| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365 |
- ///////////////////////////////////////////////////////////////////////
- // File: pageiterator.h
- // Description: Iterator for tesseract page structure that avoids using
- // tesseract internal data structures.
- // Author: Ray Smith
- // Created: Fri Feb 26 11:01:06 PST 2010
- //
- // (C) Copyright 2010, Google Inc.
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- // http://www.apache.org/licenses/LICENSE-2.0
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- ///////////////////////////////////////////////////////////////////////
- #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
- #define TESSERACT_CCMAIN_PAGEITERATOR_H_
- #include "platform.h"
- #include "publictypes.h"
- struct BlamerBundle;
- class C_BLOB_IT;
- class PAGE_RES;
- class PAGE_RES_IT;
- class WERD;
- struct Pix;
- struct Pta;
- namespace tesseract {
- class Tesseract;
- /**
- * Class to iterate over tesseract page structure, providing access to all
- * levels of the page hierarchy, without including any tesseract headers or
- * having to handle any tesseract structures.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- * See apitypes.h for the definition of PageIteratorLevel.
- * See also ResultIterator, derived from PageIterator, which adds in the
- * ability to access OCR output with text-specific methods.
- */
- class TESS_API PageIterator {
- public:
- /**
- * page_res and tesseract come directly from the BaseAPI.
- * The rectangle parameters are copied indirectly from the Thresholder,
- * via the BaseAPI. They represent the coordinates of some rectangle in an
- * original image (in top-left-origin coordinates) and therefore the top-left
- * needs to be added to any output boxes in order to specify coordinates
- * in the original image. See TessBaseAPI::SetRectangle.
- * The scale and scaled_yres are in case the Thresholder scaled the image
- * rectangle prior to thresholding. Any coordinates in tesseract's image
- * must be divided by scale before adding (rect_left, rect_top).
- * The scaled_yres indicates the effective resolution of the binary image
- * that tesseract has been given by the Thresholder.
- * After the constructor, Begin has already been called.
- */
- PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
- int scale, int scaled_yres,
- int rect_left, int rect_top,
- int rect_width, int rect_height);
- virtual ~PageIterator();
- /**
- * Page/ResultIterators may be copied! This makes it possible to iterate over
- * all the objects at a lower level, while maintaining an iterator to
- * objects at a higher level. These constructors DO NOT CALL Begin, so
- * iterations will continue from the location of src.
- */
- PageIterator(const PageIterator& src);
- const PageIterator& operator=(const PageIterator& src);
- /** Are we positioned at the same location as other? */
- bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
- // ============= Moving around within the page ============.
- /**
- * Moves the iterator to point to the start of the page to begin an
- * iteration.
- */
- virtual void Begin();
- /**
- * Moves the iterator to the beginning of the paragraph.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word on the first row of the paragraph.
- */
- virtual void RestartParagraph();
- /**
- * Return whether this iterator points anywhere in the first textline of a
- * paragraph.
- */
- bool IsWithinFirstTextlineOfParagraph() const;
- /**
- * Moves the iterator to the beginning of the text line.
- * This class implements this functionality by moving it to the zero indexed
- * blob of the first (leftmost) word of the row.
- */
- virtual void RestartRow();
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy, and returns false if the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- virtual bool Next(PageIteratorLevel level);
- /**
- * Returns true if the iterator is at the start of an object at the given
- * level.
- *
- * For instance, suppose an iterator it is pointed to the first symbol of the
- * first word of the third line of the second paragraph of the first block in
- * a page, then:
- * it.IsAtBeginningOf(RIL_BLOCK) = false
- * it.IsAtBeginningOf(RIL_PARA) = false
- * it.IsAtBeginningOf(RIL_TEXTLINE) = true
- * it.IsAtBeginningOf(RIL_WORD) = true
- * it.IsAtBeginningOf(RIL_SYMBOL) = true
- */
- virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
- /**
- * Returns whether the iterator is positioned at the last element in a
- * given level. (e.g. the last word in a line, the last line in a block)
- *
- * Here's some two-paragraph example
- * text. It starts off innocuously
- * enough but quickly turns bizarre.
- * The author inserts a cornucopia
- * of words to guard against confused
- * references.
- *
- * Now take an iterator it pointed to the start of "bizarre."
- * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
- * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
- * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
- */
- virtual bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const;
- /**
- * Returns whether this iterator is positioned
- * before other: -1
- * equal to other: 0
- * after other: 1
- */
- int Cmp(const PageIterator &other) const;
- // ============= Accessing data ==============.
- // Coordinate system:
- // Integer coordinates are at the cracks between the pixels.
- // The top-left corner of the top-left pixel in the image is at (0,0).
- // The bottom-right corner of the bottom-right pixel in the image is at
- // (width, height).
- // Every bounding box goes from the top-left of the top-left contained
- // pixel to the bottom-right of the bottom-right contained pixel, so
- // the bounding box of the single top-left pixel in the image is:
- // (0,0)->(1,1).
- // If an image rectangle has been set in the API, then returned coordinates
- // relate to the original (full) image, rather than the rectangle.
- /**
- * Controls what to include in a bounding box. Bounding boxes of all levels
- * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
- * Between layout analysis and recognition, it isn't known where all
- * diacritics belong, so this control is used to include or exclude some
- * diacritics that are above or below the main body of the word. In most cases
- * where the placement is obvious, and after recognition, it doesn't make as
- * much difference, as the diacritics will already be included in the word.
- */
- void SetBoundingBoxComponents(bool include_upper_dots,
- bool include_lower_dots) {
- include_upper_dots_ = include_upper_dots;
- include_lower_dots_ = include_lower_dots;
- }
- /**
- * Returns the bounding rectangle of the current object at the given level.
- * See comment on coordinate system above.
- * Returns false if there is no such object at the current position.
- * The returned bounding box is guaranteed to match the size and position
- * of the image returned by GetBinaryImage, but may clip foreground pixels
- * from a grey image. The padding argument to GetImage can be used to expand
- * the image to include more foreground pixels. See GetImage below.
- */
- bool BoundingBox(PageIteratorLevel level,
- int* left, int* top, int* right, int* bottom) const;
- bool BoundingBox(PageIteratorLevel level, int padding,
- int* left, int* top, int* right, int* bottom) const;
- /**
- * Returns the bounding rectangle of the object in a coordinate system of the
- * working image rectangle having its origin at (rect_left_, rect_top_) with
- * respect to the original image and is scaled by a factor scale_.
- */
- bool BoundingBoxInternal(PageIteratorLevel level,
- int* left, int* top, int* right, int* bottom) const;
- /** Returns whether there is no object of a given level. */
- bool Empty(PageIteratorLevel level) const;
- /**
- * Returns the type of the current block. See apitypes.h for
- * PolyBlockType.
- */
- PolyBlockType BlockType() const;
- /**
- * Returns the polygon outline of the current block. The returned Pta must
- * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
- * of the polygon, and the last edge is the line segment between the last
- * point and the first point. nullptr will be returned if the iterator is
- * at the end of the document or layout analysis was not used.
- */
- Pta* BlockPolygon() const;
- /**
- * Returns a binary image of the current object at the given level.
- * The position and size match the return from BoundingBoxInternal, and so
- * this could be upscaled with respect to the original input image.
- * Use pixDestroy to delete the image after use.
- */
- Pix* GetBinaryImage(PageIteratorLevel level) const;
- /**
- * Returns an image of the current object at the given level in greyscale
- * if available in the input. To guarantee a binary image use BinaryImage.
- * NOTE that in order to give the best possible image, the bounds are
- * expanded slightly over the binary connected component, by the supplied
- * padding, so the top-left position of the returned image is returned
- * in (left,top). These will most likely not match the coordinates
- * returned by BoundingBox.
- * If you do not supply an original image, you will get a binary one.
- * Use pixDestroy to delete the image after use.
- */
- Pix* GetImage(PageIteratorLevel level, int padding, Pix* original_img,
- int* left, int* top) const;
- /**
- * Returns the baseline of the current object at the given level.
- * The baseline is the line that passes through (x1, y1) and (x2, y2).
- * WARNING: with vertical text, baselines may be vertical!
- * Returns false if there is no baseline at the current position.
- */
- bool Baseline(PageIteratorLevel level,
- int* x1, int* y1, int* x2, int* y2) const;
- /**
- * Returns orientation for the block the iterator points to.
- * orientation, writing_direction, textline_order: see publictypes.h
- * deskew_angle: after rotating the block so the text orientation is
- * upright, how many radians does one have to rotate the
- * block anti-clockwise for it to be level?
- * -Pi/4 <= deskew_angle <= Pi/4
- */
- void Orientation(tesseract::Orientation *orientation,
- tesseract::WritingDirection *writing_direction,
- tesseract::TextlineOrder *textline_order,
- float *deskew_angle) const;
- /**
- * Returns information about the current paragraph, if available.
- *
- * justification -
- * LEFT if ragged right, or fully justified and script is left-to-right.
- * RIGHT if ragged left, or fully justified and script is right-to-left.
- * unknown if it looks like source code or we have very few lines.
- * is_list_item -
- * true if we believe this is a member of an ordered or unordered list.
- * is_crown -
- * true if the first line of the paragraph is aligned with the other
- * lines of the paragraph even though subsequent paragraphs have first
- * line indents. This typically indicates that this is the continuation
- * of a previous paragraph or that it is the very first paragraph in
- * the chapter.
- * first_line_indent -
- * For LEFT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the left edge of the
- * rest of the paragraph.
- * for RIGHT aligned paragraphs, the first text line of paragraphs of
- * this kind are indented this many pixels from the right edge of the
- * rest of the paragraph.
- * NOTE 1: This value may be negative.
- * NOTE 2: if *is_crown == true, the first line of this paragraph is
- * actually flush, and first_line_indent is set to the "common"
- * first_line_indent for subsequent paragraphs in this block
- * of text.
- */
- void ParagraphInfo(tesseract::ParagraphJustification *justification,
- bool *is_list_item,
- bool *is_crown,
- int *first_line_indent) const;
- // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
- // of the current word to the given pointer (takes ownership of the pointer)
- // and returns true.
- // Can only be used when iterating on the word level.
- bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
- protected:
- /**
- * Sets up the internal data for iterating the blobs of a new word, then
- * moves the iterator to the given offset.
- */
- TESS_LOCAL void BeginWord(int offset);
- /** Pointer to the page_res owned by the API. */
- PAGE_RES* page_res_;
- /** Pointer to the Tesseract object owned by the API. */
- Tesseract* tesseract_;
- /**
- * The iterator to the page_res_. Owned by this ResultIterator.
- * A pointer just to avoid dragging in Tesseract includes.
- */
- PAGE_RES_IT* it_;
- /**
- * The current input WERD being iterated. If there is an output from OCR,
- * then word_ is nullptr. Owned by the API
- */
- WERD* word_;
- /** The length of the current word_. */
- int word_length_;
- /** The current blob index within the word. */
- int blob_index_;
- /**
- * Iterator to the blobs within the word. If nullptr, then we are iterating
- * OCR results in the box_word.
- * Owned by this ResultIterator.
- */
- C_BLOB_IT* cblob_it_;
- /** Control over what to include in bounding boxes. */
- bool include_upper_dots_;
- bool include_lower_dots_;
- /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
- int scale_;
- int scaled_yres_;
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
- };
- } // namespace tesseract.
- #endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
|