pageiterator.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: pageiterator.h
  3. // Description: Iterator for tesseract page structure that avoids using
  4. // tesseract internal data structures.
  5. // Author: Ray Smith
  6. // Created: Fri Feb 26 11:01:06 PST 2010
  7. //
  8. // (C) Copyright 2010, Google Inc.
  9. // Licensed under the Apache License, Version 2.0 (the "License");
  10. // you may not use this file except in compliance with the License.
  11. // You may obtain a copy of the License at
  12. // http://www.apache.org/licenses/LICENSE-2.0
  13. // Unless required by applicable law or agreed to in writing, software
  14. // distributed under the License is distributed on an "AS IS" BASIS,
  15. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. // See the License for the specific language governing permissions and
  17. // limitations under the License.
  18. //
  19. ///////////////////////////////////////////////////////////////////////
  20. #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
  21. #define TESSERACT_CCMAIN_PAGEITERATOR_H_
  22. #include "platform.h"
  23. #include "publictypes.h"
  24. struct BlamerBundle;
  25. class C_BLOB_IT;
  26. class PAGE_RES;
  27. class PAGE_RES_IT;
  28. class WERD;
  29. struct Pix;
  30. struct Pta;
  31. namespace tesseract {
  32. class Tesseract;
  33. /**
  34. * Class to iterate over tesseract page structure, providing access to all
  35. * levels of the page hierarchy, without including any tesseract headers or
  36. * having to handle any tesseract structures.
  37. * WARNING! This class points to data held within the TessBaseAPI class, and
  38. * therefore can only be used while the TessBaseAPI class still exists and
  39. * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
  40. * DetectOS, or anything else that changes the internal PAGE_RES.
  41. * See apitypes.h for the definition of PageIteratorLevel.
  42. * See also ResultIterator, derived from PageIterator, which adds in the
  43. * ability to access OCR output with text-specific methods.
  44. */
  45. class TESS_API PageIterator {
  46. public:
  47. /**
  48. * page_res and tesseract come directly from the BaseAPI.
  49. * The rectangle parameters are copied indirectly from the Thresholder,
  50. * via the BaseAPI. They represent the coordinates of some rectangle in an
  51. * original image (in top-left-origin coordinates) and therefore the top-left
  52. * needs to be added to any output boxes in order to specify coordinates
  53. * in the original image. See TessBaseAPI::SetRectangle.
  54. * The scale and scaled_yres are in case the Thresholder scaled the image
  55. * rectangle prior to thresholding. Any coordinates in tesseract's image
  56. * must be divided by scale before adding (rect_left, rect_top).
  57. * The scaled_yres indicates the effective resolution of the binary image
  58. * that tesseract has been given by the Thresholder.
  59. * After the constructor, Begin has already been called.
  60. */
  61. PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
  62. int scale, int scaled_yres,
  63. int rect_left, int rect_top,
  64. int rect_width, int rect_height);
  65. virtual ~PageIterator();
  66. /**
  67. * Page/ResultIterators may be copied! This makes it possible to iterate over
  68. * all the objects at a lower level, while maintaining an iterator to
  69. * objects at a higher level. These constructors DO NOT CALL Begin, so
  70. * iterations will continue from the location of src.
  71. */
  72. PageIterator(const PageIterator& src);
  73. const PageIterator& operator=(const PageIterator& src);
  74. /** Are we positioned at the same location as other? */
  75. bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
  76. // ============= Moving around within the page ============.
  77. /**
  78. * Moves the iterator to point to the start of the page to begin an
  79. * iteration.
  80. */
  81. virtual void Begin();
  82. /**
  83. * Moves the iterator to the beginning of the paragraph.
  84. * This class implements this functionality by moving it to the zero indexed
  85. * blob of the first (leftmost) word on the first row of the paragraph.
  86. */
  87. virtual void RestartParagraph();
  88. /**
  89. * Return whether this iterator points anywhere in the first textline of a
  90. * paragraph.
  91. */
  92. bool IsWithinFirstTextlineOfParagraph() const;
  93. /**
  94. * Moves the iterator to the beginning of the text line.
  95. * This class implements this functionality by moving it to the zero indexed
  96. * blob of the first (leftmost) word of the row.
  97. */
  98. virtual void RestartRow();
  99. /**
  100. * Moves to the start of the next object at the given level in the
  101. * page hierarchy, and returns false if the end of the page was reached.
  102. * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
  103. * PageIteratorLevel level values will visit each non-text block once.
  104. * Think of non text blocks as containing a single para, with a single line,
  105. * with a single imaginary word.
  106. * Calls to Next with different levels may be freely intermixed.
  107. * This function iterates words in right-to-left scripts correctly, if
  108. * the appropriate language has been loaded into Tesseract.
  109. */
  110. virtual bool Next(PageIteratorLevel level);
  111. /**
  112. * Returns true if the iterator is at the start of an object at the given
  113. * level.
  114. *
  115. * For instance, suppose an iterator it is pointed to the first symbol of the
  116. * first word of the third line of the second paragraph of the first block in
  117. * a page, then:
  118. * it.IsAtBeginningOf(RIL_BLOCK) = false
  119. * it.IsAtBeginningOf(RIL_PARA) = false
  120. * it.IsAtBeginningOf(RIL_TEXTLINE) = true
  121. * it.IsAtBeginningOf(RIL_WORD) = true
  122. * it.IsAtBeginningOf(RIL_SYMBOL) = true
  123. */
  124. virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
  125. /**
  126. * Returns whether the iterator is positioned at the last element in a
  127. * given level. (e.g. the last word in a line, the last line in a block)
  128. *
  129. * Here's some two-paragraph example
  130. * text. It starts off innocuously
  131. * enough but quickly turns bizarre.
  132. * The author inserts a cornucopia
  133. * of words to guard against confused
  134. * references.
  135. *
  136. * Now take an iterator it pointed to the start of "bizarre."
  137. * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
  138. * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
  139. * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
  140. */
  141. virtual bool IsAtFinalElement(PageIteratorLevel level,
  142. PageIteratorLevel element) const;
  143. /**
  144. * Returns whether this iterator is positioned
  145. * before other: -1
  146. * equal to other: 0
  147. * after other: 1
  148. */
  149. int Cmp(const PageIterator &other) const;
  150. // ============= Accessing data ==============.
  151. // Coordinate system:
  152. // Integer coordinates are at the cracks between the pixels.
  153. // The top-left corner of the top-left pixel in the image is at (0,0).
  154. // The bottom-right corner of the bottom-right pixel in the image is at
  155. // (width, height).
  156. // Every bounding box goes from the top-left of the top-left contained
  157. // pixel to the bottom-right of the bottom-right contained pixel, so
  158. // the bounding box of the single top-left pixel in the image is:
  159. // (0,0)->(1,1).
  160. // If an image rectangle has been set in the API, then returned coordinates
  161. // relate to the original (full) image, rather than the rectangle.
  162. /**
  163. * Controls what to include in a bounding box. Bounding boxes of all levels
  164. * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
  165. * Between layout analysis and recognition, it isn't known where all
  166. * diacritics belong, so this control is used to include or exclude some
  167. * diacritics that are above or below the main body of the word. In most cases
  168. * where the placement is obvious, and after recognition, it doesn't make as
  169. * much difference, as the diacritics will already be included in the word.
  170. */
  171. void SetBoundingBoxComponents(bool include_upper_dots,
  172. bool include_lower_dots) {
  173. include_upper_dots_ = include_upper_dots;
  174. include_lower_dots_ = include_lower_dots;
  175. }
  176. /**
  177. * Returns the bounding rectangle of the current object at the given level.
  178. * See comment on coordinate system above.
  179. * Returns false if there is no such object at the current position.
  180. * The returned bounding box is guaranteed to match the size and position
  181. * of the image returned by GetBinaryImage, but may clip foreground pixels
  182. * from a grey image. The padding argument to GetImage can be used to expand
  183. * the image to include more foreground pixels. See GetImage below.
  184. */
  185. bool BoundingBox(PageIteratorLevel level,
  186. int* left, int* top, int* right, int* bottom) const;
  187. bool BoundingBox(PageIteratorLevel level, int padding,
  188. int* left, int* top, int* right, int* bottom) const;
  189. /**
  190. * Returns the bounding rectangle of the object in a coordinate system of the
  191. * working image rectangle having its origin at (rect_left_, rect_top_) with
  192. * respect to the original image and is scaled by a factor scale_.
  193. */
  194. bool BoundingBoxInternal(PageIteratorLevel level,
  195. int* left, int* top, int* right, int* bottom) const;
  196. /** Returns whether there is no object of a given level. */
  197. bool Empty(PageIteratorLevel level) const;
  198. /**
  199. * Returns the type of the current block. See apitypes.h for
  200. * PolyBlockType.
  201. */
  202. PolyBlockType BlockType() const;
  203. /**
  204. * Returns the polygon outline of the current block. The returned Pta must
  205. * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
  206. * of the polygon, and the last edge is the line segment between the last
  207. * point and the first point. nullptr will be returned if the iterator is
  208. * at the end of the document or layout analysis was not used.
  209. */
  210. Pta* BlockPolygon() const;
  211. /**
  212. * Returns a binary image of the current object at the given level.
  213. * The position and size match the return from BoundingBoxInternal, and so
  214. * this could be upscaled with respect to the original input image.
  215. * Use pixDestroy to delete the image after use.
  216. */
  217. Pix* GetBinaryImage(PageIteratorLevel level) const;
  218. /**
  219. * Returns an image of the current object at the given level in greyscale
  220. * if available in the input. To guarantee a binary image use BinaryImage.
  221. * NOTE that in order to give the best possible image, the bounds are
  222. * expanded slightly over the binary connected component, by the supplied
  223. * padding, so the top-left position of the returned image is returned
  224. * in (left,top). These will most likely not match the coordinates
  225. * returned by BoundingBox.
  226. * If you do not supply an original image, you will get a binary one.
  227. * Use pixDestroy to delete the image after use.
  228. */
  229. Pix* GetImage(PageIteratorLevel level, int padding, Pix* original_img,
  230. int* left, int* top) const;
  231. /**
  232. * Returns the baseline of the current object at the given level.
  233. * The baseline is the line that passes through (x1, y1) and (x2, y2).
  234. * WARNING: with vertical text, baselines may be vertical!
  235. * Returns false if there is no baseline at the current position.
  236. */
  237. bool Baseline(PageIteratorLevel level,
  238. int* x1, int* y1, int* x2, int* y2) const;
  239. /**
  240. * Returns orientation for the block the iterator points to.
  241. * orientation, writing_direction, textline_order: see publictypes.h
  242. * deskew_angle: after rotating the block so the text orientation is
  243. * upright, how many radians does one have to rotate the
  244. * block anti-clockwise for it to be level?
  245. * -Pi/4 <= deskew_angle <= Pi/4
  246. */
  247. void Orientation(tesseract::Orientation *orientation,
  248. tesseract::WritingDirection *writing_direction,
  249. tesseract::TextlineOrder *textline_order,
  250. float *deskew_angle) const;
  251. /**
  252. * Returns information about the current paragraph, if available.
  253. *
  254. * justification -
  255. * LEFT if ragged right, or fully justified and script is left-to-right.
  256. * RIGHT if ragged left, or fully justified and script is right-to-left.
  257. * unknown if it looks like source code or we have very few lines.
  258. * is_list_item -
  259. * true if we believe this is a member of an ordered or unordered list.
  260. * is_crown -
  261. * true if the first line of the paragraph is aligned with the other
  262. * lines of the paragraph even though subsequent paragraphs have first
  263. * line indents. This typically indicates that this is the continuation
  264. * of a previous paragraph or that it is the very first paragraph in
  265. * the chapter.
  266. * first_line_indent -
  267. * For LEFT aligned paragraphs, the first text line of paragraphs of
  268. * this kind are indented this many pixels from the left edge of the
  269. * rest of the paragraph.
  270. * for RIGHT aligned paragraphs, the first text line of paragraphs of
  271. * this kind are indented this many pixels from the right edge of the
  272. * rest of the paragraph.
  273. * NOTE 1: This value may be negative.
  274. * NOTE 2: if *is_crown == true, the first line of this paragraph is
  275. * actually flush, and first_line_indent is set to the "common"
  276. * first_line_indent for subsequent paragraphs in this block
  277. * of text.
  278. */
  279. void ParagraphInfo(tesseract::ParagraphJustification *justification,
  280. bool *is_list_item,
  281. bool *is_crown,
  282. int *first_line_indent) const;
  283. // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
  284. // of the current word to the given pointer (takes ownership of the pointer)
  285. // and returns true.
  286. // Can only be used when iterating on the word level.
  287. bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
  288. protected:
  289. /**
  290. * Sets up the internal data for iterating the blobs of a new word, then
  291. * moves the iterator to the given offset.
  292. */
  293. TESS_LOCAL void BeginWord(int offset);
  294. /** Pointer to the page_res owned by the API. */
  295. PAGE_RES* page_res_;
  296. /** Pointer to the Tesseract object owned by the API. */
  297. Tesseract* tesseract_;
  298. /**
  299. * The iterator to the page_res_. Owned by this ResultIterator.
  300. * A pointer just to avoid dragging in Tesseract includes.
  301. */
  302. PAGE_RES_IT* it_;
  303. /**
  304. * The current input WERD being iterated. If there is an output from OCR,
  305. * then word_ is nullptr. Owned by the API
  306. */
  307. WERD* word_;
  308. /** The length of the current word_. */
  309. int word_length_;
  310. /** The current blob index within the word. */
  311. int blob_index_;
  312. /**
  313. * Iterator to the blobs within the word. If nullptr, then we are iterating
  314. * OCR results in the box_word.
  315. * Owned by this ResultIterator.
  316. */
  317. C_BLOB_IT* cblob_it_;
  318. /** Control over what to include in bounding boxes. */
  319. bool include_upper_dots_;
  320. bool include_lower_dots_;
  321. /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
  322. int scale_;
  323. int scaled_yres_;
  324. int rect_left_;
  325. int rect_top_;
  326. int rect_width_;
  327. int rect_height_;
  328. };
  329. } // namespace tesseract.
  330. #endif // TESSERACT_CCMAIN_PAGEITERATOR_H_