resultiterator.h 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: resultiterator.h
  3. // Description: Iterator for tesseract results that is capable of
  4. // iterating in proper reading order over Bi Directional
  5. // (e.g. mixed Hebrew and English) text.
  6. // Author: David Eger
  7. // Created: Fri May 27 13:58:06 PST 2011
  8. //
  9. // (C) Copyright 2011, Google Inc.
  10. // Licensed under the Apache License, Version 2.0 (the "License");
  11. // you may not use this file except in compliance with the License.
  12. // You may obtain a copy of the License at
  13. // http://www.apache.org/licenses/LICENSE-2.0
  14. // Unless required by applicable law or agreed to in writing, software
  15. // distributed under the License is distributed on an "AS IS" BASIS,
  16. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17. // See the License for the specific language governing permissions and
  18. // limitations under the License.
  19. //
  20. ///////////////////////////////////////////////////////////////////////
  21. #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
  22. #define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
  23. #include <set> // for std::pair
  24. #include <vector> // for std::vector
  25. #include "ltrresultiterator.h" // for LTRResultIterator
  26. #include "platform.h" // for TESS_API, TESS_LOCAL
  27. #include "publictypes.h" // for PageIteratorLevel
  28. #include "unichar.h" // for StrongScriptDirection
  29. template <typename T> class GenericVector;
  30. template <typename T> class GenericVectorEqEq;
  31. class STRING;
  32. namespace tesseract {
  33. class Tesseract;
  34. class TESS_API ResultIterator : public LTRResultIterator {
  35. public:
  36. static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
  37. /**
  38. * ResultIterator is copy constructible!
  39. * The default copy constructor works just fine for us.
  40. */
  41. ~ResultIterator() override = default;
  42. // ============= Moving around within the page ============.
  43. /**
  44. * Moves the iterator to point to the start of the page to begin
  45. * an iteration.
  46. */
  47. void Begin() override;
  48. /**
  49. * Moves to the start of the next object at the given level in the
  50. * page hierarchy in the appropriate reading order and returns false if
  51. * the end of the page was reached.
  52. * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
  53. * PageIteratorLevel level values will visit each non-text block once.
  54. * Think of non text blocks as containing a single para, with a single line,
  55. * with a single imaginary word.
  56. * Calls to Next with different levels may be freely intermixed.
  57. * This function iterates words in right-to-left scripts correctly, if
  58. * the appropriate language has been loaded into Tesseract.
  59. */
  60. bool Next(PageIteratorLevel level) override;
  61. /**
  62. * IsAtBeginningOf() returns whether we're at the logical beginning of the
  63. * given level. (as opposed to ResultIterator's left-to-right top-to-bottom
  64. * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
  65. * For a full description, see pageiterator.h
  66. */
  67. bool IsAtBeginningOf(PageIteratorLevel level) const override;
  68. /**
  69. * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
  70. * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
  71. * point at the last word in a paragraph. See PageIterator for full comment.
  72. */
  73. bool IsAtFinalElement(PageIteratorLevel level,
  74. PageIteratorLevel element) const override;
  75. // ============= Functions that refer to words only ============.
  76. // Returns the number of blanks before the current word.
  77. int BlanksBeforeWord() const;
  78. // ============= Accessing data ==============.
  79. /**
  80. * Returns the null terminated UTF-8 encoded text string for the current
  81. * object at the given level. Use delete [] to free after use.
  82. */
  83. virtual char* GetUTF8Text(PageIteratorLevel level) const;
  84. /**
  85. * Returns the LSTM choices for every LSTM timestep for the current word.
  86. */
  87. virtual std::vector<std::vector<std::pair<const char*, float>>>*
  88. GetBestLSTMSymbolChoices() const;
  89. /**
  90. * Return whether the current paragraph's dominant reading direction
  91. * is left-to-right (as opposed to right-to-left).
  92. */
  93. bool ParagraphIsLtr() const;
  94. // ============= Exposed only for testing =============.
  95. /**
  96. * Yields the reading order as a sequence of indices and (optional)
  97. * meta-marks for a set of words (given left-to-right).
  98. * The meta marks are passed as negative values:
  99. * kMinorRunStart Start of minor direction text.
  100. * kMinorRunEnd End of minor direction text.
  101. * kComplexWord The next indexed word contains both left-to-right and
  102. * right-to-left characters and was treated as neutral.
  103. *
  104. * For example, suppose we have five words in a text line,
  105. * indexed [0,1,2,3,4] from the leftmost side of the text line.
  106. * The following are all believable reading_orders:
  107. *
  108. * Left-to-Right (in ltr paragraph):
  109. * { 0, 1, 2, 3, 4 }
  110. * Left-to-Right (in rtl paragraph):
  111. * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
  112. * Right-to-Left (in rtl paragraph):
  113. * { 4, 3, 2, 1, 0 }
  114. * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
  115. * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
  116. */
  117. static void CalculateTextlineOrder(
  118. bool paragraph_is_ltr,
  119. const GenericVector<StrongScriptDirection> &word_dirs,
  120. GenericVectorEqEq<int> *reading_order);
  121. static const int kMinorRunStart;
  122. static const int kMinorRunEnd;
  123. static const int kComplexWord;
  124. protected:
  125. /**
  126. * We presume the data associated with the given iterator will outlive us.
  127. * NB: This is private because it does something that is non-obvious:
  128. * it resets to the beginning of the paragraph instead of staying wherever
  129. * resit might have pointed.
  130. */
  131. TESS_LOCAL explicit ResultIterator(const LTRResultIterator &resit);
  132. private:
  133. /**
  134. * Calculates the current paragraph's dominant writing direction.
  135. * Typically, members should use current_paragraph_ltr_ instead.
  136. */
  137. bool CurrentParagraphIsLtr() const;
  138. /**
  139. * Returns word indices as measured from resit->RestartRow() = index 0
  140. * for the reading order of words within a textline given an iterator
  141. * into the middle of the text line.
  142. * In addition to non-negative word indices, the following negative values
  143. * may be inserted:
  144. * kMinorRunStart Start of minor direction text.
  145. * kMinorRunEnd End of minor direction text.
  146. * kComplexWord The previous word contains both left-to-right and
  147. * right-to-left characters and was treated as neutral.
  148. */
  149. void CalculateTextlineOrder(bool paragraph_is_ltr,
  150. const LTRResultIterator &resit,
  151. GenericVectorEqEq<int> *indices) const;
  152. /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
  153. void CalculateTextlineOrder(bool paragraph_is_ltr,
  154. const LTRResultIterator &resit,
  155. GenericVector<StrongScriptDirection> *ssd,
  156. GenericVectorEqEq<int> *indices) const;
  157. /**
  158. * What is the index of the current word in a strict left-to-right reading
  159. * of the row?
  160. */
  161. int LTRWordIndex() const;
  162. /**
  163. * Given an iterator pointing at a word, returns the logical reading order
  164. * of blob indices for the word.
  165. */
  166. void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
  167. /** Precondition: current_paragraph_is_ltr_ is set. */
  168. void MoveToLogicalStartOfTextline();
  169. /**
  170. * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
  171. * are set.
  172. */
  173. void MoveToLogicalStartOfWord();
  174. /** Are we pointing at the final (reading order) symbol of the word? */
  175. bool IsAtFinalSymbolOfWord() const;
  176. /** Are we pointing at the first (reading order) symbol of the word? */
  177. bool IsAtFirstSymbolOfWord() const;
  178. /**
  179. * Append any extra marks that should be appended to this word when printed.
  180. * Mostly, these are Unicode BiDi control characters.
  181. */
  182. void AppendSuffixMarks(STRING *text) const;
  183. /** Appends the current word in reading order to the given buffer.*/
  184. void AppendUTF8WordText(STRING *text) const;
  185. /**
  186. * Appends the text of the current text line, *assuming this iterator is
  187. * positioned at the beginning of the text line* This function
  188. * updates the iterator to point to the first position past the text line.
  189. * Each textline is terminated in a single newline character.
  190. * If the textline ends a paragraph, it gets a second terminal newline.
  191. */
  192. void IterateAndAppendUTF8TextlineText(STRING *text);
  193. /**
  194. * Appends the text of the current paragraph in reading order
  195. * to the given buffer.
  196. * Each textline is terminated in a single newline character, and the
  197. * paragraph gets an extra newline at the end.
  198. */
  199. void AppendUTF8ParagraphText(STRING *text) const;
  200. /** Returns whether the bidi_debug flag is set to at least min_level. */
  201. bool BidiDebug(int min_level) const;
  202. bool current_paragraph_is_ltr_;
  203. /**
  204. * Is the currently pointed-at character at the beginning of
  205. * a minor-direction run?
  206. */
  207. bool at_beginning_of_minor_run_;
  208. /** Is the currently pointed-at character in a minor-direction sequence? */
  209. bool in_minor_direction_;
  210. /**
  211. * Should detected inter-word spaces be preserved, or "compressed" to a single
  212. * space character (default behavior).
  213. */
  214. bool preserve_interword_spaces_;
  215. };
  216. } // namespace tesseract.
  217. #endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_