| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- ///////////////////////////////////////////////////////////////////////
- // File: resultiterator.h
- // Description: Iterator for tesseract results that is capable of
- // iterating in proper reading order over Bi Directional
- // (e.g. mixed Hebrew and English) text.
- // Author: David Eger
- // Created: Fri May 27 13:58:06 PST 2011
- //
- // (C) Copyright 2011, Google Inc.
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- // http://www.apache.org/licenses/LICENSE-2.0
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- ///////////////////////////////////////////////////////////////////////
- #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
- #define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
- #include <set> // for std::pair
- #include <vector> // for std::vector
- #include "ltrresultiterator.h" // for LTRResultIterator
- #include "platform.h" // for TESS_API, TESS_LOCAL
- #include "publictypes.h" // for PageIteratorLevel
- #include "unichar.h" // for StrongScriptDirection
- template <typename T> class GenericVector;
- template <typename T> class GenericVectorEqEq;
- class STRING;
- namespace tesseract {
- class Tesseract;
- class TESS_API ResultIterator : public LTRResultIterator {
- public:
- static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
- /**
- * ResultIterator is copy constructible!
- * The default copy constructor works just fine for us.
- */
- ~ResultIterator() override = default;
- // ============= Moving around within the page ============.
- /**
- * Moves the iterator to point to the start of the page to begin
- * an iteration.
- */
- void Begin() override;
- /**
- * Moves to the start of the next object at the given level in the
- * page hierarchy in the appropriate reading order and returns false if
- * the end of the page was reached.
- * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
- * PageIteratorLevel level values will visit each non-text block once.
- * Think of non text blocks as containing a single para, with a single line,
- * with a single imaginary word.
- * Calls to Next with different levels may be freely intermixed.
- * This function iterates words in right-to-left scripts correctly, if
- * the appropriate language has been loaded into Tesseract.
- */
- bool Next(PageIteratorLevel level) override;
- /**
- * IsAtBeginningOf() returns whether we're at the logical beginning of the
- * given level. (as opposed to ResultIterator's left-to-right top-to-bottom
- * order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
- * For a full description, see pageiterator.h
- */
- bool IsAtBeginningOf(PageIteratorLevel level) const override;
- /**
- * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
- * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
- * point at the last word in a paragraph. See PageIterator for full comment.
- */
- bool IsAtFinalElement(PageIteratorLevel level,
- PageIteratorLevel element) const override;
- // ============= Functions that refer to words only ============.
- // Returns the number of blanks before the current word.
- int BlanksBeforeWord() const;
- // ============= Accessing data ==============.
- /**
- * Returns the null terminated UTF-8 encoded text string for the current
- * object at the given level. Use delete [] to free after use.
- */
- virtual char* GetUTF8Text(PageIteratorLevel level) const;
- /**
- * Returns the LSTM choices for every LSTM timestep for the current word.
- */
- virtual std::vector<std::vector<std::pair<const char*, float>>>*
- GetBestLSTMSymbolChoices() const;
- /**
- * Return whether the current paragraph's dominant reading direction
- * is left-to-right (as opposed to right-to-left).
- */
- bool ParagraphIsLtr() const;
- // ============= Exposed only for testing =============.
- /**
- * Yields the reading order as a sequence of indices and (optional)
- * meta-marks for a set of words (given left-to-right).
- * The meta marks are passed as negative values:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The next indexed word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- *
- * For example, suppose we have five words in a text line,
- * indexed [0,1,2,3,4] from the leftmost side of the text line.
- * The following are all believable reading_orders:
- *
- * Left-to-Right (in ltr paragraph):
- * { 0, 1, 2, 3, 4 }
- * Left-to-Right (in rtl paragraph):
- * { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
- * Right-to-Left (in rtl paragraph):
- * { 4, 3, 2, 1, 0 }
- * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
- * { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
- */
- static void CalculateTextlineOrder(
- bool paragraph_is_ltr,
- const GenericVector<StrongScriptDirection> &word_dirs,
- GenericVectorEqEq<int> *reading_order);
- static const int kMinorRunStart;
- static const int kMinorRunEnd;
- static const int kComplexWord;
- protected:
- /**
- * We presume the data associated with the given iterator will outlive us.
- * NB: This is private because it does something that is non-obvious:
- * it resets to the beginning of the paragraph instead of staying wherever
- * resit might have pointed.
- */
- TESS_LOCAL explicit ResultIterator(const LTRResultIterator &resit);
- private:
- /**
- * Calculates the current paragraph's dominant writing direction.
- * Typically, members should use current_paragraph_ltr_ instead.
- */
- bool CurrentParagraphIsLtr() const;
- /**
- * Returns word indices as measured from resit->RestartRow() = index 0
- * for the reading order of words within a textline given an iterator
- * into the middle of the text line.
- * In addition to non-negative word indices, the following negative values
- * may be inserted:
- * kMinorRunStart Start of minor direction text.
- * kMinorRunEnd End of minor direction text.
- * kComplexWord The previous word contains both left-to-right and
- * right-to-left characters and was treated as neutral.
- */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- GenericVectorEqEq<int> *indices) const;
- /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
- void CalculateTextlineOrder(bool paragraph_is_ltr,
- const LTRResultIterator &resit,
- GenericVector<StrongScriptDirection> *ssd,
- GenericVectorEqEq<int> *indices) const;
- /**
- * What is the index of the current word in a strict left-to-right reading
- * of the row?
- */
- int LTRWordIndex() const;
- /**
- * Given an iterator pointing at a word, returns the logical reading order
- * of blob indices for the word.
- */
- void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
- /** Precondition: current_paragraph_is_ltr_ is set. */
- void MoveToLogicalStartOfTextline();
- /**
- * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
- * are set.
- */
- void MoveToLogicalStartOfWord();
- /** Are we pointing at the final (reading order) symbol of the word? */
- bool IsAtFinalSymbolOfWord() const;
- /** Are we pointing at the first (reading order) symbol of the word? */
- bool IsAtFirstSymbolOfWord() const;
- /**
- * Append any extra marks that should be appended to this word when printed.
- * Mostly, these are Unicode BiDi control characters.
- */
- void AppendSuffixMarks(STRING *text) const;
- /** Appends the current word in reading order to the given buffer.*/
- void AppendUTF8WordText(STRING *text) const;
- /**
- * Appends the text of the current text line, *assuming this iterator is
- * positioned at the beginning of the text line* This function
- * updates the iterator to point to the first position past the text line.
- * Each textline is terminated in a single newline character.
- * If the textline ends a paragraph, it gets a second terminal newline.
- */
- void IterateAndAppendUTF8TextlineText(STRING *text);
- /**
- * Appends the text of the current paragraph in reading order
- * to the given buffer.
- * Each textline is terminated in a single newline character, and the
- * paragraph gets an extra newline at the end.
- */
- void AppendUTF8ParagraphText(STRING *text) const;
- /** Returns whether the bidi_debug flag is set to at least min_level. */
- bool BidiDebug(int min_level) const;
- bool current_paragraph_is_ltr_;
- /**
- * Is the currently pointed-at character at the beginning of
- * a minor-direction run?
- */
- bool at_beginning_of_minor_run_;
- /** Is the currently pointed-at character in a minor-direction sequence? */
- bool in_minor_direction_;
- /**
- * Should detected inter-word spaces be preserved, or "compressed" to a single
- * space character (default behavior).
- */
- bool preserve_interword_spaces_;
- };
- } // namespace tesseract.
- #endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
|