| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050 |
- ///////////////////////////////////////////////////////////////////////
- // File: unicharset.h
- // Description: Unicode character/ligature set class.
- // Author: Thomas Kielbus
- //
- // (C) Copyright 2006, Google Inc.
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- // http://www.apache.org/licenses/LICENSE-2.0
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- ///////////////////////////////////////////////////////////////////////
- #ifndef TESSERACT_CCUTIL_UNICHARSET_H_
- #define TESSERACT_CCUTIL_UNICHARSET_H_
- #include "errcode.h"
- #include "genericvector.h"
- #include "helpers.h"
- #include "serialis.h"
- #include "strngs.h"
- #include "tesscallback.h"
- #include "unichar.h"
- #include "unicharmap.h"
- // Enum holding special values of unichar_id. Every unicharset has these.
- // Warning! Keep in sync with kSpecialUnicharCodes.
- enum SpecialUnicharCodes {
- UNICHAR_SPACE,
- UNICHAR_JOINED,
- UNICHAR_BROKEN,
- SPECIAL_UNICHAR_CODES_COUNT
- };
- // Boolean flag for unichar_insert. It's a bit of a double negative to allow
- // the default value to be false.
- enum class OldUncleanUnichars {
- kFalse,
- kTrue,
- };
- class CHAR_FRAGMENT {
- public:
- // Minimum number of characters used for fragment representation.
- static const int kMinLen = 6;
- // Maximum number of characters used for fragment representation.
- static const int kMaxLen = 3 + UNICHAR_LEN + 2;
- // Maximum number of fragments per character.
- static const int kMaxChunks = 5;
- // Setters and Getters.
- inline void set_all(const char *unichar, int pos, int total, bool natural) {
- set_unichar(unichar);
- set_pos(pos);
- set_total(total);
- set_natural(natural);
- }
- inline void set_unichar(const char *uch) {
- strncpy(this->unichar, uch, sizeof(this->unichar));
- this->unichar[UNICHAR_LEN] = '\0';
- }
- inline void set_pos(int p) { this->pos = p; }
- inline void set_total(int t) { this->total = t; }
- inline const char* get_unichar() const { return this->unichar; }
- inline int get_pos() const { return this->pos; }
- inline int get_total() const { return this->total; }
- // Returns the string that represents a fragment
- // with the given unichar, pos and total.
- static STRING to_string(const char *unichar, int pos, int total,
- bool natural);
- // Returns the string that represents this fragment.
- STRING to_string() const {
- return to_string(unichar, pos, total, natural);
- }
- // Checks whether a fragment has the same unichar,
- // position and total as the given inputs.
- inline bool equals(const char *other_unichar,
- int other_pos, int other_total) const {
- return (strcmp(this->unichar, other_unichar) == 0 &&
- this->pos == other_pos && this->total == other_total);
- }
- inline bool equals(const CHAR_FRAGMENT *other) const {
- return this->equals(other->get_unichar(),
- other->get_pos(),
- other->get_total());
- }
- // Checks whether a given fragment is a continuation of this fragment.
- // Assumes that the given fragment pointer is not nullptr.
- inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
- return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
- this->total == fragment->get_total() &&
- this->pos == fragment->get_pos() + 1);
- }
- // Returns true if this fragment is a beginning fragment.
- inline bool is_beginning() const { return this->pos == 0; }
- // Returns true if this fragment is an ending fragment.
- inline bool is_ending() const { return this->pos == this->total-1; }
- // Returns true if the fragment was a separate component to begin with,
- // ie did not need chopping to be isolated, but may have been separated
- // out from a multi-outline blob.
- inline bool is_natural() const { return natural; }
- void set_natural(bool value) { natural = value; }
- // Parses the string to see whether it represents a character fragment
- // (rather than a regular character). If so, allocates memory for a new
- // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
- // information. Fragments are of the form:
- // |m|1|2, meaning chunk 1 of 2 of character m, or
- // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
- // to divide the parts, as they were already separate connected components.
- //
- // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
- // instance, otherwise (if the string does not represent a fragment or it
- // looks like it does, but parsing it as a fragment fails) returns nullptr.
- //
- // Note: The caller is responsible for deallocating memory
- // associated with the returned pointer.
- static CHAR_FRAGMENT *parse_from_string(const char *str);
- private:
- char unichar[UNICHAR_LEN + 1];
- // True if the fragment was a separate component to begin with,
- // ie did not need chopping to be isolated, but may have been separated
- // out from a multi-outline blob.
- bool natural;
- int16_t pos; // fragment position in the character
- int16_t total; // total number of fragments in the character
- };
- // The UNICHARSET class is an utility class for Tesseract that holds the
- // set of characters that are used by the engine. Each character is identified
- // by a unique number, from 0 to (size - 1).
- class UNICHARSET {
- public:
- // Custom list of characters and their ligature forms (UTF8)
- // These map to unicode values in the private use area (PUC) and are supported
- // by only few font families (eg. Wyld, Adobe Caslon Pro).
- static TESS_API const char* kCustomLigatures[][2];
- // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
- static TESS_API const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
- // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
- enum Direction {
- U_LEFT_TO_RIGHT = 0,
- U_RIGHT_TO_LEFT = 1,
- U_EUROPEAN_NUMBER = 2,
- U_EUROPEAN_NUMBER_SEPARATOR = 3,
- U_EUROPEAN_NUMBER_TERMINATOR = 4,
- U_ARABIC_NUMBER = 5,
- U_COMMON_NUMBER_SEPARATOR = 6,
- U_BLOCK_SEPARATOR = 7,
- U_SEGMENT_SEPARATOR = 8,
- U_WHITE_SPACE_NEUTRAL = 9,
- U_OTHER_NEUTRAL = 10,
- U_LEFT_TO_RIGHT_EMBEDDING = 11,
- U_LEFT_TO_RIGHT_OVERRIDE = 12,
- U_RIGHT_TO_LEFT_ARABIC = 13,
- U_RIGHT_TO_LEFT_EMBEDDING = 14,
- U_RIGHT_TO_LEFT_OVERRIDE = 15,
- U_POP_DIRECTIONAL_FORMAT = 16,
- U_DIR_NON_SPACING_MARK = 17,
- U_BOUNDARY_NEUTRAL = 18,
- U_FIRST_STRONG_ISOLATE = 19,
- U_LEFT_TO_RIGHT_ISOLATE = 20,
- U_RIGHT_TO_LEFT_ISOLATE = 21,
- U_POP_DIRECTIONAL_ISOLATE = 22,
- #ifndef U_HIDE_DEPRECATED_API
- U_CHAR_DIRECTION_COUNT
- #endif // U_HIDE_DEPRECATED_API
- };
- // Create an empty UNICHARSET
- UNICHARSET();
- ~UNICHARSET();
- // Return the UNICHAR_ID of a given unichar representation within the
- // UNICHARSET.
- UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
- // Return the UNICHAR_ID of a given unichar representation within the
- // UNICHARSET. Only the first length characters from unichar_repr are used.
- UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
- // Return the minimum number of bytes that matches a legal UNICHAR_ID,
- // while leaving the rest of the string encodable. Returns 0 if the
- // beginning of the string is not encodable.
- // WARNING: this function now encodes the whole string for precision.
- // Use encode_string in preference to repeatedly calling step.
- int step(const char* str) const;
- // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
- // If not encodable, write the first byte offset which cannot be converted
- // into the second (return) argument.
- bool encodable_string(const char *str, int *first_bad_position) const;
- // Encodes the given UTF-8 string with this UNICHARSET.
- // Any part of the string that cannot be encoded (because the utf8 can't
- // be broken up into pieces that are in the unicharset) then:
- // if give_up_on_failure, stops and returns a partial encoding,
- // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
- // Returns true if the encoding succeeds completely, false if there is at
- // least one failure.
- // If lengths is not nullptr, then it is filled with the corresponding
- // byte length of each encoded UNICHAR_ID.
- // If encoded_length is not nullptr then on return it contains the length of
- // str that was encoded. (if give_up_on_failure the location of the first
- // failure, otherwise strlen(str).)
- // WARNING: Caller must guarantee that str has already been cleaned of codes
- // that do not belong in the unicharset, or encoding may fail.
- // Use CleanupString to perform the cleaning.
- bool encode_string(const char* str, bool give_up_on_failure,
- GenericVector<UNICHAR_ID>* encoding,
- GenericVector<char>* lengths,
- int* encoded_length) const;
- // Return the unichar representation corresponding to the given UNICHAR_ID
- // within the UNICHARSET.
- const char* id_to_unichar(UNICHAR_ID id) const;
- // Return the UTF8 representation corresponding to the given UNICHAR_ID after
- // resolving any private encodings internal to Tesseract. This method is
- // preferable to id_to_unichar for outputting text that will be visible to
- // external applications.
- const char* id_to_unichar_ext(UNICHAR_ID id) const;
- // Return a STRING that reformats the utf8 str into the str followed
- // by its hex unicodes.
- static STRING debug_utf8_str(const char* str);
- // Removes/replaces content that belongs in rendered text, but not in the
- // unicharset.
- static std::string CleanupString(const char* utf8_str) {
- return CleanupString(utf8_str, strlen(utf8_str));
- }
- static std::string CleanupString(const char* utf8_str, size_t length);
- // Return a STRING containing debug information on the unichar, including
- // the id_to_unichar, its hex unicodes and the properties.
- STRING debug_str(UNICHAR_ID id) const;
- STRING debug_str(const char * unichar_repr) const {
- return debug_str(unichar_to_id(unichar_repr));
- }
- // Adds a unichar representation to the set. If old_style is true, then
- // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
- // characters are ignored/skipped as if they don't exist and n-grams that
- // can already be encoded are not added.
- void unichar_insert(const char* const unichar_repr,
- OldUncleanUnichars old_style);
- void unichar_insert(const char* const unichar_repr) {
- unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
- }
- // Adds a unichar representation to the set. Avoids setting old_style to true,
- // unless it is necessary to make the new unichar get added.
- void unichar_insert_backwards_compatible(const char* const unichar_repr) {
- std::string cleaned = CleanupString(unichar_repr);
- if (cleaned != unichar_repr) {
- unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
- } else {
- int old_size = size();
- unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
- if (size() == old_size) {
- unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
- }
- }
- }
- // Return true if the given unichar id exists within the set.
- // Relies on the fact that unichar ids are contiguous in the unicharset.
- bool contains_unichar_id(UNICHAR_ID unichar_id) const {
- return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
- unichar_id >= 0;
- }
- // Return true if the given unichar representation exists within the set.
- bool contains_unichar(const char* const unichar_repr) const;
- bool contains_unichar(const char* const unichar_repr, int length) const;
- // Return true if the given unichar representation corresponds to the given
- // UNICHAR_ID within the set.
- bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
- // Delete CHAR_FRAGMENTs stored in properties of unichars array.
- void delete_pointers_in_unichars() {
- for (int i = 0; i < size_used; ++i) {
- delete unichars[i].properties.fragment;
- unichars[i].properties.fragment = nullptr;
- }
- }
- // Clear the UNICHARSET (all the previous data is lost).
- void clear() {
- if (script_table != nullptr) {
- for (int i = 0; i < script_table_size_used; ++i)
- delete[] script_table[i];
- delete[] script_table;
- script_table = nullptr;
- script_table_size_used = 0;
- }
- if (unichars != nullptr) {
- delete_pointers_in_unichars();
- delete[] unichars;
- unichars = nullptr;
- }
- script_table_size_reserved = 0;
- size_reserved = 0;
- size_used = 0;
- ids.clear();
- top_bottom_set_ = false;
- script_has_upper_lower_ = false;
- script_has_xheight_ = false;
- old_style_included_ = false;
- null_sid_ = 0;
- common_sid_ = 0;
- latin_sid_ = 0;
- cyrillic_sid_ = 0;
- greek_sid_ = 0;
- han_sid_ = 0;
- hiragana_sid_ = 0;
- katakana_sid_ = 0;
- thai_sid_ = 0;
- hangul_sid_ = 0;
- default_sid_ = 0;
- }
- // Return the size of the set (the number of different UNICHAR it holds).
- int size() const {
- return size_used;
- }
- // Reserve enough memory space for the given number of UNICHARS
- void reserve(int unichars_number);
- // Opens the file indicated by filename and saves unicharset to that file.
- // Returns true if the operation is successful.
- bool save_to_file(const char * const filename) const {
- FILE* file = fopen(filename, "w+b");
- if (file == nullptr) return false;
- bool result = save_to_file(file);
- fclose(file);
- return result;
- }
- // Saves the content of the UNICHARSET to the given file.
- // Returns true if the operation is successful.
- bool save_to_file(FILE *file) const {
- STRING str;
- return save_to_string(&str) &&
- tesseract::Serialize(file, &str[0], str.length());
- }
- bool save_to_file(tesseract::TFile *file) const {
- STRING str;
- return save_to_string(&str) && file->Serialize(&str[0], str.length());
- }
- // Saves the content of the UNICHARSET to the given STRING.
- // Returns true if the operation is successful.
- bool save_to_string(STRING *str) const;
- // Load a unicharset from a unicharset file that has been loaded into
- // the given memory buffer.
- // Returns true if the operation is successful.
- bool load_from_inmemory_file(const char* const memory, int mem_size,
- bool skip_fragments);
- // Returns true if the operation is successful.
- bool load_from_inmemory_file(const char* const memory, int mem_size) {
- return load_from_inmemory_file(memory, mem_size, false);
- }
- // Opens the file indicated by filename and loads the UNICHARSET
- // from the given file. The previous data is lost.
- // Returns true if the operation is successful.
- bool load_from_file(const char* const filename, bool skip_fragments) {
- FILE* file = fopen(filename, "rb");
- if (file == nullptr) return false;
- bool result = load_from_file(file, skip_fragments);
- fclose(file);
- return result;
- }
- // returns true if the operation is successful.
- bool load_from_file(const char* const filename) {
- return load_from_file(filename, false);
- }
- // Loads the UNICHARSET from the given file. The previous data is lost.
- // Returns true if the operation is successful.
- bool load_from_file(FILE *file, bool skip_fragments);
- bool load_from_file(FILE *file) { return load_from_file(file, false); }
- bool load_from_file(tesseract::TFile *file, bool skip_fragments);
- // Sets up internal data after loading the file, based on the char
- // properties. Called from load_from_file, but also needs to be run
- // during set_unicharset_properties.
- void post_load_setup();
- // Returns true if right_to_left scripts are significant in the unicharset,
- // but without being so sensitive that "universal" unicharsets containing
- // characters from many scripts, like orientation and script detection,
- // look like they are right_to_left.
- bool major_right_to_left() const;
- // Set a whitelist and/or blacklist of characters to recognize.
- // An empty or nullptr whitelist enables everything (minus any blacklist).
- // An empty or nullptr blacklist disables nothing.
- // An empty or nullptr unblacklist has no effect.
- // The blacklist overrides the whitelist.
- // The unblacklist overrides the blacklist.
- // Each list is a string of utf8 character strings. Boundaries between
- // unicharset units are worked out automatically, and characters not in
- // the unicharset are silently ignored.
- void set_black_and_whitelist(const char* blacklist, const char* whitelist,
- const char* unblacklist);
- // Set the isalpha property of the given unichar to the given value.
- void set_isalpha(UNICHAR_ID unichar_id, bool value) {
- unichars[unichar_id].properties.isalpha = value;
- }
- // Set the islower property of the given unichar to the given value.
- void set_islower(UNICHAR_ID unichar_id, bool value) {
- unichars[unichar_id].properties.islower = value;
- }
- // Set the isupper property of the given unichar to the given value.
- void set_isupper(UNICHAR_ID unichar_id, bool value) {
- unichars[unichar_id].properties.isupper = value;
- }
- // Set the isdigit property of the given unichar to the given value.
- void set_isdigit(UNICHAR_ID unichar_id, bool value) {
- unichars[unichar_id].properties.isdigit = value;
- }
- // Set the ispunctuation property of the given unichar to the given value.
- void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
- unichars[unichar_id].properties.ispunctuation = value;
- }
- // Set the isngram property of the given unichar to the given value.
- void set_isngram(UNICHAR_ID unichar_id, bool value) {
- unichars[unichar_id].properties.isngram = value;
- }
- // Set the script name of the given unichar to the given value.
- // Value is copied and thus can be a temporary;
- void set_script(UNICHAR_ID unichar_id, const char* value) {
- unichars[unichar_id].properties.script_id = add_script(value);
- }
- // Set other_case unichar id in the properties for the given unichar id.
- void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
- unichars[unichar_id].properties.other_case = other_case;
- }
- // Set the direction property of the given unichar to the given value.
- void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
- unichars[unichar_id].properties.direction = value;
- }
- // Set mirror unichar id in the properties for the given unichar id.
- void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
- unichars[unichar_id].properties.mirror = mirror;
- }
- // Record normalized version of unichar with the given unichar_id.
- void set_normed(UNICHAR_ID unichar_id, const char* normed) {
- unichars[unichar_id].properties.normed = normed;
- unichars[unichar_id].properties.normed_ids.truncate(0);
- }
- // Sets the normed_ids vector from the normed string. normed_ids is not
- // stored in the file, and needs to be set when the UNICHARSET is loaded.
- void set_normed_ids(UNICHAR_ID unichar_id);
- // Return the isalpha property of the given unichar.
- bool get_isalpha(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return false;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.isalpha;
- }
- // Return the islower property of the given unichar.
- bool get_islower(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return false;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.islower;
- }
- // Return the isupper property of the given unichar.
- bool get_isupper(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return false;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.isupper;
- }
- // Return the isdigit property of the given unichar.
- bool get_isdigit(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return false;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.isdigit;
- }
- // Return the ispunctuation property of the given unichar.
- bool get_ispunctuation(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return false;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.ispunctuation;
- }
- // Return the isngram property of the given unichar.
- bool get_isngram(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return false;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.isngram;
- }
- // Returns whether the unichar id represents a unicode value in the private
- // use area.
- bool get_isprivate(UNICHAR_ID unichar_id) const;
- // Returns true if the ids have useful min/max top/bottom values.
- bool top_bottom_useful() const {
- return top_bottom_set_;
- }
- // Sets all ranges to empty, so they can be expanded to set the values.
- void set_ranges_empty();
- // Sets all the properties for this unicharset given a src_unicharset with
- // everything set. The unicharsets don't have to be the same, and graphemes
- // are correctly accounted for.
- void SetPropertiesFromOther(const UNICHARSET& src) {
- PartialSetPropertiesFromOther(0, src);
- }
- // Sets properties from Other, starting only at the given index.
- void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);
- // Expands the tops and bottoms and widths for this unicharset given a
- // src_unicharset with ranges in it. The unicharsets don't have to be the
- // same, and graphemes are correctly accounted for.
- void ExpandRangesFromOther(const UNICHARSET& src);
- // Makes this a copy of src. Clears this completely first, so the automattic
- // ids will not be present in this if not in src.
- void CopyFrom(const UNICHARSET& src);
- // For each id in src, if it does not occur in this, add it, as in
- // SetPropertiesFromOther, otherwise expand the ranges, as in
- // ExpandRangesFromOther.
- void AppendOtherUnicharset(const UNICHARSET& src);
- // Returns true if the acceptable ranges of the tops of the characters do
- // not overlap, making their x-height calculations distinct.
- bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
- // Returns the min and max bottom and top of the given unichar in
- // baseline-normalized coordinates, ie, where the baseline is
- // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
- // (See normalis.h for the definitions).
- void get_top_bottom(UNICHAR_ID unichar_id,
- int* min_bottom, int* max_bottom,
- int* min_top, int* max_top) const {
- if (INVALID_UNICHAR_ID == unichar_id) {
- *min_bottom = *min_top = 0;
- *max_bottom = *max_top = 256; // kBlnCellHeight
- return;
- }
- ASSERT_HOST(contains_unichar_id(unichar_id));
- *min_bottom = unichars[unichar_id].properties.min_bottom;
- *max_bottom = unichars[unichar_id].properties.max_bottom;
- *min_top = unichars[unichar_id].properties.min_top;
- *max_top = unichars[unichar_id].properties.max_top;
- }
- void set_top_bottom(UNICHAR_ID unichar_id,
- int min_bottom, int max_bottom,
- int min_top, int max_top) {
- unichars[unichar_id].properties.min_bottom =
- ClipToRange<int>(min_bottom, 0, UINT8_MAX);
- unichars[unichar_id].properties.max_bottom =
- ClipToRange<int>(max_bottom, 0, UINT8_MAX);
- unichars[unichar_id].properties.min_top =
- ClipToRange<int>(min_top, 0, UINT8_MAX);
- unichars[unichar_id].properties.max_top =
- ClipToRange<int>(max_top, 0, UINT8_MAX);
- }
- // Returns the width stats (as mean, sd) of the given unichar relative to the
- // median advance of all characters in the character set.
- void get_width_stats(UNICHAR_ID unichar_id,
- float* width, float* width_sd) const {
- if (INVALID_UNICHAR_ID == unichar_id) {
- *width = 0.0f;
- *width_sd = 0.0f;;
- return;
- }
- ASSERT_HOST(contains_unichar_id(unichar_id));
- *width = unichars[unichar_id].properties.width;
- *width_sd = unichars[unichar_id].properties.width_sd;
- }
- void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
- unichars[unichar_id].properties.width = width;
- unichars[unichar_id].properties.width_sd = width_sd;
- }
- // Returns the stats of the x-bearing (as mean, sd) of the given unichar
- // relative to the median advance of all characters in the character set.
- void get_bearing_stats(UNICHAR_ID unichar_id,
- float* bearing, float* bearing_sd) const {
- if (INVALID_UNICHAR_ID == unichar_id) {
- *bearing = *bearing_sd = 0.0f;
- return;
- }
- ASSERT_HOST(contains_unichar_id(unichar_id));
- *bearing = unichars[unichar_id].properties.bearing;
- *bearing_sd = unichars[unichar_id].properties.bearing_sd;
- }
- void set_bearing_stats(UNICHAR_ID unichar_id,
- float bearing, float bearing_sd) {
- unichars[unichar_id].properties.bearing = bearing;
- unichars[unichar_id].properties.bearing_sd = bearing_sd;
- }
- // Returns the stats of the x-advance of the given unichar (as mean, sd)
- // relative to the median advance of all characters in the character set.
- void get_advance_stats(UNICHAR_ID unichar_id,
- float* advance, float* advance_sd) const {
- if (INVALID_UNICHAR_ID == unichar_id) {
- *advance = *advance_sd = 0;
- return;
- }
- ASSERT_HOST(contains_unichar_id(unichar_id));
- *advance = unichars[unichar_id].properties.advance;
- *advance_sd = unichars[unichar_id].properties.advance_sd;
- }
- void set_advance_stats(UNICHAR_ID unichar_id,
- float advance, float advance_sd) {
- unichars[unichar_id].properties.advance = advance;
- unichars[unichar_id].properties.advance_sd = advance_sd;
- }
- // Returns true if the font metrics properties are empty.
- bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
- return unichars[unichar_id].properties.AnyRangeEmpty();
- }
- // Returns true if the script of the given id is space delimited.
- // Returns false for Han and Thai scripts.
- bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return true;
- int script_id = get_script(unichar_id);
- return script_id != han_sid_ && script_id != thai_sid_ &&
- script_id != hangul_sid_ && script_id != hiragana_sid_ &&
- script_id != katakana_sid_;
- }
- // Return the script name of the given unichar.
- // The returned pointer will always be the same for the same script, it's
- // managed by unicharset and thus MUST NOT be deleted
- int get_script(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.script_id;
- }
- // Return the character properties, eg. alpha/upper/lower/digit/punct,
- // as a bit field of unsigned int.
- unsigned int get_properties(UNICHAR_ID unichar_id) const;
- // Return the character property as a single char. If a character has
- // multiple attributes, the main property is defined by the following order:
- // upper_case : 'A'
- // lower_case : 'a'
- // alpha : 'x'
- // digit : '0'
- // punctuation: 'p'
- char get_chartype(UNICHAR_ID unichar_id) const;
- // Get other_case unichar id in the properties for the given unichar id.
- UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.other_case;
- }
- // Returns the direction property of the given unichar.
- Direction get_direction(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.direction;
- }
- // Get mirror unichar id in the properties for the given unichar id.
- UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.mirror;
- }
- // Returns UNICHAR_ID of the corresponding lower-case unichar.
- UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- if (unichars[unichar_id].properties.islower) return unichar_id;
- return unichars[unichar_id].properties.other_case;
- }
- // Returns UNICHAR_ID of the corresponding upper-case unichar.
- UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- if (unichars[unichar_id].properties.isupper) return unichar_id;
- return unichars[unichar_id].properties.other_case;
- }
- // Returns true if this UNICHARSET has the special codes in
- // SpecialUnicharCodes available. If false then there are normal unichars
- // at these codes and they should not be used.
- bool has_special_codes() const {
- return get_fragment(UNICHAR_BROKEN) != nullptr &&
- strcmp(id_to_unichar(UNICHAR_BROKEN),
- kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
- }
- // Returns true if there are any repeated unicodes in the normalized
- // text of any unichar-id in the unicharset.
- bool AnyRepeatedUnicodes() const;
- // Return a pointer to the CHAR_FRAGMENT class if the given
- // unichar id represents a character fragment.
- const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
- if (INVALID_UNICHAR_ID == unichar_id) return nullptr;
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.fragment;
- }
- // Return the isalpha property of the given unichar representation.
- bool get_isalpha(const char* const unichar_repr) const {
- return get_isalpha(unichar_to_id(unichar_repr));
- }
- // Return the islower property of the given unichar representation.
- bool get_islower(const char* const unichar_repr) const {
- return get_islower(unichar_to_id(unichar_repr));
- }
- // Return the isupper property of the given unichar representation.
- bool get_isupper(const char* const unichar_repr) const {
- return get_isupper(unichar_to_id(unichar_repr));
- }
- // Return the isdigit property of the given unichar representation.
- bool get_isdigit(const char* const unichar_repr) const {
- return get_isdigit(unichar_to_id(unichar_repr));
- }
- // Return the ispunctuation property of the given unichar representation.
- bool get_ispunctuation(const char* const unichar_repr) const {
- return get_ispunctuation(unichar_to_id(unichar_repr));
- }
- // Return the character properties, eg. alpha/upper/lower/digit/punct,
- // of the given unichar representation
- unsigned int get_properties(const char* const unichar_repr) const {
- return get_properties(unichar_to_id(unichar_repr));
- }
- char get_chartype(const char* const unichar_repr) const {
- return get_chartype(unichar_to_id(unichar_repr));
- }
- // Return the script name of the given unichar representation.
- // The returned pointer will always be the same for the same script, it's
- // managed by unicharset and thus MUST NOT be deleted
- int get_script(const char* const unichar_repr) const {
- return get_script(unichar_to_id(unichar_repr));
- }
- // Return a pointer to the CHAR_FRAGMENT class struct if the given
- // unichar representation represents a character fragment.
- const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
- if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
- !ids.contains(unichar_repr, false)) {
- return nullptr;
- }
- return get_fragment(unichar_to_id(unichar_repr));
- }
- // Return the isalpha property of the given unichar representation.
- // Only the first length characters from unichar_repr are used.
- bool get_isalpha(const char* const unichar_repr,
- int length) const {
- return get_isalpha(unichar_to_id(unichar_repr, length));
- }
- // Return the islower property of the given unichar representation.
- // Only the first length characters from unichar_repr are used.
- bool get_islower(const char* const unichar_repr,
- int length) const {
- return get_islower(unichar_to_id(unichar_repr, length));
- }
- // Return the isupper property of the given unichar representation.
- // Only the first length characters from unichar_repr are used.
- bool get_isupper(const char* const unichar_repr,
- int length) const {
- return get_isupper(unichar_to_id(unichar_repr, length));
- }
- // Return the isdigit property of the given unichar representation.
- // Only the first length characters from unichar_repr are used.
- bool get_isdigit(const char* const unichar_repr,
- int length) const {
- return get_isdigit(unichar_to_id(unichar_repr, length));
- }
- // Return the ispunctuation property of the given unichar representation.
- // Only the first length characters from unichar_repr are used.
- bool get_ispunctuation(const char* const unichar_repr,
- int length) const {
- return get_ispunctuation(unichar_to_id(unichar_repr, length));
- }
- // Returns normalized version of unichar with the given unichar_id.
- const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
- if (unichar_id == UNICHAR_SPACE) return " ";
- return unichars[unichar_id].properties.normed.string();
- }
- // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
- // version of the given id. There may be more than one UNICHAR_ID in the
- // vector if unichar_id represents a ligature.
- const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
- return unichars[unichar_id].properties.normed_ids;
- }
- // Return the script name of the given unichar representation.
- // Only the first length characters from unichar_repr are used.
- // The returned pointer will always be the same for the same script, it's
- // managed by unicharset and thus MUST NOT be deleted
- int get_script(const char* const unichar_repr,
- int length) const {
- return get_script(unichar_to_id(unichar_repr, length));
- }
- // Return the (current) number of scripts in the script table
- int get_script_table_size() const {
- return script_table_size_used;
- }
- // Return the script string from its id
- const char* get_script_from_script_id(int id) const {
- if (id >= script_table_size_used || id < 0)
- return null_script;
- return script_table[id];
- }
- // Returns the id from the name of the script, or 0 if script is not found.
- // Note that this is an expensive operation since it involves iteratively
- // comparing strings in the script table. To avoid dependency on STL, we
- // won't use a hash. Instead, the calling function can use this to lookup
- // and save the ID for relevant scripts for fast comparisons later.
- int get_script_id_from_name(const char* script_name) const;
- // Return true if the given script is the null script
- bool is_null_script(const char* script) const {
- return script == null_script;
- }
- // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
- // then the returned pointer will be the same.
- // The script parameter is copied and thus can be a temporary.
- int add_script(const char* script);
- // Return the enabled property of the given unichar.
- bool get_enabled(UNICHAR_ID unichar_id) const {
- ASSERT_HOST(contains_unichar_id(unichar_id));
- return unichars[unichar_id].properties.enabled;
- }
- int null_sid() const { return null_sid_; }
- int common_sid() const { return common_sid_; }
- int latin_sid() const { return latin_sid_; }
- int cyrillic_sid() const { return cyrillic_sid_; }
- int greek_sid() const { return greek_sid_; }
- int han_sid() const { return han_sid_; }
- int hiragana_sid() const { return hiragana_sid_; }
- int katakana_sid() const { return katakana_sid_; }
- int thai_sid() const { return thai_sid_; }
- int hangul_sid() const { return hangul_sid_; }
- int default_sid() const { return default_sid_; }
- // Returns true if the unicharset has the concept of upper/lower case.
- bool script_has_upper_lower() const {
- return script_has_upper_lower_;
- }
- // Returns true if the unicharset has the concept of x-height.
- // script_has_xheight can be true even if script_has_upper_lower is not,
- // when the script has a sufficiently predominant top line with ascenders,
- // such as Devanagari and Thai.
- bool script_has_xheight() const {
- return script_has_xheight_;
- }
- private:
- struct UNICHAR_PROPERTIES {
- UNICHAR_PROPERTIES();
- // Initializes all properties to sensible default values.
- void Init();
- // Sets all ranges wide open. Initialization default in case there are
- // no useful values available.
- void SetRangesOpen();
- // Sets all ranges to empty. Used before expanding with font-based data.
- void SetRangesEmpty();
- // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
- // is empty.
- bool AnyRangeEmpty() const;
- // Expands the ranges with the ranges from the src properties.
- void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
- // Copies the properties from src into this.
- void CopyFrom(const UNICHAR_PROPERTIES& src);
- bool isalpha;
- bool islower;
- bool isupper;
- bool isdigit;
- bool ispunctuation;
- bool isngram;
- bool enabled;
- // Possible limits of the top and bottom of the bounding box in
- // baseline-normalized coordinates, ie, where the baseline is
- // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
- // (See normalis.h for the definitions).
- uint8_t min_bottom;
- uint8_t max_bottom;
- uint8_t min_top;
- uint8_t max_top;
- // Statstics of the widths of bounding box, relative to the median advance.
- float width;
- float width_sd;
- // Stats of the x-bearing and advance, also relative to the median advance.
- float bearing;
- float bearing_sd;
- float advance;
- float advance_sd;
- int script_id;
- UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
- Direction direction; // direction of this unichar
- // Mirror property is useful for reverse DAWG lookup for words in
- // right-to-left languages (e.g. "(word)" would be in
- // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
- // However, what we want in our DAWG is
- // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
- // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
- UNICHAR_ID mirror;
- // A string of unichar_ids that represent the corresponding normed string.
- // For awkward characters like em-dash, this gives hyphen.
- // For ligatures, this gives the string of normal unichars.
- GenericVector<UNICHAR_ID> normed_ids;
- STRING normed; // normalized version of this unichar
- // Contains meta information about the fragment if a unichar represents
- // a fragment of a character, otherwise should be set to nullptr.
- // It is assumed that character fragments are added to the unicharset
- // after the corresponding 'base' characters.
- CHAR_FRAGMENT *fragment;
- };
- struct UNICHAR_SLOT {
- char representation[UNICHAR_LEN + 1];
- UNICHAR_PROPERTIES properties;
- };
- // Internal recursive version of encode_string above.
- // str is the start of the whole string.
- // str_index is the current position in str.
- // str_length is the length of str.
- // encoding is a working encoding of str.
- // lengths is a working set of lengths of each element of encoding.
- // best_total_length is the longest length of str that has been successfully
- // encoded so far.
- // On return:
- // best_encoding contains the encoding that used the longest part of str.
- // best_lengths (may be null) contains the lengths of best_encoding.
- void encode_string(const char* str, int str_index, int str_length,
- GenericVector<UNICHAR_ID>* encoding,
- GenericVector<char>* lengths,
- int* best_total_length,
- GenericVector<UNICHAR_ID>* best_encoding,
- GenericVector<char>* best_lengths) const;
- // Gets the properties for a grapheme string, combining properties for
- // multiple characters in a meaningful way where possible.
- // Returns false if no valid match was found in the unicharset.
- // NOTE that script_id, mirror, and other_case refer to this unicharset on
- // return and will need redirecting if the target unicharset is different.
- bool GetStrProperties(const char* utf8_str,
- UNICHAR_PROPERTIES* props) const;
- // Load ourselves from a "file" where our only interface to the file is
- // an implementation of fgets(). This is the parsing primitive accessed by
- // the public routines load_from_file() and load_from_inmemory_file().
- bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
- bool skip_fragments);
- // List of mappings to make when ingesting strings from the outside.
- // The substitutions clean up text that should exists for rendering of
- // synthetic data, but not in the recognition set.
- static const char* kCleanupMaps[][2];
- static TESS_API const char* null_script;
- UNICHAR_SLOT* unichars;
- UNICHARMAP ids;
- int size_used;
- int size_reserved;
- char** script_table;
- int script_table_size_used;
- int script_table_size_reserved;
- // True if the unichars have their tops/bottoms set.
- bool top_bottom_set_;
- // True if the unicharset has significant upper/lower case chars.
- bool script_has_upper_lower_;
- // True if the unicharset has a significant mean-line with significant
- // ascenders above that.
- bool script_has_xheight_;
- // True if the set contains chars that would be changed by the cleanup.
- bool old_style_included_;
- // A few convenient script name-to-id mapping without using hash.
- // These are initialized when unicharset file is loaded. Anything
- // missing from this list can be looked up using get_script_id_from_name.
- int null_sid_;
- int common_sid_;
- int latin_sid_;
- int cyrillic_sid_;
- int greek_sid_;
- int han_sid_;
- int hiragana_sid_;
- int katakana_sid_;
- int thai_sid_;
- int hangul_sid_;
- // The most frequently occurring script in the charset.
- int default_sid_;
- };
- #endif // TESSERACT_CCUTIL_UNICHARSET_H_
|