unicharset.h 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: unicharset.h
  3. // Description: Unicode character/ligature set class.
  4. // Author: Thomas Kielbus
  5. //
  6. // (C) Copyright 2006, Google Inc.
  7. // Licensed under the Apache License, Version 2.0 (the "License");
  8. // you may not use this file except in compliance with the License.
  9. // You may obtain a copy of the License at
  10. // http://www.apache.org/licenses/LICENSE-2.0
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS,
  13. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. // See the License for the specific language governing permissions and
  15. // limitations under the License.
  16. //
  17. ///////////////////////////////////////////////////////////////////////
  18. #ifndef TESSERACT_CCUTIL_UNICHARSET_H_
  19. #define TESSERACT_CCUTIL_UNICHARSET_H_
  20. #include "errcode.h"
  21. #include "genericvector.h"
  22. #include "helpers.h"
  23. #include "serialis.h"
  24. #include "strngs.h"
  25. #include "tesscallback.h"
  26. #include "unichar.h"
  27. #include "unicharmap.h"
  28. // Enum holding special values of unichar_id. Every unicharset has these.
  29. // Warning! Keep in sync with kSpecialUnicharCodes.
  30. enum SpecialUnicharCodes {
  31. UNICHAR_SPACE,
  32. UNICHAR_JOINED,
  33. UNICHAR_BROKEN,
  34. SPECIAL_UNICHAR_CODES_COUNT
  35. };
  36. // Boolean flag for unichar_insert. It's a bit of a double negative to allow
  37. // the default value to be false.
  38. enum class OldUncleanUnichars {
  39. kFalse,
  40. kTrue,
  41. };
  42. class CHAR_FRAGMENT {
  43. public:
  44. // Minimum number of characters used for fragment representation.
  45. static const int kMinLen = 6;
  46. // Maximum number of characters used for fragment representation.
  47. static const int kMaxLen = 3 + UNICHAR_LEN + 2;
  48. // Maximum number of fragments per character.
  49. static const int kMaxChunks = 5;
  50. // Setters and Getters.
  51. inline void set_all(const char *unichar, int pos, int total, bool natural) {
  52. set_unichar(unichar);
  53. set_pos(pos);
  54. set_total(total);
  55. set_natural(natural);
  56. }
  57. inline void set_unichar(const char *uch) {
  58. strncpy(this->unichar, uch, sizeof(this->unichar));
  59. this->unichar[UNICHAR_LEN] = '\0';
  60. }
  61. inline void set_pos(int p) { this->pos = p; }
  62. inline void set_total(int t) { this->total = t; }
  63. inline const char* get_unichar() const { return this->unichar; }
  64. inline int get_pos() const { return this->pos; }
  65. inline int get_total() const { return this->total; }
  66. // Returns the string that represents a fragment
  67. // with the given unichar, pos and total.
  68. static STRING to_string(const char *unichar, int pos, int total,
  69. bool natural);
  70. // Returns the string that represents this fragment.
  71. STRING to_string() const {
  72. return to_string(unichar, pos, total, natural);
  73. }
  74. // Checks whether a fragment has the same unichar,
  75. // position and total as the given inputs.
  76. inline bool equals(const char *other_unichar,
  77. int other_pos, int other_total) const {
  78. return (strcmp(this->unichar, other_unichar) == 0 &&
  79. this->pos == other_pos && this->total == other_total);
  80. }
  81. inline bool equals(const CHAR_FRAGMENT *other) const {
  82. return this->equals(other->get_unichar(),
  83. other->get_pos(),
  84. other->get_total());
  85. }
  86. // Checks whether a given fragment is a continuation of this fragment.
  87. // Assumes that the given fragment pointer is not nullptr.
  88. inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
  89. return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
  90. this->total == fragment->get_total() &&
  91. this->pos == fragment->get_pos() + 1);
  92. }
  93. // Returns true if this fragment is a beginning fragment.
  94. inline bool is_beginning() const { return this->pos == 0; }
  95. // Returns true if this fragment is an ending fragment.
  96. inline bool is_ending() const { return this->pos == this->total-1; }
  97. // Returns true if the fragment was a separate component to begin with,
  98. // ie did not need chopping to be isolated, but may have been separated
  99. // out from a multi-outline blob.
  100. inline bool is_natural() const { return natural; }
  101. void set_natural(bool value) { natural = value; }
  102. // Parses the string to see whether it represents a character fragment
  103. // (rather than a regular character). If so, allocates memory for a new
  104. // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
  105. // information. Fragments are of the form:
  106. // |m|1|2, meaning chunk 1 of 2 of character m, or
  107. // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
  108. // to divide the parts, as they were already separate connected components.
  109. //
  110. // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
  111. // instance, otherwise (if the string does not represent a fragment or it
  112. // looks like it does, but parsing it as a fragment fails) returns nullptr.
  113. //
  114. // Note: The caller is responsible for deallocating memory
  115. // associated with the returned pointer.
  116. static CHAR_FRAGMENT *parse_from_string(const char *str);
  117. private:
  118. char unichar[UNICHAR_LEN + 1];
  119. // True if the fragment was a separate component to begin with,
  120. // ie did not need chopping to be isolated, but may have been separated
  121. // out from a multi-outline blob.
  122. bool natural;
  123. int16_t pos; // fragment position in the character
  124. int16_t total; // total number of fragments in the character
  125. };
  126. // The UNICHARSET class is an utility class for Tesseract that holds the
  127. // set of characters that are used by the engine. Each character is identified
  128. // by a unique number, from 0 to (size - 1).
  129. class UNICHARSET {
  130. public:
  131. // Custom list of characters and their ligature forms (UTF8)
  132. // These map to unicode values in the private use area (PUC) and are supported
  133. // by only few font families (eg. Wyld, Adobe Caslon Pro).
  134. static TESS_API const char* kCustomLigatures[][2];
  135. // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
  136. static TESS_API const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
  137. // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
  138. enum Direction {
  139. U_LEFT_TO_RIGHT = 0,
  140. U_RIGHT_TO_LEFT = 1,
  141. U_EUROPEAN_NUMBER = 2,
  142. U_EUROPEAN_NUMBER_SEPARATOR = 3,
  143. U_EUROPEAN_NUMBER_TERMINATOR = 4,
  144. U_ARABIC_NUMBER = 5,
  145. U_COMMON_NUMBER_SEPARATOR = 6,
  146. U_BLOCK_SEPARATOR = 7,
  147. U_SEGMENT_SEPARATOR = 8,
  148. U_WHITE_SPACE_NEUTRAL = 9,
  149. U_OTHER_NEUTRAL = 10,
  150. U_LEFT_TO_RIGHT_EMBEDDING = 11,
  151. U_LEFT_TO_RIGHT_OVERRIDE = 12,
  152. U_RIGHT_TO_LEFT_ARABIC = 13,
  153. U_RIGHT_TO_LEFT_EMBEDDING = 14,
  154. U_RIGHT_TO_LEFT_OVERRIDE = 15,
  155. U_POP_DIRECTIONAL_FORMAT = 16,
  156. U_DIR_NON_SPACING_MARK = 17,
  157. U_BOUNDARY_NEUTRAL = 18,
  158. U_FIRST_STRONG_ISOLATE = 19,
  159. U_LEFT_TO_RIGHT_ISOLATE = 20,
  160. U_RIGHT_TO_LEFT_ISOLATE = 21,
  161. U_POP_DIRECTIONAL_ISOLATE = 22,
  162. #ifndef U_HIDE_DEPRECATED_API
  163. U_CHAR_DIRECTION_COUNT
  164. #endif // U_HIDE_DEPRECATED_API
  165. };
  166. // Create an empty UNICHARSET
  167. UNICHARSET();
  168. ~UNICHARSET();
  169. // Return the UNICHAR_ID of a given unichar representation within the
  170. // UNICHARSET.
  171. UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
  172. // Return the UNICHAR_ID of a given unichar representation within the
  173. // UNICHARSET. Only the first length characters from unichar_repr are used.
  174. UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
  175. // Return the minimum number of bytes that matches a legal UNICHAR_ID,
  176. // while leaving the rest of the string encodable. Returns 0 if the
  177. // beginning of the string is not encodable.
  178. // WARNING: this function now encodes the whole string for precision.
  179. // Use encode_string in preference to repeatedly calling step.
  180. int step(const char* str) const;
  181. // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
  182. // If not encodable, write the first byte offset which cannot be converted
  183. // into the second (return) argument.
  184. bool encodable_string(const char *str, int *first_bad_position) const;
  185. // Encodes the given UTF-8 string with this UNICHARSET.
  186. // Any part of the string that cannot be encoded (because the utf8 can't
  187. // be broken up into pieces that are in the unicharset) then:
  188. // if give_up_on_failure, stops and returns a partial encoding,
  189. // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
  190. // Returns true if the encoding succeeds completely, false if there is at
  191. // least one failure.
  192. // If lengths is not nullptr, then it is filled with the corresponding
  193. // byte length of each encoded UNICHAR_ID.
  194. // If encoded_length is not nullptr then on return it contains the length of
  195. // str that was encoded. (if give_up_on_failure the location of the first
  196. // failure, otherwise strlen(str).)
  197. // WARNING: Caller must guarantee that str has already been cleaned of codes
  198. // that do not belong in the unicharset, or encoding may fail.
  199. // Use CleanupString to perform the cleaning.
  200. bool encode_string(const char* str, bool give_up_on_failure,
  201. GenericVector<UNICHAR_ID>* encoding,
  202. GenericVector<char>* lengths,
  203. int* encoded_length) const;
  204. // Return the unichar representation corresponding to the given UNICHAR_ID
  205. // within the UNICHARSET.
  206. const char* id_to_unichar(UNICHAR_ID id) const;
  207. // Return the UTF8 representation corresponding to the given UNICHAR_ID after
  208. // resolving any private encodings internal to Tesseract. This method is
  209. // preferable to id_to_unichar for outputting text that will be visible to
  210. // external applications.
  211. const char* id_to_unichar_ext(UNICHAR_ID id) const;
  212. // Return a STRING that reformats the utf8 str into the str followed
  213. // by its hex unicodes.
  214. static STRING debug_utf8_str(const char* str);
  215. // Removes/replaces content that belongs in rendered text, but not in the
  216. // unicharset.
  217. static std::string CleanupString(const char* utf8_str) {
  218. return CleanupString(utf8_str, strlen(utf8_str));
  219. }
  220. static std::string CleanupString(const char* utf8_str, size_t length);
  221. // Return a STRING containing debug information on the unichar, including
  222. // the id_to_unichar, its hex unicodes and the properties.
  223. STRING debug_str(UNICHAR_ID id) const;
  224. STRING debug_str(const char * unichar_repr) const {
  225. return debug_str(unichar_to_id(unichar_repr));
  226. }
  227. // Adds a unichar representation to the set. If old_style is true, then
  228. // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
  229. // characters are ignored/skipped as if they don't exist and n-grams that
  230. // can already be encoded are not added.
  231. void unichar_insert(const char* const unichar_repr,
  232. OldUncleanUnichars old_style);
  233. void unichar_insert(const char* const unichar_repr) {
  234. unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
  235. }
  236. // Adds a unichar representation to the set. Avoids setting old_style to true,
  237. // unless it is necessary to make the new unichar get added.
  238. void unichar_insert_backwards_compatible(const char* const unichar_repr) {
  239. std::string cleaned = CleanupString(unichar_repr);
  240. if (cleaned != unichar_repr) {
  241. unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
  242. } else {
  243. int old_size = size();
  244. unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
  245. if (size() == old_size) {
  246. unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
  247. }
  248. }
  249. }
  250. // Return true if the given unichar id exists within the set.
  251. // Relies on the fact that unichar ids are contiguous in the unicharset.
  252. bool contains_unichar_id(UNICHAR_ID unichar_id) const {
  253. return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
  254. unichar_id >= 0;
  255. }
  256. // Return true if the given unichar representation exists within the set.
  257. bool contains_unichar(const char* const unichar_repr) const;
  258. bool contains_unichar(const char* const unichar_repr, int length) const;
  259. // Return true if the given unichar representation corresponds to the given
  260. // UNICHAR_ID within the set.
  261. bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
  262. // Delete CHAR_FRAGMENTs stored in properties of unichars array.
  263. void delete_pointers_in_unichars() {
  264. for (int i = 0; i < size_used; ++i) {
  265. delete unichars[i].properties.fragment;
  266. unichars[i].properties.fragment = nullptr;
  267. }
  268. }
  269. // Clear the UNICHARSET (all the previous data is lost).
  270. void clear() {
  271. if (script_table != nullptr) {
  272. for (int i = 0; i < script_table_size_used; ++i)
  273. delete[] script_table[i];
  274. delete[] script_table;
  275. script_table = nullptr;
  276. script_table_size_used = 0;
  277. }
  278. if (unichars != nullptr) {
  279. delete_pointers_in_unichars();
  280. delete[] unichars;
  281. unichars = nullptr;
  282. }
  283. script_table_size_reserved = 0;
  284. size_reserved = 0;
  285. size_used = 0;
  286. ids.clear();
  287. top_bottom_set_ = false;
  288. script_has_upper_lower_ = false;
  289. script_has_xheight_ = false;
  290. old_style_included_ = false;
  291. null_sid_ = 0;
  292. common_sid_ = 0;
  293. latin_sid_ = 0;
  294. cyrillic_sid_ = 0;
  295. greek_sid_ = 0;
  296. han_sid_ = 0;
  297. hiragana_sid_ = 0;
  298. katakana_sid_ = 0;
  299. thai_sid_ = 0;
  300. hangul_sid_ = 0;
  301. default_sid_ = 0;
  302. }
  303. // Return the size of the set (the number of different UNICHAR it holds).
  304. int size() const {
  305. return size_used;
  306. }
  307. // Reserve enough memory space for the given number of UNICHARS
  308. void reserve(int unichars_number);
  309. // Opens the file indicated by filename and saves unicharset to that file.
  310. // Returns true if the operation is successful.
  311. bool save_to_file(const char * const filename) const {
  312. FILE* file = fopen(filename, "w+b");
  313. if (file == nullptr) return false;
  314. bool result = save_to_file(file);
  315. fclose(file);
  316. return result;
  317. }
  318. // Saves the content of the UNICHARSET to the given file.
  319. // Returns true if the operation is successful.
  320. bool save_to_file(FILE *file) const {
  321. STRING str;
  322. return save_to_string(&str) &&
  323. tesseract::Serialize(file, &str[0], str.length());
  324. }
  325. bool save_to_file(tesseract::TFile *file) const {
  326. STRING str;
  327. return save_to_string(&str) && file->Serialize(&str[0], str.length());
  328. }
  329. // Saves the content of the UNICHARSET to the given STRING.
  330. // Returns true if the operation is successful.
  331. bool save_to_string(STRING *str) const;
  332. // Load a unicharset from a unicharset file that has been loaded into
  333. // the given memory buffer.
  334. // Returns true if the operation is successful.
  335. bool load_from_inmemory_file(const char* const memory, int mem_size,
  336. bool skip_fragments);
  337. // Returns true if the operation is successful.
  338. bool load_from_inmemory_file(const char* const memory, int mem_size) {
  339. return load_from_inmemory_file(memory, mem_size, false);
  340. }
  341. // Opens the file indicated by filename and loads the UNICHARSET
  342. // from the given file. The previous data is lost.
  343. // Returns true if the operation is successful.
  344. bool load_from_file(const char* const filename, bool skip_fragments) {
  345. FILE* file = fopen(filename, "rb");
  346. if (file == nullptr) return false;
  347. bool result = load_from_file(file, skip_fragments);
  348. fclose(file);
  349. return result;
  350. }
  351. // returns true if the operation is successful.
  352. bool load_from_file(const char* const filename) {
  353. return load_from_file(filename, false);
  354. }
  355. // Loads the UNICHARSET from the given file. The previous data is lost.
  356. // Returns true if the operation is successful.
  357. bool load_from_file(FILE *file, bool skip_fragments);
  358. bool load_from_file(FILE *file) { return load_from_file(file, false); }
  359. bool load_from_file(tesseract::TFile *file, bool skip_fragments);
  360. // Sets up internal data after loading the file, based on the char
  361. // properties. Called from load_from_file, but also needs to be run
  362. // during set_unicharset_properties.
  363. void post_load_setup();
  364. // Returns true if right_to_left scripts are significant in the unicharset,
  365. // but without being so sensitive that "universal" unicharsets containing
  366. // characters from many scripts, like orientation and script detection,
  367. // look like they are right_to_left.
  368. bool major_right_to_left() const;
  369. // Set a whitelist and/or blacklist of characters to recognize.
  370. // An empty or nullptr whitelist enables everything (minus any blacklist).
  371. // An empty or nullptr blacklist disables nothing.
  372. // An empty or nullptr unblacklist has no effect.
  373. // The blacklist overrides the whitelist.
  374. // The unblacklist overrides the blacklist.
  375. // Each list is a string of utf8 character strings. Boundaries between
  376. // unicharset units are worked out automatically, and characters not in
  377. // the unicharset are silently ignored.
  378. void set_black_and_whitelist(const char* blacklist, const char* whitelist,
  379. const char* unblacklist);
  380. // Set the isalpha property of the given unichar to the given value.
  381. void set_isalpha(UNICHAR_ID unichar_id, bool value) {
  382. unichars[unichar_id].properties.isalpha = value;
  383. }
  384. // Set the islower property of the given unichar to the given value.
  385. void set_islower(UNICHAR_ID unichar_id, bool value) {
  386. unichars[unichar_id].properties.islower = value;
  387. }
  388. // Set the isupper property of the given unichar to the given value.
  389. void set_isupper(UNICHAR_ID unichar_id, bool value) {
  390. unichars[unichar_id].properties.isupper = value;
  391. }
  392. // Set the isdigit property of the given unichar to the given value.
  393. void set_isdigit(UNICHAR_ID unichar_id, bool value) {
  394. unichars[unichar_id].properties.isdigit = value;
  395. }
  396. // Set the ispunctuation property of the given unichar to the given value.
  397. void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
  398. unichars[unichar_id].properties.ispunctuation = value;
  399. }
  400. // Set the isngram property of the given unichar to the given value.
  401. void set_isngram(UNICHAR_ID unichar_id, bool value) {
  402. unichars[unichar_id].properties.isngram = value;
  403. }
  404. // Set the script name of the given unichar to the given value.
  405. // Value is copied and thus can be a temporary;
  406. void set_script(UNICHAR_ID unichar_id, const char* value) {
  407. unichars[unichar_id].properties.script_id = add_script(value);
  408. }
  409. // Set other_case unichar id in the properties for the given unichar id.
  410. void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
  411. unichars[unichar_id].properties.other_case = other_case;
  412. }
  413. // Set the direction property of the given unichar to the given value.
  414. void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
  415. unichars[unichar_id].properties.direction = value;
  416. }
  417. // Set mirror unichar id in the properties for the given unichar id.
  418. void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
  419. unichars[unichar_id].properties.mirror = mirror;
  420. }
  421. // Record normalized version of unichar with the given unichar_id.
  422. void set_normed(UNICHAR_ID unichar_id, const char* normed) {
  423. unichars[unichar_id].properties.normed = normed;
  424. unichars[unichar_id].properties.normed_ids.truncate(0);
  425. }
  426. // Sets the normed_ids vector from the normed string. normed_ids is not
  427. // stored in the file, and needs to be set when the UNICHARSET is loaded.
  428. void set_normed_ids(UNICHAR_ID unichar_id);
  429. // Return the isalpha property of the given unichar.
  430. bool get_isalpha(UNICHAR_ID unichar_id) const {
  431. if (INVALID_UNICHAR_ID == unichar_id) return false;
  432. ASSERT_HOST(contains_unichar_id(unichar_id));
  433. return unichars[unichar_id].properties.isalpha;
  434. }
  435. // Return the islower property of the given unichar.
  436. bool get_islower(UNICHAR_ID unichar_id) const {
  437. if (INVALID_UNICHAR_ID == unichar_id) return false;
  438. ASSERT_HOST(contains_unichar_id(unichar_id));
  439. return unichars[unichar_id].properties.islower;
  440. }
  441. // Return the isupper property of the given unichar.
  442. bool get_isupper(UNICHAR_ID unichar_id) const {
  443. if (INVALID_UNICHAR_ID == unichar_id) return false;
  444. ASSERT_HOST(contains_unichar_id(unichar_id));
  445. return unichars[unichar_id].properties.isupper;
  446. }
  447. // Return the isdigit property of the given unichar.
  448. bool get_isdigit(UNICHAR_ID unichar_id) const {
  449. if (INVALID_UNICHAR_ID == unichar_id) return false;
  450. ASSERT_HOST(contains_unichar_id(unichar_id));
  451. return unichars[unichar_id].properties.isdigit;
  452. }
  453. // Return the ispunctuation property of the given unichar.
  454. bool get_ispunctuation(UNICHAR_ID unichar_id) const {
  455. if (INVALID_UNICHAR_ID == unichar_id) return false;
  456. ASSERT_HOST(contains_unichar_id(unichar_id));
  457. return unichars[unichar_id].properties.ispunctuation;
  458. }
  459. // Return the isngram property of the given unichar.
  460. bool get_isngram(UNICHAR_ID unichar_id) const {
  461. if (INVALID_UNICHAR_ID == unichar_id) return false;
  462. ASSERT_HOST(contains_unichar_id(unichar_id));
  463. return unichars[unichar_id].properties.isngram;
  464. }
  465. // Returns whether the unichar id represents a unicode value in the private
  466. // use area.
  467. bool get_isprivate(UNICHAR_ID unichar_id) const;
  468. // Returns true if the ids have useful min/max top/bottom values.
  469. bool top_bottom_useful() const {
  470. return top_bottom_set_;
  471. }
  472. // Sets all ranges to empty, so they can be expanded to set the values.
  473. void set_ranges_empty();
  474. // Sets all the properties for this unicharset given a src_unicharset with
  475. // everything set. The unicharsets don't have to be the same, and graphemes
  476. // are correctly accounted for.
  477. void SetPropertiesFromOther(const UNICHARSET& src) {
  478. PartialSetPropertiesFromOther(0, src);
  479. }
  480. // Sets properties from Other, starting only at the given index.
  481. void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);
  482. // Expands the tops and bottoms and widths for this unicharset given a
  483. // src_unicharset with ranges in it. The unicharsets don't have to be the
  484. // same, and graphemes are correctly accounted for.
  485. void ExpandRangesFromOther(const UNICHARSET& src);
  486. // Makes this a copy of src. Clears this completely first, so the automattic
  487. // ids will not be present in this if not in src.
  488. void CopyFrom(const UNICHARSET& src);
  489. // For each id in src, if it does not occur in this, add it, as in
  490. // SetPropertiesFromOther, otherwise expand the ranges, as in
  491. // ExpandRangesFromOther.
  492. void AppendOtherUnicharset(const UNICHARSET& src);
  493. // Returns true if the acceptable ranges of the tops of the characters do
  494. // not overlap, making their x-height calculations distinct.
  495. bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
  496. // Returns the min and max bottom and top of the given unichar in
  497. // baseline-normalized coordinates, ie, where the baseline is
  498. // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
  499. // (See normalis.h for the definitions).
  500. void get_top_bottom(UNICHAR_ID unichar_id,
  501. int* min_bottom, int* max_bottom,
  502. int* min_top, int* max_top) const {
  503. if (INVALID_UNICHAR_ID == unichar_id) {
  504. *min_bottom = *min_top = 0;
  505. *max_bottom = *max_top = 256; // kBlnCellHeight
  506. return;
  507. }
  508. ASSERT_HOST(contains_unichar_id(unichar_id));
  509. *min_bottom = unichars[unichar_id].properties.min_bottom;
  510. *max_bottom = unichars[unichar_id].properties.max_bottom;
  511. *min_top = unichars[unichar_id].properties.min_top;
  512. *max_top = unichars[unichar_id].properties.max_top;
  513. }
  514. void set_top_bottom(UNICHAR_ID unichar_id,
  515. int min_bottom, int max_bottom,
  516. int min_top, int max_top) {
  517. unichars[unichar_id].properties.min_bottom =
  518. ClipToRange<int>(min_bottom, 0, UINT8_MAX);
  519. unichars[unichar_id].properties.max_bottom =
  520. ClipToRange<int>(max_bottom, 0, UINT8_MAX);
  521. unichars[unichar_id].properties.min_top =
  522. ClipToRange<int>(min_top, 0, UINT8_MAX);
  523. unichars[unichar_id].properties.max_top =
  524. ClipToRange<int>(max_top, 0, UINT8_MAX);
  525. }
  526. // Returns the width stats (as mean, sd) of the given unichar relative to the
  527. // median advance of all characters in the character set.
  528. void get_width_stats(UNICHAR_ID unichar_id,
  529. float* width, float* width_sd) const {
  530. if (INVALID_UNICHAR_ID == unichar_id) {
  531. *width = 0.0f;
  532. *width_sd = 0.0f;;
  533. return;
  534. }
  535. ASSERT_HOST(contains_unichar_id(unichar_id));
  536. *width = unichars[unichar_id].properties.width;
  537. *width_sd = unichars[unichar_id].properties.width_sd;
  538. }
  539. void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
  540. unichars[unichar_id].properties.width = width;
  541. unichars[unichar_id].properties.width_sd = width_sd;
  542. }
  543. // Returns the stats of the x-bearing (as mean, sd) of the given unichar
  544. // relative to the median advance of all characters in the character set.
  545. void get_bearing_stats(UNICHAR_ID unichar_id,
  546. float* bearing, float* bearing_sd) const {
  547. if (INVALID_UNICHAR_ID == unichar_id) {
  548. *bearing = *bearing_sd = 0.0f;
  549. return;
  550. }
  551. ASSERT_HOST(contains_unichar_id(unichar_id));
  552. *bearing = unichars[unichar_id].properties.bearing;
  553. *bearing_sd = unichars[unichar_id].properties.bearing_sd;
  554. }
  555. void set_bearing_stats(UNICHAR_ID unichar_id,
  556. float bearing, float bearing_sd) {
  557. unichars[unichar_id].properties.bearing = bearing;
  558. unichars[unichar_id].properties.bearing_sd = bearing_sd;
  559. }
  560. // Returns the stats of the x-advance of the given unichar (as mean, sd)
  561. // relative to the median advance of all characters in the character set.
  562. void get_advance_stats(UNICHAR_ID unichar_id,
  563. float* advance, float* advance_sd) const {
  564. if (INVALID_UNICHAR_ID == unichar_id) {
  565. *advance = *advance_sd = 0;
  566. return;
  567. }
  568. ASSERT_HOST(contains_unichar_id(unichar_id));
  569. *advance = unichars[unichar_id].properties.advance;
  570. *advance_sd = unichars[unichar_id].properties.advance_sd;
  571. }
  572. void set_advance_stats(UNICHAR_ID unichar_id,
  573. float advance, float advance_sd) {
  574. unichars[unichar_id].properties.advance = advance;
  575. unichars[unichar_id].properties.advance_sd = advance_sd;
  576. }
  577. // Returns true if the font metrics properties are empty.
  578. bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
  579. return unichars[unichar_id].properties.AnyRangeEmpty();
  580. }
  581. // Returns true if the script of the given id is space delimited.
  582. // Returns false for Han and Thai scripts.
  583. bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
  584. if (INVALID_UNICHAR_ID == unichar_id) return true;
  585. int script_id = get_script(unichar_id);
  586. return script_id != han_sid_ && script_id != thai_sid_ &&
  587. script_id != hangul_sid_ && script_id != hiragana_sid_ &&
  588. script_id != katakana_sid_;
  589. }
  590. // Return the script name of the given unichar.
  591. // The returned pointer will always be the same for the same script, it's
  592. // managed by unicharset and thus MUST NOT be deleted
  593. int get_script(UNICHAR_ID unichar_id) const {
  594. if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
  595. ASSERT_HOST(contains_unichar_id(unichar_id));
  596. return unichars[unichar_id].properties.script_id;
  597. }
  598. // Return the character properties, eg. alpha/upper/lower/digit/punct,
  599. // as a bit field of unsigned int.
  600. unsigned int get_properties(UNICHAR_ID unichar_id) const;
  601. // Return the character property as a single char. If a character has
  602. // multiple attributes, the main property is defined by the following order:
  603. // upper_case : 'A'
  604. // lower_case : 'a'
  605. // alpha : 'x'
  606. // digit : '0'
  607. // punctuation: 'p'
  608. char get_chartype(UNICHAR_ID unichar_id) const;
  609. // Get other_case unichar id in the properties for the given unichar id.
  610. UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
  611. if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
  612. ASSERT_HOST(contains_unichar_id(unichar_id));
  613. return unichars[unichar_id].properties.other_case;
  614. }
  615. // Returns the direction property of the given unichar.
  616. Direction get_direction(UNICHAR_ID unichar_id) const {
  617. if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
  618. ASSERT_HOST(contains_unichar_id(unichar_id));
  619. return unichars[unichar_id].properties.direction;
  620. }
  621. // Get mirror unichar id in the properties for the given unichar id.
  622. UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
  623. if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
  624. ASSERT_HOST(contains_unichar_id(unichar_id));
  625. return unichars[unichar_id].properties.mirror;
  626. }
  627. // Returns UNICHAR_ID of the corresponding lower-case unichar.
  628. UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
  629. if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
  630. ASSERT_HOST(contains_unichar_id(unichar_id));
  631. if (unichars[unichar_id].properties.islower) return unichar_id;
  632. return unichars[unichar_id].properties.other_case;
  633. }
  634. // Returns UNICHAR_ID of the corresponding upper-case unichar.
  635. UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
  636. if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
  637. ASSERT_HOST(contains_unichar_id(unichar_id));
  638. if (unichars[unichar_id].properties.isupper) return unichar_id;
  639. return unichars[unichar_id].properties.other_case;
  640. }
  641. // Returns true if this UNICHARSET has the special codes in
  642. // SpecialUnicharCodes available. If false then there are normal unichars
  643. // at these codes and they should not be used.
  644. bool has_special_codes() const {
  645. return get_fragment(UNICHAR_BROKEN) != nullptr &&
  646. strcmp(id_to_unichar(UNICHAR_BROKEN),
  647. kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
  648. }
  649. // Returns true if there are any repeated unicodes in the normalized
  650. // text of any unichar-id in the unicharset.
  651. bool AnyRepeatedUnicodes() const;
  652. // Return a pointer to the CHAR_FRAGMENT class if the given
  653. // unichar id represents a character fragment.
  654. const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
  655. if (INVALID_UNICHAR_ID == unichar_id) return nullptr;
  656. ASSERT_HOST(contains_unichar_id(unichar_id));
  657. return unichars[unichar_id].properties.fragment;
  658. }
  659. // Return the isalpha property of the given unichar representation.
  660. bool get_isalpha(const char* const unichar_repr) const {
  661. return get_isalpha(unichar_to_id(unichar_repr));
  662. }
  663. // Return the islower property of the given unichar representation.
  664. bool get_islower(const char* const unichar_repr) const {
  665. return get_islower(unichar_to_id(unichar_repr));
  666. }
  667. // Return the isupper property of the given unichar representation.
  668. bool get_isupper(const char* const unichar_repr) const {
  669. return get_isupper(unichar_to_id(unichar_repr));
  670. }
  671. // Return the isdigit property of the given unichar representation.
  672. bool get_isdigit(const char* const unichar_repr) const {
  673. return get_isdigit(unichar_to_id(unichar_repr));
  674. }
  675. // Return the ispunctuation property of the given unichar representation.
  676. bool get_ispunctuation(const char* const unichar_repr) const {
  677. return get_ispunctuation(unichar_to_id(unichar_repr));
  678. }
  679. // Return the character properties, eg. alpha/upper/lower/digit/punct,
  680. // of the given unichar representation
  681. unsigned int get_properties(const char* const unichar_repr) const {
  682. return get_properties(unichar_to_id(unichar_repr));
  683. }
  684. char get_chartype(const char* const unichar_repr) const {
  685. return get_chartype(unichar_to_id(unichar_repr));
  686. }
  687. // Return the script name of the given unichar representation.
  688. // The returned pointer will always be the same for the same script, it's
  689. // managed by unicharset and thus MUST NOT be deleted
  690. int get_script(const char* const unichar_repr) const {
  691. return get_script(unichar_to_id(unichar_repr));
  692. }
  693. // Return a pointer to the CHAR_FRAGMENT class struct if the given
  694. // unichar representation represents a character fragment.
  695. const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
  696. if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
  697. !ids.contains(unichar_repr, false)) {
  698. return nullptr;
  699. }
  700. return get_fragment(unichar_to_id(unichar_repr));
  701. }
  702. // Return the isalpha property of the given unichar representation.
  703. // Only the first length characters from unichar_repr are used.
  704. bool get_isalpha(const char* const unichar_repr,
  705. int length) const {
  706. return get_isalpha(unichar_to_id(unichar_repr, length));
  707. }
  708. // Return the islower property of the given unichar representation.
  709. // Only the first length characters from unichar_repr are used.
  710. bool get_islower(const char* const unichar_repr,
  711. int length) const {
  712. return get_islower(unichar_to_id(unichar_repr, length));
  713. }
  714. // Return the isupper property of the given unichar representation.
  715. // Only the first length characters from unichar_repr are used.
  716. bool get_isupper(const char* const unichar_repr,
  717. int length) const {
  718. return get_isupper(unichar_to_id(unichar_repr, length));
  719. }
  720. // Return the isdigit property of the given unichar representation.
  721. // Only the first length characters from unichar_repr are used.
  722. bool get_isdigit(const char* const unichar_repr,
  723. int length) const {
  724. return get_isdigit(unichar_to_id(unichar_repr, length));
  725. }
  726. // Return the ispunctuation property of the given unichar representation.
  727. // Only the first length characters from unichar_repr are used.
  728. bool get_ispunctuation(const char* const unichar_repr,
  729. int length) const {
  730. return get_ispunctuation(unichar_to_id(unichar_repr, length));
  731. }
  732. // Returns normalized version of unichar with the given unichar_id.
  733. const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
  734. if (unichar_id == UNICHAR_SPACE) return " ";
  735. return unichars[unichar_id].properties.normed.string();
  736. }
  737. // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
  738. // version of the given id. There may be more than one UNICHAR_ID in the
  739. // vector if unichar_id represents a ligature.
  740. const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
  741. return unichars[unichar_id].properties.normed_ids;
  742. }
  743. // Return the script name of the given unichar representation.
  744. // Only the first length characters from unichar_repr are used.
  745. // The returned pointer will always be the same for the same script, it's
  746. // managed by unicharset and thus MUST NOT be deleted
  747. int get_script(const char* const unichar_repr,
  748. int length) const {
  749. return get_script(unichar_to_id(unichar_repr, length));
  750. }
  751. // Return the (current) number of scripts in the script table
  752. int get_script_table_size() const {
  753. return script_table_size_used;
  754. }
  755. // Return the script string from its id
  756. const char* get_script_from_script_id(int id) const {
  757. if (id >= script_table_size_used || id < 0)
  758. return null_script;
  759. return script_table[id];
  760. }
  761. // Returns the id from the name of the script, or 0 if script is not found.
  762. // Note that this is an expensive operation since it involves iteratively
  763. // comparing strings in the script table. To avoid dependency on STL, we
  764. // won't use a hash. Instead, the calling function can use this to lookup
  765. // and save the ID for relevant scripts for fast comparisons later.
  766. int get_script_id_from_name(const char* script_name) const;
  767. // Return true if the given script is the null script
  768. bool is_null_script(const char* script) const {
  769. return script == null_script;
  770. }
  771. // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
  772. // then the returned pointer will be the same.
  773. // The script parameter is copied and thus can be a temporary.
  774. int add_script(const char* script);
  775. // Return the enabled property of the given unichar.
  776. bool get_enabled(UNICHAR_ID unichar_id) const {
  777. ASSERT_HOST(contains_unichar_id(unichar_id));
  778. return unichars[unichar_id].properties.enabled;
  779. }
  780. int null_sid() const { return null_sid_; }
  781. int common_sid() const { return common_sid_; }
  782. int latin_sid() const { return latin_sid_; }
  783. int cyrillic_sid() const { return cyrillic_sid_; }
  784. int greek_sid() const { return greek_sid_; }
  785. int han_sid() const { return han_sid_; }
  786. int hiragana_sid() const { return hiragana_sid_; }
  787. int katakana_sid() const { return katakana_sid_; }
  788. int thai_sid() const { return thai_sid_; }
  789. int hangul_sid() const { return hangul_sid_; }
  790. int default_sid() const { return default_sid_; }
  791. // Returns true if the unicharset has the concept of upper/lower case.
  792. bool script_has_upper_lower() const {
  793. return script_has_upper_lower_;
  794. }
  795. // Returns true if the unicharset has the concept of x-height.
  796. // script_has_xheight can be true even if script_has_upper_lower is not,
  797. // when the script has a sufficiently predominant top line with ascenders,
  798. // such as Devanagari and Thai.
  799. bool script_has_xheight() const {
  800. return script_has_xheight_;
  801. }
  802. private:
  803. struct UNICHAR_PROPERTIES {
  804. UNICHAR_PROPERTIES();
  805. // Initializes all properties to sensible default values.
  806. void Init();
  807. // Sets all ranges wide open. Initialization default in case there are
  808. // no useful values available.
  809. void SetRangesOpen();
  810. // Sets all ranges to empty. Used before expanding with font-based data.
  811. void SetRangesEmpty();
  812. // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
  813. // is empty.
  814. bool AnyRangeEmpty() const;
  815. // Expands the ranges with the ranges from the src properties.
  816. void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
  817. // Copies the properties from src into this.
  818. void CopyFrom(const UNICHAR_PROPERTIES& src);
  819. bool isalpha;
  820. bool islower;
  821. bool isupper;
  822. bool isdigit;
  823. bool ispunctuation;
  824. bool isngram;
  825. bool enabled;
  826. // Possible limits of the top and bottom of the bounding box in
  827. // baseline-normalized coordinates, ie, where the baseline is
  828. // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
  829. // (See normalis.h for the definitions).
  830. uint8_t min_bottom;
  831. uint8_t max_bottom;
  832. uint8_t min_top;
  833. uint8_t max_top;
  834. // Statstics of the widths of bounding box, relative to the median advance.
  835. float width;
  836. float width_sd;
  837. // Stats of the x-bearing and advance, also relative to the median advance.
  838. float bearing;
  839. float bearing_sd;
  840. float advance;
  841. float advance_sd;
  842. int script_id;
  843. UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
  844. Direction direction; // direction of this unichar
  845. // Mirror property is useful for reverse DAWG lookup for words in
  846. // right-to-left languages (e.g. "(word)" would be in
  847. // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
  848. // However, what we want in our DAWG is
  849. // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
  850. // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
  851. UNICHAR_ID mirror;
  852. // A string of unichar_ids that represent the corresponding normed string.
  853. // For awkward characters like em-dash, this gives hyphen.
  854. // For ligatures, this gives the string of normal unichars.
  855. GenericVector<UNICHAR_ID> normed_ids;
  856. STRING normed; // normalized version of this unichar
  857. // Contains meta information about the fragment if a unichar represents
  858. // a fragment of a character, otherwise should be set to nullptr.
  859. // It is assumed that character fragments are added to the unicharset
  860. // after the corresponding 'base' characters.
  861. CHAR_FRAGMENT *fragment;
  862. };
  863. struct UNICHAR_SLOT {
  864. char representation[UNICHAR_LEN + 1];
  865. UNICHAR_PROPERTIES properties;
  866. };
  867. // Internal recursive version of encode_string above.
  868. // str is the start of the whole string.
  869. // str_index is the current position in str.
  870. // str_length is the length of str.
  871. // encoding is a working encoding of str.
  872. // lengths is a working set of lengths of each element of encoding.
  873. // best_total_length is the longest length of str that has been successfully
  874. // encoded so far.
  875. // On return:
  876. // best_encoding contains the encoding that used the longest part of str.
  877. // best_lengths (may be null) contains the lengths of best_encoding.
  878. void encode_string(const char* str, int str_index, int str_length,
  879. GenericVector<UNICHAR_ID>* encoding,
  880. GenericVector<char>* lengths,
  881. int* best_total_length,
  882. GenericVector<UNICHAR_ID>* best_encoding,
  883. GenericVector<char>* best_lengths) const;
  884. // Gets the properties for a grapheme string, combining properties for
  885. // multiple characters in a meaningful way where possible.
  886. // Returns false if no valid match was found in the unicharset.
  887. // NOTE that script_id, mirror, and other_case refer to this unicharset on
  888. // return and will need redirecting if the target unicharset is different.
  889. bool GetStrProperties(const char* utf8_str,
  890. UNICHAR_PROPERTIES* props) const;
  891. // Load ourselves from a "file" where our only interface to the file is
  892. // an implementation of fgets(). This is the parsing primitive accessed by
  893. // the public routines load_from_file() and load_from_inmemory_file().
  894. bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
  895. bool skip_fragments);
  896. // List of mappings to make when ingesting strings from the outside.
  897. // The substitutions clean up text that should exists for rendering of
  898. // synthetic data, but not in the recognition set.
  899. static const char* kCleanupMaps[][2];
  900. static TESS_API const char* null_script;
  901. UNICHAR_SLOT* unichars;
  902. UNICHARMAP ids;
  903. int size_used;
  904. int size_reserved;
  905. char** script_table;
  906. int script_table_size_used;
  907. int script_table_size_reserved;
  908. // True if the unichars have their tops/bottoms set.
  909. bool top_bottom_set_;
  910. // True if the unicharset has significant upper/lower case chars.
  911. bool script_has_upper_lower_;
  912. // True if the unicharset has a significant mean-line with significant
  913. // ascenders above that.
  914. bool script_has_xheight_;
  915. // True if the set contains chars that would be changed by the cleanup.
  916. bool old_style_included_;
  917. // A few convenient script name-to-id mapping without using hash.
  918. // These are initialized when unicharset file is loaded. Anything
  919. // missing from this list can be looked up using get_script_id_from_name.
  920. int null_sid_;
  921. int common_sid_;
  922. int latin_sid_;
  923. int cyrillic_sid_;
  924. int greek_sid_;
  925. int han_sid_;
  926. int hiragana_sid_;
  927. int katakana_sid_;
  928. int thai_sid_;
  929. int hangul_sid_;
  930. // The most frequently occurring script in the charset.
  931. int default_sid_;
  932. };
  933. #endif // TESSERACT_CCUTIL_UNICHARSET_H_