unicharcompress.h 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: unicharcompress.h
  3. // Description: Unicode re-encoding using a sequence of smaller numbers in
  4. // place of a single large code for CJK, similarly for Indic,
  5. // and dissection of ligatures for other scripts.
  6. // Author: Ray Smith
  7. // Created: Wed Mar 04 14:45:01 PST 2015
  8. //
  9. // (C) Copyright 2015, Google Inc.
  10. // Licensed under the Apache License, Version 2.0 (the "License");
  11. // you may not use this file except in compliance with the License.
  12. // You may obtain a copy of the License at
  13. // http://www.apache.org/licenses/LICENSE-2.0
  14. // Unless required by applicable law or agreed to in writing, software
  15. // distributed under the License is distributed on an "AS IS" BASIS,
  16. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17. // See the License for the specific language governing permissions and
  18. // limitations under the License.
  19. //
  20. ///////////////////////////////////////////////////////////////////////
  21. #ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
  22. #define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
  23. #include <unordered_map>
  24. #include "serialis.h"
  25. #include "strngs.h"
  26. #include "unicharset.h"
  27. namespace tesseract {
  28. // Trivial class to hold the code for a recoded unichar-id.
  29. class RecodedCharID {
  30. public:
  31. // The maximum length of a code.
  32. static const int kMaxCodeLen = 9;
  33. RecodedCharID() : self_normalized_(1), length_(0) {
  34. memset(code_, 0, sizeof(code_));
  35. }
  36. void Truncate(int length) { length_ = length; }
  37. // Sets the code value at the given index in the code.
  38. void Set(int index, int value) {
  39. code_[index] = value;
  40. if (length_ <= index) length_ = index + 1;
  41. }
  42. // Shorthand for setting codes of length 3, as all Hangul and Han codes are
  43. // length 3.
  44. void Set3(int code0, int code1, int code2) {
  45. length_ = 3;
  46. code_[0] = code0;
  47. code_[1] = code1;
  48. code_[2] = code2;
  49. }
  50. // Accessors
  51. int length() const { return length_; }
  52. int operator()(int index) const { return code_[index]; }
  53. // Writes to the given file. Returns false in case of error.
  54. bool Serialize(TFile* fp) const {
  55. return fp->Serialize(&self_normalized_) &&
  56. fp->Serialize(&length_) &&
  57. fp->Serialize(&code_[0], length_);
  58. }
  59. // Reads from the given file. Returns false in case of error.
  60. bool DeSerialize(TFile* fp) {
  61. return fp->DeSerialize(&self_normalized_) &&
  62. fp->DeSerialize(&length_) &&
  63. fp->DeSerialize(&code_[0], length_);
  64. }
  65. bool operator==(const RecodedCharID& other) const {
  66. if (length_ != other.length_) return false;
  67. for (int i = 0; i < length_; ++i) {
  68. if (code_[i] != other.code_[i]) return false;
  69. }
  70. return true;
  71. }
  72. // Hash functor for RecodedCharID.
  73. struct RecodedCharIDHash {
  74. size_t operator()(const RecodedCharID& code) const {
  75. size_t result = 0;
  76. for (int i = 0; i < code.length_; ++i) {
  77. result ^= code(i) << (7 * i);
  78. }
  79. return result;
  80. }
  81. };
  82. private:
  83. // True if this code is self-normalizing, ie is the master entry for indices
  84. // that map to the same code. Has boolean value, but int8_t for serialization.
  85. int8_t self_normalized_;
  86. // The number of elements in use in code_;
  87. int32_t length_;
  88. // The re-encoded form of the unichar-id to which this RecodedCharID relates.
  89. int32_t code_[kMaxCodeLen];
  90. };
  91. // Class holds a "compression" of a unicharset to simplify the learning problem
  92. // for a neural-network-based classifier.
  93. // Objectives:
  94. // 1 (CJK): Ids of a unicharset with a large number of classes are expressed as
  95. // a sequence of 3 codes with much fewer values.
  96. // This is achieved using the Jamo coding for Hangul and the Unicode
  97. // Radical-Stroke-index for Han.
  98. // 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code
  99. // as the unicode sequence (but coded in a more compact space).
  100. // 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing
  101. // and not significantly distinct shapes (quotes) together, ie
  102. // represent the fi ligature as the f-i pair, and fold u+2019 and
  103. // friends all onto ascii single '
  104. // 4 The null character and mapping to target activations:
  105. // To save horizontal coding space, the compressed codes are generally mapped
  106. // to target network activations without intervening null characters, BUT
  107. // in the case of ligatures, such as ff, null characters have to be included
  108. // so existence of repeated codes is detected at codebook-building time, and
  109. // null characters are embedded directly into the codes, so the rest of the
  110. // system doesn't need to worry about the problem (much). There is still an
  111. // effect on the range of ways in which the target activations can be
  112. // generated.
  113. //
  114. // The computed code values are compact (no unused values), and, for CJK,
  115. // unique (each code position uses a disjoint set of values from each other code
  116. // position). For non-CJK, the same code value CAN be used in multiple
  117. // positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
  118. // is the same code as is used for the single f.
  119. class UnicharCompress {
  120. public:
  121. UnicharCompress();
  122. UnicharCompress(const UnicharCompress& src);
  123. ~UnicharCompress();
  124. UnicharCompress& operator=(const UnicharCompress& src);
  125. // The 1st Hangul unicode.
  126. static const int kFirstHangul = 0xac00;
  127. // The number of Hangul unicodes.
  128. static const int kNumHangul = 11172;
  129. // The number of Jamos for each of the 3 parts of a Hangul character, being
  130. // the Leading consonant, Vowel and Trailing consonant.
  131. static const int kLCount = 19;
  132. static const int kVCount = 21;
  133. static const int kTCount = 28;
  134. // Computes the encoding for the given unicharset. It is a requirement that
  135. // the file training/langdata/radical-stroke.txt have been read into the
  136. // input string radical_stroke_table.
  137. // Returns false if the encoding cannot be constructed.
  138. bool ComputeEncoding(const UNICHARSET& unicharset, int null_id,
  139. STRING* radical_stroke_table);
  140. // Sets up an encoder that doesn't change the unichars at all, so it just
  141. // passes them through unchanged.
  142. void SetupPassThrough(const UNICHARSET& unicharset);
  143. // Sets up an encoder directly using the given encoding vector, which maps
  144. // unichar_ids to the given codes.
  145. void SetupDirect(const GenericVector<RecodedCharID>& codes);
  146. // Returns the number of different values that can be used in a code, ie
  147. // 1 + the maximum value that will ever be used by an RecodedCharID code in
  148. // any position in its array.
  149. int code_range() const { return code_range_; }
  150. // Encodes a single unichar_id. Returns the length of the code, (or zero if
  151. // invalid input), and the encoding itself in code.
  152. int EncodeUnichar(int unichar_id, RecodedCharID* code) const;
  153. // Decodes code, returning the original unichar-id, or
  154. // INVALID_UNICHAR_ID if the input is invalid.
  155. int DecodeUnichar(const RecodedCharID& code) const;
  156. // Returns true if the given code is a valid start or single code.
  157. bool IsValidFirstCode(int code) const { return is_valid_start_[code]; }
  158. // Returns a list of valid non-final next codes for a given prefix code,
  159. // which may be empty.
  160. const GenericVector<int>* GetNextCodes(const RecodedCharID& code) const {
  161. auto it = next_codes_.find(code);
  162. return it == next_codes_.end() ? nullptr : it->second;
  163. }
  164. // Returns a list of valid final codes for a given prefix code, which may
  165. // be empty.
  166. const GenericVector<int>* GetFinalCodes(const RecodedCharID& code) const {
  167. auto it = final_codes_.find(code);
  168. return it == final_codes_.end() ? nullptr : it->second;
  169. }
  170. // Writes to the given file. Returns false in case of error.
  171. bool Serialize(TFile* fp) const;
  172. // Reads from the given file. Returns false in case of error.
  173. bool DeSerialize(TFile* fp);
  174. // Returns a STRING containing a text file that describes the encoding thus:
  175. // <index>[,<index>]*<tab><UTF8-str><newline>
  176. // In words, a comma-separated list of one or more indices, followed by a tab
  177. // and the UTF-8 string that the code represents per line. Most simple scripts
  178. // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
  179. // and the Indic scripts will contain a many-to-many mapping.
  180. // See the class comment above for details.
  181. STRING GetEncodingAsString(const UNICHARSET& unicharset) const;
  182. // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
  183. // Note that the returned values are 0-based indices, NOT unicode Jamo.
  184. // Returns false if the input is not in the Hangul unicode range.
  185. static bool DecomposeHangul(int unicode, int* leading, int* vowel,
  186. int* trailing);
  187. private:
  188. // Renumbers codes to eliminate unused values.
  189. void DefragmentCodeValues(int encoded_null);
  190. // Computes the value of code_range_ from the encoder_.
  191. void ComputeCodeRange();
  192. // Initializes the decoding hash_map from the encoder_ array.
  193. void SetupDecoder();
  194. // Frees allocated memory.
  195. void Cleanup();
  196. // The encoder that maps a unichar-id to a sequence of small codes.
  197. // encoder_ is the only part that is serialized. The rest is computed on load.
  198. GenericVector<RecodedCharID> encoder_;
  199. // Decoder converts the output of encoder back to a unichar-id.
  200. std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash>
  201. decoder_;
  202. // True if the index is a valid single or start code.
  203. GenericVector<bool> is_valid_start_;
  204. // Maps a prefix code to a list of valid next codes.
  205. // The map owns the vectors.
  206. std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
  207. RecodedCharID::RecodedCharIDHash>
  208. next_codes_;
  209. // Maps a prefix code to a list of valid final codes.
  210. // The map owns the vectors.
  211. std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
  212. RecodedCharID::RecodedCharIDHash>
  213. final_codes_;
  214. // Max of any value in encoder_ + 1.
  215. int code_range_;
  216. };
  217. } // namespace tesseract.
  218. #endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_