unichar.h 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: unichar.h
  3. // Description: Unicode character/ligature class.
  4. // Author: Ray Smith
  5. //
  6. // (C) Copyright 2006, Google Inc.
  7. // Licensed under the Apache License, Version 2.0 (the "License");
  8. // you may not use this file except in compliance with the License.
  9. // You may obtain a copy of the License at
  10. // http://www.apache.org/licenses/LICENSE-2.0
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS,
  13. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. // See the License for the specific language governing permissions and
  15. // limitations under the License.
  16. //
  17. ///////////////////////////////////////////////////////////////////////
  18. #ifndef TESSERACT_CCUTIL_UNICHAR_H_
  19. #define TESSERACT_CCUTIL_UNICHAR_H_
  20. #include <memory.h>
  21. #include <cstring>
  22. #include <string>
  23. #include <vector>
  24. #include "platform.h"
  25. // Maximum number of characters that can be stored in a UNICHAR. Must be
  26. // at least 4. Must not exceed 31 without changing the coding of length.
  27. #define UNICHAR_LEN 30
  28. // TODO(rays) Move these to the tesseract namespace.
  29. // A UNICHAR_ID is the unique id of a unichar.
  30. using UNICHAR_ID = int;
  31. // A variable to indicate an invalid or uninitialized unichar id.
  32. static const int INVALID_UNICHAR_ID = -1;
  33. // A special unichar that corresponds to INVALID_UNICHAR_ID.
  34. static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
  35. enum StrongScriptDirection {
  36. DIR_NEUTRAL = 0, // Text contains only neutral characters.
  37. DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
  38. DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
  39. DIR_MIX = 3, // Text contains a mixture of left-to-right
  40. // and right-to-left characters.
  41. };
  42. namespace tesseract {
  43. using char32 = signed int;
  44. // The UNICHAR class holds a single classification result. This may be
  45. // a single Unicode character (stored as between 1 and 4 utf8 bytes) or
  46. // multiple Unicode characters representing the NFKC expansion of a ligature
  47. // such as fi, ffl etc. These are also stored as utf8.
  48. class UNICHAR {
  49. public:
  50. UNICHAR() {
  51. memset(chars, 0, UNICHAR_LEN);
  52. }
  53. // Construct from a utf8 string. If len<0 then the string is null terminated.
  54. // If the string is too long to fit in the UNICHAR then it takes only what
  55. // will fit.
  56. UNICHAR(const char* utf8_str, int len);
  57. // Construct from a single UCS4 character.
  58. explicit UNICHAR(int unicode);
  59. // Default copy constructor and operator= are OK.
  60. // Get the first character as UCS-4.
  61. int first_uni() const;
  62. // Get the length of the UTF8 string.
  63. int utf8_len() const {
  64. int len = chars[UNICHAR_LEN - 1];
  65. return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
  66. }
  67. // Get a UTF8 string, but NOT nullptr terminated.
  68. const char* utf8() const {
  69. return chars;
  70. }
  71. // Get a terminated UTF8 string: Must delete[] it after use.
  72. char* utf8_str() const;
  73. // Get the number of bytes in the first character of the given utf8 string.
  74. static int utf8_step(const char* utf8_str);
  75. // A class to simplify iterating over and accessing elements of a UTF8
  76. // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
  77. // take ownership of the underlying byte array. It also does not permit
  78. // modification of the array (as the name suggests).
  79. //
  80. // Example:
  81. // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
  82. // it != UNICHAR::end(str, len);
  83. // ++it) {
  84. // tprintf("UCS-4 symbol code = %d\n", *it);
  85. // char buf[5];
  86. // int char_len = it.get_utf8(buf); buf[char_len] = '\0';
  87. // tprintf("Char = %s\n", buf);
  88. // }
  89. class const_iterator {
  90. using CI = const_iterator;
  91. public:
  92. // Step to the next UTF8 character.
  93. // If the current position is at an illegal UTF8 character, then print an
  94. // error message and step by one byte. If the current position is at a
  95. // nullptr value, don't step past it.
  96. const_iterator& operator++();
  97. // Return the UCS-4 value at the current position.
  98. // If the current position is at an illegal UTF8 value, return a single
  99. // space character.
  100. int operator*() const;
  101. // Store the UTF-8 encoding of the current codepoint into buf, which must be
  102. // at least 4 bytes long. Return the number of bytes written.
  103. // If the current position is at an illegal UTF8 value, writes a single
  104. // space character and returns 1.
  105. // Note that this method does not null-terminate the buffer.
  106. int get_utf8(char* buf) const;
  107. // Returns the number of bytes of the current codepoint. Returns 1 if the
  108. // current position is at an illegal UTF8 value.
  109. int utf8_len() const;
  110. // Returns true if the UTF-8 encoding at the current position is legal.
  111. bool is_legal() const;
  112. // Return the pointer into the string at the current position.
  113. const char* utf8_data() const {
  114. return it_;
  115. }
  116. // Iterator equality operators.
  117. friend bool operator==(const CI& lhs, const CI& rhs) {
  118. return lhs.it_ == rhs.it_;
  119. }
  120. friend bool operator!=(const CI& lhs, const CI& rhs) {
  121. return !(lhs == rhs);
  122. }
  123. private:
  124. friend class UNICHAR;
  125. explicit const_iterator(const char* it) : it_(it) {}
  126. const char* it_; // Pointer into the string.
  127. };
  128. // Create a start/end iterator pointing to a string. Note that these methods
  129. // are static and do NOT create a copy or take ownership of the underlying
  130. // array.
  131. static const_iterator begin(const char* utf8_str, int byte_length);
  132. static const_iterator end(const char* utf8_str, int byte_length);
  133. // Converts a utf-8 string to a vector of unicodes.
  134. // Returns an empty vector if the input contains invalid UTF-8.
  135. static std::vector<char32> UTF8ToUTF32(const char* utf8_str);
  136. // Converts a vector of unicodes to a utf8 string.
  137. // Returns an empty string if the input contains an invalid unicode.
  138. static std::string UTF32ToUTF8(const std::vector<char32>& str32);
  139. private:
  140. // A UTF-8 representation of 1 or more Unicode characters.
  141. // The last element (chars[UNICHAR_LEN - 1]) is a length if
  142. // its value < UNICHAR_LEN, otherwise it is a genuine character.
  143. char chars[UNICHAR_LEN]{};
  144. };
  145. } // namespace tesseract
  146. #endif // TESSERACT_CCUTIL_UNICHAR_H_