strngs.h 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. /**********************************************************************
  2. * File: strngs.h (Formerly strings.h)
  3. * Description: STRING class definition.
  4. * Author: Ray Smith
  5. *
  6. * (C) Copyright 1991, Hewlett-Packard Ltd.
  7. ** Licensed under the Apache License, Version 2.0 (the "License");
  8. ** you may not use this file except in compliance with the License.
  9. ** You may obtain a copy of the License at
  10. ** http://www.apache.org/licenses/LICENSE-2.0
  11. ** Unless required by applicable law or agreed to in writing, software
  12. ** distributed under the License is distributed on an "AS IS" BASIS,
  13. ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. ** See the License for the specific language governing permissions and
  15. ** limitations under the License.
  16. *
  17. **********************************************************************/
  18. #ifndef STRNGS_H
  19. #define STRNGS_H
  20. #include <cassert> // for assert
  21. #include <cstdint> // for uint32_t
  22. #include <cstdio> // for FILE
  23. #include <cstring> // for strncpy
  24. #include "platform.h" // for TESS_API
  25. namespace tesseract {
  26. class TFile;
  27. } // namespace tesseract.
  28. // STRING_IS_PROTECTED means that string[index] = X is invalid
  29. // because you have to go through strings interface to modify it.
  30. // This allows the string to ensure internal integrity and maintain
  31. // its own string length. Unfortunately this is not possible because
  32. // STRINGS are used as direct-manipulation data buffers for things
  33. // like length arrays and many places cast away the const on string()
  34. // to mutate the string. Turning this off means that internally we
  35. // cannot assume we know the strlen.
  36. #define STRING_IS_PROTECTED 0
  37. template <typename T>
  38. class GenericVector;
  39. class TESS_API STRING {
  40. public:
  41. STRING();
  42. STRING(const STRING& string);
  43. STRING(const char* string);
  44. STRING(const char* data, int length);
  45. ~STRING();
  46. // Writes to the given file. Returns false in case of error.
  47. bool Serialize(FILE* fp) const;
  48. // Reads from the given file. Returns false in case of error.
  49. // If swap is true, assumes a big/little-endian swap is needed.
  50. bool DeSerialize(bool swap, FILE* fp);
  51. // Writes to the given file. Returns false in case of error.
  52. bool Serialize(tesseract::TFile* fp) const;
  53. // Reads from the given file. Returns false in case of error.
  54. // If swap is true, assumes a big/little-endian swap is needed.
  55. bool DeSerialize(tesseract::TFile* fp);
  56. // As DeSerialize, but only seeks past the data - hence a static method.
  57. static bool SkipDeSerialize(tesseract::TFile* fp);
  58. bool contains(char c) const;
  59. int32_t length() const;
  60. int32_t size() const {
  61. return length();
  62. }
  63. // Workaround to avoid g++ -Wsign-compare warnings.
  64. uint32_t unsigned_size() const {
  65. const int32_t len = length();
  66. assert(0 <= len);
  67. return static_cast<uint32_t>(len);
  68. }
  69. const char* string() const;
  70. const char* c_str() const;
  71. inline char* strdup() const {
  72. int32_t len = length() + 1;
  73. return strncpy(new char[len], GetCStr(), len);
  74. }
  75. #if STRING_IS_PROTECTED
  76. const char& operator[](int32_t index) const;
  77. // len is number of chars in s to insert starting at index in this string
  78. void insert_range(int32_t index, const char* s, int len);
  79. void erase_range(int32_t index, int len);
  80. #else
  81. char& operator[](int32_t index) const;
  82. #endif
  83. void split(char c, GenericVector<STRING>* splited);
  84. void truncate_at(int32_t index);
  85. bool operator==(const STRING& string) const;
  86. bool operator!=(const STRING& string) const;
  87. bool operator!=(const char* string) const;
  88. STRING& operator=(const char* string);
  89. STRING& operator=(const STRING& string);
  90. STRING operator+(const STRING& string) const;
  91. STRING operator+(char ch) const;
  92. STRING& operator+=(const char* string);
  93. STRING& operator+=(const STRING& string);
  94. STRING& operator+=(char ch);
  95. // Assignment for strings which are not null-terminated.
  96. void assign(const char* cstr, int len);
  97. // Appends the given string and int (as a %d) to this.
  98. // += cannot be used for ints as there as a char += operator that would
  99. // be ambiguous, and ints usually need a string before or between them
  100. // anyway.
  101. void add_str_int(const char* str, int number);
  102. // Appends the given string and double (as a %.8g) to this.
  103. void add_str_double(const char* str, double number);
  104. // ensure capacity but keep pointer encapsulated
  105. inline void ensure(int32_t min_capacity) {
  106. ensure_cstr(min_capacity);
  107. }
  108. private:
  109. typedef struct STRING_HEADER {
  110. // How much space was allocated in the string buffer for char data.
  111. int capacity_;
  112. // used_ is how much of the capacity is currently being used,
  113. // including a '\0' terminator.
  114. //
  115. // If used_ is 0 then string is nullptr (not even the '\0')
  116. // else if used_ > 0 then it is strlen() + 1 (because it includes '\0')
  117. // else strlen is >= 0 (not nullptr) but needs to be computed.
  118. // this condition is set when encapsulation is violated because
  119. // an API returned a mutable string.
  120. //
  121. // capacity_ - used_ = excess capacity that the string can grow
  122. // without reallocating
  123. mutable int used_;
  124. } STRING_HEADER;
  125. // To preserve the behavior of the old serialization, we only have space
  126. // for one pointer in this structure. So we are embedding a data structure
  127. // at the start of the storage that will hold additional state variables,
  128. // then storing the actual string contents immediately after.
  129. STRING_HEADER* data_;
  130. // returns the header part of the storage
  131. inline STRING_HEADER* GetHeader() {
  132. return data_;
  133. }
  134. inline const STRING_HEADER* GetHeader() const {
  135. return data_;
  136. }
  137. // returns the string data part of storage
  138. inline char* GetCStr() {
  139. return (reinterpret_cast<char*>(data_)) + sizeof(STRING_HEADER);
  140. }
  141. inline const char* GetCStr() const {
  142. return (reinterpret_cast<const char*>(data_)) + sizeof(STRING_HEADER);
  143. }
  144. inline bool InvariantOk() const {
  145. #if STRING_IS_PROTECTED
  146. return (GetHeader()->used_ == 0)
  147. ? (string() == nullptr)
  148. : (GetHeader()->used_ == (strlen(string()) + 1));
  149. #else
  150. return true;
  151. #endif
  152. }
  153. // Ensure string has requested capacity as optimization
  154. // to avoid unnecessary reallocations.
  155. // The return value is a cstr buffer with at least requested capacity
  156. char* ensure_cstr(int32_t min_capacity);
  157. void FixHeader() const; // make used_ non-negative, even if const
  158. char* AllocData(int used, int capacity);
  159. void DiscardData();
  160. };
  161. #endif