generic_codecvt.hpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. //
  2. // Copyright (c) 2015 Artyom Beilis (Tonkikh)
  3. // Copyright (c) 2021-2023 Alexander Grund
  4. //
  5. // Distributed under the Boost Software License, Version 1.0.
  6. // https://www.boost.org/LICENSE_1_0.txt
  7. #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
  8. #define BOOST_LOCALE_GENERIC_CODECVT_HPP
  9. #include <boost/locale/utf.hpp>
  10. #include <cstdint>
  11. #include <locale>
  12. namespace boost { namespace locale {
  13. static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small to store an UTF-16 codepoint");
  14. namespace detail {
  15. // Avoid including cstring for std::memcpy
  16. inline void copy_uint16_t(void* dst, const void* src)
  17. {
  18. unsigned char* cdst = static_cast<unsigned char*>(dst);
  19. const unsigned char* csrc = static_cast<const unsigned char*>(src);
  20. cdst[0] = csrc[0];
  21. cdst[1] = csrc[1];
  22. }
  23. inline uint16_t read_state(const std::mbstate_t& src)
  24. {
  25. uint16_t dst;
  26. copy_uint16_t(&dst, &src);
  27. return dst;
  28. }
  29. inline void write_state(std::mbstate_t& dst, const uint16_t src)
  30. {
  31. copy_uint16_t(&dst, &src);
  32. }
  33. } // namespace detail
  34. /// \brief A base class that used to define constants for generic_codecvt
  35. class generic_codecvt_base {
  36. public:
  37. /// Initial state for converting to or from Unicode code points, used by initial_state in derived classes
  38. enum initial_convertion_state {
  39. to_unicode_state, ///< The state would be used by to_unicode functions
  40. from_unicode_state ///< The state would be used by from_unicode functions
  41. };
  42. };
  43. /// \brief Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t, char32_t
  44. /// and char16_t
  45. ///
  46. /// Implementations should derive from this class defining itself as CodecvtImpl and provide following members
  47. ///
  48. /// - `state_type` - a type of special object that allows to store intermediate cached data, for example `iconv_t`
  49. /// descriptor
  50. /// - `state_type initial_state(generic_codecvt_base::initial_convertion_state direction) const` - member function
  51. /// that creates initial state
  52. /// - `int max_encoding_length() const` - a maximal length that one Unicode code point is represented, for UTF-8 for
  53. /// example it is 4 from ISO-8859-1 it is 1
  54. /// - `utf::code_point to_unicode(state_type& state, const char*& begin, const char* end)` - extract first code
  55. /// point from the text in range [begin,end), in case of success begin would point to the next character sequence to
  56. /// be encoded to next code point, in case of incomplete sequence - utf::incomplete shell be returned, and in case
  57. /// of invalid input sequence utf::illegal shell be returned and begin would remain unmodified
  58. /// - `utf::len_or_error from_unicode(state_type &state, utf::code_point u, char* begin, const char* end)` - convert
  59. /// a Unicode code point `u` into a character sequence at [begin,end). Return the length of the sequence in case of
  60. /// success, utf::incomplete in case of not enough room to encode the code point, or utf::illegal in case conversion
  61. /// can not be performed
  62. ///
  63. ///
  64. /// For example implementation of codecvt for latin1/ISO-8859-1 character set
  65. ///
  66. /// \code
  67. ///
  68. /// template<typename CharType>
  69. /// class latin1_codecvt: boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >
  70. /// {
  71. /// public:
  72. ///
  73. /// /* Standard codecvt constructor */
  74. /// latin1_codecvt(size_t refs = 0): boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >(refs)
  75. /// {
  76. /// }
  77. ///
  78. /// /* State is unused but required by generic_codecvt */
  79. /// struct state_type {};
  80. ///
  81. /// state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
  82. /// {
  83. /// return state_type();
  84. /// }
  85. ///
  86. /// int max_encoding_length() const
  87. /// {
  88. /// return 1;
  89. /// }
  90. ///
  91. /// boost::locale::utf::code_point to_unicode(state_type&, const char*& begin, const char* end) const
  92. /// {
  93. /// if(begin == end)
  94. /// return boost::locale::utf::incomplete;
  95. /// return *begin++;
  96. /// }
  97. ///
  98. /// boost::locale::utf::len_or_error from_unicode(state_type&, boost::locale::utf::code_point u,
  99. /// char* begin, const char* end) const
  100. /// {
  101. /// if(u >= 256)
  102. /// return boost::locale::utf::illegal;
  103. /// if(begin == end)
  104. /// return boost::locale::utf::incomplete;
  105. /// *begin = u;
  106. /// return 1;
  107. /// }
  108. /// };
  109. ///
  110. /// \endcode
  111. ///
  112. /// When external tools used for encoding conversion, the `state_type` is useful to save objects used for
  113. /// conversions. For example, icu::UConverter can be saved in such a state for an efficient use:
  114. ///
  115. /// \code
  116. /// template<typename CharType>
  117. /// class icu_codecvt: boost::locale::generic_codecvt<CharType,icu_codecvt<CharType>>
  118. /// {
  119. /// public:
  120. ///
  121. /// /* Standard codecvt constructor */
  122. /// icu_codecvt(std::string const &name,refs = 0):
  123. /// boost::locale::generic_codecvt<CharType,icu_codecvt<CharType>>(refs)
  124. /// { ... }
  125. ///
  126. /// using state_type = std::unique_ptr<UConverter,void (*)(UConverter*)>;
  127. ///
  128. /// state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
  129. /// {
  130. /// UErrorCode err = U_ZERO_ERROR;
  131. /// return state_type(ucnv_safeClone(converter_,0,0,&err),ucnv_close);
  132. /// }
  133. ///
  134. /// boost::locale::utf::code_point to_unicode(state_type &ptr,char const *&begin,char const *end) const
  135. /// {
  136. /// UErrorCode err = U_ZERO_ERROR;
  137. /// boost::locale::utf::code_point cp = ucnv_getNextUChar(ptr.get(),&begin,end,&err);
  138. /// ...
  139. /// }
  140. /// ...
  141. /// };
  142. /// \endcode
  143. ///
  144. template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
  145. class generic_codecvt;
  146. /// \brief UTF-16 to/from narrow char codecvt facet to use with char16_t or wchar_t on Windows
  147. ///
  148. /// Note in order to fit the requirements of usability by std::wfstream it uses mbstate_t
  149. /// to handle intermediate states in handling of variable length UTF-16 sequences
  150. ///
  151. /// Its member functions implement standard virtual functions of basic codecvt
  152. template<typename CharType, typename CodecvtImpl>
  153. class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
  154. public generic_codecvt_base {
  155. public:
  156. typedef CharType uchar;
  157. generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
  158. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  159. protected:
  160. std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
  161. {
  162. if(*reinterpret_cast<char*>(&s) != 0)
  163. return std::codecvt_base::error;
  164. next = from;
  165. return std::codecvt_base::ok;
  166. }
  167. int do_encoding() const noexcept override { return 0; }
  168. int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
  169. bool do_always_noconv() const noexcept override { return false; }
  170. int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
  171. {
  172. bool state = *reinterpret_cast<char*>(&std_state) != 0;
  173. const char* save_from = from;
  174. auto cvt_state = implementation().initial_state(to_unicode_state);
  175. while(max > 0 && from < from_end) {
  176. const char* prev_from = from;
  177. const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
  178. if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
  179. from = prev_from;
  180. break;
  181. }
  182. max--;
  183. if(ch > 0xFFFF) {
  184. if(!state)
  185. from = prev_from;
  186. state = !state;
  187. }
  188. }
  189. *reinterpret_cast<char*>(&std_state) = state;
  190. return static_cast<int>(from - save_from);
  191. }
  192. std::codecvt_base::result do_in(std::mbstate_t& std_state,
  193. const char* from,
  194. const char* from_end,
  195. const char*& from_next,
  196. uchar* to,
  197. uchar* to_end,
  198. uchar*& to_next) const override
  199. {
  200. std::codecvt_base::result r = std::codecvt_base::ok;
  201. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  202. // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
  203. //
  204. // If true then only the high surrogate of a codepoint > 0xFFFF was written, but no input consumed.
  205. bool low_surrogate_pending = *reinterpret_cast<char*>(&std_state) != 0;
  206. auto cvt_state = implementation().initial_state(to_unicode_state);
  207. while(to < to_end && from < from_end) {
  208. const char* from_saved = from;
  209. utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
  210. if(ch == boost::locale::utf::illegal) {
  211. from = from_saved;
  212. r = std::codecvt_base::error;
  213. break;
  214. }
  215. if(ch == boost::locale::utf::incomplete) {
  216. from = from_saved;
  217. r = std::codecvt_base::partial;
  218. break;
  219. }
  220. // Normal codepoints go directly to stream
  221. if(ch <= 0xFFFF)
  222. *to++ = static_cast<uchar>(ch);
  223. else {
  224. // For other codepoints we can't consume our input as we may find ourselves in a state
  225. // where all input is consumed but not all output written, i.e. only the high surrogate is written.
  226. //
  227. // So we write only the high surrogate and mark this in the state.
  228. // We also set the from pointer to the previous position, i.e. don't consume the input, so this
  229. // codepoint will be read again and then we will consume our input together with writing the low
  230. // surrogate.
  231. ch -= 0x10000;
  232. const std::uint16_t w1 = static_cast<std::uint16_t>(0xD800 | (ch >> 10));
  233. const std::uint16_t w2 = static_cast<std::uint16_t>(0xDC00 | (ch & 0x3FF));
  234. if(!low_surrogate_pending) {
  235. from = from_saved;
  236. *to++ = w1;
  237. } else
  238. *to++ = w2;
  239. low_surrogate_pending = !low_surrogate_pending;
  240. }
  241. }
  242. from_next = from;
  243. to_next = to;
  244. if(r == std::codecvt_base::ok && (from != from_end || low_surrogate_pending))
  245. r = std::codecvt_base::partial;
  246. *reinterpret_cast<char*>(&std_state) = low_surrogate_pending;
  247. return r;
  248. }
  249. std::codecvt_base::result do_out(std::mbstate_t& std_state,
  250. const uchar* from,
  251. const uchar* from_end,
  252. const uchar*& from_next,
  253. char* to,
  254. char* to_end,
  255. char*& to_next) const override
  256. {
  257. std::codecvt_base::result r = std::codecvt_base::ok;
  258. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  259. // according to standard. We assume that sizeof(mbstate_t) >=2 in order
  260. // to be able to store first observed surrogate pair
  261. //
  262. // State: state!=0 - a first surrogate pair was observed (state = first pair),
  263. // we expect the second one to come and then zero the state
  264. std::uint16_t state = detail::read_state(std_state);
  265. auto cvt_state = implementation().initial_state(from_unicode_state);
  266. while(to < to_end && from < from_end) {
  267. utf::code_point ch = 0;
  268. if(state != 0) {
  269. // if the state indicates that 1st surrogate pair was written
  270. // we should make sure that the second one that comes is actually
  271. // second surrogate
  272. std::uint16_t w1 = state;
  273. std::uint16_t w2 = *from;
  274. // we don't forward from as writing may fail to incomplete or
  275. // partial conversion
  276. if(0xDC00 <= w2 && w2 <= 0xDFFF) {
  277. std::uint16_t vh = w1 - 0xD800;
  278. std::uint16_t vl = w2 - 0xDC00;
  279. ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
  280. } else {
  281. // Invalid surrogate
  282. r = std::codecvt_base::error;
  283. break;
  284. }
  285. } else {
  286. ch = *from;
  287. if(0xD800 <= ch && ch <= 0xDBFF) {
  288. // if this is a first surrogate pair we put
  289. // it into the state and consume it, note we don't
  290. // go forward as it should be illegal so we increase
  291. // the from pointer manually
  292. state = static_cast<uint16_t>(ch);
  293. from++;
  294. continue;
  295. } else if(0xDC00 <= ch && ch <= 0xDFFF) {
  296. // if we observe second surrogate pair and
  297. // first only may be expected we should break from the loop with error
  298. // as it is illegal input
  299. r = std::codecvt_base::error;
  300. break;
  301. }
  302. }
  303. if(!boost::locale::utf::is_valid_codepoint(ch)) {
  304. r = std::codecvt_base::error;
  305. break;
  306. }
  307. const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
  308. if(len == boost::locale::utf::incomplete) {
  309. r = std::codecvt_base::partial;
  310. break;
  311. } else if(len == boost::locale::utf::illegal) {
  312. r = std::codecvt_base::error;
  313. break;
  314. } else
  315. to += len;
  316. state = 0;
  317. from++;
  318. }
  319. from_next = from;
  320. to_next = to;
  321. if(r == std::codecvt_base::ok && (from != from_end || state != 0))
  322. r = std::codecvt_base::partial;
  323. detail::write_state(std_state, state);
  324. return r;
  325. }
  326. };
  327. /// \brief UTF-32 to/from narrow char codecvt facet to use with char32_t or wchar_t on POSIX platforms
  328. ///
  329. /// Its member functions implement standard virtual functions of basic codecvt.
  330. /// mbstate_t is not used for UTF-32 handling due to fixed length encoding
  331. template<typename CharType, typename CodecvtImpl>
  332. class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
  333. public generic_codecvt_base {
  334. public:
  335. typedef CharType uchar;
  336. generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
  337. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  338. protected:
  339. std::codecvt_base::result
  340. do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
  341. {
  342. next = from;
  343. return std::codecvt_base::ok;
  344. }
  345. int do_encoding() const noexcept override { return 0; }
  346. int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
  347. bool do_always_noconv() const noexcept override { return false; }
  348. int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
  349. {
  350. const char* start_from = from;
  351. auto cvt_state = implementation().initial_state(to_unicode_state);
  352. while(max > 0 && from < from_end) {
  353. const char* save_from = from;
  354. const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
  355. if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
  356. from = save_from;
  357. break;
  358. }
  359. max--;
  360. }
  361. return static_cast<int>(from - start_from);
  362. }
  363. std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
  364. const char* from,
  365. const char* from_end,
  366. const char*& from_next,
  367. uchar* to,
  368. uchar* to_end,
  369. uchar*& to_next) const override
  370. {
  371. std::codecvt_base::result r = std::codecvt_base::ok;
  372. auto cvt_state = implementation().initial_state(to_unicode_state);
  373. while(to < to_end && from < from_end) {
  374. const char* from_saved = from;
  375. const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
  376. if(ch == boost::locale::utf::illegal) {
  377. r = std::codecvt_base::error;
  378. from = from_saved;
  379. break;
  380. }
  381. if(ch == boost::locale::utf::incomplete) {
  382. r = std::codecvt_base::partial;
  383. from = from_saved;
  384. break;
  385. }
  386. *to++ = ch;
  387. }
  388. from_next = from;
  389. to_next = to;
  390. if(r == std::codecvt_base::ok && from != from_end)
  391. r = std::codecvt_base::partial;
  392. return r;
  393. }
  394. std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
  395. const uchar* from,
  396. const uchar* from_end,
  397. const uchar*& from_next,
  398. char* to,
  399. char* to_end,
  400. char*& to_next) const override
  401. {
  402. std::codecvt_base::result r = std::codecvt_base::ok;
  403. auto cvt_state = implementation().initial_state(from_unicode_state);
  404. while(to < to_end && from < from_end) {
  405. const std::uint32_t ch = *from;
  406. if(!boost::locale::utf::is_valid_codepoint(ch)) {
  407. r = std::codecvt_base::error;
  408. break;
  409. }
  410. const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
  411. if(len == boost::locale::utf::incomplete) {
  412. r = std::codecvt_base::partial;
  413. break;
  414. } else if(len == boost::locale::utf::illegal) {
  415. r = std::codecvt_base::error;
  416. break;
  417. }
  418. to += len;
  419. from++;
  420. }
  421. from_next = from;
  422. to_next = to;
  423. if(r == std::codecvt_base::ok && from != from_end)
  424. r = std::codecvt_base::partial;
  425. return r;
  426. }
  427. };
  428. template<typename CodecvtImpl>
  429. class generic_codecvt<char, CodecvtImpl, 1> : public std::codecvt<char, char, std::mbstate_t>,
  430. public generic_codecvt_base {
  431. public:
  432. typedef char uchar;
  433. const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
  434. generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
  435. };
  436. }} // namespace boost::locale
  437. #endif