unicode_iterator.hpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE unicode_iterator.hpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
  16. */
  17. /****************************************************************************
  18. Contents:
  19. ~~~~~~~~~
  20. 1) Read Only, Input Adapters:
  21. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  22. template <class BaseIterator, class U8Type = std::uint8_t>
  23. class u32_to_u8_iterator;
  24. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
  25. template <class BaseIterator, class U32Type = std::uint32_t>
  26. class u8_to_u32_iterator;
  27. Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
  28. template <class BaseIterator, class U16Type = std::uint16_t>
  29. class u32_to_u16_iterator;
  30. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
  31. template <class BaseIterator, class U32Type = std::uint32_t>
  32. class u16_to_u32_iterator;
  33. Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
  34. 2) Single pass output iterator adapters:
  35. template <class BaseIterator>
  36. class utf8_output_iterator;
  37. Accepts UTF-32 code points and forwards them on as UTF-8 code points.
  38. template <class BaseIterator>
  39. class utf16_output_iterator;
  40. Accepts UTF-32 code points and forwards them on as UTF-16 code points.
  41. ****************************************************************************/
  42. #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
  43. #define BOOST_REGEX_UNICODE_ITERATOR_HPP
  44. #include <boost/regex/config.hpp>
  45. #ifndef BOOST_REGEX_AS_MODULE
  46. #include <cstdint>
  47. #include <stdexcept>
  48. #include <sstream>
  49. #include <ios>
  50. #include <limits.h> // CHAR_BIT
  51. #endif
  52. #ifndef BOOST_REGEX_STANDALONE
  53. #include <boost/throw_exception.hpp>
  54. #endif
  55. namespace boost{
  56. namespace detail{
  57. BOOST_REGEX_STATIC_CONST std::uint16_t high_surrogate_base = 0xD7C0u;
  58. BOOST_REGEX_STATIC_CONST std::uint16_t low_surrogate_base = 0xDC00u;
  59. BOOST_REGEX_STATIC_CONST std::uint32_t ten_bit_mask = 0x3FFu;
  60. inline bool is_high_surrogate(std::uint16_t v)
  61. {
  62. return (v & 0xFFFFFC00u) == 0xd800u;
  63. }
  64. inline bool is_low_surrogate(std::uint16_t v)
  65. {
  66. return (v & 0xFFFFFC00u) == 0xdc00u;
  67. }
  68. template <class T>
  69. inline bool is_surrogate(T v)
  70. {
  71. return (v & 0xFFFFF800u) == 0xd800;
  72. }
  73. inline unsigned utf8_byte_count(std::uint8_t c)
  74. {
  75. // if the most significant bit with a zero in it is in position
  76. // 8-N then there are N bytes in this UTF-8 sequence:
  77. std::uint8_t mask = 0x80u;
  78. unsigned result = 0;
  79. while(c & mask)
  80. {
  81. ++result;
  82. mask >>= 1;
  83. }
  84. return (result == 0) ? 1 : ((result > 4) ? 4 : result);
  85. }
  86. inline unsigned utf8_trailing_byte_count(std::uint8_t c)
  87. {
  88. return utf8_byte_count(c) - 1;
  89. }
  90. #ifdef BOOST_REGEX_MSVC
  91. #pragma warning(push)
  92. #pragma warning(disable:4100)
  93. #endif
  94. #ifndef BOOST_NO_EXCEPTIONS
  95. BOOST_REGEX_NORETURN
  96. #endif
  97. inline void invalid_utf32_code_point(std::uint32_t val)
  98. {
  99. std::stringstream ss;
  100. ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
  101. std::out_of_range e(ss.str());
  102. #ifndef BOOST_REGEX_STANDALONE
  103. boost::throw_exception(e);
  104. #else
  105. throw e;
  106. #endif
  107. }
  108. #ifdef BOOST_REGEX_MSVC
  109. #pragma warning(pop)
  110. #endif
  111. } // namespace detail
  112. template <class BaseIterator, class U16Type = std::uint16_t>
  113. class u32_to_u16_iterator
  114. {
  115. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  116. static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  117. static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument");
  118. public:
  119. typedef std::ptrdiff_t difference_type;
  120. typedef U16Type value_type;
  121. typedef value_type const* pointer;
  122. typedef value_type const reference;
  123. typedef std::bidirectional_iterator_tag iterator_category;
  124. reference operator*()const
  125. {
  126. if(m_current == 2)
  127. extract_current();
  128. return m_values[m_current];
  129. }
  130. bool operator==(const u32_to_u16_iterator& that)const
  131. {
  132. if(m_position == that.m_position)
  133. {
  134. // Both m_currents must be equal, or both even
  135. // this is the same as saying their sum must be even:
  136. return (m_current + that.m_current) & 1u ? false : true;
  137. }
  138. return false;
  139. }
  140. bool operator!=(const u32_to_u16_iterator& that)const
  141. {
  142. return !(*this == that);
  143. }
  144. u32_to_u16_iterator& operator++()
  145. {
  146. // if we have a pending read then read now, so that we know whether
  147. // to skip a position, or move to a low-surrogate:
  148. if(m_current == 2)
  149. {
  150. // pending read:
  151. extract_current();
  152. }
  153. // move to the next surrogate position:
  154. ++m_current;
  155. // if we've reached the end skip a position:
  156. if(m_values[m_current] == 0)
  157. {
  158. m_current = 2;
  159. ++m_position;
  160. }
  161. return *this;
  162. }
  163. u32_to_u16_iterator operator++(int)
  164. {
  165. u32_to_u16_iterator r(*this);
  166. ++(*this);
  167. return r;
  168. }
  169. u32_to_u16_iterator& operator--()
  170. {
  171. if(m_current != 1)
  172. {
  173. // decrementing an iterator always leads to a valid position:
  174. --m_position;
  175. extract_current();
  176. m_current = m_values[1] ? 1 : 0;
  177. }
  178. else
  179. {
  180. m_current = 0;
  181. }
  182. return *this;
  183. }
  184. u32_to_u16_iterator operator--(int)
  185. {
  186. u32_to_u16_iterator r(*this);
  187. --(*this);
  188. return r;
  189. }
  190. BaseIterator base()const
  191. {
  192. return m_position;
  193. }
  194. // construct:
  195. u32_to_u16_iterator() : m_position(), m_current(0)
  196. {
  197. m_values[0] = 0;
  198. m_values[1] = 0;
  199. m_values[2] = 0;
  200. }
  201. u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
  202. {
  203. m_values[0] = 0;
  204. m_values[1] = 0;
  205. m_values[2] = 0;
  206. }
  207. private:
  208. void extract_current()const
  209. {
  210. // begin by checking for a code point out of range:
  211. std::uint32_t v = *m_position;
  212. if(v >= 0x10000u)
  213. {
  214. if(v > 0x10FFFFu)
  215. detail::invalid_utf32_code_point(*m_position);
  216. // split into two surrogates:
  217. m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
  218. m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  219. m_current = 0;
  220. BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
  221. BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
  222. }
  223. else
  224. {
  225. // 16-bit code point:
  226. m_values[0] = static_cast<U16Type>(*m_position);
  227. m_values[1] = 0;
  228. m_current = 0;
  229. // value must not be a surrogate:
  230. if(detail::is_surrogate(m_values[0]))
  231. detail::invalid_utf32_code_point(*m_position);
  232. }
  233. }
  234. BaseIterator m_position;
  235. mutable U16Type m_values[3];
  236. mutable unsigned m_current;
  237. };
  238. template <class BaseIterator, class U32Type = std::uint32_t>
  239. class u16_to_u32_iterator
  240. {
  241. // special values for pending iterator reads:
  242. static const U32Type pending_read = 0xffffffffu;
  243. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  244. static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument");
  245. static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  246. public:
  247. typedef std::ptrdiff_t difference_type;
  248. typedef U32Type value_type;
  249. typedef value_type const* pointer;
  250. typedef value_type const reference;
  251. typedef std::bidirectional_iterator_tag iterator_category;
  252. reference operator*()const
  253. {
  254. if(m_value == pending_read)
  255. extract_current();
  256. return m_value;
  257. }
  258. bool operator==(const u16_to_u32_iterator& that)const
  259. {
  260. return m_position == that.m_position;
  261. }
  262. bool operator!=(const u16_to_u32_iterator& that)const
  263. {
  264. return !(*this == that);
  265. }
  266. u16_to_u32_iterator& operator++()
  267. {
  268. // skip high surrogate first if there is one:
  269. if(detail::is_high_surrogate(*m_position)) ++m_position;
  270. ++m_position;
  271. m_value = pending_read;
  272. return *this;
  273. }
  274. u16_to_u32_iterator operator++(int)
  275. {
  276. u16_to_u32_iterator r(*this);
  277. ++(*this);
  278. return r;
  279. }
  280. u16_to_u32_iterator& operator--()
  281. {
  282. --m_position;
  283. // if we have a low surrogate then go back one more:
  284. if(detail::is_low_surrogate(*m_position))
  285. --m_position;
  286. m_value = pending_read;
  287. return *this;
  288. }
  289. u16_to_u32_iterator operator--(int)
  290. {
  291. u16_to_u32_iterator r(*this);
  292. --(*this);
  293. return r;
  294. }
  295. BaseIterator base()const
  296. {
  297. return m_position;
  298. }
  299. // construct:
  300. u16_to_u32_iterator() : m_position()
  301. {
  302. m_value = pending_read;
  303. }
  304. u16_to_u32_iterator(BaseIterator b) : m_position(b)
  305. {
  306. m_value = pending_read;
  307. }
  308. //
  309. // Range checked version:
  310. //
  311. u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  312. {
  313. m_value = pending_read;
  314. //
  315. // The range must not start with a low surrogate, or end in a high surrogate,
  316. // otherwise we run the risk of running outside the underlying input range.
  317. // Likewise b must not be located at a low surrogate.
  318. //
  319. std::uint16_t val;
  320. if(start != end)
  321. {
  322. if((b != start) && (b != end))
  323. {
  324. val = *b;
  325. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  326. invalid_code_point(val);
  327. }
  328. val = *start;
  329. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  330. invalid_code_point(val);
  331. val = *--end;
  332. if(detail::is_high_surrogate(val))
  333. invalid_code_point(val);
  334. }
  335. }
  336. private:
  337. static void invalid_code_point(std::uint16_t val)
  338. {
  339. std::stringstream ss;
  340. ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
  341. std::out_of_range e(ss.str());
  342. #ifndef BOOST_REGEX_STANDALONE
  343. boost::throw_exception(e);
  344. #else
  345. throw e;
  346. #endif
  347. }
  348. void extract_current()const
  349. {
  350. m_value = static_cast<U32Type>(static_cast< std::uint16_t>(*m_position));
  351. // if the last value is a high surrogate then adjust m_position and m_value as needed:
  352. if(detail::is_high_surrogate(*m_position))
  353. {
  354. // precondition; next value must have be a low-surrogate:
  355. BaseIterator next(m_position);
  356. std::uint16_t t = *++next;
  357. if((t & 0xFC00u) != 0xDC00u)
  358. invalid_code_point(t);
  359. m_value = (m_value - detail::high_surrogate_base) << 10;
  360. m_value |= (static_cast<U32Type>(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask);
  361. }
  362. // postcondition; result must not be a surrogate:
  363. if(detail::is_surrogate(m_value))
  364. invalid_code_point(static_cast< std::uint16_t>(m_value));
  365. }
  366. BaseIterator m_position;
  367. mutable U32Type m_value;
  368. };
  369. template <class BaseIterator, class U8Type = std::uint8_t>
  370. class u32_to_u8_iterator
  371. {
  372. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  373. static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  374. static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument");
  375. public:
  376. typedef std::ptrdiff_t difference_type;
  377. typedef U8Type value_type;
  378. typedef value_type const* pointer;
  379. typedef value_type const reference;
  380. typedef std::bidirectional_iterator_tag iterator_category;
  381. reference operator*()const
  382. {
  383. if(m_current == 4)
  384. extract_current();
  385. return m_values[m_current];
  386. }
  387. bool operator==(const u32_to_u8_iterator& that)const
  388. {
  389. if(m_position == that.m_position)
  390. {
  391. // either the m_current's must be equal, or one must be 0 and
  392. // the other 4: which means neither must have bits 1 or 2 set:
  393. return (m_current == that.m_current)
  394. || (((m_current | that.m_current) & 3) == 0);
  395. }
  396. return false;
  397. }
  398. bool operator!=(const u32_to_u8_iterator& that)const
  399. {
  400. return !(*this == that);
  401. }
  402. u32_to_u8_iterator& operator++()
  403. {
  404. // if we have a pending read then read now, so that we know whether
  405. // to skip a position, or move to a low-surrogate:
  406. if(m_current == 4)
  407. {
  408. // pending read:
  409. extract_current();
  410. }
  411. // move to the next surrogate position:
  412. ++m_current;
  413. // if we've reached the end skip a position:
  414. if(m_values[m_current] == 0)
  415. {
  416. m_current = 4;
  417. ++m_position;
  418. }
  419. return *this;
  420. }
  421. u32_to_u8_iterator operator++(int)
  422. {
  423. u32_to_u8_iterator r(*this);
  424. ++(*this);
  425. return r;
  426. }
  427. u32_to_u8_iterator& operator--()
  428. {
  429. if((m_current & 3) == 0)
  430. {
  431. --m_position;
  432. extract_current();
  433. m_current = 3;
  434. while(m_current && (m_values[m_current] == 0))
  435. --m_current;
  436. }
  437. else
  438. --m_current;
  439. return *this;
  440. }
  441. u32_to_u8_iterator operator--(int)
  442. {
  443. u32_to_u8_iterator r(*this);
  444. --(*this);
  445. return r;
  446. }
  447. BaseIterator base()const
  448. {
  449. return m_position;
  450. }
  451. // construct:
  452. u32_to_u8_iterator() : m_position(), m_current(0)
  453. {
  454. m_values[0] = 0;
  455. m_values[1] = 0;
  456. m_values[2] = 0;
  457. m_values[3] = 0;
  458. m_values[4] = 0;
  459. }
  460. u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
  461. {
  462. m_values[0] = 0;
  463. m_values[1] = 0;
  464. m_values[2] = 0;
  465. m_values[3] = 0;
  466. m_values[4] = 0;
  467. }
  468. private:
  469. void extract_current()const
  470. {
  471. std::uint32_t c = *m_position;
  472. if(c > 0x10FFFFu)
  473. detail::invalid_utf32_code_point(c);
  474. if(c < 0x80u)
  475. {
  476. m_values[0] = static_cast<unsigned char>(c);
  477. m_values[1] = static_cast<unsigned char>(0u);
  478. m_values[2] = static_cast<unsigned char>(0u);
  479. m_values[3] = static_cast<unsigned char>(0u);
  480. }
  481. else if(c < 0x800u)
  482. {
  483. m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
  484. m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  485. m_values[2] = static_cast<unsigned char>(0u);
  486. m_values[3] = static_cast<unsigned char>(0u);
  487. }
  488. else if(c < 0x10000u)
  489. {
  490. m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
  491. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  492. m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  493. m_values[3] = static_cast<unsigned char>(0u);
  494. }
  495. else
  496. {
  497. m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
  498. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  499. m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  500. m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  501. }
  502. m_current= 0;
  503. }
  504. BaseIterator m_position;
  505. mutable U8Type m_values[5];
  506. mutable unsigned m_current;
  507. };
  508. template <class BaseIterator, class U32Type = std::uint32_t>
  509. class u8_to_u32_iterator
  510. {
  511. // special values for pending iterator reads:
  512. static const U32Type pending_read = 0xffffffffu;
  513. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  514. static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument");
  515. static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  516. public:
  517. typedef std::ptrdiff_t difference_type;
  518. typedef U32Type value_type;
  519. typedef value_type const* pointer;
  520. typedef value_type const reference;
  521. typedef std::bidirectional_iterator_tag iterator_category;
  522. reference operator*()const
  523. {
  524. if(m_value == pending_read)
  525. extract_current();
  526. return m_value;
  527. }
  528. bool operator==(const u8_to_u32_iterator& that)const
  529. {
  530. return m_position == that.m_position;
  531. }
  532. bool operator!=(const u8_to_u32_iterator& that)const
  533. {
  534. return !(*this == that);
  535. }
  536. u8_to_u32_iterator& operator++()
  537. {
  538. // We must not start with a continuation character:
  539. if((static_cast<std::uint8_t>(*m_position) & 0xC0) == 0x80)
  540. invalid_sequence();
  541. // skip high surrogate first if there is one:
  542. unsigned c = detail::utf8_byte_count(*m_position);
  543. if(m_value == pending_read)
  544. {
  545. // Since we haven't read in a value, we need to validate the code points:
  546. for(unsigned i = 0; i < c; ++i)
  547. {
  548. ++m_position;
  549. // We must have a continuation byte:
  550. if((i != c - 1) && ((static_cast<std::uint8_t>(*m_position) & 0xC0) != 0x80))
  551. invalid_sequence();
  552. }
  553. }
  554. else
  555. {
  556. std::advance(m_position, c);
  557. }
  558. m_value = pending_read;
  559. return *this;
  560. }
  561. u8_to_u32_iterator operator++(int)
  562. {
  563. u8_to_u32_iterator r(*this);
  564. ++(*this);
  565. return r;
  566. }
  567. u8_to_u32_iterator& operator--()
  568. {
  569. // Keep backtracking until we don't have a trailing character:
  570. unsigned count = 0;
  571. while((*--m_position & 0xC0u) == 0x80u) ++count;
  572. // now check that the sequence was valid:
  573. if(count != detail::utf8_trailing_byte_count(*m_position))
  574. invalid_sequence();
  575. m_value = pending_read;
  576. return *this;
  577. }
  578. u8_to_u32_iterator operator--(int)
  579. {
  580. u8_to_u32_iterator r(*this);
  581. --(*this);
  582. return r;
  583. }
  584. BaseIterator base()const
  585. {
  586. return m_position;
  587. }
  588. // construct:
  589. u8_to_u32_iterator() : m_position()
  590. {
  591. m_value = pending_read;
  592. }
  593. u8_to_u32_iterator(BaseIterator b) : m_position(b)
  594. {
  595. m_value = pending_read;
  596. }
  597. //
  598. // Checked constructor:
  599. //
  600. u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  601. {
  602. m_value = pending_read;
  603. //
  604. // We must not start with a continuation character, or end with a
  605. // truncated UTF-8 sequence otherwise we run the risk of going past
  606. // the start/end of the underlying sequence:
  607. //
  608. if(start != end)
  609. {
  610. unsigned char v = *start;
  611. if((v & 0xC0u) == 0x80u)
  612. invalid_sequence();
  613. if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
  614. invalid_sequence();
  615. BaseIterator pos = end;
  616. do
  617. {
  618. v = *--pos;
  619. }
  620. while((start != pos) && ((v & 0xC0u) == 0x80u));
  621. std::ptrdiff_t extra = detail::utf8_byte_count(v);
  622. if(std::distance(pos, end) < extra)
  623. invalid_sequence();
  624. }
  625. }
  626. private:
  627. static void invalid_sequence()
  628. {
  629. std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
  630. #ifndef BOOST_REGEX_STANDALONE
  631. boost::throw_exception(e);
  632. #else
  633. throw e;
  634. #endif
  635. }
  636. void extract_current()const
  637. {
  638. m_value = static_cast<U32Type>(static_cast< std::uint8_t>(*m_position));
  639. // we must not have a continuation character:
  640. if((m_value & 0xC0u) == 0x80u)
  641. invalid_sequence();
  642. // see how many extra bytes we have:
  643. unsigned extra = detail::utf8_trailing_byte_count(*m_position);
  644. // extract the extra bits, 6 from each extra byte:
  645. BaseIterator next(m_position);
  646. for(unsigned c = 0; c < extra; ++c)
  647. {
  648. ++next;
  649. m_value <<= 6;
  650. // We must have a continuation byte:
  651. if((static_cast<std::uint8_t>(*next) & 0xC0) != 0x80)
  652. invalid_sequence();
  653. m_value += static_cast<std::uint8_t>(*next) & 0x3Fu;
  654. }
  655. // we now need to remove a few of the leftmost bits, but how many depends
  656. // upon how many extra bytes we've extracted:
  657. static const std::uint32_t masks[4] =
  658. {
  659. 0x7Fu,
  660. 0x7FFu,
  661. 0xFFFFu,
  662. 0x1FFFFFu,
  663. };
  664. m_value &= masks[extra];
  665. // check the result is in range:
  666. if(m_value > static_cast<U32Type>(0x10FFFFu))
  667. invalid_sequence();
  668. // The result must not be a surrogate:
  669. if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
  670. invalid_sequence();
  671. // We should not have had an invalidly encoded UTF8 sequence:
  672. if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
  673. invalid_sequence();
  674. }
  675. BaseIterator m_position;
  676. mutable U32Type m_value;
  677. };
  678. template <class BaseIterator>
  679. class utf16_output_iterator
  680. {
  681. public:
  682. typedef void difference_type;
  683. typedef void value_type;
  684. typedef std::uint32_t* pointer;
  685. typedef std::uint32_t& reference;
  686. typedef std::output_iterator_tag iterator_category;
  687. utf16_output_iterator(const BaseIterator& b)
  688. : m_position(b){}
  689. utf16_output_iterator(const utf16_output_iterator& that)
  690. : m_position(that.m_position){}
  691. utf16_output_iterator& operator=(const utf16_output_iterator& that)
  692. {
  693. m_position = that.m_position;
  694. return *this;
  695. }
  696. const utf16_output_iterator& operator*()const
  697. {
  698. return *this;
  699. }
  700. void operator=(std::uint32_t val)const
  701. {
  702. push(val);
  703. }
  704. utf16_output_iterator& operator++()
  705. {
  706. return *this;
  707. }
  708. utf16_output_iterator& operator++(int)
  709. {
  710. return *this;
  711. }
  712. BaseIterator base()const
  713. {
  714. return m_position;
  715. }
  716. private:
  717. void push(std::uint32_t v)const
  718. {
  719. if(v >= 0x10000u)
  720. {
  721. // begin by checking for a code point out of range:
  722. if(v > 0x10FFFFu)
  723. detail::invalid_utf32_code_point(v);
  724. // split into two surrogates:
  725. *m_position++ = static_cast<std::uint16_t>(v >> 10) + detail::high_surrogate_base;
  726. *m_position++ = static_cast<std::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  727. }
  728. else
  729. {
  730. // 16-bit code point:
  731. // value must not be a surrogate:
  732. if(detail::is_surrogate(v))
  733. detail::invalid_utf32_code_point(v);
  734. *m_position++ = static_cast<std::uint16_t>(v);
  735. }
  736. }
  737. mutable BaseIterator m_position;
  738. };
  739. template <class BaseIterator>
  740. class utf8_output_iterator
  741. {
  742. public:
  743. typedef void difference_type;
  744. typedef void value_type;
  745. typedef std::uint32_t* pointer;
  746. typedef std::uint32_t& reference;
  747. typedef std::output_iterator_tag iterator_category;
  748. utf8_output_iterator(const BaseIterator& b)
  749. : m_position(b){}
  750. utf8_output_iterator(const utf8_output_iterator& that)
  751. : m_position(that.m_position){}
  752. utf8_output_iterator& operator=(const utf8_output_iterator& that)
  753. {
  754. m_position = that.m_position;
  755. return *this;
  756. }
  757. const utf8_output_iterator& operator*()const
  758. {
  759. return *this;
  760. }
  761. void operator=(std::uint32_t val)const
  762. {
  763. push(val);
  764. }
  765. utf8_output_iterator& operator++()
  766. {
  767. return *this;
  768. }
  769. utf8_output_iterator& operator++(int)
  770. {
  771. return *this;
  772. }
  773. BaseIterator base()const
  774. {
  775. return m_position;
  776. }
  777. private:
  778. void push(std::uint32_t c)const
  779. {
  780. if(c > 0x10FFFFu)
  781. detail::invalid_utf32_code_point(c);
  782. if(c < 0x80u)
  783. {
  784. *m_position++ = static_cast<unsigned char>(c);
  785. }
  786. else if(c < 0x800u)
  787. {
  788. *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
  789. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  790. }
  791. else if(c < 0x10000u)
  792. {
  793. *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
  794. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  795. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  796. }
  797. else
  798. {
  799. *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
  800. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  801. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  802. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  803. }
  804. }
  805. mutable BaseIterator m_position;
  806. };
  807. } // namespace boost
  808. #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP