utf8_checker.hpp 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. //
  2. // Copyright (c) 2016-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. //
  7. // Official repository: https://github.com/boostorg/beast
  8. //
  9. #ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
  10. #define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
  11. #include <boost/beast/core/type_traits.hpp>
  12. #include <boost/asio/buffer.hpp>
  13. #include <boost/assert.hpp>
  14. #include <algorithm>
  15. #include <cstdint>
  16. namespace boost {
  17. namespace beast {
  18. namespace websocket {
  19. namespace detail {
  20. /** A UTF8 validator.
  21. This validator can be used to check if a buffer containing UTF8 text is
  22. valid. The write function may be called incrementally with segmented UTF8
  23. sequences. The finish function determines if all processed text is valid.
  24. */
  25. template<class = void>
  26. class utf8_checker_t
  27. {
  28. std::size_t need_ = 0; // chars we need to finish the code point
  29. std::uint8_t* p_ = cp_; // current position in temp buffer
  30. std::uint8_t cp_[4]; // a temp buffer for the code point
  31. public:
  32. /** Prepare to process text as valid utf8
  33. */
  34. void
  35. reset();
  36. /** Check that all processed text is valid utf8
  37. */
  38. bool
  39. finish();
  40. /** Check if text is valid UTF8
  41. @return `true` if the text is valid utf8 or false otherwise.
  42. */
  43. bool
  44. write(std::uint8_t const* in, std::size_t size);
  45. /** Check if text is valid UTF8
  46. @return `true` if the text is valid utf8 or false otherwise.
  47. */
  48. template<class ConstBufferSequence>
  49. bool
  50. write(ConstBufferSequence const& bs);
  51. };
  52. template<class _>
  53. void
  54. utf8_checker_t<_>::
  55. reset()
  56. {
  57. need_ = 0;
  58. p_ = cp_;
  59. }
  60. template<class _>
  61. bool
  62. utf8_checker_t<_>::
  63. finish()
  64. {
  65. auto const success = need_ == 0;
  66. reset();
  67. return success;
  68. }
  69. template<class _>
  70. template<class ConstBufferSequence>
  71. bool
  72. utf8_checker_t<_>::
  73. write(ConstBufferSequence const& bs)
  74. {
  75. static_assert(boost::asio::is_const_buffer_sequence<ConstBufferSequence>::value,
  76. "ConstBufferSequence requirements not met");
  77. for(auto b : beast::detail::buffers_range(bs))
  78. if(! write(static_cast<
  79. std::uint8_t const*>(b.data()),
  80. b.size()))
  81. return false;
  82. return true;
  83. }
  84. template<class _>
  85. bool
  86. utf8_checker_t<_>::
  87. write(std::uint8_t const* in, std::size_t size)
  88. {
  89. auto const valid =
  90. [](std::uint8_t const*& p)
  91. {
  92. if(p[0] < 128)
  93. {
  94. ++p;
  95. return true;
  96. }
  97. if((p[0] & 0xe0) == 0xc0)
  98. {
  99. if( (p[1] & 0xc0) != 0x80 ||
  100. (p[0] & 0xfe) == 0xc0) // overlong
  101. return false;
  102. p += 2;
  103. return true;
  104. }
  105. if((p[0] & 0xf0) == 0xe0)
  106. {
  107. if( (p[1] & 0xc0) != 0x80
  108. || (p[2] & 0xc0) != 0x80
  109. || (p[0] == 0xe0 && (p[1] & 0xe0) == 0x80) // overlong
  110. || (p[0] == 0xed && (p[1] & 0xe0) == 0xa0) // surrogate
  111. //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
  112. )
  113. return false;
  114. p += 3;
  115. return true;
  116. }
  117. if((p[0] & 0xf8) == 0xf0)
  118. {
  119. if( (p[1] & 0xc0) != 0x80
  120. || (p[2] & 0xc0) != 0x80
  121. || (p[3] & 0xc0) != 0x80
  122. || (p[0] == 0xf0 && (p[1] & 0xf0) == 0x80) // overlong
  123. || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
  124. )
  125. return false;
  126. p += 4;
  127. return true;
  128. }
  129. return false;
  130. };
  131. auto const fail_fast =
  132. [&]()
  133. {
  134. auto const n = p_ - cp_;
  135. switch(n)
  136. {
  137. default:
  138. BOOST_ASSERT(false);
  139. BOOST_FALLTHROUGH;
  140. case 1:
  141. cp_[1] = 0x81;
  142. BOOST_FALLTHROUGH;
  143. case 2:
  144. cp_[2] = 0x81;
  145. BOOST_FALLTHROUGH;
  146. case 3:
  147. cp_[3] = 0x81;
  148. break;
  149. }
  150. std::uint8_t const* p = cp_;
  151. return ! valid(p);
  152. };
  153. auto const needed =
  154. [](std::uint8_t const v)
  155. {
  156. if(v < 128)
  157. return 1;
  158. if(v < 192)
  159. return 0;
  160. if(v < 224)
  161. return 2;
  162. if(v < 240)
  163. return 3;
  164. if(v < 248)
  165. return 4;
  166. return 0;
  167. };
  168. auto const end = in + size;
  169. // Finish up any incomplete code point
  170. if(need_ > 0)
  171. {
  172. // Calculate what we have
  173. auto n = (std::min)(size, need_);
  174. size -= n;
  175. need_ -= n;
  176. // Add characters to the code point
  177. while(n--)
  178. *p_++ = *in++;
  179. BOOST_ASSERT(p_ <= cp_ + 4);
  180. // Still incomplete?
  181. if(need_ > 0)
  182. {
  183. // Incomplete code point
  184. BOOST_ASSERT(in == end);
  185. // Do partial validation on the incomplete
  186. // code point, this is called "Fail fast"
  187. // in Autobahn|Testsuite parlance.
  188. return ! fail_fast();
  189. }
  190. // Complete code point, validate it
  191. std::uint8_t const* p = &cp_[0];
  192. if(! valid(p))
  193. return false;
  194. p_ = cp_;
  195. }
  196. if(size <= sizeof(std::size_t))
  197. goto slow;
  198. // Align `in` to sizeof(std::size_t) boundary
  199. {
  200. auto const in0 = in;
  201. auto last = reinterpret_cast<std::uint8_t const*>(
  202. ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
  203. sizeof(std::size_t)) * sizeof(std::size_t));
  204. // Check one character at a time for low-ASCII
  205. while(in < last)
  206. {
  207. if(*in & 0x80)
  208. {
  209. // Not low-ASCII so switch to slow loop
  210. size = size - (in - in0);
  211. goto slow;
  212. }
  213. ++in;
  214. }
  215. size = size - (in - in0);
  216. }
  217. // Fast loop: Process 4 or 8 low-ASCII characters at a time
  218. {
  219. auto const in0 = in;
  220. auto last = in + size - 7;
  221. auto constexpr mask = static_cast<
  222. std::size_t>(0x8080808080808080 & ~std::size_t{0});
  223. while(in < last)
  224. {
  225. #if 0
  226. std::size_t temp;
  227. std::memcpy(&temp, in, sizeof(temp));
  228. if((temp & mask) != 0)
  229. #else
  230. // Technically UB but works on all known platforms
  231. if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
  232. #endif
  233. {
  234. size = size - (in - in0);
  235. goto slow;
  236. }
  237. in += sizeof(std::size_t);
  238. }
  239. // There's at least one more full code point left
  240. last += 4;
  241. while(in < last)
  242. if(! valid(in))
  243. return false;
  244. goto tail;
  245. }
  246. slow:
  247. // Slow loop: Full validation on one code point at a time
  248. {
  249. auto last = in + size - 3;
  250. while(in < last)
  251. if(! valid(in))
  252. return false;
  253. }
  254. tail:
  255. // Handle the remaining bytes. The last
  256. // characters could split a code point so
  257. // we save the partial code point for later.
  258. //
  259. // On entry to the loop, `in` points to the
  260. // beginning of a code point.
  261. //
  262. for(;;)
  263. {
  264. // Number of chars left
  265. auto n = end - in;
  266. if(! n)
  267. break;
  268. // Chars we need to finish this code point
  269. auto const need = needed(*in);
  270. if(need == 0)
  271. return false;
  272. if(need <= n)
  273. {
  274. // Check a whole code point
  275. if(! valid(in))
  276. return false;
  277. }
  278. else
  279. {
  280. // Calculate how many chars we need
  281. // to finish this partial code point
  282. need_ = need - n;
  283. // Save the partial code point
  284. while(n--)
  285. *p_++ = *in++;
  286. BOOST_ASSERT(in == end);
  287. BOOST_ASSERT(p_ <= cp_ + 4);
  288. // Do partial validation on the incomplete
  289. // code point, this is called "Fail fast"
  290. // in Autobahn|Testsuite parlance.
  291. return ! fail_fast();
  292. }
  293. }
  294. return true;
  295. }
  296. using utf8_checker = utf8_checker_t<>;
  297. template<class = void>
  298. bool
  299. check_utf8(char const* p, std::size_t n)
  300. {
  301. utf8_checker c;
  302. if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
  303. return false;
  304. return c.finish();
  305. }
  306. } // detail
  307. } // websocket
  308. } // beast
  309. } // boost
  310. #endif