// Copyright (C) 2018 Robert N. Steagall // Copyright (C) 2019 T. Zachary Laine // // Distributed under the Boost Software License, Version 1.0. (See // accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #ifndef BOOST_PARSER_DETAIL_TEXT_TRANSCODE_ALGORITHM_HPP #define BOOST_PARSER_DETAIL_TEXT_TRANSCODE_ALGORITHM_HPP #include #include #include #if BOOST_PARSER_DETAIL_TEXT_USE_CONCEPTS #include #endif #include namespace boost::parser::detail { namespace text { /** An alias for `in_out_result` returned by algorithms that perform a transcoding copy. */ template using transcode_result = in_out_result; namespace detail { template constexpr OutIter read_into_utf8_iter(uint32_t cp, OutIter out) { if (cp < 0x80) { *out = static_cast(cp); ++out; } else if (cp < 0x800) { *out = static_cast(0xC0 + (cp >> 6)); ++out; *out = static_cast(0x80 + (cp & 0x3f)); ++out; } else if (cp < 0x10000) { *out = static_cast(0xe0 + (cp >> 12)); ++out; *out = static_cast(0x80 + ((cp >> 6) & 0x3f)); ++out; *out = static_cast(0x80 + (cp & 0x3f)); ++out; } else { *out = static_cast(0xf0 + (cp >> 18)); ++out; *out = static_cast(0x80 + ((cp >> 12) & 0x3f)); ++out; *out = static_cast(0x80 + ((cp >> 6) & 0x3f)); ++out; *out = static_cast(0x80 + (cp & 0x3f)); ++out; } return out; } template constexpr OutIter read_into_utf16_iter(uint32_t cp, OutIter out) { uint16_t const high_surrogate_base = 0xd7c0; uint16_t const low_surrogate_base = 0xdc00; if (cp < 0x10000) { *out = static_cast(cp); ++out; } else { *out = static_cast(cp >> 10) + high_surrogate_base; ++out; *out = static_cast(cp & 0x3ff) + low_surrogate_base; ++out; } return out; } template< bool UseN, typename InputIter, typename Sentinel, typename OutIter> transcode_result transcode_utf_8_to_16( InputIter first, Sentinel last, std::ptrdiff_t n, OutIter out, std::input_iterator_tag) { for (; first != last && (!UseN || n); --n) { unsigned char const c = *first; if (c < 0x80) { *out = *first; ++first; ++out; } else { auto const cp = detail::advance(first, last); out = detail::read_into_utf16_iter(cp, out); } } return {first, out}; } template transcode_result transcode_utf_8_to_16( Iter first, Iter last, std::ptrdiff_t n, OutIter out, std::random_access_iterator_tag) { return transcode_utf_8_to_16( first, last, n, out, std::input_iterator_tag{}); } template< bool UseN, typename InputIter, typename Sentinel, typename OutIter> transcode_result transcode_utf_8_to_32( InputIter first, Sentinel last, std::ptrdiff_t n, OutIter out, std::input_iterator_tag) { for (; first != last && (!UseN || n); --n) { unsigned char const c = *first; if (c < 0x80) { *out = *first; ++first; ++out; } else { *out = detail::advance(first, last); ++out; } } return {first, out}; } template transcode_result transcode_utf_8_to_32( Iter first, Iter last, std::ptrdiff_t n, OutIter out, std::random_access_iterator_tag) { return transcode_utf_8_to_32( first, last, n, out, std::input_iterator_tag{}); } template struct tag_t {}; template transcode_result transcode_to_8( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { for (; first != last && (!UseN || n); ++first, ++out) { *out = *first; --n; } return {first, out}; } template transcode_result transcode_to_16( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { return detail::transcode_utf_8_to_16( first, last, n, out, typename std::iterator_traits::iterator_category{}); } template transcode_result transcode_to_32( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { return detail::transcode_utf_8_to_32( first, last, n, out, typename std::iterator_traits::iterator_category{}); } template transcode_result transcode_to_8( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { uint32_t const high_surrogate_max = 0xdbff; uint16_t const high_surrogate_base = 0xd7c0; uint16_t const low_surrogate_base = 0xdc00; for (; first != last && (!UseN || n); ++first, --n) { uint32_t const hi = *first; if (surrogate(hi)) { if (hi <= high_surrogate_max) { ++first; if (first == last) { uint32_t const cp = replacement_character; out = detail::read_into_utf8_iter(cp, out); ++out; return {first, out}; } uint32_t const lo = *first; if (low_surrogate(lo)) { uint32_t const cp = ((hi - high_surrogate_base) << 10) + (lo - low_surrogate_base); out = detail::read_into_utf8_iter(cp, out); continue; } } out = detail::read_into_utf8_iter( replacement_character, out); } else { out = detail::read_into_utf8_iter(hi, out); } } return {first, out}; } template transcode_result transcode_to_16( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { for (; first != last && (!UseN || n); ++first, ++out, --n) { *out = *first; } return {first, out}; } template transcode_result transcode_to_32( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { uint32_t const high_surrogate_max = 0xdbff; uint16_t const high_surrogate_base = 0xd7c0; uint16_t const low_surrogate_base = 0xdc00; for (; first != last && (!UseN || n); ++first, --n) { uint32_t const hi = *first; if (surrogate(hi)) { if (hi <= high_surrogate_max) { ++first; if (first == last) { *out = replacement_character; ++out; return {first, out}; } uint32_t const lo = *first; if (low_surrogate(lo)) { uint32_t const cp = ((hi - high_surrogate_base) << 10) + (lo - low_surrogate_base); *out = cp; ++out; continue; } } *out = replacement_character; ++out; } else { *out = hi; ++out; } } return {first, out}; } template transcode_result transcode_to_8( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { for (; first != last && (!UseN || n); ++first, --n) { out = detail::read_into_utf8_iter(*first, out); } return {first, out}; } template transcode_result transcode_to_16( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { for (; first != last && (!UseN || n); ++first, --n) { out = detail::read_into_utf16_iter(*first, out); } return {first, out}; } template transcode_result transcode_to_32( tag_t, Iter first, Sentinel last, std::ptrdiff_t n, OutIter out) { for (; first != last && (!UseN || n); ++first, ++out, --n) { *out = *first; } return {first, out}; } } #if 0 /** Copies the code points in the range [first, last) to out, changing the encoding from UTF-8 to UTF-32. */ template transcode_result transcode_utf_8_to_32_take_n( InputIter first, Sentinel last, std::ptrdiff_t n, OutIter out) { auto const r = detail::unpack_iterator_and_sentinel(first, last); return detail::transcode_to_32( detail::tag_t{}, r.first, r.last, n, out); } /** Copies the first `n` code points in the range [first, last) to out, changing the encoding from UTF-8 to UTF-32. */ template transcode_result transcode_utf_8_to_32_take_n( InputIter first, Sentinel last, std::ptrdiff_t n, OutIter out) { auto const r = detail::unpack_iterator_and_sentinel(first, last); return detail::transcode_to_32( detail::tag_t{}, r.first, r.last, n, out); } /** Copies the first `n` code points in the range [first, last) to out, changing the encoding from UTF-8 to UTF-32. */ template transcode_result transcode_utf_8_to_32_take_n(Range && r, std::ptrdiff_t n, OutIter out) { return detail::transcode_utf_8_to_32_dispatch:: call(r, n, out) .out; } #endif }} namespace boost::parser::detail { namespace text { BOOST_PARSER_DETAIL_TEXT_NAMESPACE_V1 { #if defined(BOOST_TEXT_DOXYGEN) // -> utf8 /** Copies the code points in the range `[first, last)` to `out`, changing the encoding to UTF-8. */ template< std::input_iterator I, std::sentinel_for S, std::output_iterator O> requires( utf16_code_unit> || utf32_code_unit>) transcode_result transcode_to_utf8(I first, S last, O out); /** Copies the code points in the range `[p, null_sentinel)` to `out`, changing the encoding to UTF-8. */ template O> requires(utf16_pointer || utf32_pointer) transcode_result transcode_to_utf8(Ptr p, O out); /** Copies the code points in the array `arr` to `out`, changing the encoding to UTF-8. */ template O> requires (utf16_code_unit || utf32_code_unit) transcode_result transcode_to_utf8(Char (&arr)[N], O out); /** Copies the code points in the range `r` to `out`, changing the encoding to UTF-8. */ template O> requires (utf16_code_unit> || utf32_code_unit>) transcode_result, O> transcode_to_utf8(R && r, O out); // -> utf16 /** Copies the code points in the range `[first, last)` to `out`, changing the encoding to UTF-16. */ template< std::input_iterator I, std::sentinel_for S, std::output_iterator O> requires (utf8_code_unit> || utf32_code_unit>) transcode_result transcode_to_utf16(I first, S last, O out); /** Copies the code points in the range `[p, null_sentinel)` to `out`, changing the encoding to UTF-16. */ template O> requires (utf8_pointer || utf32_pointer) transcode_result transcode_to_utf16(Ptr p, O out); /** Copies the code points in the array `arr` to `out`, changing the encoding to UTF-16. */ template O> requires (utf8_code_unit || utf32_code_unit) transcode_result transcode_to_utf16(Char (&arr)[N], O out); /** Copies the code points in the range `r` to `out`, changing the encoding to UTF-16. */ template O> requires (utf8_code_unit> || utf32_code_unit>) transcode_result, O> transcode_to_utf16(R && r, O out); // -> utf32 /** Copies the code points in the range `[first, last)` to `out`, changing the encoding to UTF-32. */ template< std::input_iterator I, std::sentinel_for S, std::output_iterator O> requires (utf8_code_unit> || utf16_code_unit>) transcode_result transcode_to_utf32(I first, S last, O out); /** Copies the code points in the range `[p, null_sentinel)` to `out`, changing the encoding to UTF-32. */ template O> requires (utf8_pointer || utf16_pointer) transcode_result transcode_to_utf32(Ptr p, O out); /** Copies the code points in the array `arr` to `out`, changing the encoding to UTF-32. */ template O> requires (utf8_code_unit || utf16_code_unit) transcode_result transcode_to_utf32(Char (&arr)[N], O out); /** Copies the code points in the range `r` to `out`, changing the encoding to UTF-32. */ template O> requires (utf8_code_unit> || utf16_code_unit>) transcode_result, O> transcode_to_utf32(R && r, O out); #endif namespace dtl { template< bool UseN, typename Range, typename OutIter, bool _16Ptr = detail::is_16_ptr_v, bool CPPtr = detail::is_cp_ptr_v> struct transcode_to_8_dispatch { static constexpr auto call(Range && r, std::ptrdiff_t n, OutIter out) -> transcode_result { auto const u = text::unpack_iterator_and_sentinel( detail::begin(r), detail::end(r)); auto unpacked = detail::transcode_to_8( detail::tag_t{}, u.first, u.last, n, out); return {u.repack(unpacked.in), unpacked.out}; } }; template struct transcode_to_8_dispatch { static constexpr auto call(Ptr p, std::ptrdiff_t n, OutIter out) { return detail::transcode_to_8( detail::tag_t{}, p, null_sentinel, n, out); } }; template struct transcode_to_8_dispatch { static constexpr auto call(Ptr p, std::ptrdiff_t n, OutIter out) { return detail::transcode_to_8( detail::tag_t{}, p, null_sentinel, n, out); } }; template< bool UseN, typename Range, typename OutIter, bool CharPtr = detail::is_char_ptr_v, bool CPPtr = detail::is_cp_ptr_v> struct transcode_to_16_dispatch { static constexpr auto call(Range && r, std::ptrdiff_t n, OutIter out) -> transcode_result { auto const u = text::unpack_iterator_and_sentinel( detail::begin(r), detail::end(r)); auto unpacked = detail::transcode_to_16( detail::tag_t{}, u.first, u.last, n, out); return {u.repack(unpacked.in), unpacked.out}; } }; template struct transcode_to_16_dispatch { static constexpr auto call(Ptr p, std::ptrdiff_t n, OutIter out) { return detail::transcode_to_16( detail::tag_t{}, p, null_sentinel, n, out); } }; template struct transcode_to_16_dispatch { static constexpr auto call(Ptr p, std::ptrdiff_t n, OutIter out) { return detail::transcode_to_16( detail::tag_t{}, p, null_sentinel, n, out); } }; template< bool UseN, typename Range, typename OutIter, bool CharPtr = detail::is_char_ptr_v, bool _16Ptr = detail::is_16_ptr_v> struct transcode_to_32_dispatch { static constexpr auto call(Range && r, std::ptrdiff_t n, OutIter out) -> transcode_result { auto const u = text::unpack_iterator_and_sentinel( detail::begin(r), detail::end(r)); auto unpacked = detail::transcode_to_32( detail::tag_t{}, u.first, u.last, n, out); return {u.repack(unpacked.in), unpacked.out}; } }; template struct transcode_to_32_dispatch { static constexpr auto call(Ptr p, std::ptrdiff_t n, OutIter out) { return detail::transcode_to_32( detail::tag_t{}, p, null_sentinel, n, out); } }; template struct transcode_to_32_dispatch { static constexpr auto call(Ptr p, std::ptrdiff_t n, OutIter out) { return detail::transcode_to_32( detail::tag_t{}, p, null_sentinel, n, out); } }; } template transcode_result transcode_to_utf8( Iter first, Sentinel last, OutIter out) { auto const r = text::unpack_iterator_and_sentinel(first, last); auto unpacked = detail::transcode_to_8( detail::tag_t{}, r.first, r.last, -1, out); return {r.repack(unpacked.in), unpacked.out}; } template transcode_result, OutIter> transcode_to_utf8(Range && r, OutIter out) { return dtl::transcode_to_8_dispatch::call( r, -1, out); } template transcode_result transcode_to_utf16( Iter first, Sentinel last, OutIter out) { auto const r = text::unpack_iterator_and_sentinel(first, last); auto unpacked = detail::transcode_to_16( detail::tag_t{}, r.first, r.last, -1, out); return {r.repack(unpacked.in), unpacked.out}; } template transcode_result, OutIter> transcode_to_utf16(Range && r, OutIter out) { return dtl::transcode_to_16_dispatch::call( r, -1, out); } template transcode_result transcode_to_utf32( Iter first, Sentinel last, OutIter out) { auto const r = text::unpack_iterator_and_sentinel(first, last); auto unpacked = detail::transcode_to_32( detail::tag_t{}, r.first, r.last, -1, out); return {r.repack(unpacked.in), unpacked.out}; } template transcode_result, OutIter> transcode_to_utf32(Range && r, OutIter out) { return dtl::transcode_to_32_dispatch::call( r, -1, out); } }}} #if BOOST_PARSER_DETAIL_TEXT_USE_CONCEPTS namespace boost::parser::detail { namespace text { BOOST_PARSER_DETAIL_TEXT_NAMESPACE_V2 { // -> utf8 template< std::input_iterator I, std::sentinel_for S, std::output_iterator O> requires( utf16_code_unit> || utf32_code_unit>) transcode_result transcode_to_utf8(I first, S last, O out) { auto const r = text::unpack_iterator_and_sentinel(first, last); auto unpacked = detail::transcode_to_8( detail::tag_t{}, r.first, r.last, -1, out); return {r.repack(unpacked.in), unpacked.out}; } template O> requires(utf16_range || utf32_range) transcode_result, O> transcode_to_utf8( R && r, O out) { return text::transcode_to_utf8( std::ranges::begin(r), std::ranges::end(r), out); } // -> utf16 template< std::input_iterator I, std::sentinel_for S, std::output_iterator O> requires( utf8_code_unit> || utf32_code_unit>) transcode_result transcode_to_utf16(I first, S last, O out) { auto const r = text::unpack_iterator_and_sentinel(first, last); auto unpacked = detail::transcode_to_16( detail::tag_t{}, r.first, r.last, -1, out); return {r.repack(unpacked.in), unpacked.out}; } template O> requires(utf8_range || utf32_range) transcode_result, O> transcode_to_utf16( R && r, O out) { return text::transcode_to_utf16( std::ranges::begin(r), std::ranges::end(r), out); } // -> utf32 template< std::input_iterator I, std::sentinel_for S, std::output_iterator O> requires( utf8_code_unit> || utf16_code_unit>) transcode_result transcode_to_utf32(I first, S last, O out) { auto const r = text::unpack_iterator_and_sentinel(first, last); auto unpacked = detail::transcode_to_32( detail::tag_t{}, r.first, r.last, -1, out); return {r.repack(unpacked.in), unpacked.out}; } template O> requires(utf8_range || utf16_range) transcode_result, O> transcode_to_utf32( R && r, O out) { return text::transcode_to_utf32( std::ranges::begin(r), std::ranges::end(r), out); } }}} #endif #endif