cppx-core
utf8-Generator.hpp
Go to the documentation of this file.
1 #pragma once // Source encoding: UTF-8 with BOM (π is a lowercase Greek "pi").
2 #include <cppx-core/collections/Span_.hpp> // cppx::Span_
3 #include <cppx-core/language/syntax/macro-items_of.hpp> // CPPX_ITEMS_OF
4 #include <cppx-core/language/syntax/macro-use.hpp> // CPPX_USE_STD
6 #include <cppx-core/language/types/byte-types.hpp> // cppx::Byte
8 #include <cppx-core/language/types/Truth.hpp> // cppx::Truth
9 #include <cppx-core/text/data/ascii-character-names.hpp> // cppx::ascii::*
10 #include <cppx-core/text/pointers-from-string_view.hpp> // cppx::(p_first_of, p_beyond_of)
11 #include <cppx-core/text/unicode/utf16-surrogate-pairs.hpp> // cppx::utf16::*
12 
13 #include <c/assert.hpp> // assert
14 #include <iterator> // std::(next, iterator_traits)
15 #include <string> // std::string
16 #include <utility> // std::(exchange, move)
17 
18 namespace cppx::utf8
19 {
20  CPPX_USE_STD( basic_string, basic_string_view, iterator_traits, exchange, move, next, string );
21 
23  namespace impl
24  {
25  template< class Out_iterator >
26  auto output_utf8( const uint32_t code, const Out_iterator destination )
27  -> Out_iterator
28  {
29  using Out_code = typename iterator_traits<Out_iterator>::value_type;
30 
31  Out_iterator out = destination;
32  static_assert( ascii::last_char == 0x7F );
33  if( code <= 0x7F )
34  {
35  *out = static_cast<Out_code>( code ); ++out;
36  }
37  else if( code <= 0x7FF )
38  {
39  // high bits = 0b11111 << 6
40  *out = static_cast<Out_code>( 0b11000000 | (code >> 6) ); ++out;
41  *out = static_cast<Out_code>( 0b10000000 | (code & 0x3F) ); ++out;
42  }
43  else if( code <= 0xFFFF )
44  {
45  // high bits = 0b1111 << 12
46  *out = static_cast<Out_code>( 0b11100000 | (code >> 12) ); ++out;
47  *out = static_cast<Out_code>( 0b10000000 | ((code >> 6) & 0x3F) ); ++out;
48  *out = static_cast<Out_code>( 0b10000000 | (code & 0x3F) ); ++out;
49  }
50  else // code <= 0x10FFFF
51  {
52  // high bits = 0b111 << 18
53  *out = static_cast<Out_code>( 0b11110000 | (code >> 18) ); ++out;
54  *out = static_cast<Out_code>( 0b10000000 | ((code >> 12) & 0x3F) ); ++out;
55  *out = static_cast<Out_code>( 0b10000000 | ((code >> 6) & 0x3F) ); ++out;
56  *out = static_cast<Out_code>( 0b10000000 | (code & 0x3F) ); ++out;
57  }
58  return out;
59  }
60  } // namespace impl
62 
63  class Generator
64  {
65  uint32_t m_surrogate_1 = 0;
66  Size m_n_bad_chars = 0;
67 
68  public:
69  auto n_bad_chars() const noexcept -> Size { return m_n_bad_chars; }
70 
71  template< class Out_iterator >
72  auto utf8_from_code( const uint32_t code, const Out_iterator destination )
73  -> Out_iterator
74  {
75  Out_iterator it = destination;
76  if( m_surrogate_1 != 0 )
77  {
78  const Truth ok = utf16::range_of_pair_value_2.contains( code );
79  if( ok )
80  {
81  const uint32_t first_value = exchange( m_surrogate_1, 0 );
82  const uint32_t second_value = code;
83  const uint32_t full_code = utf16::code_from_pair( first_value, second_value );
84  return impl::output_utf8( full_code, it );
85  }
86  else
87  {
88  m_surrogate_1 = 0;
89  ++m_n_bad_chars;
90  it = impl::output_utf8( ascii::bad_char, it ); // For the `m_surrogate_1`.
91  // Fall through.
92  }
93  }
94 
96  {
97  m_surrogate_1 = code;
98  return it;
99  }
100  else if( code > 0x10FFFF or utf16::range_of_pair_value_2.contains( code ) )
101  {
102  ++m_n_bad_chars;
103  return impl::output_utf8( ascii::bad_char, it ); // For the `code`.
104  }
105  return impl::output_utf8( code, it );
106  }
107 
108  template< class In_iterator, class Out_iterator >
110  const Span_<In_iterator> range,
111  const Out_iterator destination
112  ) -> Out_iterator
113  {
114  Out_iterator current = destination;
115  for( const uint32_t code : range )
116  {
117  current = utf8_from_code( code, current );
118  }
119  return current;
120  }
121 
122  template< class In_iterator, class Out_iterator >
124  const In_iterator first,
125  const In_iterator beyond,
126  const Out_iterator destination
127  ) -> Out_iterator
128  { return utf8_from_codes( Span_( first, beyond ), destination ); }
129  };
130 } // namespace cppx::utf8
auto utf8_from_code(const uint32_t code, const Out_iterator destination) -> Out_iterator
A drop-in replacement for bool without implicit conversion from/to types other than bool.
Definition: Truth.hpp:34
CPPX_USE_STD(basic_string, basic_string_view, iterator_traits, move, next, string, string_view)
constexpr Range_< unsigned > range_of_pair_value_2(0xDC00, 0xDFFF)
auto n_bad_chars() const noexcept -> Size
auto utf8_from_codes(const Span_< In_iterator > range, const Out_iterator destination) -> Out_iterator
Truth is a drop-in replacement for bool without implicit conversion from/to types other than bool.
constexpr Range_< unsigned > range_of_pair_value_1(0xD800, 0xDBFF)
constexpr auto code_from_pair(const unsigned v1, const unsigned v2) noexcept -> uint32_t
auto utf8_from_codes(const In_iterator first, const In_iterator beyond, const Out_iterator destination) -> Out_iterator
Simple type builders Type_, P_, R_, Raw_array_ and Raw_array_of_.
auto next(P_< const char > p) -> P_< const char >
$items_of eases use of standard library functions, e.g. sort( $items_of( numbers ) ).
Signed_< size_t > Size
A Signed_ equivalent of size_t.
Macros for generating more concise and clear using statements, primarily $use_cppx and $use_std,...
Byte and Signed_byte, + std::byte support definitions as_number and as_std_byte.
auto contains(const Integer v) -> Truth
Definition: ascii-util.hpp:46
Signed Size and Index, plus unsigned equivalents Unsigned_size and Unsigned_index.