cppx-core
utf32-Generator.hpp
Go to the documentation of this file.
1 #pragma once // Source encoding: UTF-8 with BOM (π is a lowercase Greek "pi").
2 
3 #include <cppx-core/collections/Span_.hpp> // cppx::Span_
4 #include <cppx-core/failure-handling/macro-fail.hpp> // CPPX_FAIL
5 #include <cppx-core/language/syntax/macro-use.hpp> // CPPX_USE_STD
7 #include <cppx-core/meta-template/Enable_if_.hpp> // cppx::Enable_if_
8 #include <cppx-core/parameters/In_out_ref_.hpp> // cppx::In_out_ref_, std::ref
9 #include <cppx-core/text/data/ascii-character-names.hpp> // cppx::ascii::bad_char
10 #include <cppx-core/text/unicode/utf8-iteration.hpp> // cppx::utf8::*
11 #include <cppx-core/text/unicode/utf16-surrogate-pairs.hpp> // cppx::utf16::*
12 
13 #include <iterator> // std::(distance, next)
14 #include <functional> // std::invoke
15 
16 namespace cppx::utf32
17 {
18  CPPX_USE_STD( distance, invoke, ref, next );
19 
20  class Generator
21  {
22  int m_n_bad_groups = 0;
23  int m_n_noncanonical_groups = 0;
24 
25  public:
26  auto n_bad_groups() const noexcept -> Size { return m_n_bad_groups; }
27  auto n_noncanonical_groups() const noexcept -> Size { return m_n_noncanonical_groups; }
28 
29  template< class In_iterator >
30  auto code_point_from_bytes( In_out_ref_<In_iterator> it_ref, const In_iterator beyond )
31  -> uint32_t
32  {
33  auto& it = it_ref.get();
34  hopefully( it != beyond )
35  or CPPX_FAIL( "Called with an empty byte sequence `it == beyond`" );
36 
37  const Byte byte_value = *it; ++it;
38  if( utf8::is_single_byte( byte_value ) ) { return byte_value; }
39  if( not utf8::is_lead_byte( byte_value ) ) { return ascii::bad_char; }
40 
41  const int n_bytes = utf8::group_size_for_lead_byte( byte_value );
42  if( n_bytes > 4 ) // More than Unicode's 21 bits per code point.
43  {
44  while( it != beyond and utf8::is_continuation_byte( *it ) )
45  {
46  ++it;
47  }
48  return ascii::bad_char;
49  }
50  m_n_noncanonical_groups += int( not utf8::is_valid_lead_byte( byte_value ) );
51 
52  const int n_first_bits = (8 - (n_bytes + 1));
53  const uint32_t first_bits_mask = ((1 << n_first_bits) - 1);
54 
55  uint32_t code_point = (byte_value & first_bits_mask); // i == 0
56  for( int i = 1; i < n_bytes; ++i )
57  {
58  if( it != beyond )
59  {
60  const Byte continuation_byte = *it;
61  if( utf8::is_continuation_byte( continuation_byte ) )
62  {
63  ++it;
64  code_point = (code_point << 6) | (continuation_byte & 0x3F);
65  continue;
66  }
67  }
68  ++m_n_bad_groups;
69  return ascii::bad_char;
70  }
71  return code_point;
72  }
73 
74  template<
75  class In_iterator,
76  class Out_iterator,
77  class = Enable_if_<(magnitude_bits_per_<Item_for_iterator_<Out_iterator>> >= 21)>
78  >
80  const Span_<In_iterator> bytes_range,
81  const Out_iterator destination
82  ) -> Out_iterator
83  {
84  using Out_value = Item_for_iterator_<Out_iterator>;
85 
86  Out_iterator current = destination;
87  const In_iterator beyond_bytes = bytes_range.beyond();
88  for( auto it = bytes_range.first(); it != beyond_bytes; )
89  {
90  const uint32_t code_point = code_point_from_bytes( it, beyond_bytes );
91  *current = static_cast<Out_value>( code_point );
92  ++current;
93  }
94  return current;
95  }
96 
97  template<
98  class In_iterator,
99  class Out_iterator,
100  class = Enable_if_<(magnitude_bits_per_<Item_for_iterator_<Out_iterator>> >= 21)>
101  >
103  const In_iterator first,
104  const In_iterator beyond,
105  const Out_iterator destination
106  ) -> Out_iterator
107  { return utf32_from_bytes( Span_( first, beyond ), destination ); }
108  };
109 } // namespace cppx::utf32
auto first() const -> Iterator
Definition: Span_.hpp:43
auto n_bad_groups() const noexcept -> Size
auto utf32_from_bytes(const Span_< In_iterator > bytes_range, const Out_iterator destination) -> Out_iterator
auto is_continuation_byte(const char ch) -> Truth
auto n_noncanonical_groups() const noexcept -> Size
CPPX_USE_STD(distance, invoke, ref, next)
auto is_lead_byte(const char ch) -> Truth
auto hopefully(const Truth condition) -> Truth
auto code_point_from_bytes(In_out_ref_< In_iterator > it_ref, const In_iterator beyond) -> uint32_t
auto utf32_from_bytes(const In_iterator first, const In_iterator beyond, const Out_iterator destination) -> Out_iterator
auto is_single_byte(const char ch) -> Truth
auto group_size_for_lead_byte(const char lead_byte) -> int
auto next(P_< const char > p) -> P_< const char >
unsigned char Byte
Default choice of byte type.
Definition: byte-types.hpp:19
Signed_< size_t > Size
A Signed_ equivalent of size_t.
typename std::iterator_traits< Iterator >::value_type Item_for_iterator_
Definition: type-traits.hpp:50
auto beyond() const -> Iterator
Definition: Span_.hpp:44
#define CPPX_FAIL(...)
Definition: macro-fail.hpp:10
std::enable_if_t< condition, Result > Enable_if_
Just more readable than enable_if_t.
Macros for generating more concise and clear using statements, primarily $use_cppx and $use_std,...
auto is_valid_lead_byte(const char ch) -> Truth
Signed Size and Index, plus unsigned equivalents Unsigned_size and Unsigned_index.