cppx-core
unicode-Code_point_generator.hpp
Go to the documentation of this file.
1 #pragma once // Source encoding: UTF-8 with BOM (π is a lowercase Greek "pi").
2 
3 #include <cppx-core/collections/Span_.hpp> // cppx::Span_
4 #include <cppx-core/failure-handling/macro-fail.hpp> // CPPX_FAIL, std::exception
5 #include <cppx-core/language/syntax/macro-use.hpp> // CPPX_USE_STD
7 #include <cppx-core/meta-template/Enable_if_.hpp> // cppx::Enable_if_
8 #include <cppx-core/parameters/In_out_ref_.hpp> // cppx::In_out_ref_, std::ref
9 #include <cppx-core/text/data/ascii-character-names.hpp> // cppx::ascii::bad_char
10 #include <cppx-core/text/unicode/utf8-iteration.hpp> // cppx::utf8::*
11 
12 #include <iterator> // std::(distance, next)
13 #include <functional> // std::invoke
14 
15 namespace cppx::unicode
16 {
17  CPPX_USE_STD( distance, invalid_argument, invoke, ref, runtime_error, next );
18 
20  {
21  int m_n_bad_groups = 0;
22  int m_n_noncanonical_groups = 0;
23  int m_incomplete_code_point = 0;
24  int m_n_completion_bytes = 0;
25 
26  public:
27  auto n_bad_groups() const noexcept -> Size { return m_n_bad_groups; }
28  auto n_noncanonical_groups() const noexcept -> Size { return m_n_noncanonical_groups; }
29 
31  public runtime_error
32  {
33  public:
34  using runtime_error::runtime_error;
35  };
36 
37  template< class In_iterator >
38  auto code_point_from_bytes( In_out_ref_<In_iterator> it_ref, const In_iterator beyond )
39  -> uint32_t
40  {
41  auto& it = it_ref.get();
42  hopefully( it != beyond )
43  or CPPX_FAIL_( invalid_argument, "Called with an empty byte sequence `it == beyond`" );
44 
45  const Byte byte_value = *it; ++it;
46  if( utf8::is_single_byte( byte_value ) )
47  {
48  return byte_value;
49  }
50 
51  if( not utf8::is_lead_byte( byte_value ) )
52  {
53  ++m_n_bad_groups;
54  return ascii::bad_char;
55  }
56 
57  const int n_bytes = utf8::group_size_for_lead_byte( byte_value );
58  if( n_bytes > 4 ) // More than Unicode's 21 bits per code point.
59  {
60  while( it != beyond and utf8::is_continuation_byte( *it ) )
61  {
62  ++it;
63  }
64  ++m_n_bad_groups;
65  return ascii::bad_char;
66  }
67  m_n_noncanonical_groups += int( not utf8::is_valid_lead_byte( byte_value ) );
68 
69  const int n_first_value_bits = (8 - (n_bytes + 1));
70  const uint32_t first_value_bits_mask = ((1 << n_first_value_bits) - 1);
71 
72  uint32_t code_point = (byte_value & first_value_bits_mask); // i == 0
73  for( int i = 1; i < n_bytes; ++i )
74  {
75  if( it == beyond )
76  {
77  m_incomplete_code_point = code_point;
78  m_n_completion_bytes = n_bytes - (i - 1);
79  CPPX_FAIL_( Incomplete_group_exception, "Incomplete group at end of buffer" );
80  }
81 
82  const Byte continuation_byte = *it;
83  if( utf8::is_continuation_byte( continuation_byte ) )
84  {
85  ++it;
86  code_point = (code_point << 6) | (continuation_byte & 0x3F);
87  }
88  else
89  {
90  ++m_n_bad_groups;
91  return ascii::bad_char;
92  }
93  }
94  return code_point;
95  }
96  };
97 } // namespace cppx::unicode
auto is_continuation_byte(const char ch) -> Truth
auto is_lead_byte(const char ch) -> Truth
auto hopefully(const Truth condition) -> Truth
auto n_noncanonical_groups() const noexcept -> Size
auto is_single_byte(const char ch) -> Truth
CPPX_USE_STD(distance, invalid_argument, invoke, ref, runtime_error, next)
auto group_size_for_lead_byte(const char lead_byte) -> int
auto next(P_< const char > p) -> P_< const char >
auto code_point_from_bytes(In_out_ref_< In_iterator > it_ref, const In_iterator beyond) -> uint32_t
unsigned char Byte
Default choice of byte type.
Definition: byte-types.hpp:19
Signed_< size_t > Size
A Signed_ equivalent of size_t.
Macros for generating more concise and clear using statements, primarily $use_cppx and $use_std,...
auto is_valid_lead_byte(const char ch) -> Truth
Signed Size and Index, plus unsigned equivalents Unsigned_size and Unsigned_index.
#define CPPX_FAIL_(X,...)
Definition: macro-fail.hpp:13