cppx-core
utf8-iteration.hpp
Go to the documentation of this file.
1 #pragma once // Source encoding: UTF-8 with BOM (π is a lowercase Greek "pi").
2 //
3 // UTF-8 code point iteration.
4 // For the classifiers see <url: https://en.wikipedia.org/wiki/UTF-8#Codepage_layout>.
5 
6 #include <cppx-core/collections/is_empty.hpp> // cppx::is_empty
7 #include <cppx-core/collections/Range_.hpp> // cppx::up_to
8 #include <cppx-core/failure-handling/macro-fail.hpp> // cppx::(hopefully, fail) CPPX_FAIL
10 #include <cppx-core/language/types/byte-types.hpp> // cppx::Byte
12 #include <cppx-core/language/types/Truth.hpp> // cppx::Truth
13 #include <cppx-core/text/pointers-from-string_view.hpp> // cppx::(p_first_of, p_beyond_of)
14 
15 #include <string_view> // std::(string_view )
16 
17 namespace cppx::utf8
18 {
19  CPPX_USE_STD( string_view );
20 
21  inline auto is_single_byte( const char ch )
22  -> Truth
23  { return (Byte( ch ) & 0x80) == 0x00; }
24 
25  inline auto is_valid_single_byte( const char ch )
26  -> Truth
27  { return is_single_byte( ch ); }
28 
29  inline auto is_continuation_byte( const char ch )
30  -> Truth
31  {
32  // The general scheme is that bit pattern 0b10xxxxxx denotes a continuation byte,
33  // providing 6 bits to the result value.
34  return (Byte( ch ) & 0xC0) == 0x80;
35  }
36 
37  inline auto is_valid_continuation_byte( const char ch )
38  -> Truth
39  { return is_continuation_byte( ch ); }
40 
41  inline auto is_lead_byte( const char ch )
42  -> Truth
43  {
44  // The general scheme is that bit pattern 0b11xxxxxx denotes a lead byte.
45  // The number of leading 1's equals the total number of bytes in this group.
46  return (Byte( ch ) & 0xC0) == 0xC0;
47  }
48 
49  inline auto is_valid_lead_byte( const char ch )
50  -> Truth
51  {
52  // The general scheme is that bit pattern 0b11xxxxxx denotes a lead byte, but
53  // some code points are ruled out because those encodings are not permitted.
54  // E.g. 0b1100'000x would start an invalid encoding of a single ASCII character.
55  return (0xC2 <= Byte( ch ) and Byte( ch ) <= 0xF4);
56  }
57 
58  // Intentionally does no validity checking.
59  inline auto group_size_for_lead_byte( const char lead_byte )
60  -> int
61  {
62  int n_bytes = 0;
63  for( auto bits = Byte( lead_byte ), mask = Byte( 0x80 ); (bits & mask) != 0; mask >>= 1 )
64  {
65  ++n_bytes;
66  }
67  return n_bytes;
68  }
69 
70  // Move to the next Unicode code point, not necessarily the next character!
71  inline void move_to_next( P_<const char>& p )
72  {
73  do{ ++p; } while( is_continuation_byte( *p ) );
74  }
75 
76  inline void move_to_next( P_<const char>& p, const P_<const char> p_beyond )
77  {
78  // assert( p != nullptr )
79  // assert( p != p_beyond )
80  do{ ++p; } while( p != p_beyond and is_continuation_byte( *p ) );
81  }
82 
83  // Move to the previous Unicode code point, not necessarily the previous character!
84  inline void move_to_prev( P_<const char>& p )
85  {
86  do{ --p; } while( is_continuation_byte( *p ) );
87  }
88 
89  inline auto move_to_prev( P_<const char>& p, const P_<const char> p_first )
90  -> Truth
91  {
92  // assert( p != nullptr )
93  // assert( p != p_first )
94  do{ --p; } while( p != p_first and is_continuation_byte( *p ) );
95 
96  return not( p == p_first and is_continuation_byte( *p ) );
97  }
98 
99  inline void advance( P_<const char>& p, const Size distance )
100  {
101  if( distance >= 0 )
102  {
103  for( auto _ : up_to( distance ) ) { (void)_; move_to_next( p ); }
104  }
105  else
106  {
107  for( auto _ : up_to( -distance ) ) { (void)_; move_to_prev( p ); }
108  }
109  }
110 
111  inline auto advance( P_<const char>& p, const Size distance, const string_view& range )
112  -> Truth
113  {
114  if( distance >= 0 )
115  {
116  const auto p_beyond = p_beyond_of( range );
117  for( auto _ : up_to( distance ) ) { (void)_; move_to_next( p, p_beyond ); }
118  }
119  else
120  {
121  const auto p_first = p_first_of( range );
122  for( auto _ : up_to( -distance ) )
123  {
124  (void)_;
125  if( not move_to_prev( p, p_first ) )
126  {
127  return false;
128  }
129  }
130  }
131  return true;
132  }
133 
134  inline auto next( P_<const char> p )
135  -> P_<const char>
136  {
137  move_to_next( p );
138  return p;
139  }
140 
141  inline auto next( P_<const char> p, const P_<const char> p_beyond )
142  -> P_<const char>
143  {
144  move_to_next( p, p_beyond );
145  return p;
146  }
147 
148  inline auto prev( P_<const char> p )
149  -> P_<const char>
150  {
151  move_to_prev( p );
152  return p;
153  }
154 
155  inline auto to_prev_code_point( P_<const char> p, const P_<const char> p_first )
156  -> P_<const char>
157  {
158  move_to_prev( p, p_first );
159  return p;
160  }
161 
162  inline auto n_code_points_in( const string_view& view )
163  -> Size
164  {
165  if( is_empty( view ) )
166  {
167  return 0;
168  }
169 
170  Size n = 0;
171  for( auto p = p_first_of( view ), beyond = p_beyond_of( view );
172  p != beyond;
173  move_to_next( p, beyond ) )
174  { ++n; }
175  return n;
176  }
177 
178 } // namespace cppx::utf8
auto n_code_points_in(const string_view &view) -> Size
Some_type * P_
Creates a raw pointer type.
auto is_continuation_byte(const char ch) -> Truth
auto prev(P_< const char > p) -> P_< const char >
auto is_lead_byte(const char ch) -> Truth
A drop-in replacement for bool without implicit conversion from/to types other than bool.
Definition: Truth.hpp:34
CPPX_USE_STD(basic_string, basic_string_view, iterator_traits, move, next, string, string_view)
void advance(P_< const char > &p, const Size distance)
void move_to_prev(P_< const char > &p)
auto is_single_byte(const char ch) -> Truth
Truth is a drop-in replacement for bool without implicit conversion from/to types other than bool.
auto is_valid_continuation_byte(const char ch) -> Truth
auto p_first_of(const basic_string_view< Char > &view) noexcept -> C_str_< Char >
auto group_size_for_lead_byte(const char lead_byte) -> int
void move_to_next(P_< const char > &p)
Simple type builders Type_, P_, R_, Raw_array_ and Raw_array_of_.
auto is_empty(const Collection &c) -> Truth
Definition: is_empty.hpp:33
auto next(P_< const char > p) -> P_< const char >
constexpr auto up_to(const Integer n) noexcept -> Range_< Integer >
Definition: Range_.hpp:61
unsigned char Byte
Default choice of byte type.
Definition: byte-types.hpp:19
Signed_< size_t > Size
A Signed_ equivalent of size_t.
auto is_valid_single_byte(const char ch) -> Truth
auto to_prev_code_point(P_< const char > p, const P_< const char > p_first) -> P_< const char >
Byte and Signed_byte, + std::byte support definitions as_number and as_std_byte.
auto is_valid_lead_byte(const char ch) -> Truth
Signed Size and Index, plus unsigned equivalents Unsigned_size and Unsigned_index.
auto p_beyond_of(const basic_string_view< Char > &view) noexcept -> C_str_< Char >