E-MailRelay
gconvert.cpp
Go to the documentation of this file.
1//
2// Copyright (C) 2001-2024 Graeme Walker <graeme_walker@users.sourceforge.net>
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program. If not, see <http://www.gnu.org/licenses/>.
16// ===
17///
18/// \file gconvert.cpp
19///
20
21#include "gdef.h"
22#include "gconvert.h"
23#include <algorithm>
24#include <type_traits>
25#include <cwchar> // std::wsclen()
26
27bool G::Convert::m_utf16 = sizeof(wchar_t) == 2 ;
28
29#ifndef G_LIB_SMALL
30bool G::Convert::utf16( bool b )
31{
32 std::swap( m_utf16 , b ) ;
33 return b ;
34}
35#endif
36
37#ifndef G_LIB_SMALL
38std::wstring G::Convert::widen( std::string_view sv )
39{
40 return sv.empty() ? std::wstring() : widenImp( sv.data() , sv.size() ) ;
41}
42#endif
43
44bool G::Convert::valid( std::string_view sv ) noexcept
45{
46 if( sv.empty() ) return true ;
47 bool valid = true ;
48 widenImp( sv.data() , sv.size() , nullptr , &valid ) ;
49 return valid ;
50}
51
52#ifndef G_LIB_SMALL
53std::string G::Convert::narrow( const std::wstring & s )
54{
55 return s.empty() ? std::string() : narrowImp( s.data() , s.size() ) ;
56}
57#endif
58
59#ifndef G_LIB_SMALL
60std::string G::Convert::narrow( const wchar_t * p )
61{
62 return p && *p ? narrowImp( p , std::wcslen(p) ) : std::string() ;
63}
64#endif
65
66#ifndef G_LIB_SMALL
67std::string G::Convert::narrow( const wchar_t * p , std::size_t n )
68{
69 return p && n ? narrowImp( p , n ) : std::string() ;
70}
71#endif
72
73#ifndef G_LIB_SMALL
74bool G::Convert::invalid( const std::wstring & s )
75{
76 return s.find( L'\xFFFD' ) != std::string::npos ;
77}
78#endif
79
80#ifndef G_LIB_SMALL
81bool G::Convert::invalid( const std::string & s )
82{
83 return s.find( "\xEF\xBF\xBD" ) != std::string::npos ;
84}
85#endif
86
87// ==
88
89std::wstring G::Convert::widenImp( const char * p_in , std::size_t n_in )
90{
91 std::size_t n_out = widenImp( p_in , n_in , nullptr ) ;
92 if( n_out == 0U ) return {} ;
93 std::wstring out ;
94 out.resize( n_out ) ;
95 widenImp( p_in , n_in , &*out.begin() ) ;
96 return out ;
97}
98
99std::size_t G::Convert::widenImp( const char * p_in , std::size_t n_in , wchar_t * p_out , bool * valid_out ) noexcept
100{
101 const unsigned char * p = reinterpret_cast<const unsigned char*>( p_in ) ;
102 std::size_t n_out = 0U ;
103 std::size_t d = 0U ;
104 for( std::size_t i = 0U ; i < n_in ; i += d , p += d )
105 {
106 // UTF-8 in
107 auto pair = u8in( p , n_in-i ) ;
108 unicode_type u = pair.first ;
109 d = pair.second ;
110 if( u == unicode_error )
111 {
112 u = 0xFFFD ;
113 if( valid_out )
114 *valid_out = false ;
115 }
116
117 if( m_utf16 ) // UTF-16 out
118 {
119 if( u <= 0xD7FF || ( u >= 0xE000 && u <= 0xFFFF ) )
120 {
121 if( p_out )
122 *p_out++ = static_cast<wchar_t>(u) ;
123 n_out++ ;
124 }
125 else if( u >= 0x10000 && u <= 0x10FFFF )
126 {
127 static_assert( (0x10FFFF - 0x10000) == 0xFFFFF , "" ) ;
128 u -= 0x10000 ;
129 if( p_out )
130 {
131 *p_out++ = static_cast<wchar_t>( 0xD800 | (u >> 10) ) ;
132 *p_out++ = static_cast<wchar_t>( 0xDC00 | (u & 0x3FF) ) ;
133 }
134 n_out += 2 ;
135 }
136 else
137 {
138 if( p_out )
139 *p_out++ = L'\xFFFD' ;
140 n_out++ ;
141 }
142 }
143 else // UCS-4 out
144 {
145 if( p_out )
146 *p_out++ = static_cast<wchar_t>(u) ;
147 n_out++ ;
148 }
149 }
150 return n_out ;
151}
152
153std::pair<G::Convert::unicode_type,std::size_t> G::Convert::u8in( const unsigned char * p , std::size_t n ) noexcept
154{
155 unicode_type u = unicode_error ;
156 std::size_t d = 1U ;
157 if( n == 0U )
158 {
159 }
160 else if( ( p[0] & 0x80U ) == 0U ) // 0...
161 {
162 u = p[0] ;
163 }
164 else if( ( p[0] & 0xC0 ) == 0x80 ) // 10...
165 {
166 }
167 else if( ( p[0] & 0xE0U ) == 0xC0U && n > 1U && // 110...
168 !( ( p[0] & 0x1EU ) == 0U ) && // (not overlong)
169 ( p[1] & 0xC0 ) == 0x80U )
170 {
171 d = 2U ;
172 u = ( static_cast<unicode_type>(p[0]) & 0x1FU ) << 6 ;
173 u |= ( static_cast<unicode_type>(p[1]) & 0x3FU ) << 0 ;
174 }
175 else if( ( p[0] & 0xE0U ) == 0xC0U )
176 {
177 //d = 2U ;
178 }
179 else if( ( p[0] & 0xF0U ) == 0xE0U && n > 2U && // 1110...
180 !( ( p[0] & 0x0FU ) == 0U && ( p[1] & 0x20U ) == 0U ) && // (not overlong)
181 !( ( p[0] & 0x0FU ) == 0x0DU && ( p[1] & 0x20U ) == 0x20U ) && // (not UTF-16 surrogate)
182 ( p[1] & 0xC0 ) == 0x80U &&
183 ( p[2] & 0xC0 ) == 0x80U )
184 {
185 d = 3U ;
186 u = ( static_cast<unicode_type>(p[0]) & 0x0FU ) << 12 ;
187 u |= ( static_cast<unicode_type>(p[1]) & 0x3FU ) << 6 ;
188 u |= ( static_cast<unicode_type>(p[2]) & 0x3FU ) << 0 ;
189 }
190 else if( ( p[0] & 0xF0U ) == 0xE0U )
191 {
192 //d = 3U ;
193 }
194 else if( ( p[0] & 0xF8U ) == 0xF0U && n > 3U && // 11110...
195 !( ( p[0] & 0x07U ) == 0U && ( p[1] & 0x30U ) == 0U ) && // (not overlong)
196 ( p[1] & 0xC0 ) == 0x80U &&
197 ( p[2] & 0xC0 ) == 0x80U &&
198 ( p[3] & 0xC0 ) == 0x80U )
199 {
200 d = 4U ;
201 u = ( static_cast<unicode_type>(p[0]) & 0x07U ) << 18 ;
202 u |= ( static_cast<unicode_type>(p[1]) & 0x3FU ) << 12 ;
203 u |= ( static_cast<unicode_type>(p[2]) & 0x3FU ) << 6 ;
204 u |= ( static_cast<unicode_type>(p[3]) & 0x3FU ) << 0 ;
205 }
206 else if( ( p[0] & 0xF8U ) == 0xF0U )
207 {
208 //d = 4U ;
209 }
210 else // 11111...
211 {
212 }
213 return std::make_pair( u , d ) ;
214}
215
216void G::Convert::u8parse( std::string_view s , ParseFn fn )
217{
218 const unsigned char * p = reinterpret_cast<const unsigned char*>( s.data() ) ;
219 std::size_t n = s.size() ;
220 std::size_t d = 0U ;
221 for( std::size_t i = 0U ; i < n ; i += d , p += d )
222 {
223 auto pair = u8in( p , n-i ) ;
224 if( pair.first == unicode_error ) pair.first = L'\xFFFD' ;
225 if( !fn( pair.first , pair.second , i ) ) break ;
226 d = std::max( pair.second , std::size_t(1U) ) ;
227 }
228}
229
230std::string G::Convert::narrowImp( const wchar_t * p_in , std::size_t n_in )
231{
232 std::size_t n_out = narrowImp( p_in , n_in , nullptr ) ;
233 std::string s ;
234 s.resize( n_out ) ;
235 G::Convert::narrowImp( p_in , n_in , &*s.begin() ) ;
236 return s ;
237}
238
239std::size_t G::Convert::narrowImp( const wchar_t * p_in , std::size_t n_in , char * p_out ) noexcept
240{
241 std::size_t n_out = 0U ;
242 std::size_t d = 1U ;
243 for( std::size_t i = 0U ; i < n_in ; i += d , p_in += d )
244 {
245 unicode_type u = unicode_cast( p_in[0] ) ;
246 bool error = false ;
247 d = 1U ;
248 if( m_utf16 ) // UTF-16 in
249 {
250 auto u1 = unicode_cast( m_utf16 && (i+1U) < n_in ? p_in[1] : L'\0' ) ;
251 const bool u0_high = u >= 0xD800U && u <= 0xDBFFU ;
252 const bool u0_low = u >= 0xDC00U && u <= 0xDFFFU ;
253 const bool u1_low = u1 >= 0xDC00U && u1 <= 0xDFFFU ;
254 if( u0_high && u1_low )
255 {
256 d = 2U ;
257 u = 0x10000U | ((u & 0x3FFU) << 10) | (u1 & 0x3FFU) ;
258 }
259 else if( u0_high || u0_low )
260 {
261 error = true ;
262 }
263 }
264
265 // UTF-8 out
266 if( error || u > 0x10FFFFU )
267 {
268 if( p_out )
269 {
270 *p_out++ = '\xEF' ;
271 *p_out++ = '\xBF' ;
272 *p_out++ = '\xBD' ;
273 }
274 n_out += 3 ;
275 }
276 else
277 {
278 n_out += u8out( u , p_out ) ;
279 }
280 }
281 return n_out ;
282}
283
284std::size_t G::Convert::u8out( unicode_type u , char* & p_out ) noexcept
285{
286 if( u > 0x10FFFFU )
287 {
288 return 0U ;
289 }
290 else if( u <= 0x7FU )
291 {
292 if( p_out )
293 *p_out++ = static_cast<unsigned char>(u) ; // NOLINT
294 return 1U ;
295 }
296 else if( u >= 0x80U && u <= 0x7FFU )
297 {
298 if( p_out )
299 {
300 *p_out++ = char_cast( 0xC0U | ((u >> 6) & 0x1FU) ) ;
301 *p_out++ = char_cast( 0x80U | ((u >> 0) & 0x3FU) ) ;
302 }
303 return 2U ;
304 }
305 else if( u >= 0x800U && u <= 0xFFFFU )
306 {
307 if( p_out )
308 {
309 *p_out++ = char_cast( 0xE0U | ((u >> 12) & 0x0FU) ) ;
310 *p_out++ = char_cast( 0x80U | ((u >> 6) & 0x3FU) ) ;
311 *p_out++ = char_cast( 0x80U | ((u >> 0) & 0x3FU) ) ;
312 }
313 return 3U ;
314 }
315 else
316 {
317 if( p_out )
318 {
319 *p_out++ = char_cast( 0xF0U | ((u >> 18) & 0x07U) ) ;
320 *p_out++ = char_cast( 0x80U | ((u >> 12) & 0x3FU) ) ;
321 *p_out++ = char_cast( 0x80U | ((u >> 6) & 0x3FU) ) ;
322 *p_out++ = char_cast( 0x80U | ((u >> 0) & 0x3FU) ) ;
323 }
324 return 4U ;
325 }
326}
327
328G::Convert::unicode_type G::Convert::unicode_cast( wchar_t c ) noexcept
329{
330 return static_cast<unicode_type>( static_cast<std::make_unsigned<wchar_t>::type>(c) ) ; // NOLINT clang-tidy confusion
331}
332
333char G::Convert::char_cast( unsigned int i ) noexcept
334{
335 return static_cast<char>( static_cast<unsigned char>( i ) ) ; // NOLINT narrowing
336}
337
static std::wstring widen(std::string_view)
Widens from UTF-8 to UTF-16/UCS-4 wstring.
Definition: gconvert.cpp:38
static std::pair< unicode_type, std::size_t > u8in(const unsigned char *, std::size_t n) noexcept
Reads a Unicode character from a UTF-8 buffer together with the number of bytes consumed.
Definition: gconvert.cpp:153
static bool invalid(const std::wstring &)
Returns true if the string contains L'\xFFFD'.
Definition: gconvert.cpp:74
static void u8parse(std::string_view, ParseFn)
Calls a function for each Unicode value in the given UTF-8 string.
Definition: gconvert.cpp:216
static std::string narrow(const std::wstring &)
Narrows from UTF-16/UCS-4 wstring to UTF-8.
Definition: gconvert.cpp:53
static std::size_t u8out(unicode_type, char *&) noexcept
Puts a Unicode character value into a character buffer with UTF-8 encoding.
Definition: gconvert.cpp:284
static bool valid(std::string_view) noexcept
Returns true if the string is valid UTF-8.
Definition: gconvert.cpp:44
static bool utf16(bool)
Forces UTF-16 even if wchar_t is 4 bytes. Used in testing.
Definition: gconvert.cpp:30