E-MailRelay
gconvert.h
Go to the documentation of this file.
1//
2// Copyright (C) 2001-2024 Graeme Walker <graeme_walker@users.sourceforge.net>
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program. If not, see <http://www.gnu.org/licenses/>.
16// ===
17///
18/// \file gconvert.h
19///
20
21#ifndef G_CONVERT_H
22#define G_CONVERT_H
23
24#include "gdef.h"
25#include "gstringview.h"
26#include "gexception.h"
27#include <string>
28#include <vector>
29#include <type_traits>
30#include <cstdint> // std::uint_least32_t
31#include <functional>
32
33namespace G
34{
35 class Convert ;
36}
37
38//| \class G::Convert
39/// A static class which provides string encoding conversion functions between
40/// UTF-8 and wchar_t. On Unix wchar_t strings are unencoded UCS-4; on Windows
41/// wchar_t strings are UTF-16.
42///
44{
45public:
46 G_EXCEPTION_CLASS( NarrowError , tx("string character-set narrowing error") )
47 G_EXCEPTION_CLASS( WidenError , tx("string character-set widening error") )
48 using unicode_type = std::uint_least32_t ;
49 static_assert( sizeof(unicode_type) >= sizeof(wchar_t) , "" ) ;
50 static constexpr unicode_type unicode_error = ~(unicode_type)0 ;
51 using ParseFn = std::function<bool(unicode_type,std::size_t,std::size_t)> ;
52
53 static std::wstring widen( std::string_view ) ;
54 ///< Widens from UTF-8 to UTF-16/UCS-4 wstring. Invalid input characters
55 ///< are substituted with L'\xFFFD'.
56
57 static bool valid( std::string_view ) noexcept ;
58 ///< Returns true if the string is valid UTF-8.
59
60 static std::string narrow( const std::wstring & ) ;
61 ///< Narrows from UTF-16/UCS-4 wstring to UTF-8. Invalid input characters
62 ///< are substituted with u8"\uFFFD", ie. "\xEF\xBF\xBD".
63
64 static std::string narrow( const wchar_t * ) ;
65 ///< Pointer overload.
66
67 static std::string narrow( const wchar_t * , std::size_t n ) ;
68 ///< String-view overload.
69
70 static bool invalid( const std::wstring & ) ;
71 ///< Returns true if the string contains L'\xFFFD'.
72
73 static bool invalid( const std::string & ) ;
74 ///< Returns true if the string contains u8"\uFFFD".
75
76 static std::size_t u8out( unicode_type , char * & ) noexcept ;
77 ///< Puts a Unicode character value into a character buffer with
78 ///< UTF-8 encoding. Advances the pointer by reference and returns
79 ///< the number of bytes (1..4). Returns zero on error, without
80 ///< advancing the pointer.
81
82 static std::pair<unicode_type,std::size_t> u8in( const unsigned char * , std::size_t n ) noexcept ;
83 ///< Reads a Unicode character from a UTF-8 buffer together with
84 ///< the number of bytes consumed. Returns [unicode_error,1] on
85 ///< error.
86
87 static void u8parse( std::string_view , ParseFn ) ;
88 ///< Calls a function for each Unicode value in the given
89 ///< UTF-8 string. Stops if the callback returns false. The
90 ///< callback parameters are: Unicode value (0xFFFD on
91 ///< error), UTF-8 bytes consumed, and UTF-8 byte offset.
92
93 static bool utf16( bool ) ;
94 ///< Forces UTF-16 even if wchar_t is 4 bytes. Used in testing.
95
96public:
97 Convert() = delete ;
98
99private:
100 static bool m_utf16 ;
101 static std::wstring widenImp( const char * , std::size_t ) ;
102 static std::size_t widenImp( const char * , std::size_t , wchar_t * , bool * = nullptr ) noexcept ;
103 static std::string narrowImp( const wchar_t * , std::size_t ) ;
104 static std::size_t narrowImp( const wchar_t * , std::size_t , char * ) noexcept ;
105 static unicode_type unicode_cast( wchar_t c ) noexcept ;
106 static char char_cast( unsigned int ) noexcept ;
107} ;
108
109#endif
A static class which provides string encoding conversion functions between UTF-8 and wchar_t.
Definition: gconvert.h:44
static std::wstring widen(std::string_view)
Widens from UTF-8 to UTF-16/UCS-4 wstring.
Definition: gconvert.cpp:38
static std::pair< unicode_type, std::size_t > u8in(const unsigned char *, std::size_t n) noexcept
Reads a Unicode character from a UTF-8 buffer together with the number of bytes consumed.
Definition: gconvert.cpp:153
static bool invalid(const std::wstring &)
Returns true if the string contains L'\xFFFD'.
Definition: gconvert.cpp:74
static void u8parse(std::string_view, ParseFn)
Calls a function for each Unicode value in the given UTF-8 string.
Definition: gconvert.cpp:216
static std::string narrow(const std::wstring &)
Narrows from UTF-16/UCS-4 wstring to UTF-8.
Definition: gconvert.cpp:53
static std::size_t u8out(unicode_type, char *&) noexcept
Puts a Unicode character value into a character buffer with UTF-8 encoding.
Definition: gconvert.cpp:284
static bool valid(std::string_view) noexcept
Returns true if the string is valid UTF-8.
Definition: gconvert.cpp:44
static bool utf16(bool)
Forces UTF-16 even if wchar_t is 4 bytes. Used in testing.
Definition: gconvert.cpp:30
Low-level classes.
Definition: garg.h:36
constexpr const char * tx(const char *p) noexcept
A briefer alternative to G::gettext_noop().
Definition: ggettext.h:84