E-MailRelay
gidn.cpp
Go to the documentation of this file.
1//
2// Copyright (C) 2001-2024 Graeme Walker <graeme_walker@users.sourceforge.net>
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program. If not, see <http://www.gnu.org/licenses/>.
16// ===
17///
18/// \file gidn.cpp
19///
20
21#include "gdef.h"
22#include "gidn.h"
23#include "gstr.h"
24#include "gconvert.h"
25#include "gstringtoken.h"
26#include "glog.h"
27#include <vector>
28#include <limits>
29#include <iomanip>
30#include <algorithm>
31#include <iterator>
32
33namespace G
34{
35 class IdnImp ;
36}
37
38class G::IdnImp
39{
40public:
41 IdnImp() ;
42 IdnImp & encode( std::string_view domain ) ;
43 std::string result() const { return m_output ; }
44 static bool is7Bit( std::string_view s ) noexcept ;
45
46private:
47 using unicode_type = G::Convert::unicode_type ;
48 using value_type = g_uint32_t ;
49 using List = std::vector<unicode_type> ;
50
51private:
52 void outputPunycode( std::string_view ) ;
53 static bool parse( List& , unicode_type , std::size_t , std::size_t ) ;
54 static value_type adapt( value_type d , value_type n , bool first ) noexcept ;
55 struct div_t { value_type quot ; value_type rem ; } ;
56 static div_t div( value_type numerator , value_type demoninator ) noexcept ;
57 static value_type clamp( value_type v , value_type lo , value_type hi ) noexcept ;
58 static void check( bool ) ;
59 static void check( value_type , value_type , value_type ) ;
60 static bool is7Bit_( char ) noexcept ;
61
62private:
63 static constexpr value_type c_skew = 38U ;
64 static constexpr value_type c_damp = 700U ;
65 static constexpr value_type c_base = 36U ;
66 static constexpr value_type c_tmin = 1U ;
67 static constexpr value_type c_tmax = 26U ;
68 static constexpr value_type c_initial_bias = 72U ;
69 static constexpr value_type c_initial_n = 128U ;
70 std::string m_output ;
71 List m_ulist ;
72} ;
73
74bool G::Idn::valid( std::string_view domain )
75{
76 // TODO full IDN validation
77
78 if( domain.empty() || !G::Str::isPrintable(domain) )
79 return false ;
80
81 for( G::StringTokenView t(domain,".",1U) ; t ; ++t )
82 {
83 if( t().empty() || ( !G::Str::isPrintableAscii(t()) && !G::Convert::valid(t()) ) )
84 return false ;
85 }
86 return true ;
87}
88
89std::string G::Idn::encode( std::string_view domain )
90{
91 if( domain.empty() || IdnImp::is7Bit(domain) )
92 return G::sv_to_string( domain ) ;
93 else
94 return IdnImp().encode(domain).result() ;
95}
96
97// ==
98
99G::IdnImp::IdnImp()
100= default ;
101
102G::IdnImp & G::IdnImp::encode( std::string_view domain )
103{
104 m_output.reserve( domain.size() * 2U ) ;
105 m_ulist.reserve( domain.size() ) ;
106 bool first = true ;
107 for( G::StringTokenView t(domain,".",1U) ; t ; ++t , first = false )
108 {
109 m_output.append( first?0U:1U , '.' ) ;
110 if( is7Bit(t()) )
111 {
112 m_output.append( t.data() , t.size() ) ;
113 }
114 else
115 {
116 m_output.append( "xn--" , 4U ) ;
117 outputPunycode( t() ) ;
118 }
119 }
120 return *this ;
121}
122
123void G::IdnImp::outputPunycode( std::string_view label )
124{
125 // RFC-3492 pseudocode transliteration...
126
127 std::size_t b0 = m_output.size() ;
128 std::copy_if( label.begin() , label.end() , std::back_inserter(m_output) , &IdnImp::is7Bit_ ) ;
129 value_type b = static_cast<value_type>( m_output.size() - b0 ) ;
130 if( b ) m_output.append( 1U , '-' ) ;
131
132 using namespace std::placeholders ;
133 G::Convert::u8parse( label , std::bind(&IdnImp::parse,std::ref(m_ulist),_1,_2,_3) ) ;
134
135 static constexpr std::string_view c_map { "abcdefghijklmnopqrstuvwxyz0123456789" , 36U } ;
136 value_type n = c_initial_n ;
137 value_type delta = 0 ;
138 value_type bias = c_initial_bias ;
139 for( value_type h = b ; h < static_cast<value_type>(m_ulist.size()) ; delta++ , n++ )
140 {
141 auto m_p = std::min_element( m_ulist.begin() , m_ulist.end() ,
142 [n](unicode_type a_,unicode_type b_){return (a_<n?0x110000U:a_) < (b_<n?0x110000U:b_) ;} ) ;
143
144 G_ASSERT( m_p != m_ulist.end() && *m_p >= n ) ;
145 check( m_p != m_ulist.end() && *m_p >= n ) ;
146 G_DEBUG( "idn: next code point is " << std::hex << std::setfill('0') << std::setw(4U) << *m_p ) ;
147 check( delta , *m_p-n , h+1U ) ;
148
149 delta += (*m_p-n) * (h+1U) ;
150 n = *m_p ;
151 for( std::size_t i = 0U ; i < m_ulist.size() ; i++ ) // NOLINT modernize-loop-convert
152 {
153 if( m_ulist[i] < n ) { delta++ ; check( delta != 0U ) ; }
154 if( m_ulist[i] == n )
155 {
156 auto q = delta ;
157 const auto output_size = m_output.size() ; GDEF_IGNORE_VARIABLE(output_size) ; // NOLINT
158 for( value_type k = c_base ;; k += c_base )
159 {
160 value_type t = clamp( k-std::min(k,bias) , c_tmin , c_tmax ) ; // 1..26
161 if( q < t ) break ;
162 auto x = div( q-t , c_base-t ) ; static_assert(c_base>c_tmax,"") ;
163 q = x.quot ;
164 m_output.push_back( c_map.at(std::size_t(t)+x.rem) ) ;
165 }
166 m_output.push_back( c_map.at(q) ) ;
167 G_DEBUG( "idn: delta " << delta << ", encodes as \"" << m_output.substr(output_size) << "\"" ) ;
168 bias = adapt( delta , h+1U , h == b ) ;
169 G_DEBUG( "idn: bias becomes " << bias ) ;
170 delta = 0 ;
171 ++h ;
172 }
173 }
174 }
175}
176
177G::IdnImp::value_type G::IdnImp::adapt( value_type d , value_type n , bool first ) noexcept
178{
179 d /= ( first ? c_damp : 2U ) ;
180 d += ( d / n ) ;
181 value_type k = 0U ;
182 for( ; d > ((c_base-c_tmin)*c_tmax)/2 ; k += c_base )
183 d /= (c_base-c_tmin) ;
184 return k + ((c_base-c_tmin+1U)*d) / (d+c_skew) ;
185}
186
187bool G::IdnImp::parse( List & output , unicode_type u , std::size_t , std::size_t )
188{
189 output.push_back( {u} ) ;
190 return true ;
191}
192
193G::IdnImp::value_type G::IdnImp::clamp( value_type v , value_type lo , value_type hi ) noexcept
194{
195 // std::clamp() is c++17
196 return v < lo ? lo : ( hi < v ? hi : v ) ;
197}
198
199bool G::IdnImp::is7Bit_( char c ) noexcept
200{
201 return ( static_cast<unsigned char>(c) & 0x80U ) == 0U ;
202}
203
204bool G::IdnImp::is7Bit( std::string_view s ) noexcept
205{
206 return std::all_of( s.begin() , s.end() , &IdnImp::is7Bit_ ) ;
207}
208
209G::IdnImp::div_t G::IdnImp::div( value_type top , value_type bottom ) noexcept
210{
211 // cf. std::div()
212 return { top/bottom , top%bottom } ;
213}
214
215void G::IdnImp::check( bool b )
216{
217 if( !b )
218 throw Idn::Error() ; // never gets here
219}
220
221void G::IdnImp::check( value_type a , value_type b , value_type c )
222{
223 constexpr value_type maxint = std::numeric_limits<value_type>::max() ;
224 if( c != 0U && b > (maxint-a)/c )
225 throw Idn::Error( "domain name too long: numeric overflow multiplying by " + std::to_string(c) ) ;
226}
227
static void u8parse(std::string_view, ParseFn)
Calls a function for each Unicode value in the given UTF-8 string.
Definition: gconvert.cpp:216
static bool valid(std::string_view) noexcept
Returns true if the string is valid UTF-8.
Definition: gconvert.cpp:44
static bool isPrintable(std::string_view s) noexcept
Returns true if every character is 0x20 or above but not 0x7f.
Definition: gstr.cpp:418
static bool isPrintableAscii(std::string_view s) noexcept
Returns true if every character is between 0x20 and 0x7e inclusive.
Definition: gstr.cpp:413
A zero-copy string token iterator where the token separators are runs of whitespace characters,...
Definition: gstringtoken.h:54
std::string encode(std::string_view domain)
Returns the given domain with A-lables.
Definition: gidn.cpp:89
bool valid(std::string_view domain)
Returns true if the given domain is valid with U-labels and/or A-labels.
Definition: gidn.cpp:74
Low-level classes.
Definition: garg.h:36