Argo  1.0
A C++ library for handling JSON.
utf8.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 Andrew Haisley
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in all
12  * copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20  * SOFTWARE.
21  */
22 
24 
25 #include "common.hpp"
26 #include "utf8.hpp"
27 #include "json_utf8_exception.hpp"
28 
29 using namespace std;
30 using namespace NAMESPACE;
31 
32 unsigned int utf8::from_hex(char c)
33 {
34  c = tolower(c);
35 
36  if (c >= '0' && c <= '9')
37  {
38  return c - '0';
39  }
40  else if (c >= 'a' && c <= 'f')
41  {
42  return c - 'a' + 10;
43  }
44  else
45  {
46  throw json_utf8_exception(json_utf8_exception::invalid_hex_char_e, c);
47  }
48 }
49 
50 char utf8::to_hex(unsigned int i)
51 {
52  if (i <= 9)
53  {
54  return '0' + i;
55  }
56  else if (i <= 15)
57  {
58  return 'a' + i - 10;
59  }
60  else
61  {
62  throw json_utf8_exception(json_exception::invalid_hex_number_e, i);
63  }
64 }
65 
66 char utf8::next_char(const string &s, size_t &index)
67 {
68  if (index < s.size())
69  {
70  return s[index++];
71  }
72  else
73  {
74  throw json_utf8_exception(json_utf8_exception::invalid_string_encoding_e, s);
75  }
76 }
77 
78 char32_t utf8::parse_hex(const string &src, size_t &src_index)
79 {
80  char32_t res = 0;
81 
82  res |= from_hex(next_char(src, src_index)) << 12;
83  res |= from_hex(next_char(src, src_index)) << 8;
84  res |= from_hex(next_char(src, src_index)) << 4;
85  res |= from_hex(next_char(src, src_index));
86 
87  return res;
88 }
89 
90 void utf8::set_and_check_unicode_byte(string &dst, size_t &dst_index, char c)
91 {
92  if (c == 0)
93  {
94  throw json_utf8_exception(json_utf8_exception::invalid_utf8_char_e, c);
95  }
96  else
97  {
98  dst[dst_index++] = c;
99  }
100 }
101 
102 void utf8::parse_unicode(
103  const string &src,
104  size_t &src_index,
105  string &dst,
106  size_t &dst_index)
107 {
108  char32_t uc = 0;
109 
110  // is this basic multilingual or a UTF-16 surrogate pair?
111  if ((src_index + 10) <= src.size() &&
112  src[src_index + 4] == '\\' &&
113  src[src_index + 5] == 'u')
114  {
115  char32_t high_surrogate = parse_hex(src, src_index);
116 
117  (void)next_char(src, src_index);
118  (void)next_char(src, src_index);
119 
120  char32_t low_surrogate = parse_hex(src, src_index);
121 
122  uc = ((high_surrogate - 0xD800) << 10 | (low_surrogate - 0xDC00)) + 0x10000;
123  }
124  else
125  {
126  uc = parse_hex(src, src_index);
127  }
128 
129  // I would use the nice new C++11 standard conversion templates for this but
130  // they're not fully supported by compilers in the wild just yet.
131 
132  if (valid_unicode(uc))
133  {
134  if (uc < 0x80)
135  {
136  dst[dst_index++] = static_cast<char>(uc);
137  }
138  else if (uc < 0x800)
139  {
140  set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc >> 6) | 0xc0));
141  set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc & 0x3f) | 0x80));
142  }
143  else if (uc < 0x10000)
144  {
145  set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc >> 12) | 0xe0));
146  set_and_check_unicode_byte(dst, dst_index, static_cast<char>(((uc >> 6) & 0x3f) | 0x80));
147  set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc & 0x3f) | 0x80));
148  }
149  else
150  {
151  set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc >> 18) | 0xf0));
152  set_and_check_unicode_byte(dst, dst_index, static_cast<char>(((uc >> 12) & 0x3f) | 0x80));
153  set_and_check_unicode_byte(dst, dst_index, static_cast<char>(((uc >> 6) & 0x3f) | 0x80));
154  set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc & 0x3f) | 0x80));
155  }
156  }
157  else
158  {
159  throw json_utf8_exception(json_utf8_exception::invalid_unicode_e, uc);
160  }
161 }
162 
163 unique_ptr<string> utf8::json_string_to_utf8(const string &src)
164 {
165  size_t src_index = 0;
166  size_t dst_index = 0;
167 
168  string *dst = new string(src.size(), '\0');
169 
170  unique_ptr<string> res(dst);
171 
172  while (src_index < src.size())
173  {
174  if (src[src_index] != '\\')
175  {
176  (*dst)[dst_index++] = src[src_index++];
177  }
178  else
179  {
180  char c = next_char(src, ++src_index);
181 
182  switch (c)
183  {
184  case '"':
185  case '\\':
186  case '/':
187  (*dst)[dst_index++] = c;
188  break;
189  case 'b':
190  (*dst)[dst_index++] = '\b';
191  break;
192  case 'f':
193  (*dst)[dst_index++] = '\f';
194  break;
195  case 'n':
196  (*dst)[dst_index++] = '\n';
197  break;
198  case 'r':
199  (*dst)[dst_index++] = '\r';
200  break;
201  case 't':
202  (*dst)[dst_index++] = '\t';
203  break;
204  case 'u':
205  parse_unicode(src, src_index, *dst, dst_index);
206  break;
207  default:
208  throw json_utf8_exception(json_utf8_exception::invalid_string_escape_e, c);
209  }
210  }
211  }
212 
213  dst->resize(dst_index);
214  return res;
215 }
216 
217 unsigned int utf8::utf8_length(uint8_t c)
218 {
219  if ((c & 0x80) == 0)
220  {
221  return 1;
222  }
223  else if ((c & 0xe0) == 0xc0)
224  {
225  return 2;
226  }
227  else if ((c & 0xf0) == 0xe0)
228  {
229  return 3;
230  }
231  else if ((c & 0xf8) == 0xf0)
232  {
233  return 4;
234  }
235  else
236  {
237  return 0;
238  }
239 }
240 
241 unique_ptr<string> utf8::utf8_to_json_string(const string &src)
242 {
243  unique_ptr<string> res(new string);
244 
245  // We need at least the same number of characters as the source.
246  res->reserve(src.size());
247 
248  size_t src_index = 0;
249 
250  while (src_index < src.size())
251  {
252  char32_t uc = src[src_index++] & 0xff;
253 
254  unsigned int l = utf8_length(uc);
255 
256  switch (l)
257  {
258  case 1:
259  break;
260  case 2:
261  uc = ((uc << 6) & 0x7ff) | (next_char(src, src_index) & 0x3f);
262  break;
263  case 3:
264  uc = ((uc << 12) & 0xffff) | ((next_char(src, src_index) << 6) & 0xfff);
265  uc |= next_char(src, src_index) & 0x3f;
266  break;
267  case 4:
268  uc = ((uc << 18) & 0x1fffff) | ((next_char(src, src_index) << 12) & 0x3ffff);
269  uc |= (next_char(src, src_index) << 6) & 0xfff;
270  uc |= next_char(src, src_index) & 0x3f;
271  break;
272  default:
273  throw json_utf8_exception(json_utf8_exception::invalid_utf8_sequence_length_e, l);
274  }
275 
276  if (uc <= 0xff)
277  {
278  switch (uc)
279  {
280  case '"':
281  *res += "\\\"";
282  break;
283  case '\\':
284  *res += "\\\\";
285  break;
286  case '\b':
287  *res += "\\b";
288  break;
289  case '\f':
290  *res += "\\f";
291  break;
292  case '\n':
293  *res += "\\n";
294  break;
295  case '\r':
296  *res += "\\r";
297  break;
298  case '\t':
299  *res += "\\t";
300  break;
301  default:
302  *res += static_cast<char>(uc);
303  }
304  }
305  else if (uc <= 0xffff)
306  {
307  add_hex_string(res, uc);
308  }
309  else
310  {
311  uc -= 0x10000;
312  add_hex_string(res, (uc >> 10) + 0xD800);
313  add_hex_string(res, (uc & 0x3ff) + 0xDC00);
314  }
315  }
316 
317  return res;
318 }
319 
320 bool utf8::valid_unicode(char32_t uc)
321 {
322  return (uc <= 0x0010ffffu) && !(uc >= 0xd800u && uc <= 0xdfffu);
323 }
324 
325 void utf8::add_hex_string(unique_ptr<string> &dst, char32_t uc)
326 {
327  *dst += "\\u";
328  *dst += to_hex((uc >> 12) & 0x0f);
329  *dst += to_hex((uc >> 8) & 0x0f);
330  *dst += to_hex((uc >> 4) & 0x0f);
331  *dst += to_hex(uc & 0x0f);
332 }
#define NAMESPACE
You can change the namespace of the whole library by changing this value.
Definition: common.hpp:29
STL namespace.
Common defs needed everywhere and, as far as is possible, platform specific changes.
The json_utf8_exception class.
The utf8 class.
Exception class for errors translating to and from UTF-8 strings.