32 unsigned int utf8::from_hex(
char c)
36 if (c >=
'0' && c <=
'9')
40 else if (c >=
'a' && c <=
'f')
50 char utf8::to_hex(
unsigned int i)
66 char utf8::next_char(
const string &s,
size_t &index)
78 char32_t utf8::parse_hex(
const string &src,
size_t &src_index)
82 res |= from_hex(next_char(src, src_index)) << 12;
83 res |= from_hex(next_char(src, src_index)) << 8;
84 res |= from_hex(next_char(src, src_index)) << 4;
85 res |= from_hex(next_char(src, src_index));
90 void utf8::set_and_check_unicode_byte(
string &dst,
size_t &dst_index,
char c)
102 void utf8::parse_unicode(
111 if ((src_index + 10) <= src.size() &&
112 src[src_index + 4] ==
'\\' &&
113 src[src_index + 5] ==
'u')
115 char32_t high_surrogate = parse_hex(src, src_index);
117 (void)next_char(src, src_index);
118 (void)next_char(src, src_index);
120 char32_t low_surrogate = parse_hex(src, src_index);
122 uc = ((high_surrogate - 0xD800) << 10 | (low_surrogate - 0xDC00)) + 0x10000;
126 uc = parse_hex(src, src_index);
132 if (valid_unicode(uc))
136 dst[dst_index++] =
static_cast<char>(uc);
140 set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc >> 6) | 0xc0));
141 set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc & 0x3f) | 0x80));
143 else if (uc < 0x10000)
145 set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc >> 12) | 0xe0));
146 set_and_check_unicode_byte(dst, dst_index, static_cast<char>(((uc >> 6) & 0x3f) | 0x80));
147 set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc & 0x3f) | 0x80));
151 set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc >> 18) | 0xf0));
152 set_and_check_unicode_byte(dst, dst_index, static_cast<char>(((uc >> 12) & 0x3f) | 0x80));
153 set_and_check_unicode_byte(dst, dst_index, static_cast<char>(((uc >> 6) & 0x3f) | 0x80));
154 set_and_check_unicode_byte(dst, dst_index, static_cast<char>((uc & 0x3f) | 0x80));
163 unique_ptr<string> utf8::json_string_to_utf8(
const string &src)
165 size_t src_index = 0;
166 size_t dst_index = 0;
168 string *dst =
new string(src.size(),
'\0');
170 unique_ptr<string> res(dst);
172 while (src_index < src.size())
174 if (src[src_index] !=
'\\')
176 (*dst)[dst_index++] = src[src_index++];
180 char c = next_char(src, ++src_index);
187 (*dst)[dst_index++] = c;
190 (*dst)[dst_index++] =
'\b';
193 (*dst)[dst_index++] =
'\f';
196 (*dst)[dst_index++] =
'\n';
199 (*dst)[dst_index++] =
'\r';
202 (*dst)[dst_index++] =
'\t';
205 parse_unicode(src, src_index, *dst, dst_index);
213 dst->resize(dst_index);
217 unsigned int utf8::utf8_length(uint8_t c)
223 else if ((c & 0xe0) == 0xc0)
227 else if ((c & 0xf0) == 0xe0)
231 else if ((c & 0xf8) == 0xf0)
241 unique_ptr<string> utf8::utf8_to_json_string(
const string &src)
243 unique_ptr<string> res(
new string);
246 res->reserve(src.size());
248 size_t src_index = 0;
250 while (src_index < src.size())
252 char32_t uc = src[src_index++] & 0xff;
254 unsigned int l = utf8_length(uc);
261 uc = ((uc << 6) & 0x7ff) | (next_char(src, src_index) & 0x3f);
264 uc = ((uc << 12) & 0xffff) | ((next_char(src, src_index) << 6) & 0xfff);
265 uc |= next_char(src, src_index) & 0x3f;
268 uc = ((uc << 18) & 0x1fffff) | ((next_char(src, src_index) << 12) & 0x3ffff);
269 uc |= (next_char(src, src_index) << 6) & 0xfff;
270 uc |= next_char(src, src_index) & 0x3f;
302 *res +=
static_cast<char>(uc);
305 else if (uc <= 0xffff)
307 add_hex_string(res, uc);
312 add_hex_string(res, (uc >> 10) + 0xD800);
313 add_hex_string(res, (uc & 0x3ff) + 0xDC00);
320 bool utf8::valid_unicode(char32_t uc)
322 return (uc <= 0x0010ffffu) && !(uc >= 0xd800u && uc <= 0xdfffu);
325 void utf8::add_hex_string(unique_ptr<string> &dst, char32_t uc)
328 *dst += to_hex((uc >> 12) & 0x0f);
329 *dst += to_hex((uc >> 8) & 0x0f);
330 *dst += to_hex((uc >> 4) & 0x0f);
331 *dst += to_hex(uc & 0x0f);
#define NAMESPACE
You can change the namespace of the whole library by changing this value.
Common defs needed everywhere and, as far as is possible, platform specific changes.
The json_utf8_exception class.
Exception class for errors translating to and from UTF-8 strings.