Implement support for \u escapes

This commit is contained in:
Petri Lehtinen 2009-07-14 20:47:57 +03:00
parent a0435e3061
commit 9240146c10
5 changed files with 152 additions and 13 deletions

View File

@ -83,9 +83,13 @@ static void error_set(json_error_t *error, const lex_t *lex,
error->line = lex->line;
if(saved_text && saved_text[0])
{
if(lex->saved_text.length <= 20) {
snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
"%s near '%s'", text, saved_text);
}
else
snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
}
else
{
snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
@ -208,11 +212,36 @@ static void lex_save_cached(lex_t *lex)
}
}
/* assumes that str points to 'u' plus at least 4 valid hex digits */
static int decode_unicode_escape(const char *str)
{
int i;
int value = 0;
assert(str[0] == 'u');
for(i = 1; i <= 4; i++) {
char c = str[i];
value <<= 4;
if(isdigit(c))
value += c - '0';
else if(islower(c))
value += c - 'a' + 10;
else if(isupper(c))
value += c - 'A' + 10;
else
assert(0);
}
return value;
}
static void lex_scan_string(lex_t *lex, json_error_t *error)
{
char c;
const char *p;
char *t;
int i;
lex->token = TOKEN_INVALID;
@ -240,7 +269,7 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
c = lex_get_save(lex, error);
if(c == 'u') {
c = lex_get_save(lex, error);
for(int i = 0; i < 4; i++) {
for(i = 0; i < 4; i++) {
if(!isxdigit(c)) {
lex_unget_unsave(lex, c);
error_set(error, lex, "invalid escape");
@ -285,12 +314,57 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
if(*p == '\\') {
p++;
if(*p == 'u') {
/* TODO */
error_set(error, lex, "\\u escapes are not yet supported");
free(lex->value.string);
lex->value.string = NULL;
char buffer[4];
int length;
int value;
value = decode_unicode_escape(p);
p += 5;
if(0xD800 <= value && value <= 0xDBFF) {
/* surrogate pair */
if(*p == '\\' && *(p + 1) == 'u') {
int value2 = decode_unicode_escape(++p);
p += 5;
if(0xDC00 <= value2 && value2 <= 0xDFFF) {
/* valid second surrogate */
value = ((value - 0xD800) << 10) +
(value2 - 0xDC00) +
0x10000;
}
else {
/* invalid second surrogate */
error_set(error, lex,
"invalid Unicode '\\u%04X\\u%04X'",
value, value2);
goto out;
} else {
}
}
else {
/* no second surrogate */
error_set(error, lex, "invalid Unicode '\\u%04X'",
value);
goto out;
}
}
else if(0xDC00 <= value && value <= 0xDFFF) {
error_set(error, lex, "invalid Unicode '\\u%04X'", value);
goto out;
}
else if(value == 0)
{
error_set(error, lex, "\\u0000 is not allowed");
goto out;
}
if(utf8_encode(value, buffer, &length))
assert(0);
memcpy(t, buffer, length);
t += length;
}
else {
switch(*p) {
case '"': case '\\': case '/':
*t = *p; break;
@ -301,13 +375,12 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
case 't': *t = '\t'; break;
default: assert(0);
}
t++;
p++;
}
}
else
*t = *p;
t++;
p++;
*(t++) = *(p++);
}
*t = '\0';
lex->token = TOKEN_STRING;

View File

@ -1,5 +1,41 @@
#include <string.h>
int utf8_encode(int codepoint, char *buffer, int *size)
{
if(codepoint < 0)
return -1;
else if(codepoint < 0x80)
{
buffer[0] = (char)codepoint;
*size = 1;
}
else if(codepoint < 0x800)
{
buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
buffer[1] = 0x80 + ((codepoint & 0x03F));
*size = 2;
}
else if(codepoint < 0x10000)
{
buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
buffer[2] = 0x80 + ((codepoint & 0x003F));
*size = 3;
}
else if(codepoint <= 0x10FFFF)
{
buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
buffer[3] = 0x80 + ((codepoint & 0x00003F));
*size = 4;
}
else
return -1;
return 0;
}
int utf8_check_first(char byte)
{
unsigned char u = (unsigned char)byte;

View File

@ -1,6 +1,8 @@
#ifndef UTF_H
#define UTF_H
int utf8_encode(int codepoint, char *buffer, int *size);
int utf8_check_first(char byte);
int utf8_check_full(const char *buffer, int size);

20
test/testdata/invalid vendored
View File

@ -136,3 +136,23 @@ invalid token near '-0'
====
1
control character 0x9 near '"'
========
["\u0000 (null byte not allowed)"]
====
1
\u0000 is not allowed
========
["\uDADA (first surrogate without the second)"]
====
1
invalid Unicode '\uDADA'
========
["\uD888\u3210 (first surrogate and invalid second surrogate)"]
====
1
invalid Unicode '\uD888\u3210'
========
["\uDFAA (second surrogate on it's own)"]
====
1
invalid Unicode '\uDFAA'

8
test/testdata/valid vendored
View File

@ -8,6 +8,14 @@
========
["\"\\\/\b\f\n\r\t"]
========
["\u002c one-byte UTF-8"]
========
["\u0123 two-byte UTF-8"]
========
["\u0821 three-byte UTF-8"]
========
["\uD834\uDD1E surrogate, four-byte UTF-8"]
========
[0]
========
[1]