Decode and check for correct UTF-8

All strings (decoded JSON text, the argument of json_string(), the key argument of json_object_set()) are checked for valid UTF-8.
2009-07-13 21:03:09 +03:00 · 2009-07-13 21:03:09 +03:00 · 902bcdaa5e
commit 902bcdaa5e
parent 625f50f916
6 changed files with 189 additions and 4 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -5,9 +5,12 @@ libjansson_la_SOURCES = \
 	dump.c \
 	hashtable.c \
 	hashtable.h \
+	jansson_private.h \
 	load.c \
 	strbuffer.c \
 	strbuffer.h \
+	utf.c \
+	utf.h \
 	util.h \
 	value.c
 libjansson_la_LDFLAGS = -version-info 0:0:0
--- a/src/jansson_private.h
+++ b/src/jansson_private.h
@ -0,0 +1,8 @@
+#ifndef JANSSON_PRIVATE_H
+#define JANSSON_PRIVATE_H
+
+int json_object_set_nocheck(json_t *json, const char *key, json_t *value);
+json_t *json_string_nocheck(const char *value);
+
+
+#endif
--- a/src/load.c
+++ b/src/load.c
@ -9,7 +9,9 @@
 #include <assert.h>

 #include <jansson.h>
+#include "jansson_private.h"
 #include "strbuffer.h"
+#include "utf.h"

 #define TOKEN_INVALID         -1
 #define TOKEN_EOF              0
@ -101,8 +103,37 @@ static char stream_get(stream_t *stream)
 {
    if(!stream->buffer[stream->buffer_pos])
    {
+        char c;
+
        stream->buffer[0] = stream->get(stream->data);
        stream->buffer_pos = 0;
+
+        c = stream->buffer[0];
+
+        if(c == EOF && stream->eof(stream->data))
+            return EOF;
+
+        if(c < 0)
+        {
+            /* multi-byte UTF-8 sequence */
+            int i, count;
+
+            count = utf8_check_first(c);
+            if(!count)
+                return 0;
+
+            assert(count >= 2);
+
+            for(i = 1; i < count; i++)
+                stream->buffer[i] = stream->get(stream->data);
+
+            if(!utf8_check_full(stream->buffer, count))
+                return 0;
+
+            stream->buffer[count] = '\0';
+        }
+        else
+            stream->buffer[1] = '\0';
    }

    return (char)stream->buffer[stream->buffer_pos++];
@ -439,7 +470,7 @@ static json_t *parse_object(lex_t *lex, json_error_t *error)
            goto error;
        }

-        if(json_object_set(object, key, value)) {
+        if(json_object_set_nocheck(object, key, value)) {
            free(key);
            json_decref(value);
            goto error;
@ -513,7 +544,7 @@ static json_t *parse_value(lex_t *lex, json_error_t *error)

    switch(lex->token) {
        case TOKEN_STRING: {
-            json = json_string(lex->value.string);
+            json = json_string_nocheck(lex->value.string);
            break;
        }

--- a/src/utf.c
+++ b/src/utf.c
@ -0,0 +1,116 @@
+#include <string.h>
+
+int utf8_check_first(char byte)
+{
+    unsigned char u = (unsigned char)byte;
+
+    if(u < 0x80)
+        return 1;
+
+    if(0x80 <= u && u <= 0xBF) {
+        /* second, third or fourth byte of a multi-byte
+           sequence, i.e. a "continuation byte" */
+        return 0;
+    }
+    else if(u == 0xC0 || u == 0xC1) {
+        /* overlong encoding of an ASCII byte */
+        return 0;
+    }
+    else if(0xC2 <= u && u <= 0xDF) {
+        /* 2-byte sequence */
+        return 2;
+    }
+
+    else if(0xE0 <= u && u <= 0xEF) {
+        /* 3-byte sequence */
+        return 3;
+    }
+    else if(0xF0 <= u && u <= 0xF4) {
+        /* 4-byte sequence */
+        return 4;
+    }
+    else { /* u >= 0xF5 */
+        /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
+           UTF-8 */
+        return 0;
+    }
+}
+
+int utf8_check_full(const char *buffer, int size)
+{
+    int i, value = 0;
+    unsigned char u = (unsigned char)buffer[0];
+
+    if(size == 2)
+    {
+        value = u & 0x1F;
+    }
+    else if(size == 3)
+    {
+        value = u & 0xF;
+    }
+    else if(size == 4)
+    {
+        value = u & 0x7;
+    }
+    else
+        return 0;
+
+    for(i = 1; i < size; i++)
+    {
+        u = (unsigned char)buffer[i];
+
+        if(u < 0x80 || u > 0xBF) {
+            /* not a continuation byte */
+            return 0;
+        }
+
+        value = (value << 6) + (u & 0x3F);
+    }
+
+    if(value > 0x10FFFF) {
+        /* not in Unicode range */
+        return 0;
+    }
+
+    else if(0xD800 <= value && value <= 0xDFFF) {
+        /* invalid code point (UTF-16 surrogate halves) */
+        return 0;
+    }
+
+    else if((size == 2 && value < 0x80) ||
+            (size == 3 && value < 0x800) ||
+            (size == 4 && value < 0x10000)) {
+        /* overlong encoding */
+        return 0;
+    }
+
+    return 1;
+}
+
+int utf8_check_string(const char *string, int length)
+{
+    int i;
+
+    if(length == -1)
+        length = strlen(string);
+
+    for(i = 0; i < length; i++)
+    {
+        int count = utf8_check_first(string[i]);
+        if(count == 0)
+            return 0;
+        else if(count > 1)
+        {
+            if(i + count > length)
+                return 0;
+
+            if(!utf8_check_full(&string[i], count))
+                return 0;
+
+            i += count - 1;
+        }
+    }
+
+    return 1;
+}
--- a/src/utf.h
+++ b/src/utf.h
@ -0,0 +1,9 @@
+#ifndef UTF_H
+#define UTF_H
+
+int utf8_check_first(char byte);
+int utf8_check_full(const char *buffer, int size);
+
+int utf8_check_string(const char *string, int length);
+
+#endif
--- a/src/value.c
+++ b/src/value.c
@ -4,6 +4,8 @@

 #include <jansson.h>
 #include "hashtable.h"
+#include "jansson_private.h"
+#include "utf.h"
 #include "util.h"

 #define container_of(ptr_, type_, member_)  \
@ -109,7 +111,7 @@ json_t *json_object_get(const json_t *json, const char *key)
    return hashtable_get(&object->hashtable, key);
 }

-int json_object_set(json_t *json, const char *key, json_t *value)
+int json_object_set_nocheck(json_t *json, const char *key, json_t *value)
 {
    json_object_t *object;

@ -120,6 +122,14 @@ int json_object_set(json_t *json, const char *key, json_t *value)
    return hashtable_set(&object->hashtable, strdup(key), json_incref(value));
 }

+int json_object_set(json_t *json, const char *key, json_t *value)
+{
+    if(!utf8_check_string(key, -1))
+        return -1;
+
+    return json_object_set_nocheck(json, key, value);
+}
+
 int json_object_del(json_t *json, const char *key)
 {
    json_object_t *object;
@ -255,7 +265,7 @@ int json_array_append(json_t *json, json_t *value)

 /*** string ***/

-json_t *json_string(const char *value)
+json_t *json_string_nocheck(const char *value)
 {
    json_string_t *string = malloc(sizeof(json_string_t));
    if(!string)
@ -266,6 +276,14 @@ json_t *json_string(const char *value)
    return &string->json;
 }

+json_t *json_string(const char *value)
+{
+    if(!utf8_check_string(value, -1))
+        return NULL;
+
+    return json_string_nocheck(value);
+}
+
 const char *json_string_value(const json_t *json)
 {
    if(!json_is_string(json))