From 99b92fdd6e181aac2bd8fd021cd2718978126f49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C5=99emysl=20Janouch?=
Date: Thu, 7 May 2015 08:34:10 +0200
Subject: [PATCH] Extend UTF-8 code a bit
---
liberty.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 47 insertions(+), 8 deletions(-)
diff --git a/liberty.c b/liberty.c
index 0f37a93..8ea860f 100644
--- a/liberty.c
+++ b/liberty.c
@@ -1843,9 +1843,8 @@ isspace_ascii (int c)
// --- UTF-8 -------------------------------------------------------------------
/// Return a pointer to the next UTF-8 character, or NULL on error
-// TODO: decode the sequence while we're at it
static const char *
-utf8_next (const char *s, size_t len)
+utf8_next (const char *s, size_t len, int32_t *codepoint)
{
// End of string, we go no further
if (!len)
@@ -1869,28 +1868,33 @@ utf8_next (const char *s, size_t len)
tail_len++;
}
- p++;
-
// Check the rest of the sequence
if (tail_len > --len)
return NULL;
+ uint32_t cp = *p++ & ~mask;
while (tail_len--)
- if ((*p++ & 0xC0) != 0x80)
+ {
+ if ((*p & 0xC0) != 0x80)
return NULL;
-
+ cp = cp << 6 | (*p++ & 0x3F);
+ }
+ if (codepoint)
+ *codepoint = cp;
return (const char *) p;
}
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
-// TODO: also validate the codepoints
static bool
utf8_validate (const char *s, size_t len)
{
const char *next;
while (len)
{
- if (!(next = utf8_next (s, len)))
+ int32_t codepoint;
+ // TODO: better validations
+ if (!(next = utf8_next (s, len, &codepoint))
+ || codepoint > 0x10FFFF)
return false;
len -= next - s;
@@ -1899,6 +1903,41 @@ utf8_validate (const char *s, size_t len)
return true;
}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+struct utf8_iter
+{
+ const char *s; ///< String iterator
+ size_t len; ///< How many bytes remain
+};
+
+static void
+utf8_iter_init (struct utf8_iter *self, const char *s)
+{
+ self->len = strlen ((self->s = s));
+}
+
+static int32_t
+utf8_iter_next (struct utf8_iter *self, size_t *len)
+{
+ if (!self->len)
+ return -1;
+
+ const char *old = self->s;
+ int32_t codepoint;
+ if (!soft_assert ((self->s = utf8_next (old, self->len, &codepoint))))
+ {
+ // Invalid UTF-8
+ self->len = 0;
+ return -1;
+ }
+
+ size_t advance = self->s - old;
+ self->len -= advance;
+ if (len) *len = advance;
+ return codepoint;
+}
+
// --- Base 64 -----------------------------------------------------------------
static uint8_t g_base64_table[256] =