Extend UTF-8 code a bit
This commit is contained in:
parent
70f12a8a7d
commit
99b92fdd6e
55
liberty.c
55
liberty.c
@ -1843,9 +1843,8 @@ isspace_ascii (int c)
|
|||||||
// --- UTF-8 -------------------------------------------------------------------
|
// --- UTF-8 -------------------------------------------------------------------
|
||||||
|
|
||||||
/// Return a pointer to the next UTF-8 character, or NULL on error
|
/// Return a pointer to the next UTF-8 character, or NULL on error
|
||||||
// TODO: decode the sequence while we're at it
|
|
||||||
static const char *
|
static const char *
|
||||||
utf8_next (const char *s, size_t len)
|
utf8_next (const char *s, size_t len, int32_t *codepoint)
|
||||||
{
|
{
|
||||||
// End of string, we go no further
|
// End of string, we go no further
|
||||||
if (!len)
|
if (!len)
|
||||||
@ -1869,28 +1868,33 @@ utf8_next (const char *s, size_t len)
|
|||||||
tail_len++;
|
tail_len++;
|
||||||
}
|
}
|
||||||
|
|
||||||
p++;
|
|
||||||
|
|
||||||
// Check the rest of the sequence
|
// Check the rest of the sequence
|
||||||
if (tail_len > --len)
|
if (tail_len > --len)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
uint32_t cp = *p++ & ~mask;
|
||||||
while (tail_len--)
|
while (tail_len--)
|
||||||
if ((*p++ & 0xC0) != 0x80)
|
{
|
||||||
|
if ((*p & 0xC0) != 0x80)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
cp = cp << 6 | (*p++ & 0x3F);
|
||||||
|
}
|
||||||
|
if (codepoint)
|
||||||
|
*codepoint = cp;
|
||||||
return (const char *) p;
|
return (const char *) p;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
|
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
|
||||||
// TODO: also validate the codepoints
|
|
||||||
static bool
|
static bool
|
||||||
utf8_validate (const char *s, size_t len)
|
utf8_validate (const char *s, size_t len)
|
||||||
{
|
{
|
||||||
const char *next;
|
const char *next;
|
||||||
while (len)
|
while (len)
|
||||||
{
|
{
|
||||||
if (!(next = utf8_next (s, len)))
|
int32_t codepoint;
|
||||||
|
// TODO: better validations
|
||||||
|
if (!(next = utf8_next (s, len, &codepoint))
|
||||||
|
|| codepoint > 0x10FFFF)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
len -= next - s;
|
len -= next - s;
|
||||||
@ -1899,6 +1903,41 @@ utf8_validate (const char *s, size_t len)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||||
|
|
||||||
|
struct utf8_iter
|
||||||
|
{
|
||||||
|
const char *s; ///< String iterator
|
||||||
|
size_t len; ///< How many bytes remain
|
||||||
|
};
|
||||||
|
|
||||||
|
static void
|
||||||
|
utf8_iter_init (struct utf8_iter *self, const char *s)
|
||||||
|
{
|
||||||
|
self->len = strlen ((self->s = s));
|
||||||
|
}
|
||||||
|
|
||||||
|
static int32_t
|
||||||
|
utf8_iter_next (struct utf8_iter *self, size_t *len)
|
||||||
|
{
|
||||||
|
if (!self->len)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
const char *old = self->s;
|
||||||
|
int32_t codepoint;
|
||||||
|
if (!soft_assert ((self->s = utf8_next (old, self->len, &codepoint))))
|
||||||
|
{
|
||||||
|
// Invalid UTF-8
|
||||||
|
self->len = 0;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t advance = self->s - old;
|
||||||
|
self->len -= advance;
|
||||||
|
if (len) *len = advance;
|
||||||
|
return codepoint;
|
||||||
|
}
|
||||||
|
|
||||||
// --- Base 64 -----------------------------------------------------------------
|
// --- Base 64 -----------------------------------------------------------------
|
||||||
|
|
||||||
static uint8_t g_base64_table[256] =
|
static uint8_t g_base64_table[256] =
|
||||||
|
Loading…
Reference in New Issue
Block a user