Improve the UTF-8 API
We need to be able to detect partial sequences.
This commit is contained in:
parent
9b72304963
commit
9d14562f7e
54
liberty.c
54
liberty.c
|
@ -2722,63 +2722,59 @@ isspace_ascii (int c)
|
|||
|
||||
// --- UTF-8 -------------------------------------------------------------------
|
||||
|
||||
/// Return a pointer to the next UTF-8 character, or NULL on error
|
||||
static const char *
|
||||
utf8_next (const char *s, size_t len, int32_t *codepoint)
|
||||
/// Return the value of the UTF-8 character at `*s` and advance the pointer
|
||||
/// to the next one. Returns -2 if there is only a partial but possibly valid
|
||||
/// character sequence, or -1 on other errors. Either way, `*s` is untouched.
|
||||
static int32_t
|
||||
utf8_decode (const char **s, size_t len)
|
||||
{
|
||||
// End of string, we go no further
|
||||
if (!len)
|
||||
return NULL;
|
||||
return -1;
|
||||
|
||||
// Find out how long the sequence is (0 for ASCII)
|
||||
unsigned mask = 0x80;
|
||||
unsigned sequence_len = 0;
|
||||
|
||||
const uint8_t *p = (const uint8_t *) s;
|
||||
const uint8_t *p = (const uint8_t *) *s, *end = p + len;
|
||||
while ((*p & mask) == mask)
|
||||
{
|
||||
// Invalid start of sequence
|
||||
if (mask == 0xFE)
|
||||
return NULL;
|
||||
return -1;
|
||||
|
||||
mask |= mask >> 1;
|
||||
sequence_len++;
|
||||
}
|
||||
|
||||
// In the middle of a character or the input is too short
|
||||
if (sequence_len == 1 || sequence_len > len)
|
||||
return NULL;
|
||||
// In the middle of a character
|
||||
if (sequence_len == 1)
|
||||
return -1;
|
||||
|
||||
// Check the rest of the sequence
|
||||
uint32_t cp = *p++ & ~mask;
|
||||
while (sequence_len && --sequence_len)
|
||||
{
|
||||
if (p == end)
|
||||
return -2;
|
||||
if ((*p & 0xC0) != 0x80)
|
||||
return NULL;
|
||||
return -1;
|
||||
cp = cp << 6 | (*p++ & 0x3F);
|
||||
}
|
||||
if (codepoint)
|
||||
*codepoint = cp;
|
||||
return (const char *) p;
|
||||
*s = (const char *) p;
|
||||
return cp;
|
||||
}
|
||||
|
||||
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
|
||||
static bool
|
||||
utf8_validate (const char *s, size_t len)
|
||||
{
|
||||
const char *next;
|
||||
while (len)
|
||||
{
|
||||
int32_t codepoint;
|
||||
// TODO: better validations
|
||||
if (!(next = utf8_next (s, len, &codepoint))
|
||||
|| codepoint > 0x10FFFF)
|
||||
return false;
|
||||
|
||||
len -= next - s;
|
||||
s = next;
|
||||
}
|
||||
return true;
|
||||
const char *end = s + len;
|
||||
int32_t codepoint;
|
||||
while ((codepoint = utf8_decode (&s, end - s)) >= 0
|
||||
&& codepoint <= 0x10FFFF /* TODO: better validations */)
|
||||
;
|
||||
return s == end;
|
||||
}
|
||||
|
||||
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
@ -2802,12 +2798,12 @@ utf8_iter_next (struct utf8_iter *self, size_t *len)
|
|||
return -1;
|
||||
|
||||
const char *old = self->s;
|
||||
int32_t codepoint;
|
||||
if (!soft_assert ((self->s = utf8_next (old, self->len, &codepoint))))
|
||||
int32_t codepoint = utf8_decode (&self->s, self->len);
|
||||
if (!soft_assert (codepoint >= 0))
|
||||
{
|
||||
// Invalid UTF-8
|
||||
self->len = 0;
|
||||
return -1;
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
size_t advance = self->s - old;
|
||||
|
|
|
@ -326,6 +326,11 @@ test_str_map (void)
|
|||
static void
|
||||
test_utf8 (void)
|
||||
{
|
||||
const char *full = "\xc5\x99", *partial = full, *empty = full;
|
||||
soft_assert (utf8_decode (&full, 2) == 0x0159);
|
||||
soft_assert (utf8_decode (&partial, 1) == -2);
|
||||
soft_assert (utf8_decode (&empty, 0) == -1);
|
||||
|
||||
const char valid [] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm";
|
||||
const char invalid[] = "\xf0\x90\x28\xbc";
|
||||
soft_assert ( utf8_validate (valid, sizeof valid));
|
||||
|
|
Loading…
Reference in New Issue