Improve the UTF-8 API
We need to be able to detect partial sequences.
This commit is contained in:
parent
9b72304963
commit
9d14562f7e
52
liberty.c
52
liberty.c
|
@ -2722,63 +2722,59 @@ isspace_ascii (int c)
|
||||||
|
|
||||||
// --- UTF-8 -------------------------------------------------------------------
|
// --- UTF-8 -------------------------------------------------------------------
|
||||||
|
|
||||||
/// Return a pointer to the next UTF-8 character, or NULL on error
|
/// Return the value of the UTF-8 character at `*s` and advance the pointer
|
||||||
static const char *
|
/// to the next one. Returns -2 if there is only a partial but possibly valid
|
||||||
utf8_next (const char *s, size_t len, int32_t *codepoint)
|
/// character sequence, or -1 on other errors. Either way, `*s` is untouched.
|
||||||
|
static int32_t
|
||||||
|
utf8_decode (const char **s, size_t len)
|
||||||
{
|
{
|
||||||
// End of string, we go no further
|
// End of string, we go no further
|
||||||
if (!len)
|
if (!len)
|
||||||
return NULL;
|
return -1;
|
||||||
|
|
||||||
// Find out how long the sequence is (0 for ASCII)
|
// Find out how long the sequence is (0 for ASCII)
|
||||||
unsigned mask = 0x80;
|
unsigned mask = 0x80;
|
||||||
unsigned sequence_len = 0;
|
unsigned sequence_len = 0;
|
||||||
|
|
||||||
const uint8_t *p = (const uint8_t *) s;
|
const uint8_t *p = (const uint8_t *) *s, *end = p + len;
|
||||||
while ((*p & mask) == mask)
|
while ((*p & mask) == mask)
|
||||||
{
|
{
|
||||||
// Invalid start of sequence
|
// Invalid start of sequence
|
||||||
if (mask == 0xFE)
|
if (mask == 0xFE)
|
||||||
return NULL;
|
return -1;
|
||||||
|
|
||||||
mask |= mask >> 1;
|
mask |= mask >> 1;
|
||||||
sequence_len++;
|
sequence_len++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// In the middle of a character or the input is too short
|
// In the middle of a character
|
||||||
if (sequence_len == 1 || sequence_len > len)
|
if (sequence_len == 1)
|
||||||
return NULL;
|
return -1;
|
||||||
|
|
||||||
// Check the rest of the sequence
|
// Check the rest of the sequence
|
||||||
uint32_t cp = *p++ & ~mask;
|
uint32_t cp = *p++ & ~mask;
|
||||||
while (sequence_len && --sequence_len)
|
while (sequence_len && --sequence_len)
|
||||||
{
|
{
|
||||||
|
if (p == end)
|
||||||
|
return -2;
|
||||||
if ((*p & 0xC0) != 0x80)
|
if ((*p & 0xC0) != 0x80)
|
||||||
return NULL;
|
return -1;
|
||||||
cp = cp << 6 | (*p++ & 0x3F);
|
cp = cp << 6 | (*p++ & 0x3F);
|
||||||
}
|
}
|
||||||
if (codepoint)
|
*s = (const char *) p;
|
||||||
*codepoint = cp;
|
return cp;
|
||||||
return (const char *) p;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
|
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
|
||||||
static bool
|
static bool
|
||||||
utf8_validate (const char *s, size_t len)
|
utf8_validate (const char *s, size_t len)
|
||||||
{
|
{
|
||||||
const char *next;
|
const char *end = s + len;
|
||||||
while (len)
|
|
||||||
{
|
|
||||||
int32_t codepoint;
|
int32_t codepoint;
|
||||||
// TODO: better validations
|
while ((codepoint = utf8_decode (&s, end - s)) >= 0
|
||||||
if (!(next = utf8_next (s, len, &codepoint))
|
&& codepoint <= 0x10FFFF /* TODO: better validations */)
|
||||||
|| codepoint > 0x10FFFF)
|
;
|
||||||
return false;
|
return s == end;
|
||||||
|
|
||||||
len -= next - s;
|
|
||||||
s = next;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||||
|
@ -2802,12 +2798,12 @@ utf8_iter_next (struct utf8_iter *self, size_t *len)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
const char *old = self->s;
|
const char *old = self->s;
|
||||||
int32_t codepoint;
|
int32_t codepoint = utf8_decode (&self->s, self->len);
|
||||||
if (!soft_assert ((self->s = utf8_next (old, self->len, &codepoint))))
|
if (!soft_assert (codepoint >= 0))
|
||||||
{
|
{
|
||||||
// Invalid UTF-8
|
// Invalid UTF-8
|
||||||
self->len = 0;
|
self->len = 0;
|
||||||
return -1;
|
return codepoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t advance = self->s - old;
|
size_t advance = self->s - old;
|
||||||
|
|
|
@ -326,6 +326,11 @@ test_str_map (void)
|
||||||
static void
|
static void
|
||||||
test_utf8 (void)
|
test_utf8 (void)
|
||||||
{
|
{
|
||||||
|
const char *full = "\xc5\x99", *partial = full, *empty = full;
|
||||||
|
soft_assert (utf8_decode (&full, 2) == 0x0159);
|
||||||
|
soft_assert (utf8_decode (&partial, 1) == -2);
|
||||||
|
soft_assert (utf8_decode (&empty, 0) == -1);
|
||||||
|
|
||||||
const char valid [] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm";
|
const char valid [] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm";
|
||||||
const char invalid[] = "\xf0\x90\x28\xbc";
|
const char invalid[] = "\xf0\x90\x28\xbc";
|
||||||
soft_assert ( utf8_validate (valid, sizeof valid));
|
soft_assert ( utf8_validate (valid, sizeof valid));
|
||||||
|
|
Loading…
Reference in New Issue