Improve the UTF-8 API

We need to be able to detect partial sequences.
This commit is contained in:
Přemysl Eric Janouch 2020-10-12 22:34:46 +02:00
parent 9b72304963
commit 9d14562f7e
Signed by: p
GPG Key ID: A0420B94F92B9493
2 changed files with 30 additions and 29 deletions

View File

@ -2722,63 +2722,59 @@ isspace_ascii (int c)
// --- UTF-8 ------------------------------------------------------------------- // --- UTF-8 -------------------------------------------------------------------
/// Return a pointer to the next UTF-8 character, or NULL on error /// Return the value of the UTF-8 character at `*s` and advance the pointer
static const char * /// to the next one. Returns -2 if there is only a partial but possibly valid
utf8_next (const char *s, size_t len, int32_t *codepoint) /// character sequence, or -1 on other errors. Either way, `*s` is untouched.
static int32_t
utf8_decode (const char **s, size_t len)
{ {
// End of string, we go no further // End of string, we go no further
if (!len) if (!len)
return NULL; return -1;
// Find out how long the sequence is (0 for ASCII) // Find out how long the sequence is (0 for ASCII)
unsigned mask = 0x80; unsigned mask = 0x80;
unsigned sequence_len = 0; unsigned sequence_len = 0;
const uint8_t *p = (const uint8_t *) s; const uint8_t *p = (const uint8_t *) *s, *end = p + len;
while ((*p & mask) == mask) while ((*p & mask) == mask)
{ {
// Invalid start of sequence // Invalid start of sequence
if (mask == 0xFE) if (mask == 0xFE)
return NULL; return -1;
mask |= mask >> 1; mask |= mask >> 1;
sequence_len++; sequence_len++;
} }
// In the middle of a character or the input is too short // In the middle of a character
if (sequence_len == 1 || sequence_len > len) if (sequence_len == 1)
return NULL; return -1;
// Check the rest of the sequence // Check the rest of the sequence
uint32_t cp = *p++ & ~mask; uint32_t cp = *p++ & ~mask;
while (sequence_len && --sequence_len) while (sequence_len && --sequence_len)
{ {
if (p == end)
return -2;
if ((*p & 0xC0) != 0x80) if ((*p & 0xC0) != 0x80)
return NULL; return -1;
cp = cp << 6 | (*p++ & 0x3F); cp = cp << 6 | (*p++ & 0x3F);
} }
if (codepoint) *s = (const char *) p;
*codepoint = cp; return cp;
return (const char *) p;
} }
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated /// Very rough UTF-8 validation, just makes sure codepoints can be iterated
static bool static bool
utf8_validate (const char *s, size_t len) utf8_validate (const char *s, size_t len)
{ {
const char *next; const char *end = s + len;
while (len)
{
int32_t codepoint; int32_t codepoint;
// TODO: better validations while ((codepoint = utf8_decode (&s, end - s)) >= 0
if (!(next = utf8_next (s, len, &codepoint)) && codepoint <= 0x10FFFF /* TODO: better validations */)
|| codepoint > 0x10FFFF) ;
return false; return s == end;
len -= next - s;
s = next;
}
return true;
} }
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@ -2802,12 +2798,12 @@ utf8_iter_next (struct utf8_iter *self, size_t *len)
return -1; return -1;
const char *old = self->s; const char *old = self->s;
int32_t codepoint; int32_t codepoint = utf8_decode (&self->s, self->len);
if (!soft_assert ((self->s = utf8_next (old, self->len, &codepoint)))) if (!soft_assert (codepoint >= 0))
{ {
// Invalid UTF-8 // Invalid UTF-8
self->len = 0; self->len = 0;
return -1; return codepoint;
} }
size_t advance = self->s - old; size_t advance = self->s - old;

View File

@ -326,6 +326,11 @@ test_str_map (void)
static void static void
test_utf8 (void) test_utf8 (void)
{ {
const char *full = "\xc5\x99", *partial = full, *empty = full;
soft_assert (utf8_decode (&full, 2) == 0x0159);
soft_assert (utf8_decode (&partial, 1) == -2);
soft_assert (utf8_decode (&empty, 0) == -1);
const char valid [] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm"; const char valid [] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm";
const char invalid[] = "\xf0\x90\x28\xbc"; const char invalid[] = "\xf0\x90\x28\xbc";
soft_assert ( utf8_validate (valid, sizeof valid)); soft_assert ( utf8_validate (valid, sizeof valid));