Compare commits

..

2 Commits

2 changed files with 19 additions and 5 deletions

View File

@ -2753,6 +2753,11 @@ utf8_decode (const char **s, size_t len)
// Check the rest of the sequence
uint32_t cp = *p++ & ~mask;
// Overlong sequence (possibly MUTF-8, not supported)
if (!cp && sequence_len)
return -1;
while (sequence_len && --sequence_len)
{
if (p == end)
@ -2765,6 +2770,13 @@ utf8_decode (const char **s, size_t len)
return cp;
}
static inline bool
utf8_validate_cp (int32_t cp)
{
// RFC 3629, CESU-8 not allowed
return cp >= 0 && cp <= 0x10FFFF && (cp < 0xD800 || cp > 0xDFFF);
}
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
static bool
utf8_validate (const char *s, size_t len)
@ -2772,7 +2784,7 @@ utf8_validate (const char *s, size_t len)
const char *end = s + len;
int32_t codepoint;
while ((codepoint = utf8_decode (&s, end - s)) >= 0
&& codepoint <= 0x10FFFF /* TODO: better validations */)
&& utf8_validate_cp (codepoint))
;
return s == end;
}

View File

@ -332,9 +332,11 @@ test_utf8 (void)
soft_assert (utf8_decode (&empty, 0) == -1);
const char valid[] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm";
const char invalid[] = "\xf0\x90\x28\xbc";
const char invalid_1[] = "\xf0\x90\x28\xbc";
const char invalid_2[] = "\xc0\x80";
soft_assert ( utf8_validate (valid, sizeof valid));
soft_assert (!utf8_validate (invalid, sizeof invalid));
soft_assert (!utf8_validate (invalid_1, sizeof invalid_1));
soft_assert (!utf8_validate (invalid_2, sizeof invalid_2));
struct utf8_iter iter = utf8_iter_make ("fóọ");
size_t ch_len;