Split out utf8_validate_cp(), adhere to RFC 3629

This commit is contained in:
Přemysl Eric Janouch 2020-10-21 05:20:20 +02:00
parent b08cf6c29f
commit 53bcebc2f0
Signed by: p
GPG Key ID: A0420B94F92B9493

View File

@ -2770,6 +2770,13 @@ utf8_decode (const char **s, size_t len)
return cp; return cp;
} }
static inline bool
utf8_validate_cp (int32_t cp)
{
// RFC 3629, CESU-8 not allowed
return cp >= 0 && cp <= 0x10FFFF && (cp < 0xD800 || cp > 0xDFFF);
}
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated /// Very rough UTF-8 validation, just makes sure codepoints can be iterated
static bool static bool
utf8_validate (const char *s, size_t len) utf8_validate (const char *s, size_t len)
@ -2777,7 +2784,7 @@ utf8_validate (const char *s, size_t len)
const char *end = s + len; const char *end = s + len;
int32_t codepoint; int32_t codepoint;
while ((codepoint = utf8_decode (&s, end - s)) >= 0 while ((codepoint = utf8_decode (&s, end - s)) >= 0
&& codepoint <= 0x10FFFF /* TODO: better validations */) && utf8_validate_cp (codepoint))
; ;
return s == end; return s == end;
} }