From 53bcebc2f0bae3ba0bbcefb849bdb0ede0ea4385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C5=99emysl=20Eric=20Janouch?= Date: Wed, 21 Oct 2020 05:20:20 +0200 Subject: [PATCH] Split out utf8_validate_cp(), adhere to RFC 3629 --- liberty.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/liberty.c b/liberty.c index c0b6bb4..d3c6c25 100644 --- a/liberty.c +++ b/liberty.c @@ -2770,6 +2770,13 @@ utf8_decode (const char **s, size_t len) return cp; } +static inline bool +utf8_validate_cp (int32_t cp) +{ + // RFC 3629, CESU-8 not allowed + return cp >= 0 && cp <= 0x10FFFF && (cp < 0xD800 || cp > 0xDFFF); +} + /// Very rough UTF-8 validation, just makes sure codepoints can be iterated static bool utf8_validate (const char *s, size_t len) @@ -2777,7 +2784,7 @@ utf8_validate (const char *s, size_t len) const char *end = s + len; int32_t codepoint; while ((codepoint = utf8_decode (&s, end - s)) >= 0 - && codepoint <= 0x10FFFF /* TODO: better validations */) + && utf8_validate_cp (codepoint)) ; return s == end; }