Compare commits

...

3 Commits

Author SHA1 Message Date
53bcebc2f0 Split out utf8_validate_cp(), adhere to RFC 3629 2020-10-21 05:20:20 +02:00
b08cf6c29f Reject overlong UTF-8 sequences 2020-10-21 05:08:59 +02:00
69101eb155 Fix optional arguments in --help output
An equals sign is necessary.
2020-10-13 21:27:46 +02:00
2 changed files with 20 additions and 6 deletions

View File

@@ -2753,6 +2753,11 @@ utf8_decode (const char **s, size_t len)
// Check the rest of the sequence
uint32_t cp = *p++ & ~mask;
// Overlong sequence (possibly MUTF-8, not supported)
if (!cp && sequence_len)
return -1;
while (sequence_len && --sequence_len)
{
if (p == end)
@@ -2765,6 +2770,13 @@ utf8_decode (const char **s, size_t len)
return cp;
}
static inline bool
utf8_validate_cp (int32_t cp)
{
// RFC 3629, CESU-8 not allowed
return cp >= 0 && cp <= 0x10FFFF && (cp < 0xD800 || cp > 0xDFFF);
}
/// Very rough UTF-8 validation, just makes sure codepoints can be iterated
static bool
utf8_validate (const char *s, size_t len)
@@ -2772,7 +2784,7 @@ utf8_validate (const char *s, size_t len)
const char *end = s + len;
int32_t codepoint;
while ((codepoint = utf8_decode (&s, end - s)) >= 0
&& codepoint <= 0x10FFFF /* TODO: better validations */)
&& utf8_validate_cp (codepoint))
;
return s == end;
}
@@ -3782,7 +3794,7 @@ opt_handler_usage (const struct opt_handler *self, FILE *stream)
str_append_printf (&row, "--%s", opt->long_name);
if (opt->arg_hint)
str_append_printf (&row, (opt->flags & OPT_OPTIONAL_ARG)
? " [%s]" : " %s", opt->arg_hint);
? "[=%s]" : " %s", opt->arg_hint);
// TODO: keep the indent if there are multiple lines
if (row.len + 2 <= OPT_USAGE_ALIGNMENT_COLUMN)

View File

@@ -331,10 +331,12 @@ test_utf8 (void)
soft_assert (utf8_decode (&partial, 1) == -2);
soft_assert (utf8_decode (&empty, 0) == -1);
const char valid [] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm";
const char invalid[] = "\xf0\x90\x28\xbc";
soft_assert ( utf8_validate (valid, sizeof valid));
soft_assert (!utf8_validate (invalid, sizeof invalid));
const char valid[] = "2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm";
const char invalid_1[] = "\xf0\x90\x28\xbc";
const char invalid_2[] = "\xc0\x80";
soft_assert ( utf8_validate (valid, sizeof valid));
soft_assert (!utf8_validate (invalid_1, sizeof invalid_1));
soft_assert (!utf8_validate (invalid_2, sizeof invalid_2));
struct utf8_iter iter = utf8_iter_make ("fóọ");
size_t ch_len;