Extract UTF-8 sequence parser into its own function, fix bugs, apply unit tests
This commit is contained in:
parent
739be0e55d
commit
f1b3dff4c2
93
t/03utf8.c
93
t/03utf8.c
|
@ -7,7 +7,7 @@ int main(int argc, char *argv[])
|
||||||
TermKey *tk;
|
TermKey *tk;
|
||||||
TermKeyKey key;
|
TermKeyKey key;
|
||||||
|
|
||||||
plan_tests(21);
|
plan_tests(57);
|
||||||
|
|
||||||
pipe(fd);
|
pipe(fd);
|
||||||
|
|
||||||
|
@ -72,6 +72,97 @@ int main(int argc, char *argv[])
|
||||||
is_int(key.type, TERMKEY_TYPE_UNICODE, "key.type UTF-8 4 high");
|
is_int(key.type, TERMKEY_TYPE_UNICODE, "key.type UTF-8 4 high");
|
||||||
is_int(key.code.number, 0x10FFFF, "key.code.number UTF-8 4 high");
|
is_int(key.code.number, 0x10FFFF, "key.code.number UTF-8 4 high");
|
||||||
|
|
||||||
|
/* Invalid continuations */
|
||||||
|
|
||||||
|
write(fd[1], "\xC2!", 2);
|
||||||
|
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 invalid cont");
|
||||||
|
is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 2 invalid cont");
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 invalid after");
|
||||||
|
is_int(key.code.number, '!', "key.code.number UTF-8 2 invalid after");
|
||||||
|
|
||||||
|
write(fd[1], "\xE0!", 2);
|
||||||
|
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid cont");
|
||||||
|
is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 3 invalid cont");
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid after");
|
||||||
|
is_int(key.code.number, '!', "key.code.number UTF-8 3 invalid after");
|
||||||
|
|
||||||
|
write(fd[1], "\xE0\xA0!", 3);
|
||||||
|
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid cont 2");
|
||||||
|
is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 3 invalid cont 2");
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid after");
|
||||||
|
is_int(key.code.number, '!', "key.code.number UTF-8 3 invalid after");
|
||||||
|
|
||||||
|
write(fd[1], "\xF0!", 2);
|
||||||
|
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont");
|
||||||
|
is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont");
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after");
|
||||||
|
is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after");
|
||||||
|
|
||||||
|
write(fd[1], "\xF0\x90!", 3);
|
||||||
|
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont 2");
|
||||||
|
is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont 2");
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after");
|
||||||
|
is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after");
|
||||||
|
|
||||||
|
write(fd[1], "\xF0\x90\x80!", 4);
|
||||||
|
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont 3");
|
||||||
|
is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont 3");
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after");
|
||||||
|
is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after");
|
||||||
|
|
||||||
|
/* Partials */
|
||||||
|
|
||||||
|
write(fd[1], "\xC2", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 2 partial");
|
||||||
|
|
||||||
|
write(fd[1], "\xA0", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 partial");
|
||||||
|
is_int(key.code.number, 0x00A0, "key.code.number UTF-8 2 partial");
|
||||||
|
|
||||||
|
write(fd[1], "\xE0", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 3 partial");
|
||||||
|
|
||||||
|
write(fd[1], "\xA0", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 3 partial");
|
||||||
|
|
||||||
|
write(fd[1], "\x80", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 partial");
|
||||||
|
is_int(key.code.number, 0x0800, "key.code.number UTF-8 3 partial");
|
||||||
|
|
||||||
|
write(fd[1], "\xF0", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial");
|
||||||
|
|
||||||
|
write(fd[1], "\x90", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial");
|
||||||
|
|
||||||
|
write(fd[1], "\x80", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial");
|
||||||
|
|
||||||
|
write(fd[1], "\x80", 1);
|
||||||
|
termkey_advisereadable(tk);
|
||||||
|
is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 partial");
|
||||||
|
is_int(key.code.number, 0x10000, "key.code.number UTF-8 4 partial");
|
||||||
|
|
||||||
termkey_destroy(tk);
|
termkey_destroy(tk);
|
||||||
|
|
||||||
return exit_status();
|
return exit_status();
|
||||||
|
|
145
termkey.c
145
termkey.c
|
@ -422,6 +422,76 @@ static void fill_utf8(TermKeyKey *key)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define UTF8_INVALID 0xFFFD
|
||||||
|
static TermKeyResult parse_utf8(const unsigned char *bytes, size_t len, long *cp, size_t *nbytep)
|
||||||
|
{
|
||||||
|
unsigned int nbytes;
|
||||||
|
|
||||||
|
unsigned char b0 = bytes[0];
|
||||||
|
|
||||||
|
if(b0 < 0xc0) {
|
||||||
|
// Starts with a continuation byte - that's not right
|
||||||
|
*cp = UTF8_INVALID;
|
||||||
|
*nbytep = 1;
|
||||||
|
return TERMKEY_RES_KEY;
|
||||||
|
}
|
||||||
|
else if(b0 < 0xe0) {
|
||||||
|
nbytes = 2;
|
||||||
|
*cp = b0 & 0x1f;
|
||||||
|
}
|
||||||
|
else if(b0 < 0xf0) {
|
||||||
|
nbytes = 3;
|
||||||
|
*cp = b0 & 0x0f;
|
||||||
|
}
|
||||||
|
else if(b0 < 0xf8) {
|
||||||
|
nbytes = 4;
|
||||||
|
*cp = b0 & 0x07;
|
||||||
|
}
|
||||||
|
else if(b0 < 0xfc) {
|
||||||
|
nbytes = 5;
|
||||||
|
*cp = b0 & 0x03;
|
||||||
|
}
|
||||||
|
else if(b0 < 0xfe) {
|
||||||
|
nbytes = 6;
|
||||||
|
*cp = b0 & 0x01;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
*cp = UTF8_INVALID;
|
||||||
|
*nbytep = 1;
|
||||||
|
return TERMKEY_RES_KEY;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(unsigned int b = 1; b < nbytes; b++) {
|
||||||
|
unsigned char cb;
|
||||||
|
|
||||||
|
if(b >= len)
|
||||||
|
return TERMKEY_RES_AGAIN;
|
||||||
|
|
||||||
|
cb = bytes[b];
|
||||||
|
if(cb < 0x80 || cb >= 0xc0) {
|
||||||
|
*cp = UTF8_INVALID;
|
||||||
|
*nbytep = b;
|
||||||
|
return TERMKEY_RES_KEY;
|
||||||
|
}
|
||||||
|
|
||||||
|
*cp <<= 6;
|
||||||
|
*cp |= cb & 0x3f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for overlong sequences
|
||||||
|
if(nbytes > utf8_seqlen(*cp))
|
||||||
|
*cp = UTF8_INVALID;
|
||||||
|
|
||||||
|
// Check for UTF-16 surrogates or invalid *cps
|
||||||
|
if((*cp >= 0xD800 && *cp <= 0xDFFF) ||
|
||||||
|
*cp == 0xFFFE ||
|
||||||
|
*cp == 0xFFFF)
|
||||||
|
*cp = UTF8_INVALID;
|
||||||
|
|
||||||
|
*nbytep = nbytes;
|
||||||
|
return TERMKEY_RES_KEY;
|
||||||
|
}
|
||||||
|
|
||||||
static void emit_codepoint(TermKey *tk, long codepoint, TermKeyKey *key)
|
static void emit_codepoint(TermKey *tk, long codepoint, TermKeyKey *key)
|
||||||
{
|
{
|
||||||
if(codepoint < 0x20) {
|
if(codepoint < 0x20) {
|
||||||
|
@ -487,8 +557,6 @@ static void emit_codepoint(TermKey *tk, long codepoint, TermKeyKey *key)
|
||||||
fill_utf8(key);
|
fill_utf8(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define UTF8_INVALID 0xFFFD
|
|
||||||
|
|
||||||
static TermKeyResult peekkey(TermKey *tk, TermKeyKey *key, int force, size_t *nbytep)
|
static TermKeyResult peekkey(TermKey *tk, TermKeyKey *key, int force, size_t *nbytep)
|
||||||
{
|
{
|
||||||
int again = 0;
|
int again = 0;
|
||||||
|
@ -604,83 +672,24 @@ static TermKeyResult peekkey_simple(TermKey *tk, TermKeyKey *key, int force, siz
|
||||||
}
|
}
|
||||||
else if(tk->flags & TERMKEY_FLAG_UTF8) {
|
else if(tk->flags & TERMKEY_FLAG_UTF8) {
|
||||||
// Some UTF-8
|
// Some UTF-8
|
||||||
unsigned int nbytes;
|
|
||||||
long codepoint;
|
long codepoint;
|
||||||
|
TermKeyResult res = parse_utf8(tk->buffer + tk->buffstart, tk->buffcount, &codepoint, nbytep);
|
||||||
|
|
||||||
key->type = TERMKEY_TYPE_UNICODE;
|
if(res == TERMKEY_RES_AGAIN && force) {
|
||||||
key->modifiers = 0;
|
|
||||||
|
|
||||||
if(b0 < 0xc0) {
|
|
||||||
// Starts with a continuation byte - that's not right
|
|
||||||
(*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
|
|
||||||
*nbytep = 1;
|
|
||||||
return TERMKEY_RES_KEY;
|
|
||||||
}
|
|
||||||
else if(b0 < 0xe0) {
|
|
||||||
nbytes = 2;
|
|
||||||
codepoint = b0 & 0x1f;
|
|
||||||
}
|
|
||||||
else if(b0 < 0xf0) {
|
|
||||||
nbytes = 3;
|
|
||||||
codepoint = b0 & 0x0f;
|
|
||||||
}
|
|
||||||
else if(b0 < 0xf8) {
|
|
||||||
nbytes = 4;
|
|
||||||
codepoint = b0 & 0x07;
|
|
||||||
}
|
|
||||||
else if(b0 < 0xfc) {
|
|
||||||
nbytes = 5;
|
|
||||||
codepoint = b0 & 0x03;
|
|
||||||
}
|
|
||||||
else if(b0 < 0xfe) {
|
|
||||||
nbytes = 6;
|
|
||||||
codepoint = b0 & 0x01;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
(*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
|
|
||||||
*nbytep = 1;
|
|
||||||
return TERMKEY_RES_KEY;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(tk->buffcount < nbytes) {
|
|
||||||
if(!force)
|
|
||||||
return TERMKEY_RES_AGAIN;
|
|
||||||
|
|
||||||
/* There weren't enough bytes for a complete UTF-8 sequence but caller
|
/* There weren't enough bytes for a complete UTF-8 sequence but caller
|
||||||
* demands an answer. About the best thing we can do here is eat as many
|
* demands an answer. About the best thing we can do here is eat as many
|
||||||
* bytes as we have, and emit a UTF8_INVALID. If the remaining bytes
|
* bytes as we have, and emit a UTF8_INVALID. If the remaining bytes
|
||||||
* arrive later, they'll be invalid too.
|
* arrive later, they'll be invalid too.
|
||||||
*/
|
*/
|
||||||
(*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
|
codepoint = UTF8_INVALID;
|
||||||
*nbytep = tk->buffcount;
|
*nbytep = tk->buffcount;
|
||||||
return TERMKEY_RES_KEY;
|
res = TERMKEY_RES_KEY;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unsigned int b = 1; b < nbytes; b++) {
|
key->type = TERMKEY_TYPE_UNICODE;
|
||||||
unsigned char cb = CHARAT(b);
|
key->modifiers = 0;
|
||||||
if(cb < 0x80 || cb >= 0xc0) {
|
|
||||||
(*tk->method.emit_codepoint)(tk, UTF8_INVALID, key);
|
|
||||||
*nbytep = b - 1;
|
|
||||||
return TERMKEY_RES_KEY;
|
|
||||||
}
|
|
||||||
|
|
||||||
codepoint <<= 6;
|
|
||||||
codepoint |= cb & 0x3f;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for overlong sequences
|
|
||||||
if(nbytes > utf8_seqlen(codepoint))
|
|
||||||
codepoint = UTF8_INVALID;
|
|
||||||
|
|
||||||
// Check for UTF-16 surrogates or invalid codepoints
|
|
||||||
if((codepoint >= 0xD800 && codepoint <= 0xDFFF) ||
|
|
||||||
codepoint == 0xFFFE ||
|
|
||||||
codepoint == 0xFFFF)
|
|
||||||
codepoint = UTF8_INVALID;
|
|
||||||
|
|
||||||
(*tk->method.emit_codepoint)(tk, codepoint, key);
|
(*tk->method.emit_codepoint)(tk, codepoint, key);
|
||||||
*nbytep = nbytes;
|
return res;
|
||||||
return TERMKEY_RES_KEY;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Non UTF-8 case - just report the raw byte
|
// Non UTF-8 case - just report the raw byte
|
||||||
|
|
Loading…
Reference in New Issue