From f7c67b2a2eeaa0d62c4473751e2d062b2b082cc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C5=99emysl=20Janouch?= 
Date: Tue, 4 Oct 2016 13:21:45 +0200
Subject: [PATCH] Add termo_str{f,p}key_utf8()
Adding back some original UTF-8 handling code.
---
 termo.c | 187 +++++++++++++++++++++++++++++++++++++++++++++-----------
 termo.h |   5 ++
 2 files changed, 157 insertions(+), 35 deletions(-)
diff --git a/termo.c b/termo.c
index 75800e6..477242b 100644
--- a/termo.c
+++ b/termo.c
@@ -1048,28 +1048,25 @@ peekkey_simple (termo_t *tk, termo_key_t *key, int flags, size_t *nbytep)
 	}
 }
 
-// XXX: With the current infrastructure I'm not sure how to properly handle
-//   this.  peekkey() isn't made for skipping invalid inputs.
-#define INVALID_1005 0x20
+// REPLACEMENT CHARACTER
+#define UTF8_INVALID 0xFFFD
 
-static termo_result_t
-parse_1005_value (const unsigned char **bytes, size_t *len, uint32_t *cp)
+static size_t
+parse_utf8_fast (const unsigned char *bytes, size_t len, uint32_t *cp)
 {
-	unsigned int nbytes;
-	unsigned char b0 = (*bytes)[0];
+	size_t nbytes;
+	unsigned char b0 = bytes[0];
 	if (b0 < 0x80)
 	{
 		// Single byte ASCII
 		*cp = b0;
-		nbytes = 1;
-		goto end;
+		return 1;
 	}
 	else if (b0 < 0xc0)
 	{
 		// Starts with a continuation byte - that's not right
-		*cp = INVALID_1005;
-		nbytes = 1;
-		goto end;
+		*cp = UTF8_INVALID;
+		return 1;
 	}
 	else if (b0 < 0xe0)
 	{
@@ -1098,28 +1095,39 @@ parse_1005_value (const unsigned char **bytes, size_t *len, uint32_t *cp)
 	}
 	else
 	{
-		*cp = INVALID_1005;
-		nbytes = 1;
-		goto end;
+		*cp = UTF8_INVALID;
+		return 1;
 	}
 
-	for (unsigned int b = 1; b < nbytes; b++)
+	for (size_t b = 1; b < nbytes; b++)
 	{
-		if (b >= *len)
-			return TERMO_RES_AGAIN;
+		if (b >= len)
+			return 0;
 
-		unsigned char cb = (*bytes)[b];
+		unsigned char cb = bytes[b];
 		if (cb < 0x80 || cb >= 0xc0)
 		{
-			*cp = INVALID_1005;
-			nbytes = b;
-			goto end;
+			*cp = UTF8_INVALID;
+			return b;
 		}
 		*cp <<= 6;
 		*cp |= cb & 0x3f;
 	}
+	return nbytes;
+}
+
+static termo_result_t
+parse_1005_value (const unsigned char **bytes, size_t *len, uint32_t *cp)
+{
+	size_t nbytes = parse_utf8_fast (*bytes, *len, cp);
+	if (nbytes == 0)
+		return TERMO_RES_AGAIN;
+
+	// XXX: With the current infrastructure I'm not sure how to properly handle
+	//   this.  peekkey() isn't made for skipping invalid inputs.
+	if (*cp == UTF8_INVALID)
+		*cp = 0x20;
 
-end:
 	(*bytes) += nbytes;
 	(*len) -= nbytes;
 	return TERMO_RES_KEY;
@@ -1430,6 +1438,8 @@ register_c0_full (termo_t *tk, termo_sym_t sym,
 	return sym;
 }
 
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
 static struct modnames
 {
 	const char *shift, *alt, *ctrl;
@@ -1446,9 +1456,11 @@ modnames[] =
 	{ "shift", "meta", "ctrl" }, // LOWERMOD+ALTISMETA+LONGMOD
 };
 
-size_t
-termo_strfkey (termo_t *tk, char *buffer, size_t len,
-	termo_key_t *key, termo_format_t format)
+typedef const char *(*strfkey_emit_fn) (termo_t *, termo_key_t *, char *);
+
+static size_t
+termo_strfkey_generic (termo_t *tk, char *buffer, size_t len,
+	termo_key_t *key, termo_format_t format, strfkey_emit_fn emit)
 {
 	size_t pos = 0;
 	int l = 0;
@@ -1524,10 +1536,11 @@ termo_strfkey (termo_t *tk, char *buffer, size_t len,
 	switch (key->type)
 	{
 	case TERMO_TYPE_KEY:
-		if (!key->multibyte[0]) // In case of user-supplied key structures
-			fill_multibyte (tk, key);
-		l = snprintf (buffer + pos, len - pos, "%s", key->multibyte);
+	{
+		char buf[MB_LEN_MAX + 1];
+		l = snprintf (buffer + pos, len - pos, "%s", emit (tk, key, buf));
 		break;
+	}
 	case TERMO_TYPE_KEYSYM:
 	{
 		const char *name = termo_get_keyname (tk, key->code.sym);
@@ -1596,9 +1609,74 @@ termo_strfkey (termo_t *tk, char *buffer, size_t len,
 	return pos;
 }
 
-const char *
-termo_strpkey (termo_t *tk,
-	const char *str, termo_key_t *key, termo_format_t format)
+static const char *
+strfkey_emit_locale (termo_t *tk, termo_key_t *key, char buf[])
+{
+	(void) buf;
+	if (!key->multibyte[0]) // In case of user-supplied key structures
+		fill_multibyte (tk, key);
+	return key->multibyte;
+}
+
+size_t
+termo_strfkey (termo_t *tk, char *buffer, size_t len,
+	termo_key_t *key, termo_format_t format)
+{
+	return termo_strfkey_generic
+		(tk, buffer, len, key, format, strfkey_emit_locale);
+}
+
+static inline size_t
+utf8_seqlen (uint32_t codepoint)
+{
+	if (codepoint < 0x0000080) return 1;
+	if (codepoint < 0x0000800) return 2;
+	if (codepoint < 0x0010000) return 3;
+	if (codepoint < 0x0200000) return 4;
+	if (codepoint < 0x4000000) return 5;
+	return 6;
+}
+
+static const char *
+strfkey_emit_utf8 (termo_t *tk, termo_key_t *key, char buf[])
+{
+	(void) tk;
+	uint32_t codepoint = key->code.codepoint;
+	int nbytes = utf8_seqlen (codepoint);
+	buf[nbytes] = 0;
+
+	// This is easier done backwards
+	for (int b = nbytes; b-- > 1; codepoint >>= 6)
+		buf[b] = 0x80 | (codepoint & 0x3f);
+
+	switch (nbytes)
+	{
+	case 1: buf[0] =        (codepoint & 0x7f); break;
+	case 2: buf[0] = 0xc0 | (codepoint & 0x1f); break;
+	case 3: buf[0] = 0xe0 | (codepoint & 0x0f); break;
+	case 4: buf[0] = 0xf0 | (codepoint & 0x07); break;
+	case 5: buf[0] = 0xf8 | (codepoint & 0x03); break;
+	case 6: buf[0] = 0xfc | (codepoint & 0x01); break;
+	}
+	return buf;
+}
+
+size_t
+termo_strfkey_utf8 (termo_t *tk, char *buffer, size_t len,
+	termo_key_t *key, termo_format_t format)
+{
+	return termo_strfkey_generic
+		(tk, buffer, len, key, format, strfkey_emit_utf8);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+typedef termo_result_t (*strpkey_parse_fn)
+	(termo_t *, const unsigned char *, size_t, uint32_t *, size_t *);
+
+static const char *
+termo_strpkey_generic (termo_t *tk, const char *str, termo_key_t *key,
+	termo_format_t format, strpkey_parse_fn parse)
 {
 	struct modnames *mods = &modnames[
 		!!(format & TERMO_FORMAT_LONGMOD) +
@@ -1609,8 +1687,8 @@ termo_strpkey (termo_t *tk,
 
 	if ((format & TERMO_FORMAT_CARETCTRL) && str[0] == '^' && str[1])
 	{
-		str = termo_strpkey (tk,
-			str + 1, key, format & ~TERMO_FORMAT_CARETCTRL);
+		str = termo_strpkey_generic (tk,
+			str + 1, key, format & ~TERMO_FORMAT_CARETCTRL, parse);
 
 		if (!str
 		 || key->type != TERMO_TYPE_KEY
@@ -1660,7 +1738,7 @@ termo_strpkey (termo_t *tk,
 		str += snbytes;
 	}
 	// Multibyte must be last
-	else if (parse_multibyte (tk, (unsigned const char *) str, strlen (str),
+	else if (parse (tk, (unsigned const char *) str, strlen (str),
 		&key->code.codepoint, &nbytes) == TERMO_RES_KEY)
 	{
 		key->type = TERMO_TYPE_KEY;
@@ -1675,6 +1753,45 @@ termo_strpkey (termo_t *tk,
 	return (char *) str;
 }
 
+const char *
+termo_strpkey (termo_t *tk,
+	const char *str, termo_key_t *key, termo_format_t format)
+{
+	return termo_strpkey_generic (tk, str, key, format, parse_multibyte);
+}
+
+static termo_result_t
+parse_utf8 (termo_t *tk, const unsigned char *bytes, size_t len,
+	uint32_t *cp, size_t *nbytep)
+{
+	(void) tk;
+	size_t nbytes = parse_utf8_fast (bytes, len, cp);
+	if (nbytes == 0)
+		return TERMO_RES_AGAIN;
+
+	// Check for overlong sequences
+	if (nbytes > utf8_seqlen (*cp))
+		*cp = UTF8_INVALID;
+
+	// Check for UTF-16 surrogates or invalid *cps
+	if ((*cp >= 0xD800 && *cp <= 0xDFFF)
+	 || *cp == 0xFFFE
+	 || *cp == 0xFFFF)
+		*cp = UTF8_INVALID;
+
+	*nbytep = nbytes;
+	return TERMO_RES_KEY;
+}
+
+const char *
+termo_strpkey_utf8 (termo_t *tk,
+	const char *str, termo_key_t *key, termo_format_t format)
+{
+	return termo_strpkey_generic (tk, str, key, format, parse_utf8);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
 int
 termo_keycmp (termo_t *tk,
 	const termo_key_t *key1p, const termo_key_t *key2p)
diff --git a/termo.h b/termo.h
index 9df3e73..f32d5f8 100644
--- a/termo.h
+++ b/termo.h
@@ -299,6 +299,11 @@ size_t termo_strfkey (termo_t *tk, char *buffer, size_t len,
 const char *termo_strpkey (termo_t *tk, const char *str,
 	termo_key_t *key, termo_format_t format);
 
+size_t termo_strfkey_utf8 (termo_t *tk, char *buffer, size_t len,
+	termo_key_t *key, termo_format_t format);
+const char *termo_strpkey_utf8 (termo_t *tk, const char *str,
+	termo_key_t *key, termo_format_t format);
+
 int termo_keycmp (termo_t *tk,
 	const termo_key_t *key1, const termo_key_t *key2);