Tolerate cut-off UTF-8 messages

I've had this happen to me on Russian channels and it's highly
annoying because you lose the entire message.  On the contrary,
this at worst screws up the last few characters of it.

Closes #2
This commit is contained in:
Přemysl Eric Janouch 2020-10-12 23:32:58 +02:00
parent 73c3ca3633
commit 9d8a7a10d0
Signed by: p
GPG Key ID: A0420B94F92B9493
1 changed files with 22 additions and 1 deletions

View File

@ -3022,7 +3022,7 @@ irc_skip_statusmsg (struct server *s, const char *target)
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// As of 2015, everything should be in UTF-8. And if it's not, we'll decode it
// As of 2020, everything should be in UTF-8. And if it's not, we'll decode it
// as ISO Latin 1. This function should not be called on the whole message.
static char *
irc_to_utf8 (const char *text)
@ -7811,9 +7811,30 @@ irc_process_numeric (struct server *s,
strv_free (&copy);
}
static void
irc_fix_cut_off_utf8 (char **line)
{
// A variation on utf8_validate(), we need to detect the -2 return
const char *p = *line, *end = strchr (p, 0);
int32_t codepoint;
while ((codepoint = utf8_decode (&p, end - p)) >= 0
&& codepoint <= 0x10FFFF /* TODO: move this check into a function */)
;
if (codepoint != -2)
return;
struct str fixed_up = str_make ();
str_append_data (&fixed_up, *line, p - *line);
str_append (&fixed_up, "\xEF\xBF\xBD" /* U+FFFD */);
cstr_set (line, str_steal (&fixed_up));
}
static void
irc_process_message (const struct irc_message *msg, struct server *s)
{
if (msg->params.len)
irc_fix_cut_off_utf8 (&msg->params.vector[msg->params.len - 1]);
// TODO: make use of IRCv3.2 server-time (with fallback to unixtime_msec())
// -> change all calls to log_{server,nick,outcoming,ctcp}*() to take
// an extra argument specifying time