wdmtg/compound-text.c

634 lines
25 KiB
C
Raw Permalink Normal View History

//
// compound-text.c: partial X11 COMPOUND_TEXT to UCS-4 transcoder
//
// Copyright (c) 2020, Přemysl Eric Janouch <p@janouch.name>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
#include <stdbool.h>
#include <stdlib.h> // malloc, free, NULL, size_t
#include <glib.h> // g_utf8_*, g_ucs4_to_utf8
// None of the full Chinese, Japanese, Korean character sets are supported,
// and will be replaced by a lot of funny question marks
enum compound_text_encoding {
COMPOUND_TEXT_ASCII,
COMPOUND_TEXT_ISO8859_1_GR,
COMPOUND_TEXT_ISO8859_2_GR,
COMPOUND_TEXT_ISO8859_3_GR,
COMPOUND_TEXT_ISO8859_4_GR,
COMPOUND_TEXT_ISO8859_5_GR,
COMPOUND_TEXT_ISO8859_6_GR,
COMPOUND_TEXT_ISO8859_7_GR,
COMPOUND_TEXT_ISO8859_8_GR,
COMPOUND_TEXT_ISO8859_9_GR,
COMPOUND_TEXT_ISO8859_10_GR,
COMPOUND_TEXT_ISO8859_13_GR,
COMPOUND_TEXT_ISO8859_14_GR,
COMPOUND_TEXT_ISO8859_15_GR,
COMPOUND_TEXT_ISO8859_16_GR,
COMPOUND_TEXT_JIS_X0201_GR,
COMPOUND_TEXT_JIS_X0201_GL,
COMPOUND_TEXT_COUNT
};
/*
Generated from glibc charset data, based on the following script:
for enc in ISO-8859-{1,2,3,4,5,6,7,8,9,10,13,14,15,16} JIS_X0201; do
echo "[COMPOUND_TEXT_$enc] ="
zcat /usr/share/i18n/charmaps/$enc.gz | \
perl -nle '$x{hex($2)} = "0x$1" if m|^<U(.*?)> +/x(..) +.*|;
sub tbl { join ", ", map { $x{$_} || "0x0000" } @_ }
END { print tbl(0x20..0x7F); print tbl(0xa0..0xFF); }'
done | fmt -sw70 | sed 's|^0|\t&|; s|^|\t|'
*/
static unsigned short compound_text_tables[COMPOUND_TEXT_COUNT][96] = {
[COMPOUND_TEXT_ASCII] = {
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F
},
[COMPOUND_TEXT_ISO8859_1_GR] = {
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF
},
[COMPOUND_TEXT_ISO8859_2_GR] = {
0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
},
[COMPOUND_TEXT_ISO8859_3_GR] = {
0x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, 0x0000, 0x0124, 0x00A7,
0x00A8, 0x0130, 0x015E, 0x011E, 0x0134, 0x00AD, 0x0000, 0x017B,
0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7,
0x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, 0x0000, 0x017C,
0x00C0, 0x00C1, 0x00C2, 0x0000, 0x00C4, 0x010A, 0x0108, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x0000, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7,
0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x016C, 0x015C, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x0000, 0x00E4, 0x010B, 0x0109, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x0000, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7,
0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9
},
[COMPOUND_TEXT_ISO8859_4_GR] = {
0x00A0, 0x0104, 0x0138, 0x0156, 0x00A4, 0x0128, 0x013B, 0x00A7,
0x00A8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00AD, 0x017D, 0x00AF,
0x00B0, 0x0105, 0x02DB, 0x0157, 0x00B4, 0x0129, 0x013C, 0x02C7,
0x00B8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014A, 0x017E, 0x014B,
0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E,
0x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x012A,
0x0110, 0x0145, 0x014C, 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x0168, 0x016A, 0x00DF,
0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F,
0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x012B,
0x0111, 0x0146, 0x014D, 0x0137, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x0169, 0x016B, 0x02D9
},
[COMPOUND_TEXT_ISO8859_5_GR] = {
0x00A0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
0x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x00AD, 0x040E, 0x040F,
0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x00A7, 0x045E, 0x045F
},
[COMPOUND_TEXT_ISO8859_6_GR] = {
0x00A0, 0x0000, 0x0000, 0x0000, 0x00A4, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x060C, 0x00AD, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x061B, 0x0000, 0x0000, 0x0000, 0x061F,
0x0000, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F,
0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,
0x0638, 0x0639, 0x063A, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,
0x0648, 0x0649, 0x064A, 0x064B, 0x064C, 0x064D, 0x064E, 0x064F,
0x0650, 0x0651, 0x0652, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
},
[COMPOUND_TEXT_ISO8859_7_GR] = {
0x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x037A, 0x00AB, 0x00AC, 0x00AD, 0x0000, 0x2015,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x00B7,
0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
0x03A0, 0x03A1, 0x0000, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7,
0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0x0000
},
[COMPOUND_TEXT_ISO8859_8_GR] = {
0x00A0, 0x0000, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2017,
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
0x05E8, 0x05E9, 0x05EA, 0x0000, 0x0000, 0x200E, 0x200F, 0x0000
},
[COMPOUND_TEXT_ISO8859_9_GR] = {
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x011F, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF
},
[COMPOUND_TEXT_ISO8859_10_GR] = {
0x00A0, 0x0104, 0x0112, 0x0122, 0x012A, 0x0128, 0x0136, 0x00A7,
0x013B, 0x0110, 0x0160, 0x0166, 0x017D, 0x00AD, 0x016A, 0x014A,
0x00B0, 0x0105, 0x0113, 0x0123, 0x012B, 0x0129, 0x0137, 0x00B7,
0x013C, 0x0111, 0x0161, 0x0167, 0x017E, 0x2015, 0x016B, 0x014B,
0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E,
0x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x00CF,
0x00D0, 0x0145, 0x014C, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0168,
0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F,
0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x00EF,
0x00F0, 0x0146, 0x014D, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0169,
0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x0138
},
[COMPOUND_TEXT_ISO8859_13_GR] = {
0x00A0, 0x201D, 0x00A2, 0x00A3, 0x00A4, 0x201E, 0x00A6, 0x00A7,
0x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x201C, 0x00B5, 0x00B6, 0x00B7,
0x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112,
0x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7,
0x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113,
0x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7,
0x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x2019
},
[COMPOUND_TEXT_ISO8859_14_GR] = {
0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7,
0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178,
0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56,
0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF
},
[COMPOUND_TEXT_ISO8859_15_GR] = {
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AC, 0x00A5, 0x0160, 0x00A7,
0x0161, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x017D, 0x00B5, 0x00B6, 0x00B7,
0x017E, 0x00B9, 0x00BA, 0x00BB, 0x0152, 0x0153, 0x0178, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF
},
[COMPOUND_TEXT_ISO8859_16_GR] = {
0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF
},
[COMPOUND_TEXT_JIS_X0201_GR] = {
0x0000, 0x3002, 0x300C, 0x300D, 0x3001, 0x30FB, 0x30F2, 0x30A1,
0x30A3, 0x30A5, 0x30A7, 0x30A9, 0x30E3, 0x30E5, 0x30E7, 0x30C3,
0x30FC, 0x30A2, 0x30A4, 0x30A6, 0x30A8, 0x30AA, 0x30AB, 0x30AD,
0x30AF, 0x30B1, 0x30B3, 0x30B5, 0x30B7, 0x30B9, 0x30BB, 0x30BD,
0x30BF, 0x30C1, 0x30C4, 0x30C6, 0x30C8, 0x30CA, 0x30CB, 0x30CC,
0x30CD, 0x30CE, 0x30CF, 0x30D2, 0x30D5, 0x30D8, 0x30DB, 0x30DE,
0x30DF, 0x30E0, 0x30E1, 0x30E2, 0x30E4, 0x30E6, 0x30E8, 0x30E9,
0x30EA, 0x30EB, 0x30EC, 0x30ED, 0x30EF, 0x30F3, 0x309B, 0x309C,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
},
[COMPOUND_TEXT_JIS_X0201_GL] = {
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005A, 0x005B, 0x00A5, 0x005D, 0x005E, 0x005F,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x203E, 0x007F
},
};
struct compound_text_state {
const char *in; // Current input iterator
const char *end; // End of input
int *out; // Current result iterator
int gl_encoding; // Current GL encoding or -N for unknown N-octet
int gr_encoding; // Current GR encoding or -N for unknown N-octet
};
static bool
compound_text_peek(struct compound_text_state *s, unsigned char *c)
{
if (s->in >= s->end)
return false;
*c = *s->in;
return true;
}
static bool
compound_text_read(struct compound_text_state *s, unsigned char *c)
{
if (!compound_text_peek(s, c))
return false;
s->in++;
return true;
}
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
static bool
compound_text_skip_I(struct compound_text_state *s, unsigned char *c)
{
if (*c >= 0x21 && *c <= 0x23) {
while (compound_text_peek(s, c) && *c >= 0x20 && *c <= 0x2f)
compound_text_read(s, c);
if (!compound_text_read(s, c))
return false;
}
return true;
}
static bool
compound_text_unknown_1(struct compound_text_state *s, unsigned char c,
int *encoding)
{
if (!compound_text_skip_I(s, &c))
return false;
else if (c >= 0x40 && c <= 0x7e)
*encoding = -1;
else
return false;
return true;
}
static bool
compound_text_unknown_N(struct compound_text_state *s, unsigned char c,
int *encoding)
{
if (!compound_text_skip_I(s, &c))
return false;
else if (c >= 0x40 && c <= 0x5f)
*encoding = -2;
else if (c >= 0x60 && c <= 0x6f)
*encoding = -3;
else if (c >= 0x70 && c <= 0x7e)
return false; // "4 or more"
else
return false;
return true;
}
static bool
compound_text_utf8(struct compound_text_state *s)
{
// The specification isn't entirely clear about termination,
// let's be naïve and careful about what we accept
while (s->in + 3 <= s->end) {
if (s->in[0] == 0x1b && s->in[1] == 0x25 && s->in[2] == 0x40) {
s->in += 3;
return true;
}
gunichar r = g_utf8_get_char_validated(s->in, s->end - s->in);
if (r == (gunichar) -1 || r == (gunichar) -2)
return false;
// Don't allow circumventing the rules with this stupid mode
if (r < 0x20 && r != '\t' && r != '\n')
r = 0xFFFD;
s->in = g_utf8_next_char(s->in);
*s->out++ = r;
}
return false;
}
static bool
compound_text_ESC(struct compound_text_state *s)
{
unsigned char c;
if (!compound_text_read(s, &c)) {
return false;
} else if (c == 0x28 /* GL 94 */) {
if (!compound_text_read(s, &c))
return false;
else if (c == 0x42)
s->gl_encoding = COMPOUND_TEXT_ASCII;
else if (c == 0x4a)
s->gl_encoding = COMPOUND_TEXT_JIS_X0201_GL;
else if (!compound_text_unknown_1(s, c, &s->gl_encoding))
return false;
} else if (c == 0x29 /* GR 94 */) {
if (!compound_text_read(s, &c))
return false;
else if (c == 0x49)
s->gr_encoding = COMPOUND_TEXT_JIS_X0201_GR;
else if (!compound_text_unknown_1(s, c, &s->gr_encoding))
return false;
} else if (c == 0x2d /* GR 96 */) {
if (!compound_text_read(s, &c))
return false;
else if (c == 0x41)
s->gr_encoding = COMPOUND_TEXT_ISO8859_1_GR;
else if (c == 0x42)
s->gr_encoding = COMPOUND_TEXT_ISO8859_2_GR;
else if (c == 0x43)
s->gr_encoding = COMPOUND_TEXT_ISO8859_3_GR;
else if (c == 0x44)
s->gr_encoding = COMPOUND_TEXT_ISO8859_4_GR;
else if (c == 0x46)
s->gr_encoding = COMPOUND_TEXT_ISO8859_7_GR;
else if (c == 0x47)
s->gr_encoding = COMPOUND_TEXT_ISO8859_6_GR;
else if (c == 0x48)
s->gr_encoding = COMPOUND_TEXT_ISO8859_8_GR;
else if (c == 0x4c)
s->gr_encoding = COMPOUND_TEXT_ISO8859_5_GR;
else if (c == 0x4d)
s->gr_encoding = COMPOUND_TEXT_ISO8859_9_GR;
else if (c == 0x56)
s->gr_encoding = COMPOUND_TEXT_ISO8859_10_GR;
else if (c == 0x59)
s->gr_encoding = COMPOUND_TEXT_ISO8859_13_GR;
else if (c == 0x5f)
s->gr_encoding = COMPOUND_TEXT_ISO8859_14_GR;
else if (c == 0x62)
s->gr_encoding = COMPOUND_TEXT_ISO8859_15_GR;
else if (c == 0x66)
s->gr_encoding = COMPOUND_TEXT_ISO8859_16_GR;
else if (!compound_text_unknown_1(s, c, &s->gr_encoding))
return false;
} else if (c == 0x24 /* ^N */) {
if (!compound_text_read(s, &c)) {
return false;
} else if (c == 0x28 /* GL 94^N */) {
if (!compound_text_read(s, &c) ||
!compound_text_unknown_N(s, c, &s->gl_encoding))
return false;
} else if (c == 0x29 /* GR 94^N */) {
if (!compound_text_read(s, &c) ||
!compound_text_unknown_N(s, c, &s->gr_encoding))
return false;
} else {
return false;
}
} else if (c == 0x25 /* Non-Standard Character Set Encodings */) {
if (!compound_text_read(s, &c))
return false;
if (c == 0x47 /* from version 1.1.xf86.1 */)
return compound_text_utf8(s);
if (c != 0x2f || !compound_text_read(s, &c) || c < 0x30 || c > 0x34)
return false;
if (!compound_text_read(s, &c) || !(c & 0x80))
return false;
int skip_h = c - 128;
if (!compound_text_read(s, &c) || !(c & 0x80))
return false;
int skip_l = c - 128;
for (int skip = skip_h << 8 | skip_l; skip--; ) {
if (!compound_text_read(s, &c))
return false;
}
// TODO: this would deserve more obvious handling,
// we're replacing an entire sequence with just one character.
// For that, we'd need to parse the sequence, though.
*s->out++ = 0xFFFD;
} else if (c == 0x23 /* Extensions, starting control sequences */) {
// NOTE: major version = c - 0x20 + 1
if (!compound_text_read(s, &c) || c < 0x20 || c > 0x2f)
return false;
if (!compound_text_read(s, &c))
return false;
else if (c == 0x30)
return false; // not supported: ignoring extensions is OK
else if (c == 0x31)
return false; // not supported: ignoring extensions is not OK
else
return false;
} else if (c >= 0x20 && c <= 0x2f /* extension, Intermediate */) {
return false; // not supported
} else if (c >= 0x30 && c <= 0x7e /* extension, Final */) {
return false; // not supported
}
return true;
}
static bool
compound_text_CSI(struct compound_text_state *s)
{
unsigned char c;
if (!compound_text_read(s, &c)) {
return false;
} else if (c == 0x31) {
if (!compound_text_read(s, &c) || c != 0x5d)
return false;
*s->out++ = 0x202A; // LRE
} else if (c == 0x32) {
if (!compound_text_read(s, &c) || c != 0x5d)
return false;
*s->out++ = 0x202B; // RLE
} else if (c == 0x5d) {
*s->out++ = 0x202C; // PDF
} else if (c >= 0x30 && c <= 0x3f /* extension, P */) {
return false; // not supported
} else if (c >= 0x20 && c <= 0x2f /* extension, Intermediate */) {
return false; // not supported
} else if (c >= 0x40 && c <= 0x7e /* extension, Final */) {
return false; // not supported
} else {
return false;
}
return true;
}
static bool
compound_text_GL(struct compound_text_state *s, unsigned char c)
{
if (s->gl_encoding < 0 ||
!(*s->out = compound_text_tables[s->gl_encoding][c - 0x20]))
*s->out = 0xFFFD;
for (int i = 0; --i > s->gl_encoding; ) {
if (!compound_text_read(s, &c) || c < 0x20 || c >= 0x80)
return false;
}
s->out++;
return true;
}
static bool
compound_text_GR(struct compound_text_state *s, unsigned char c)
{
if (s->gr_encoding < 0 ||
!(*s->out = compound_text_tables[s->gr_encoding][c - 0xa0]))
*s->out = 0xFFFD;
for (int i = 0; --i > s->gr_encoding; ) {
if (!compound_text_read(s, &c) || c < 0xa0)
return false;
}
s->out++;
return true;
}
int *
compound_text_to_ucs4(const char *compound_text, size_t length)
{
// This is a good approximation, as well as the upper bound
int *result = calloc(sizeof *result, length + 1);
struct compound_text_state s = {
.in = compound_text, .end = compound_text + length, .out = result,
.gl_encoding = COMPOUND_TEXT_ASCII,
.gr_encoding = COMPOUND_TEXT_ISO8859_1_GR,
};
unsigned char c;
while (compound_text_read(&s, &c) && c != 0) {
bool ok = true;
if (c == '\t' || c == '\n')
*s.out++ = c;
else if (c == 0x1b)
ok = compound_text_ESC(&s);
else if (c == 0x9b)
ok = compound_text_CSI(&s);
else if ((c & ~0x80) < 0x20 /* C0, C1 */)
*s.out++ = 0xFFFD;
else if (c < 0x80)
ok = compound_text_GL(&s, c);
else
ok = compound_text_GR(&s, c);
if (!ok) {
// TODO: consider returning partial results
free(result);
return NULL;
}
}
*s.out++ = 0;
return result;
}
#if COMPOUND_TEXT_SELFTEST
// Build with -DCOMPOUND_TEXT_SELFTEST `pkg-config --cflags --libs x11 glib-2.0`
#include <X11/Xlib.h>
#include <X11/Xutil.h>
int
main(void)
{
Display *dpy = XOpenDisplay(NULL); GString *s = g_string_new("");
while (1) {
g_string_truncate(s, 0);
for (gsize i = 0; i < 10; i++) {
int c = rand() & 0x10FFFF;
if ((c < 0xD800 || c > 0xDFFF) && // GLib rejects surrogates
(c != 0x9b) && // Xlib inserts a lone CSI (!)
(c >= 0x20)) // not allowed or disruptive
g_string_append_unichar(s, c);
}
XTextProperty prop;
Xutf8TextListToTextProperty(dpy, (char **) &s->str, 1,
XCompoundTextStyle, &prop);
int *ucs4 = NULL; char *x = NULL;
if (!(ucs4 = compound_text_to_ucs4((char *) prop.value, prop.nitems)))
g_printerr("parse error '%s' -> '%s'\n", s->str, prop.value);
else if (!(x = g_ucs4_to_utf8((gunichar *) ucs4, -1, NULL, NULL, NULL)))
g_printerr("total failure: %s\n", prop.value);
free(ucs4); free(x); XFree(prop.value);
}
}
#endif