85 lines
2.7 KiB
Bash
85 lines
2.7 KiB
Bash
|
#!/bin/sh
|
||
|
u=https://www.unicode.org/Public/
|
||
|
|
||
|
# Download and filter Unicode data files with the given category expression,
|
||
|
# producing a list of possibly duplicitous codepoints in decimal format
|
||
|
retrieve() {
|
||
|
curl --silent --show-error --location "$2" | perl -lne 's/#.*//; s/ //g;
|
||
|
next unless /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?;('"$1"')$/;
|
||
|
print for hex $1 .. hex ($2 // $1);'
|
||
|
}
|
||
|
|
||
|
togo() {
|
||
|
sort -nu | perl -lne '
|
||
|
sub flush { printf "{0x%04x, 0x%04x},\n", $first, $last }
|
||
|
BEGIN { $first = $last = <> }
|
||
|
if ($_ != $last + 1) { flush; $first = $_; }
|
||
|
$last = $_;
|
||
|
END { flush if defined $first }' | column -xc 72
|
||
|
}
|
||
|
|
||
|
gofmt <<EOF
|
||
|
// Code generated by running "go generate" in janouch.name/haven. DO NOT EDIT.
|
||
|
|
||
|
package $GOPACKAGE
|
||
|
|
||
|
// RuneWidth returns the column width of Go runes, using an algorithm and tables
|
||
|
// derived from https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:
|
||
|
// - The null character (U+0000) has a column width of 0.
|
||
|
// - Other C0/C1 control characters and DEL will lead to a return value of -1.
|
||
|
// - Non-spacing and enclosing combining characters (general category code
|
||
|
// Mn or Me in the Unicode database) have a column width of 0.
|
||
|
// - SOFT HYPHEN (U+00AD) has a column width of 1.
|
||
|
// - Other format characters (general category code Cf in the Unicode database)
|
||
|
// and ZERO WIDTH SPACE (U+200B) have a column width of 0.
|
||
|
// - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) have
|
||
|
// a column width of 0.
|
||
|
// - Spacing characters in the East Asian Wide (W) or East Asian Full-width (F)
|
||
|
// category as defined in Unicode UAX #11 have a column width of 2.
|
||
|
// - All remaining characters (including all printable ISO 8859-1 and WGL4
|
||
|
// characters, Unicode control characters, etc.) have a column width of 1.
|
||
|
//
|
||
|
// Local changes:
|
||
|
// - Tables are generated from the latest available version of Unicode.
|
||
|
func RuneWidth(r rune) int {
|
||
|
switch {
|
||
|
case r == 0:
|
||
|
return 0
|
||
|
case r < 32 || r >= 0x7f && r < 0xa0:
|
||
|
return -1
|
||
|
case zeroWidthRunes.contains(r):
|
||
|
return 0
|
||
|
case fullWidthRunes.contains(r):
|
||
|
return 2
|
||
|
}
|
||
|
return 1
|
||
|
}
|
||
|
|
||
|
type runeRange struct{ first, last rune }
|
||
|
type runeRangeTable []runeRange
|
||
|
|
||
|
func (table runeRangeTable) contains(r rune) bool {
|
||
|
min, max := 0, len(table)-1
|
||
|
for max >= min {
|
||
|
mid := (min + max) / 2
|
||
|
if table[mid].last < r {
|
||
|
min = mid + 1
|
||
|
} else if table[mid].first > r {
|
||
|
max = mid - 1
|
||
|
} else {
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
var zeroWidthRunes = runeRangeTable{
|
||
|
$({ retrieve 'Me|Mn|Cf' $u/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt;
|
||
|
seq 0x1160 0x11ff; echo $((0x200B)); } | grep -xv $((0x00AD)) | togo)
|
||
|
}
|
||
|
|
||
|
var fullWidthRunes = runeRangeTable{
|
||
|
$(retrieve 'W|F' $u/UCD/latest/ucd/EastAsianWidth.txt | togo)
|
||
|
}
|
||
|
EOF
|