width data generation

This commit is contained in:
Thomas Wolff 2018-03-07 23:55:52 +01:00 committed by Corinna Vinschen
parent 8e8fd6c849
commit 37132125bc
5 changed files with 1388 additions and 0 deletions

569
newlib/libc/string/WIDTH-A Normal file
View File

@ -0,0 +1,569 @@
# UAX #11: East Asian Ambiguous
# Plane 00
# Rows Positions (Cells)
00 A1 A4 A7-A8 AA AD-AE B0-B4 B6-BA BC-BF C6 D0 D7-D8 DE-E1 E6 E8-EA
00 EC-ED F0 F2-F3 F7-FA FC FE
01 01 11 13 1B 26-27 2B 31-33 38 3F-42 44 48-4B 4D 52-53 66-67 6B
01 CE D0 D2 D4 D6 D8 DA DC
02 51 61 C4 C7 C9-CB CD D0 D8-DB DD DF
03 00-6F 91-A1 A3-A9 B1-C1 C3-C9
04 01 10-4F 51
20 10 13-16 18-19 1C-1D 20-22 24-27 30 32-33 35 3B 3E 74 7F 81-84
20 AC
21 03 05 09 13 16 21-22 26 2B 53-54 5B-5E 60-6B 70-79 89 90-99 B8-B9
21 D2 D4 E7
22 00 02-03 07-08 0B 0F 11 15 1A 1D-20 23 25 27-2C 2E 34-37 3C-3D
22 48 4C 52 60-61 64-67 6A-6B 6E-6F 82-83 86-87 95 99 A5 BF
23 12
24 60-E9 EB-FF
25 00-4B 50-73 80-8F 92-95 A0-A1 A3-A9 B2-B3 B6-B7 BC-BD C0-C1 C6-C8
25 CB CE-D1 E2-E5 EF
26 05-06 09 0E-0F 1C 1E 40 42 60-61 63-65 67-6A 6C-6D 6F 9E-9F BF
26 C6-CD CF-D3 D5-E1 E3 E8-E9 EB-F1 F4 F6-F9 FB-FC FE-FF
27 3D 76-7F
2B 56-59
32 48-4F
E0 00-FF
E1 00-FF
E2 00-FF
E3 00-FF
E4 00-FF
E5 00-FF
E6 00-FF
E7 00-FF
E8 00-FF
E9 00-FF
EA 00-FF
EB 00-FF
EC 00-FF
ED 00-FF
EE 00-FF
EF 00-FF
F0 00-FF
F1 00-FF
F2 00-FF
F3 00-FF
F4 00-FF
F5 00-FF
F6 00-FF
F7 00-FF
F8 00-FF
FE 00-0F
FF FD
1F1 00-0A 10-2D 30-69 70-8D 8F-90 9B-AC
E01 00-EF
F00 00-FF
F01 00-FF
F02 00-FF
F03 00-FF
F04 00-FF
F05 00-FF
F06 00-FF
F07 00-FF
F08 00-FF
F09 00-FF
F0A 00-FF
F0B 00-FF
F0C 00-FF
F0D 00-FF
F0E 00-FF
F0F 00-FF
F10 00-FF
F11 00-FF
F12 00-FF
F13 00-FF
F14 00-FF
F15 00-FF
F16 00-FF
F17 00-FF
F18 00-FF
F19 00-FF
F1A 00-FF
F1B 00-FF
F1C 00-FF
F1D 00-FF
F1E 00-FF
F1F 00-FF
F20 00-FF
F21 00-FF
F22 00-FF
F23 00-FF
F24 00-FF
F25 00-FF
F26 00-FF
F27 00-FF
F28 00-FF
F29 00-FF
F2A 00-FF
F2B 00-FF
F2C 00-FF
F2D 00-FF
F2E 00-FF
F2F 00-FF
F30 00-FF
F31 00-FF
F32 00-FF
F33 00-FF
F34 00-FF
F35 00-FF
F36 00-FF
F37 00-FF
F38 00-FF
F39 00-FF
F3A 00-FF
F3B 00-FF
F3C 00-FF
F3D 00-FF
F3E 00-FF
F3F 00-FF
F40 00-FF
F41 00-FF
F42 00-FF
F43 00-FF
F44 00-FF
F45 00-FF
F46 00-FF
F47 00-FF
F48 00-FF
F49 00-FF
F4A 00-FF
F4B 00-FF
F4C 00-FF
F4D 00-FF
F4E 00-FF
F4F 00-FF
F50 00-FF
F51 00-FF
F52 00-FF
F53 00-FF
F54 00-FF
F55 00-FF
F56 00-FF
F57 00-FF
F58 00-FF
F59 00-FF
F5A 00-FF
F5B 00-FF
F5C 00-FF
F5D 00-FF
F5E 00-FF
F5F 00-FF
F60 00-FF
F61 00-FF
F62 00-FF
F63 00-FF
F64 00-FF
F65 00-FF
F66 00-FF
F67 00-FF
F68 00-FF
F69 00-FF
F6A 00-FF
F6B 00-FF
F6C 00-FF
F6D 00-FF
F6E 00-FF
F6F 00-FF
F70 00-FF
F71 00-FF
F72 00-FF
F73 00-FF
F74 00-FF
F75 00-FF
F76 00-FF
F77 00-FF
F78 00-FF
F79 00-FF
F7A 00-FF
F7B 00-FF
F7C 00-FF
F7D 00-FF
F7E 00-FF
F7F 00-FF
F80 00-FF
F81 00-FF
F82 00-FF
F83 00-FF
F84 00-FF
F85 00-FF
F86 00-FF
F87 00-FF
F88 00-FF
F89 00-FF
F8A 00-FF
F8B 00-FF
F8C 00-FF
F8D 00-FF
F8E 00-FF
F8F 00-FF
F90 00-FF
F91 00-FF
F92 00-FF
F93 00-FF
F94 00-FF
F95 00-FF
F96 00-FF
F97 00-FF
F98 00-FF
F99 00-FF
F9A 00-FF
F9B 00-FF
F9C 00-FF
F9D 00-FF
F9E 00-FF
F9F 00-FF
FA0 00-FF
FA1 00-FF
FA2 00-FF
FA3 00-FF
FA4 00-FF
FA5 00-FF
FA6 00-FF
FA7 00-FF
FA8 00-FF
FA9 00-FF
FAA 00-FF
FAB 00-FF
FAC 00-FF
FAD 00-FF
FAE 00-FF
FAF 00-FF
FB0 00-FF
FB1 00-FF
FB2 00-FF
FB3 00-FF
FB4 00-FF
FB5 00-FF
FB6 00-FF
FB7 00-FF
FB8 00-FF
FB9 00-FF
FBA 00-FF
FBB 00-FF
FBC 00-FF
FBD 00-FF
FBE 00-FF
FBF 00-FF
FC0 00-FF
FC1 00-FF
FC2 00-FF
FC3 00-FF
FC4 00-FF
FC5 00-FF
FC6 00-FF
FC7 00-FF
FC8 00-FF
FC9 00-FF
FCA 00-FF
FCB 00-FF
FCC 00-FF
FCD 00-FF
FCE 00-FF
FCF 00-FF
FD0 00-FF
FD1 00-FF
FD2 00-FF
FD3 00-FF
FD4 00-FF
FD5 00-FF
FD6 00-FF
FD7 00-FF
FD8 00-FF
FD9 00-FF
FDA 00-FF
FDB 00-FF
FDC 00-FF
FDD 00-FF
FDE 00-FF
FDF 00-FF
FE0 00-FF
FE1 00-FF
FE2 00-FF
FE3 00-FF
FE4 00-FF
FE5 00-FF
FE6 00-FF
FE7 00-FF
FE8 00-FF
FE9 00-FF
FEA 00-FF
FEB 00-FF
FEC 00-FF
FED 00-FF
FEE 00-FF
FEF 00-FF
FF0 00-FF
FF1 00-FF
FF2 00-FF
FF3 00-FF
FF4 00-FF
FF5 00-FF
FF6 00-FF
FF7 00-FF
FF8 00-FF
FF9 00-FF
FFA 00-FF
FFB 00-FF
FFC 00-FF
FFD 00-FF
FFE 00-FF
FFF 00-FD
1000 00-FF
1001 00-FF
1002 00-FF
1003 00-FF
1004 00-FF
1005 00-FF
1006 00-FF
1007 00-FF
1008 00-FF
1009 00-FF
100A 00-FF
100B 00-FF
100C 00-FF
100D 00-FF
100E 00-FF
100F 00-FF
1010 00-FF
1011 00-FF
1012 00-FF
1013 00-FF
1014 00-FF
1015 00-FF
1016 00-FF
1017 00-FF
1018 00-FF
1019 00-FF
101A 00-FF
101B 00-FF
101C 00-FF
101D 00-FF
101E 00-FF
101F 00-FF
1020 00-FF
1021 00-FF
1022 00-FF
1023 00-FF
1024 00-FF
1025 00-FF
1026 00-FF
1027 00-FF
1028 00-FF
1029 00-FF
102A 00-FF
102B 00-FF
102C 00-FF
102D 00-FF
102E 00-FF
102F 00-FF
1030 00-FF
1031 00-FF
1032 00-FF
1033 00-FF
1034 00-FF
1035 00-FF
1036 00-FF
1037 00-FF
1038 00-FF
1039 00-FF
103A 00-FF
103B 00-FF
103C 00-FF
103D 00-FF
103E 00-FF
103F 00-FF
1040 00-FF
1041 00-FF
1042 00-FF
1043 00-FF
1044 00-FF
1045 00-FF
1046 00-FF
1047 00-FF
1048 00-FF
1049 00-FF
104A 00-FF
104B 00-FF
104C 00-FF
104D 00-FF
104E 00-FF
104F 00-FF
1050 00-FF
1051 00-FF
1052 00-FF
1053 00-FF
1054 00-FF
1055 00-FF
1056 00-FF
1057 00-FF
1058 00-FF
1059 00-FF
105A 00-FF
105B 00-FF
105C 00-FF
105D 00-FF
105E 00-FF
105F 00-FF
1060 00-FF
1061 00-FF
1062 00-FF
1063 00-FF
1064 00-FF
1065 00-FF
1066 00-FF
1067 00-FF
1068 00-FF
1069 00-FF
106A 00-FF
106B 00-FF
106C 00-FF
106D 00-FF
106E 00-FF
106F 00-FF
1070 00-FF
1071 00-FF
1072 00-FF
1073 00-FF
1074 00-FF
1075 00-FF
1076 00-FF
1077 00-FF
1078 00-FF
1079 00-FF
107A 00-FF
107B 00-FF
107C 00-FF
107D 00-FF
107E 00-FF
107F 00-FF
1080 00-FF
1081 00-FF
1082 00-FF
1083 00-FF
1084 00-FF
1085 00-FF
1086 00-FF
1087 00-FF
1088 00-FF
1089 00-FF
108A 00-FF
108B 00-FF
108C 00-FF
108D 00-FF
108E 00-FF
108F 00-FF
1090 00-FF
1091 00-FF
1092 00-FF
1093 00-FF
1094 00-FF
1095 00-FF
1096 00-FF
1097 00-FF
1098 00-FF
1099 00-FF
109A 00-FF
109B 00-FF
109C 00-FF
109D 00-FF
109E 00-FF
109F 00-FF
10A0 00-FF
10A1 00-FF
10A2 00-FF
10A3 00-FF
10A4 00-FF
10A5 00-FF
10A6 00-FF
10A7 00-FF
10A8 00-FF
10A9 00-FF
10AA 00-FF
10AB 00-FF
10AC 00-FF
10AD 00-FF
10AE 00-FF
10AF 00-FF
10B0 00-FF
10B1 00-FF
10B2 00-FF
10B3 00-FF
10B4 00-FF
10B5 00-FF
10B6 00-FF
10B7 00-FF
10B8 00-FF
10B9 00-FF
10BA 00-FF
10BB 00-FF
10BC 00-FF
10BD 00-FF
10BE 00-FF
10BF 00-FF
10C0 00-FF
10C1 00-FF
10C2 00-FF
10C3 00-FF
10C4 00-FF
10C5 00-FF
10C6 00-FF
10C7 00-FF
10C8 00-FF
10C9 00-FF
10CA 00-FF
10CB 00-FF
10CC 00-FF
10CD 00-FF
10CE 00-FF
10CF 00-FF
10D0 00-FF
10D1 00-FF
10D2 00-FF
10D3 00-FF
10D4 00-FF
10D5 00-FF
10D6 00-FF
10D7 00-FF
10D8 00-FF
10D9 00-FF
10DA 00-FF
10DB 00-FF
10DC 00-FF
10DD 00-FF
10DE 00-FF
10DF 00-FF
10E0 00-FF
10E1 00-FF
10E2 00-FF
10E3 00-FF
10E4 00-FF
10E5 00-FF
10E6 00-FF
10E7 00-FF
10E8 00-FF
10E9 00-FF
10EA 00-FF
10EB 00-FF
10EC 00-FF
10ED 00-FF
10EE 00-FF
10EF 00-FF
10F0 00-FF
10F1 00-FF
10F2 00-FF
10F3 00-FF
10F4 00-FF
10F5 00-FF
10F6 00-FF
10F7 00-FF
10F8 00-FF
10F9 00-FF
10FA 00-FF
10FB 00-FF
10FC 00-FF
10FD 00-FF
10FE 00-FF
10FF 00-FD

54
newlib/libc/string/mkunidata Executable file
View File

@ -0,0 +1,54 @@
#! /bin/sh
echo generating Unicode width data for newlib/libc/string/wcwidth.c
cd `dirname $0`
PATH="$PATH":. # ensure access to uniset tool
#############################################################################
# checks and (with option -u) downloads
case "$1" in
-u)
#WGET=wget -N -t 1 --timeout=55
WGET=curl -R -O --connect-timeout 55
WGET+=-z $@
echo downloading uniset tool
$WGET http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
gzip -dc uniset.tar.gz | tar xvf - uniset
echo downloading data from unicode.org
for data in UnicodeData.txt Blocks.txt EastAsianWidth.txt
do $WGET http://unicode.org/Public/UNIDATA/$data
done
;;
*) echo checking package unicode-ucd
grep unicode-ucd /etc/setup/installed.db || exit 9
;;
esac
echo checking uniset tool
type uniset || exit 9
for data in UnicodeData.txt Blocks.txt EastAsianWidth.txt
do test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9
done
echo generating from Unicode version `sed -e 's,[^.0-9],,g' -e 1q Blocks.txt`
exit
#############################################################################
# table generation
echo generating combining characters table
uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B +D7B0-D7C6 +D7CB-D7FB c > combining.t
echo generating ambiguous width characters table
sh ./mkwidthA && uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c > ambiguous.t
echo generating wide characters table
sh ./mkwide
#############################################################################
# end

49
newlib/libc/string/mkwide Executable file
View File

@ -0,0 +1,49 @@
#! /bin/sh
# generate list of wide characters, with convex closure
skipcheck=false
if [ ! -r EastAsianWidth.txt ]
then ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1
fi
if [ ! -r UnicodeData.txt ]
then ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1
fi
if [ ! -r Blocks.txt ]
then ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1
fi
sed -e "s,^\([^;]*\);[NAH],\1," -e t -e d EastAsianWidth.txt > wide.na
sed -e "s,^\([^;]*\);[WF],\1," -e t -e d EastAsianWidth.txt > wide.fw
PATH="$PATH:." # for uniset
nrfw=`uniset +wide.fw nr | sed -e 's,.*:,,'`
echo FW $nrfw
nrna=`uniset +wide.na nr | sed -e 's,.*:,,'`
echo NAH $nrna
extrablocks="2E80-303E"
# check all blocks
includes () {
nr=`uniset +wide.$2 -$1 nr | sed -e 's,.*:,,'`
test $nr != $3
}
echo "adding compact closure of wide ranges, this may take ~10min"
for b in $extrablocks `sed -e 's,^\([0-9A-F]*\)\.\.\([0-9A-F]*\).*,\1-\2,' -e t -e d Blocks.txt`
do range=$b
echo checking $range $* >&2
if includes $range fw $nrfw && ! includes $range na $nrna
then echo $range
fi
done > wide.blocks
(
sed -e "s,^,//," -e 1q EastAsianWidth.txt
sed -e "s,^,//," -e 1q Blocks.txt
uniset `sed -e 's,^,+,' wide.blocks` +wide.fw c
) > wide.t
rm -f wide.na wide.fw wide.blocks

20
newlib/libc/string/mkwidthA Executable file
View File

@ -0,0 +1,20 @@
#! /bin/sh
# generate WIDTH-A file, listing Unicode characters with width property
# Ambiguous, from EastAsianWidth.txt
if [ ! -r EastAsianWidth.txt ]
then ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1
fi
if [ ! -r UnicodeData.txt ]
then ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1
fi
if [ ! -r Blocks.txt ]
then ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1
fi
sed -e "s,^\([^;]*\);A,\1," -e t -e d EastAsianWidth.txt > width-a-new
rm -f WIDTH-A
echo "# UAX #11: East Asian Ambiguous" > WIDTH-A
PATH="$PATH:." uniset +width-a-new compact >> WIDTH-A
rm -f width-a-new

696
newlib/libc/string/uniset Executable file
View File

@ -0,0 +1,696 @@
#!/usr/bin/perl
# Uniset -- Unicode subset manager -- Markus Kuhn
# http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
require 5.008;
use open ':utf8';
use FindBin qw($RealBin); # to find directory where this file is located
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
my (%name, %invname, %category, %comment);
print <<End if $#ARGV < 0;
Uniset -- Unicode subset manager -- Markus Kuhn
Uniset merges and subtracts Unicode subsets. It can output and
analyse the resulting character set in various formats.
Uniset understand the following command-line arguments:
Commands to define a set of characters:
+ filename add the character set described in the file to the set
- filename remove the character set described in the file from the set
+: filename add the characters in the UTF-8 file to the set
-: filename remove the characters in the UTF-8 file from the set
+xxxx..yyyy add the range to the set (xxxx and yyyy are hex numbers)
-xxxx..yyyy remove the range from the set (xxxx and yyyy are hex numbers)
+cat=Xx add all Unicode characters with category code Xx
-cat=Xx remove all Unicode characters with category code Xx
-cat!=Xx remove all Unicode characters without category code Xx
clean remove any elements that do not appear in the Unicode database
unknown remove any elements that do appear in the Unicode database
Command to output descriptions of the constructed set of characters:
table write a full table with one line per character
compact output the set in compact MES format
c output the set as C interval array
nr output the number of characters
sources output a table that shows the number of characters contributed
by the various combinations of input sets added with +.
utf8-list output a list of all characters encoded in UTF-8
Commands to tailor the following output commands:
html write HTML tables instead of plain text
ucs add the unicode character itself to the table (UTF-8 in
plain table, numeric character reference in HTML)
Formats of character set input files read by the + and - command:
Empty lines, white space at the start and end of the line and any
comment text following a \# are ignored. The following formats are
recognized
xx yyyy xx is the hex code in an 8-bit character set and yyyy
is the corresponding Unicode value. Both can optionally
be prefixed by 0x. This is the format used in the
files on <ftp://ftp.unicode.org/Public/MAPPINGS/>.
yyyy yyyy (optionally prefixed with 0x) is a Unicode character
belonging to the specified subset.
yyyy-yyyy a range of Unicode characters belonging to
yyyy..yyyy the specified subset.
xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify
corresponding low bytes or with a hyphen also ranges of
low bytes in the Unicode values that belong to this
subset. This is also the format that is generated by
the compact command.
End
exit 1 if $#ARGV < 0;
# Subroutine to identify whether the ISO 10646/Unicode character code
# ucs belongs into the East Asian Wide (W) or East Asian FullWidth
# (F) category as defined in Unicode Technical Report #11.
sub iswide ($) {
my $ucs = shift(@_);
return ($ucs >= 0x1100 &&
($ucs <= 0x115f || # Hangul Jamo
$ucs == 0x2329 || $ucs == 0x232a ||
($ucs >= 0x2e80 && $ucs <= 0xa4cf &&
$ucs != 0x303f) || # CJK .. Yi
($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables
($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs
($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms
($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms
($ucs >= 0xffe0 && $ucs <= 0xffe6) ||
($ucs >= 0x20000 && $ucs <= 0x2fffd) ||
($ucs >= 0x30000 && $ucs <= 0x3fffd)));
}
# Return the Unicode name that belongs to a given character code
# Jamo short names, see Unicode 3.0, table 4-4, page 86
my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '',
'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112
my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O',
'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU',
'EU', 'YI', 'I'); # 1161..1175
my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM',
'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS',
'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2
sub name {
my $ucs = shift(@_);
# The intervals used here reflect Unicode Version 3.2
if (($ucs >= 0x3400 && $ucs <= 0x4db5) ||
($ucs >= 0x4e00 && $ucs <= 0x9fa5) ||
($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs);
}
if ($ucs >= 0xac00 && $ucs <= 0xd7a3) {
my $s = $ucs - 0xac00;
my $l = 0x1100 + int($s / (21 * 28));
my $v = 0x1161 + int(($s % (21 * 28)) / 28);
my $t = 0x11a7 + $s % 28;
return "HANGUL SYLLABLE " .
($lname[int($s / (21 * 28))] .
$vname[int(($s % (21 * 28)) / 28)] .
$tname[$s % 28 - 1]);
}
return $name{$ucs};
}
sub is_unicode {
my $ucs = shift(@_);
# The intervals used here reflect Unicode Version 3.2
if (($ucs >= 0x3400 && $ucs <= 0x4db5) ||
($ucs >= 0x4e00 && $ucs <= 0x9fa5) ||
($ucs >= 0xac00 && $ucs <= 0xd7a3) ||
($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
return 1;
}
return exists $name{$ucs};
}
my @search_path;
push @search_path, "$ENV{HOME}/local/share/uniset"
if -d "$ENV{HOME}/local/share/uniset";
push @search_path, "/usr/share/uniset" if -d "/usr/share/uniset";
push @search_path, $RealBin unless $RealBin =~ m|^/usr/bin|;
sub search_open {
my ($mode, $fn) = @_;
my $file;
return $file if open($file, $mode, $fn);
return undef if $fn =~ m|/|;
for my $path (@search_path) {
return $file if open($file, $mode, "$path/$fn");
}
return undef;
}
my $html = 0;
my $image = 0;
my $adducs = 0;
my $unicodedata = "UnicodeData.txt";
my $blockdata = "Blocks.txt";
# read list of all Unicode names
my $data = search_open('<', $unicodedata);
unless ($data) {
die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
"Please make sure that you have downloaded the file\n" .
"http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
}
while (<$data>) {
if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
next if $2 ne '<control>' && substr($2, 0, 1) eq '<';
$ucs = hex($1);
$name{$ucs} = $2;
$invname{$2} = $ucs;
$category{$ucs} = $3;
$comment{$ucs} = $12;
} else {
die("Syntax error in line '$_' in file '$unicodedata'");
}
}
close($data);
# read list of all Unicode blocks
$data = search_open('<', $blockdata);
unless ($data) {
die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" .
"Please make sure that you have downloaded the file\n" .
"http://www.unicode.org/Public/UNIDATA/Blocks.txt\n");
}
my $blocks = 0;
my (@blockstart, @blockend, @blockname);
while (<$data>) {
if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) {
$blockstart[$blocks] = hex($1);
$blockend [$blocks] = hex($2);
$blockname [$blocks] = $3;
$blocks++;
} elsif (/^\s*\#/ || /^\s*$/) {
# ignore comments and empty lines
} else {
die("Syntax error in line '$_' in file '$blockdata'");
}
}
close($data);
if ($blockend[$blocks-1] < 0x110000) {
$blockstart[$blocks] = 0x110000;
$blockend [$blocks] = 0x7FFFFFFF;
$blockname [$blocks] = "Beyond Plane 16";
$blocks++;
}
# process command line arguments
while ($_ = shift(@ARGV)) {
if (/^html$/) {
$html = 1;
} elsif (/^ucs$/) {
$adducs = 1;
} elsif (/^img$/) {
$html = 1;
$image = 1;
} elsif (/^template$/) {
$template = shift(@ARGV);
open(TEMPLATE, $template) || die("Can't open template file '$template': '$!'");
while (<TEMPLATE>) {
if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) {
open(INCLUDE, $1) || die("Can't open template include file '$1': '$!'");
while (<INCLUDE>) {
print $_;
}
close(INCLUDE);
} elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) {
open(INCLUDE, $1) || die("Can't open template include file '$1': '$!'");
while (<INCLUDE>) {
s/&/&amp;/g;
s/</&lt;/g;
print $_;
}
close(INCLUDE);
} else {
print $_;
}
}
close(TEMPLATE);
} elsif (/^\+cat=(.+)$/) {
# add characters with given category
$cat = $1;
for $i (keys(%category)) {
$used{$i} = "[${cat}]" if $category{$i} eq $cat;
}
} elsif (/^\-cat=(.+)$/) {
# remove characters with given category
$cat = $1;
for $i (keys(%category)) {
delete $used{$i} if $category{$i} eq $cat;
}
} elsif (/^\-cat!=(.+)$/) {
# remove characters without given category
$cat = $1;
for $i (keys(%category)) {
delete $used{$i} unless $category{$i} eq $cat;
}
} elsif (/^([+-]):(.*)/) {
$remove = $1 eq "-";
$setfile = $2;
$setfile = shift(@ARGV) if $setfile eq "";
push(@SETS, $setfile);
open(SET, $setfile) || die("Can't open set file '$setfile': '$!'");
$setname = $setfile;
while (<SET>) {
while ($_) {
$i = ord($_);
$used{$i} .= "[${setname}]" unless $remove;
delete $used{$i} if $remove;
$_ = substr($_, 1);
}
}
close SET;
} elsif (/^([+-])(.*)/) {
$remove = $1 eq "-";
$setfile = $2;
$setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/;
if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) {
# handle intervall specification on command line
$first = hex($1);
$last = hex($3);
for ($i = $first; $i <= $last; $i++) {
$used{$i} .= "[ARG]" unless $remove;
delete $used{$i} if $remove;
}
next;
}
$setfile = shift(@ARGV) if $setfile eq "";
push(@SETS, $setfile);
my $setf = search_open('<', $setfile);
die("Can't open set file '$setfile': '$!'") unless $setf;
$cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name
$setname = $setfile;
$setname =~ s/([^.\[\]]*)\..*/$1/;
while (<$setf>) {
if (/^<code_set_name>/) {
# handle ISO 15897 (POSIX registry) charset mapping format
undef $comment_char;
undef $escape_char;
while (<$setf>) {
if ($comment_char && /^$comment_char/) {
# remove comments
$_ = $`;
}
next if (/^\032?\s*$/); # skip empty lines
if (/^<comment_char> (\S)$/) {
$comment_char = $1;
} elsif (/^<escape_char> (\S)$/) {
$escape_char = $1;
} elsif (/^(END )?CHARMAP$/) {
#ignore
} elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) {
$used{hex($2)} .= "[${setname}{$1}]" unless $remove;
delete $used{hex($2)} if $remove;
} else {
die("Syntax error in line $. in file '$setfile':\n'$_'\n");
}
}
next;
} elsif (/^STARTFONT /) {
# handle X11 BDF file
while (<$setf>) {
if (/^ENCODING\s+([0-9]+)/) {
$used{$1} .= "[${setname}]" unless $remove;
delete $used{$1} if $remove;
}
}
next;
}
tr/a-z/A-Z/; # make input uppercase
if ($cedf) {
if ($. > 4) {
if (/^([0-9A-F]{2})\t.?\t(.*)$/) {
# handle Kosta Kosti's trans CEDF format
next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0));
$ucs = $invname{$2};
die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs;
$used{$ucs} .= "[${setname}{$1}]" unless $remove;
delete $used{$ucs} if $remove;
} else {
die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n");
}
}
next;
}
if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) {
# ignore ftp.unicode.org mapping file lines with #UNDEFINED
next;
}
s/^([^\#]*)\#.*$/$1/; # remove comments
next if (/^\032?\s*$/); # skip empty lines
if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
# handle entry from a ftp.unicode.org mapping file
$used{hex($4)} .= "[${setname}{$2}]" unless $remove;
delete $used{hex($4)} if $remove;
} elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
# handle interval specification
$first = hex($2);
$last = hex($5);
for ($i = $first; $i <= $last; $i++) {
$used{$i} .= "[${setname}]" unless $remove;
delete $used{$i} if $remove;
}
} elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) {
# handle lines from P10 MES draft
$row = $1;
$cols = $_;
$cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/;
$cols =~ tr/,//d;
@cols = split(/\s+/, $cols);
for (@cols) {
if (/^(..)$/) {
$first = hex("$row$1");
$last = $first;
} elsif (/^(..)-(..)$/) {
$first = hex("$row$1");
$last = hex("$row$2");
} else {
die ("this should never happen '$_'");
}
for ($i = $first; $i <= $last; $i++) {
$used{$i} .= "[${setname}]" unless $remove;
delete $used{$i} if $remove;
}
}
} elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) {
# handle single character
$used{hex($2)} .= "[${setname}]" unless $remove;
delete $used{hex($2)} if $remove;
} else {
die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/;
}
}
close $setf;
} elsif (/^loadimages$/ || /^loadbigimages$/) {
if (/^loadimages$/) {
$prefix = "Small.Glyphs";
} else {
$prefix = "Glyphs";
}
$total = 0;
for $i (keys(%used)) {
next if ($name{$i} eq "<control>");
$total++;
}
$count = 0;
$| = 1;
for $i (sort({$a <=> $b} keys(%used))) {
next if ($name{$i} eq "<control>");
$count++;
$j = sprintf("%04X", $i);
$j =~ /(..)(..)/;
$gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif";
print("\r$count/$total: $gif");
system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &");
select(undef, undef, undef, 0.2);
}
print("\n");
exit 0;
} elsif (/^giftable/) {
# form a table of glyphs (requires pbmtools installed)
$count = 0;
for $i (keys(%used)) {
$count++ unless $name{$i} eq "<control>";
}
$width = int(sqrt($count/sqrt(2)) + 0.5);
$width = $1 if /^giftable([0-9]+)$/;
system("rm -f tmp-*.pnm table.pnm~ table.pnm");
$col = 0;
$row = 0;
for $i (sort({$a <=> $b} keys(%used))) {
next if ($name{$i} eq "<control>");
$j = sprintf("%04X", $i);
$j =~ /(..)(..)/;
$gif = "Small.Glyphs/$1/U$j.gif";
$pnm = sprintf("tmp-%02x.pnm", $col);
$fallback = "Small.Glyphs/FF/UFFFD.gif";
system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }");
if (++$col == $width) {
system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
if ($row == 0) {
system("mv tmp-row.pnm table.pnm");
} else {
system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm");
}
$row++;
$col = 0;
system("rm -f tmp-*.pnm table.pnm~");
}
}
if ($col > 0) {
system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
if ($row == 0) {
system("mv tmp-row.pnm table.pnm");
} else {
system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm");
}
}
system("rm -f table.gif ; ppmtogif table.pnm > table.gif");
system("rm -f tmp-*.pnm table.pnm~ table.pnm");
} elsif (/^table$/) {
# go through all used names to print full table
print "<TABLE border=2>\n" if $html;
for $i (sort({$a <=> $b} keys(%used))) {
next if ($name{$i} eq "<control>");
if ($html) {
$sources = $used{$i};
$sources =~ s/\]\[/, /g;
$sources =~ s/^\[//g;
$sources =~ s/\]$//g;
$sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g;
$j = sprintf("%04X", $i);
$j =~ /(..)(..)/;
$gif = "Small.Glyphs/$1/U$j.gif";
print "<TR>";
print "<TD><img width=32 height=32 src=\"$gif\">" if $image;
printf("<TD>&#%d;", $i) if $adducs;
print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i);
print " ($comment{$i})" if $comment{$i};
print "</SAMP><TD><SMALL>$sources</SMALL>\n";
} else {
printf("%04X \# ", $i);
print pack("U", $i) . " " if $adducs;
print name($i) ."\n";
}
}
print "</TABLE>\n" if $html;
} elsif (/^imgblock$/) {
$width = 16;
$width = $1 if /giftable([0-9]+)/;
$col = 0;
$subline = "";
print "\n<P><TABLE cellspacing=0 cellpadding=0>";
for $i (sort({$a <=> $b} keys(%used))) {
print "<TR>" if $col == 0;
$j = sprintf("%04X", $i);
$j =~ /(..)(..)/;
$gif = "Small.Glyphs/$1/U$j.gif";
$alt = name($i);
print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">";
$subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>";
if (++$col == $width) {
print "<TR align=center>$subline";
$col = 0;
$subline = "";
}
}
print "<TR align=center>$subline" if ($col > 0);
print "</TABLE>\n";
} elsif (/^sources$/) {
# count how many characters are attributed to the various source set combinations
print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html;
for $i (keys(%used)) {
next if ($name{$i} eq "<control>");
$sources = $used{$i};
$sources =~ s/\]\[/, /g;
$sources =~ s/^\[//g;
$sources =~ s/\]$//g;
$sources =~ s/\{(..)\}//g;
$contribs{$sources} += 1;
}
for $j (keys(%contribs)) {
print "<TR><TD>$contribs{$j}<TD>$j\n" if $html;
}
print "</TABLE>\n" if $html;
} elsif (/^compact$/) {
# print compact table in P10 MES format
print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html;
print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html;
print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html;
$current_row = '';
$start_col = '';
$last_col = '';
for $i (sort({$a <=> $b} keys(%used))) {
next if ($name{$i} eq "<control>");
$row = sprintf("%02X", $i >> 8);
$col = sprintf("%02X", $i & 0xff);
if ($row ne $current_row) {
if (($last_col ne '') and ($last_col ne $start_col)) {
print "-$last_col";
print "</SAMP>" if $html;
}
print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html;
print "\n $row\t" unless $html;
$len = 0;
$current_row = $row;
$start_col = '';
}
if ($start_col eq '') {
print "$col";
$len += 2;
$start_col = $col;
$last_col = $col;
} elsif (hex($col) == hex($last_col) + 1) {
$last_col = $col;
} else {
if ($last_col ne $start_col) {
print "-$last_col";
$len += 3;
}
if ($len > 60 && !$html) {
print "\n $row\t";
$len = 0;
};
print " " if $len;
print "$col";
$len += 2 + !! $len;
$start_col = $col;
$last_col = $col;
}
}
if (($last_col ne '') and ($last_col ne $start_col)) {
print "-$last_col";
print "</SAMP>" if $html;
}
print "\n" if ($current_row ne '');
print "</TABLE>\n" if $html;
print "\n";
} elsif (/^c$/) {
# print table as C interval array
print "{";
$last_i = '';
$columns = 3;
$col = $columns;
for $i (sort({$a <=> $b} keys(%used))) {
next if ($name{$i} eq "<control>");
if ($last_i eq '') {
if (++$col > $columns) { $col = 1; print "\n "; }
printf(" { 0x%04X, ", $i);
$last_i = $i;
} elsif ($i == $last_i + 1) {
$last_i = $i;
} else {
printf("0x%04X },", $last_i);
if (++$col > $columns) { $col = 1; print "\n "; }
printf(" { 0x%04X, ", $i);
$last_i = $i;
}
}
if ($last_i ne '') {
printf("0x%04X }", $last_i);
}
print "\n};\n";
} elsif (/^utf8-list$/) {
$col = 0;
$block = 0;
$last = -1;
for $i (sort({$a <=> $b} keys(%used))) {
next if ($name{$i} eq "<control>");
while ($blockend[$block] < $i && $block < $blocks - 1) {
$block++;
}
if ($last <= $blockend[$block-1] &&
$i < $blockstart[$block]) {
print "\n" if ($col);
printf "\nFree block (U+%04X-U+%04X):\n\n",
$blockend[$block-1] + 1, $blockstart[$block] - 1;
$col = 0;
}
if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
print "\n" if ($col);
printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n",
$blockstart[$block], $blockend[$block];
$col = 0;
}
if ($category{$i} eq 'Mn') {
# prefix non-spacing character with U+25CC DOTTED CIRCLE
print "\x{25CC}";
} elsif ($category{$i} eq 'Me') {
# prefix enclosing non-spacing character with space
print " ";
}
print pack("U", $i);
$col += 1 + iswide($i);
if ($col >= 64) {
print "\n";
$col = 0;
}
$last = $i;
}
print "\n" if ($col);
} elsif (/^collections$/) {
$block = 0;
$last = -1;
for $i (sort({$a <=> $b} keys(%used))) {
next if ($name{$i} eq "<control>");
while ($blockend[$block] < $i && $block < $blocks - 1) {
$block++;
}
if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
print $blockname[$block],
" " x (40 - length($blockname[$block]));
printf "%04X-%04X\n",
$blockstart[$block], $blockend[$block];
}
$last = $i;
}
} elsif (/^nr$/) {
print "<P>" if $html;
print "# " unless $html;
print "Number of characters in above table: ";
$count = 0;
for $i (keys(%used)) {
$count++ unless $name{$i} eq "<control>";
}
print $count;
print "\n";
} elsif (/^clean$/) {
# remove characters from set that are not in $unicodedata
for $i (keys(%used)) {
delete $used{$i} unless is_unicode($i);
}
} elsif (/^unknown$/) {
# remove characters from set that are in $unicodedata
for $i (keys(%used)) {
delete $used{$i} if is_unicode($i);
}
} else {
die("Unknown command line command '$_'");
};
}