libc/newlib/libc/ctype/mkcategories

72 lines
1.9 KiB
Bash
Executable File

#! /bin/sh
# generate table of Unicode character category ranges;
# note: undefined characters between two characters of the same category
# are associated to the same category, e.g.
#0A0A;GURMUKHI LETTER UU;Lo
#0A0B..0A0E -> Lo
#0A0F;GURMUKHI LETTER EE;Lo
if [ -r UnicodeData.txt ]
then UnicodeData=UnicodeData.txt
elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
then UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
else echo UnicodeData.txt not found >&2
exit 1
fi
# the code assumes foldall=false, foldcase=true
foldall=false
foldcase=true
(
cat <<\/EOS
first=
item () {
if [ -n "$first" ]
then if [ "$2" != "isRangeLast" \
-a $(( 0x$1 )) -ne $(( 0x${last-0} + 1 )) ]
then range
fi
fi
if [ -z "$first" ]
then first=$1
val=$3
fi
last=$1
}
range () {
# echo " {0x$first, 0x$last, CAT_$val},"
# echo " {0x$first, $((0x$last - 0x$first)), CAT_$val},"
# echo " {0x$first | (CAT_$val << 24), $((0x$last - 0x$first))},"
echo " {CAT_$val, 0x$first, $((0x$last - 0x$first))},"
first=
}
/EOS
cat "$UnicodeData" |
if $foldall
then sed -e "s,;L[lu];,;LC;," -e "s,;C[fs];,;Cfs;," \
-e "s,;L[mo];,;Lmo;," -e "s,;Nl;,;Lmo;," \
-e "s,;P.;,;P;," -e "s,;No;,;P;," \
-e "s,;S.;,;S;," -e "s,;Z[lp];,;Zlp;," \
-e "s,;C[no];,;X;," -e "s,;M[cen];,;M;,"
elif $foldcase
then
# fold Lu/Ll to LC only if lower/upper conversion is available
sed -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);.*/ s/;Lu;/;LC;/' \
-e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);.*/ s/;Ll;/;LC;/' \
-e '/;Co;/ d'
else cat
fi |
sed -e "s,^\([^;]*\);<[^;]*\, Last>;\([^;]*\);.*,\1 isRangeLast \2," \
-e "s,^\([^;]*\);[^;]*;\([^;]*\);.*,\1 isNormalOrRangeFirst \2," |
uniq -f2 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ,"
) | sh > categories.t
sed -e "s/.*\(CAT_[A-Za-z]*\).*/ \1,/" categories.t |
sort | uniq > categories.cat