#! /bin/sh

# generate table of Unicode character category ranges;
# note: undefined characters between two characters of the same category
# are associated to the same category, e.g.
#0A0A;GURMUKHI LETTER UU;Lo
#0A0B..0A0E           -> Lo
#0A0F;GURMUKHI LETTER EE;Lo

if [ -r UnicodeData.txt ]
then	UnicodeData=UnicodeData.txt
elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
then	UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
else	echo UnicodeData.txt not found >&2
	exit 1
fi

# the code assumes foldall=false, foldcase=true
foldall=false
foldcase=true

(
cat <<\/EOS
first=
item () {
	if [ -n "$first" ]
	then	if [ "$2" != "isRangeLast" \
		     -a $(( 0x$1 )) -ne $(( 0x${last-0} + 1 )) ]
		then	range
		fi
	fi

	if [ -z "$first" ]
	then	first=$1
		val=$3
	fi

	last=$1
}
range () {
#	echo "    {0x$first, 0x$last, CAT_$val},"
#	echo "    {0x$first, $((0x$last - 0x$first)), CAT_$val},"
#	echo "    {0x$first | (CAT_$val << 24), $((0x$last - 0x$first))},"
	echo "    {CAT_$val, 0x$first, $((0x$last - 0x$first))},"
	first=
}
/EOS

cat "$UnicodeData" |
if $foldall
then sed -e "s,;L[lu];,;LC;," -e "s,;C[fs];,;Cfs;," \
	 -e "s,;L[mo];,;Lmo;," -e "s,;Nl;,;Lmo;," \
	 -e "s,;P.;,;P;,"  -e "s,;No;,;P;," \
	 -e "s,;S.;,;S;," -e "s,;Z[lp];,;Zlp;," \
	 -e "s,;C[no];,;X;," -e "s,;M[cen];,;M;,"
elif $foldcase
then
# fold Lu/Ll to LC only if lower/upper conversion is available
 sed -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);.*/ s/;Lu;/;LC;/' \
     -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);.*/ s/;Ll;/;LC;/' \
     -e '/;Co;/ d'
else cat
fi |
sed -e "s,^\([^;]*\);<[^;]*\, Last>;\([^;]*\);.*,\1 isRangeLast \2," \
    -e "s,^\([^;]*\);[^;]*;\([^;]*\);.*,\1 isNormalOrRangeFirst \2," |
uniq -f2 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ,"
) | sh > categories.t

sed -e "s/.*\(CAT_[A-Za-z]*\).*/  \1,/" categories.t |
sort | uniq > categories.cat