72 lines
1.9 KiB
Bash
Executable File
72 lines
1.9 KiB
Bash
Executable File
#! /bin/sh
|
|
|
|
# generate table of Unicode character category ranges;
|
|
# note: undefined characters between two characters of the same category
|
|
# are associated to the same category, e.g.
|
|
#0A0A;GURMUKHI LETTER UU;Lo
|
|
#0A0B..0A0E -> Lo
|
|
#0A0F;GURMUKHI LETTER EE;Lo
|
|
|
|
if [ -r UnicodeData.txt ]
|
|
then UnicodeData=UnicodeData.txt
|
|
elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
|
|
then UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
|
|
else echo UnicodeData.txt not found >&2
|
|
exit 1
|
|
fi
|
|
|
|
# the code assumes foldall=false, foldcase=true
|
|
foldall=false
|
|
foldcase=true
|
|
|
|
(
|
|
cat <<\/EOS
|
|
first=
|
|
item () {
|
|
if [ -n "$first" ]
|
|
then if [ "$2" != "isRangeLast" \
|
|
-a $(( 0x$1 )) -ne $(( 0x${last-0} + 1 )) ]
|
|
then range
|
|
fi
|
|
fi
|
|
|
|
if [ -z "$first" ]
|
|
then first=$1
|
|
val=$3
|
|
fi
|
|
|
|
last=$1
|
|
}
|
|
range () {
|
|
# echo " {0x$first, 0x$last, CAT_$val},"
|
|
# echo " {0x$first, $((0x$last - 0x$first)), CAT_$val},"
|
|
# echo " {0x$first | (CAT_$val << 24), $((0x$last - 0x$first))},"
|
|
echo " {CAT_$val, 0x$first, $((0x$last - 0x$first))},"
|
|
first=
|
|
}
|
|
/EOS
|
|
|
|
cat "$UnicodeData" |
|
|
if $foldall
|
|
then sed -e "s,;L[lu];,;LC;," -e "s,;C[fs];,;Cfs;," \
|
|
-e "s,;L[mo];,;Lmo;," -e "s,;Nl;,;Lmo;," \
|
|
-e "s,;P.;,;P;," -e "s,;No;,;P;," \
|
|
-e "s,;S.;,;S;," -e "s,;Z[lp];,;Zlp;," \
|
|
-e "s,;C[no];,;X;," -e "s,;M[cen];,;M;,"
|
|
elif $foldcase
|
|
then
|
|
# fold Lu/Ll to LC only if lower/upper conversion is available
|
|
sed -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);.*/ s/;Lu;/;LC;/' \
|
|
-e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);.*/ s/;Ll;/;LC;/' \
|
|
-e '/;Co;/ d'
|
|
else cat
|
|
fi |
|
|
sed -e "s,^\([^;]*\);<[^;]*\, Last>;\([^;]*\);.*,\1 isRangeLast \2," \
|
|
-e "s,^\([^;]*\);[^;]*;\([^;]*\);.*,\1 isNormalOrRangeFirst \2," |
|
|
uniq -f2 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ,"
|
|
) | sh > categories.t
|
|
|
|
sed -e "s/.*\(CAT_[A-Za-z]*\).*/ \1,/" categories.t |
|
|
sort | uniq > categories.cat
|
|
|