character data generation
This commit is contained in:
parent
41f72ab4d7
commit
a352730004
128
newlib/libc/ctype/mkcaseconv
Executable file
128
newlib/libc/ctype/mkcaseconv
Executable file
@ -0,0 +1,128 @@
|
|||||||
|
#! /bin/sh -f
|
||||||
|
|
||||||
|
# generate a table for Unicode case conversion; entries:
|
||||||
|
# struct caseconv_entry defined in towctrans_l.c
|
||||||
|
|
||||||
|
if [ -r UnicodeData.txt ]
|
||||||
|
then UnicodeData=UnicodeData.txt
|
||||||
|
elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
|
||||||
|
then UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
|
||||||
|
else echo UnicodeData.txt not found >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
LC_ALL=C
|
||||||
|
export LC_ALL
|
||||||
|
|
||||||
|
compact=true
|
||||||
|
|
||||||
|
#0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||||||
|
#0061;LATIN SMALL LETTER A;Ll;0;L;;;;;N;;;0041;;0041
|
||||||
|
#0130;LATIN CAPITAL LETTER I WITH DOT ABOVE;Lu;0;L;0049 0307;;;;N;LATIN CAPITAL LETTER I DOT;;;0069;
|
||||||
|
#01C4;LATIN CAPITAL LETTER DZ WITH CARON;Lu;0;L;<compat> 0044 017D;;;;N;LATIN CAPITAL LETTER D Z HACEK;;;01C6;01C5
|
||||||
|
#01C5;LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON;Lt;0;L;<compat> 0044 017E;;;;N;LATIN LETTER CAPITAL D SMALL Z HACEK;;01C4;01C6;01C5
|
||||||
|
#01C6;LATIN SMALL LETTER DZ WITH CARON;Ll;0;L;<compat> 0064 017E;;;;N;LATIN SMALL LETTER D Z HACEK;;01C4;;01C5
|
||||||
|
|
||||||
|
tr -d '\015' < $UnicodeData |
|
||||||
|
sed \
|
||||||
|
-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
|
||||||
|
-e t \
|
||||||
|
-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
|
||||||
|
-e t \
|
||||||
|
-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;]*\);\([^;][^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
|
||||||
|
-e t \
|
||||||
|
-e d |
|
||||||
|
(#src 01C5 upper "01C4" lower "01C6" title "01C5"
|
||||||
|
if $compact
|
||||||
|
then
|
||||||
|
(
|
||||||
|
cat <<\/EOS
|
||||||
|
src () {
|
||||||
|
if [ -n "$3" ]
|
||||||
|
then tohi=$(( 0x0$3 - 0x0$1 ))
|
||||||
|
else tohi=0
|
||||||
|
fi
|
||||||
|
if [ -n "$5" ]
|
||||||
|
then tolo=$(( 0x0$5 - 0x0$1 ))
|
||||||
|
else tolo=0
|
||||||
|
fi
|
||||||
|
case "$tolo.$tohi" in
|
||||||
|
0.0) true;;
|
||||||
|
0.*)
|
||||||
|
case "$1.$tohi" in
|
||||||
|
*[02468ACE].1) echo "'#error' U+$1 ODDSML";;
|
||||||
|
*[02468ACE].-1) echo " 0x$1 TO1 ODDCAP";;
|
||||||
|
*[13579BDF].1) echo "'#error' U+$1 EVENSML";;
|
||||||
|
*[13579BDF].-1) echo " 0x$1 TO1 EVENCAP";;
|
||||||
|
*) echo " 0x$1 TOUP $tohi";;
|
||||||
|
esac;;
|
||||||
|
*.0)
|
||||||
|
case "$1.$tolo" in
|
||||||
|
*[02468ACE].1) echo " 0x$1 TO1 EVENCAP";;
|
||||||
|
*[02468ACE].-1) echo "'#error' U+$1 EVENSML";;
|
||||||
|
*[13579BDF].1) echo " 0x$1 TO1 ODDCAP";;
|
||||||
|
*[13579BDF].-1) echo "'#error' U+$1 ODDSML";;
|
||||||
|
*) echo " 0x$1 TOLO $tolo";;
|
||||||
|
esac;;
|
||||||
|
*) case "$tolo.$tohi" in
|
||||||
|
1.-1) echo " 0x$1 TOBOTH 0";;
|
||||||
|
*) echo "'#error' U+$1";;
|
||||||
|
esac;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
/EOS
|
||||||
|
cat
|
||||||
|
) | sh |
|
||||||
|
uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ," |
|
||||||
|
(
|
||||||
|
cat <<\/EOS
|
||||||
|
first=
|
||||||
|
diff=-1
|
||||||
|
max=255
|
||||||
|
range () {
|
||||||
|
# $diff == $(($last - $first))
|
||||||
|
if [ "$diff" -ge 0 ]
|
||||||
|
then # we have items at all
|
||||||
|
echo " {$first, $diff, $v2, $v3},"
|
||||||
|
fi
|
||||||
|
first=
|
||||||
|
diff=-1
|
||||||
|
}
|
||||||
|
item () {
|
||||||
|
if [ "$1" == "#error" ]
|
||||||
|
then echo "$*"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $diff -eq $max ]
|
||||||
|
then range
|
||||||
|
elif [ -n "$first" ]
|
||||||
|
then if [ $(( $1 )) -ne $(( ${last-0} + 1 )) ]
|
||||||
|
then range
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$first" ]
|
||||||
|
then first=$1
|
||||||
|
v2=$2
|
||||||
|
v3=$3
|
||||||
|
fi
|
||||||
|
|
||||||
|
last=$1
|
||||||
|
diff=$(( $diff + 1 ))
|
||||||
|
}
|
||||||
|
/EOS
|
||||||
|
cat
|
||||||
|
) | sh
|
||||||
|
elif false
|
||||||
|
then
|
||||||
|
sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/ {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \
|
||||||
|
-e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/' \
|
||||||
|
-e 's/\(0x[0-9A-F][0-9A-F]*\) - \(0x[0-9A-F][0-9A-F]*\)/$((`printf %d \1` - `printf %d \2`))/g' \
|
||||||
|
-e 's/^/echo "/' -e 's/$/"/' |
|
||||||
|
sh
|
||||||
|
else
|
||||||
|
sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/ {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \
|
||||||
|
-e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/'
|
||||||
|
fi
|
||||||
|
) > caseconv.t
|
69
newlib/libc/ctype/mkcategories
Executable file
69
newlib/libc/ctype/mkcategories
Executable file
@ -0,0 +1,69 @@
|
|||||||
|
#! /bin/sh
|
||||||
|
|
||||||
|
# generate table of Unicode character category ranges;
|
||||||
|
# note: undefined characters between two characters of the same category
|
||||||
|
# are associated to the same category, e.g.
|
||||||
|
#0A0A;GURMUKHI LETTER UU;Lo
|
||||||
|
#0A0B..0A0E -> Lo
|
||||||
|
#0A0F;GURMUKHI LETTER EE;Lo
|
||||||
|
|
||||||
|
if [ -r UnicodeData.txt ]
|
||||||
|
then UnicodeData=UnicodeData.txt
|
||||||
|
elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
|
||||||
|
then UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
|
||||||
|
else echo UnicodeData.txt not found >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# the code assumes foldall=false, foldcase=true
|
||||||
|
foldall=false
|
||||||
|
foldcase=true
|
||||||
|
|
||||||
|
(
|
||||||
|
cat <<\/EOS
|
||||||
|
first=
|
||||||
|
item () {
|
||||||
|
if [ -n "$first" ]
|
||||||
|
then if [ $(( 0x$1 )) -ne $(( 0x${last-0} + 1 )) ]
|
||||||
|
then range
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$first" ]
|
||||||
|
then first=$1
|
||||||
|
val=$2
|
||||||
|
fi
|
||||||
|
|
||||||
|
last=$1
|
||||||
|
}
|
||||||
|
range () {
|
||||||
|
# echo " {0x$first, 0x$last, CAT_$val},"
|
||||||
|
# echo " {0x$first, $((0x$last - 0x$first)), CAT_$val},"
|
||||||
|
# echo " {0x$first | (CAT_$val << 24), $((0x$last - 0x$first))},"
|
||||||
|
echo " {CAT_$val, 0x$first, $((0x$last - 0x$first))},"
|
||||||
|
first=
|
||||||
|
}
|
||||||
|
/EOS
|
||||||
|
|
||||||
|
cat "$UnicodeData" |
|
||||||
|
if $foldall
|
||||||
|
then sed -e "s,;L[lu];,;LC;," -e "s,;C[fs];,;Cfs;," \
|
||||||
|
-e "s,;L[mo];,;Lmo;," -e "s,;Nl;,;Lmo;," \
|
||||||
|
-e "s,;P.;,;P;," -e "s,;No;,;P;," \
|
||||||
|
-e "s,;S.;,;S;," -e "s,;Z[lp];,;Zlp;," \
|
||||||
|
-e "s,;C[no];,;X;," -e "s,;M[cen];,;M;,"
|
||||||
|
elif $foldcase
|
||||||
|
then
|
||||||
|
# fold Lu/Ll to LC only if lower/upper conversion is available
|
||||||
|
sed -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);.*/ s/;Lu;/;LC;/' \
|
||||||
|
-e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);.*/ s/;Ll;/;LC;/' \
|
||||||
|
-e '/;Co;/ d'
|
||||||
|
else cat
|
||||||
|
fi |
|
||||||
|
sed -e "s,^\([^;]*\);[^;]*;\([^;]*\);.*,\1 \2," |
|
||||||
|
uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ,"
|
||||||
|
) | sh > categories.t
|
||||||
|
|
||||||
|
sed -e "s/.*\(CAT_[A-Za-z]*\).*/ \1,/" categories.t |
|
||||||
|
sort | uniq > categories.cat
|
||||||
|
|
40
newlib/libc/ctype/mkunidata
Executable file
40
newlib/libc/ctype/mkunidata
Executable file
@ -0,0 +1,40 @@
|
|||||||
|
#! /bin/sh
|
||||||
|
|
||||||
|
echo generating Unicode character properties data for newlib/libc/ctype
|
||||||
|
|
||||||
|
cd `dirname $0`
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# checks and (with option -u) download
|
||||||
|
|
||||||
|
case "$1" in
|
||||||
|
-u)
|
||||||
|
#WGET=wget -N -t 1 --timeout=55
|
||||||
|
WGET=curl -R -O --connect-timeout 55
|
||||||
|
WGET+=-z $@
|
||||||
|
|
||||||
|
echo downloading data from unicode.org
|
||||||
|
for data in UnicodeData.txt
|
||||||
|
do $WGET http://unicode.org/Public/UNIDATA/$data
|
||||||
|
done
|
||||||
|
;;
|
||||||
|
*) echo checking package unicode-ucd
|
||||||
|
grep unicode-ucd /etc/setup/installed.db || exit 9
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
for data in UnicodeData.txt
|
||||||
|
do test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9
|
||||||
|
done
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# table generation
|
||||||
|
|
||||||
|
echo generating character category table for "isw*.c"
|
||||||
|
sh ./mkcategories
|
||||||
|
|
||||||
|
echo generating case conversion table for "tow*.c"
|
||||||
|
sh ./mkcaseconv
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# end
|
Loading…
x
Reference in New Issue
Block a user