From 5eb556c8497ddd680c28e53e04c4badfe612004a Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Sat, 6 Feb 2010 18:28:33 +0000 Subject: [PATCH] * libc/ctype/ctype_cp.h (_CTYPE_GEORGIAN_PS_128_254): Define. (_CTYPE_GEORGIAN_PS_255): Define. (_CTYPE_PT154_128_254): Define. (_CTYPE_PT154_255): Define. (__ctype_cp): Add array members for above ctype definitions. * libc/locale/locale.c (loadlocale): Make TIS-620 charset name available for all targets. Add guards for setting the conversion function pointers. Add support for GEORGIAN-PS and PT154 charsets. Change documentation to reflect current behaviour more closely. * libc/locale/nl_langinfo.c (nl_langinfo): On Cygwin, translate "CP101" to "GEORGIAN-PS" and "CP102" to "PT154". * libc/stdlib/sb_charsets.c (__cp_conv): Add conversion arrays for GEORGIAN-PS and PT154. (__cp_index): Map invalid Windows codepage number 101 to GEORGIAN-PS conversion array, 102 to PT154 conversion array. --- newlib/ChangeLog | 18 +++++ newlib/libc/ctype/ctype_cp.h | 62 +++++++++++++++- newlib/libc/locale/locale.c | 121 ++++++++++++++++++++++--------- newlib/libc/locale/nl_langinfo.c | 4 + newlib/libc/stdlib/sb_charsets.c | 50 ++++++++++++- 5 files changed, 218 insertions(+), 37 deletions(-) diff --git a/newlib/ChangeLog b/newlib/ChangeLog index ee2c6a2be..ee5f3c09c 100644 --- a/newlib/ChangeLog +++ b/newlib/ChangeLog @@ -1,3 +1,21 @@ +2010-02-06 Corinna Vinschen + + * libc/ctype/ctype_cp.h (_CTYPE_GEORGIAN_PS_128_254): Define. + (_CTYPE_GEORGIAN_PS_255): Define. + (_CTYPE_PT154_128_254): Define. + (_CTYPE_PT154_255): Define. + (__ctype_cp): Add array members for above ctype definitions. + * libc/locale/locale.c (loadlocale): Make TIS-620 charset name + available for all targets. Add guards for setting the conversion + function pointers. Add support for GEORGIAN-PS and PT154 charsets. + Change documentation to reflect current behaviour more closely. + * libc/locale/nl_langinfo.c (nl_langinfo): On Cygwin, translate + "CP101" to "GEORGIAN-PS" and "CP102" to "PT154". + * libc/stdlib/sb_charsets.c (__cp_conv): Add conversion arrays + for GEORGIAN-PS and PT154. + (__cp_index): Map invalid Windows codepage number 101 to + GEORGIAN-PS conversion array, 102 to PT154 conversion array. + 2010-02-06 Ralf Corsepius * libc/posix/telldir.c: Remove bogus nested prototype of lseek(). diff --git a/newlib/libc/ctype/ctype_cp.h b/newlib/libc/ctype/ctype_cp.h index 40ecd206b..7ce0ab33d 100644 --- a/newlib/libc/ctype/ctype_cp.h +++ b/newlib/libc/ctype/ctype_cp.h @@ -433,6 +433,42 @@ _U, _U, _U, _U, _U, _U, _U, _U, \ _U, _U, _U, _U, _U, _U, _U #define _CTYPE_CP21866_255 _U +#define _CTYPE_GEORGIAN_PS_128_254 \ + _P, 0, _P, _L, _P, _P, _P, _P, \ + _P, _P, _U, _P, _U, _U, 0, 0, \ + 0, _P, _P, _P, _P, _P, _P, _P, \ + _P, _P, _L, _P, _L, 0, _L, _U, \ + _S|_B, _P, _P, _P, _P, _P, _P, _P, \ + _P, _P, _P, _P, _P, _P, _P, _P, \ + _P, _P, _P, _P, _P, _P, _P, _P, \ + _P, _P, _P, _P, _P, _P, _P, _P, \ + _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, \ + _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, \ + _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, \ + _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, \ + _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _L, _L, \ + _L, _L, _L, _L, _L, _L, _L, _L, \ + _L, _L, _L, _L, _L, _L, _L, _P, \ + _L, _L, _L, _L, _L, _L, _L +#define _CTYPE_GEORGIAN_PS_255 _L +#define _CTYPE_PT154_128_254 \ + _U, _U, _U, _L, _P, _P, _U, _U, \ + _U, _L, _U, _U, _U, _U, _U, _U, \ + _L, _P, _P, _P, _P, _P, _P, _P, \ + _L, _L, _L, _L, _L, _L, _L, _L, \ + _S|_B, _U, _L, _U, _U, _U, _U, _P, \ + _U, _P, _U, _P, _P, _L, _P, _U, \ + _P, _L, _U, _L, _L, _L, _P, _P, \ + _L, _P, _L, _P, _L, _U, _L, _L, \ + _U, _U, _U, _U, _U, _U, _U, _U, \ + _U, _U, _U, _U, _U, _U, _U, _U, \ + _U, _U, _U, _U, _U, _U, _U, _U, \ + _U, _U, _U, _U, _U, _U, _U, _U, \ + _L, _L, _L, _L, _L, _L, _L, _L, \ + _L, _L, _L, _L, _L, _L, _L, _L, \ + _L, _L, _L, _L, _L, _L, _L, _L, \ + _L, _L, _L, _L, _L, _L, _L +#define _CTYPE_PT154_255 _L extern int __cp_index (const char *charset_ext); @@ -442,7 +478,7 @@ extern int __cp_index (const char *charset_ext); #ifndef __CYGWIN__ static _CONST #endif -char __ctype_cp[24][128 + 256] = { +char __ctype_cp[26][128 + 256] = { { _CTYPE_CP437_128_254, 0, _CTYPE_DATA_0_127, @@ -587,11 +623,23 @@ char __ctype_cp[24][128 + 256] = { _CTYPE_CP21866_128_254, _CTYPE_CP21866_255 }, + { _CTYPE_GEORGIAN_PS_128_254, + 0, + _CTYPE_DATA_0_127, + _CTYPE_GEORGIAN_PS_128_254, + _CTYPE_GEORGIAN_PS_255 + }, + { _CTYPE_PT154_128_254, + 0, + _CTYPE_DATA_0_127, + _CTYPE_PT154_128_254, + _CTYPE_PT154_255 + }, }; #else /* !defined(ALLOW_NEGATIVE_CTYPE_INDEX) */ -static _CONST char __ctype_cp[22][1 + 256] = { +static _CONST char __ctype_cp[26][1 + 256] = { { 0, _CTYPE_DATA_0_127, _CTYPE_CP437_128_254, @@ -712,6 +760,16 @@ static _CONST char __ctype_cp[22][1 + 256] = { _CTYPE_CP21866_128_254, _CTYPE_CP21866_255 }, + { 0, + _CTYPE_DATA_0_127, + _CTYPE_GEORGIAN_PS_128_254, + _CTYPE_GEORGIAN_PS_255 + }, + { 0, + _CTYPE_DATA_0_127, + _CTYPE_PT154_128_254, + _CTYPE_PT154_255 + }, }; #endif /* ALLOW_NEGATIVE_CTYPE_INDEX */ diff --git a/newlib/libc/locale/locale.c b/newlib/libc/locale/locale.c index 85069aefa..26283c5f4 100644 --- a/newlib/libc/locale/locale.c +++ b/newlib/libc/locale/locale.c @@ -56,34 +56,36 @@ for a given language, a three character string per ISO 639-3. <<"TERRITORY">> is a country code per ISO 3166. For <<"charset">> and <<"modifier">> see below. -Additionally to the POSIX specifier, seven extensions are supported for -backward compatibility with older implementations using newlib: -<<"C-UTF-8">>, <<"C-JIS">>, <<"C-eucJP">>, <<"C-SJIS">>, <>, -<>, <<"C-ISO-8859-x">> with 1 <= x <= 15, or <<"C-CPxxx">> with -xxx in [437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 932, -1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258]. - -Instead of <<"C-">>, you can specify also <<"C.">>. Both variations allow +Additionally to the POSIX specifier, the following extension is supported +for backward compatibility with older implementations using newlib: +<<"C-charset">>. +Instead of <<"C-">>, you can also specify <<"C.">>. Both variations allow to specify language neutral locales while using other charsets than ASCII, for instance <<"C.UTF-8">>, which keeps all settings as in the C locale, but uses the UTF-8 charset. -Even when using POSIX locale strings, the only charsets allowed are +The following charsets are recogized: <<"UTF-8">>, <<"JIS">>, <<"EUCJP">>, <<"SJIS">>, <<"KOI8-R">>, <<"KOI8-U">>, -<<"ISO-8859-x">> with 1 <= x <= 15, or <<"CPxxx">> with xxx in -[437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 932, 1125, 1250, -1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258]. +<<"GEORGIAN-PS">>, <<"PT154">>, <<"TIS-620">>, <<"ISO-8859-x">> with +1 <= x <= 16, or <<"CPxxx">> with xxx in [437, 720, 737, 775, 850, 852, 855, +857, 858, 862, 866, 874, 932, 1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256, +1257, 1258]. + Charsets are case insensitive. For instance, <<"EUCJP">> and <<"eucJP">> are equivalent. Charset names with dashes can also be written without dashes, as in <<"UTF8">>, <<"iso88591">> or <<"koi8r">>. <<"EUCJP">> and <<"EUCKR"> are also recognized with dash, <<"EUC-JP">> and <<"EUC-KR">>. +Full support for all of the above charsets requires that newlib has been +build with multibyte support and support for all ISO and Windows Codepage. +Otherwise all singlebyte charsets are simply mapped to ASCII. Right now, +only newlib for Cygwin is built with full charset support by default. +Under Cygwin, this implementation additionally supports the charsets +<<"GBK">>, <<"eucKR">>, and <<"Big5">>. Cygwin does not support <<"JIS">>. + (<<"">> is also accepted; if given, the settings are read from the corresponding LC_* environment variables and $LANG according to POSIX rules. -Under Cygwin, this implementation additionally supports the charsets -<<"GBK">>, <<"eucKR">>, <<"Big5">>, and <<"TIS-620">>. - This implementation also supports a single modifier, <<"cjknarrow">>. Any other modifier is ignored. <<"cjknarrow">>, in conjunction with one of the language specifiers <<"ja">>, <<"ko">>, and <<"zh">> specifies @@ -720,18 +722,82 @@ loadlocale(struct _reent *p, int category) l_mbtowc = __ascii_mbtowc; #endif break; -#ifdef __CYGWIN__ case 'G': case 'g': - if (strcasecmp (charset, "GBK")) - return NULL; - strcpy (charset, "GBK"); - mbc_max = 2; +#ifdef __CYGWIN__ + if (!strcasecmp (charset, "GBK")) + { + strcpy (charset, "GBK"); + mbc_max = 2; #ifdef _MB_CAPABLE - l_wctomb = __gbk_wctomb; - l_mbtowc = __gbk_mbtowc; + l_wctomb = __gbk_wctomb; + l_mbtowc = __gbk_mbtowc; +#endif + } + else +#endif /* __CYGWIN__ */ + /* GEORGIAN-PS and the alias without dash */ + if (!strncasecmp (charset, "GEORGIAN", 8)) + { + c = charset + 8; + if (*c == '-') + ++c; + if (strcasecmp (c, "PS")) + return NULL; + strcpy (charset, "CP101"); + mbc_max = 1; +#ifdef _MB_CAPABLE +#ifdef _MB_EXTENDED_CHARSETS_WINDOWS + l_wctomb = __cp_wctomb; + l_mbtowc = __cp_mbtowc; +#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */ + l_wctomb = __ascii_wctomb; + l_mbtowc = __ascii_mbtowc; +#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */ +#endif + } + else + return NULL; + break; + case 'P': + case 'p': + /* PT154 */ + if (strcasecmp (charset, "PT154")) + return NULL; + strcpy (charset, "CP102"); + mbc_max = 1; +#ifdef _MB_CAPABLE +#ifdef _MB_EXTENDED_CHARSETS_WINDOWS + l_wctomb = __cp_wctomb; + l_mbtowc = __cp_mbtowc; +#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */ + l_wctomb = __ascii_wctomb; + l_mbtowc = __ascii_mbtowc; +#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */ #endif break; + case 'T': + case 't': + if (strncasecmp (charset, "TIS", 3)) + return NULL; + c = charset + 3; + if (*c == '-') + ++c; + if (strcasecmp (c, "620")) + return NULL; + strcpy (charset, "CP874"); + mbc_max = 1; +#ifdef _MB_CAPABLE +#ifdef _MB_EXTENDED_CHARSETS_WINDOWS + l_wctomb = __cp_wctomb; + l_mbtowc = __cp_mbtowc; +#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */ + l_wctomb = __ascii_wctomb; + l_mbtowc = __ascii_mbtowc; +#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */ +#endif + break; +#ifdef __CYGWIN__ case 'B': case 'b': if (strcasecmp (charset, "BIG5")) @@ -741,17 +807,6 @@ loadlocale(struct _reent *p, int category) #ifdef _MB_CAPABLE l_wctomb = __big5_wctomb; l_mbtowc = __big5_mbtowc; -#endif - break; - case 'T': - case 't': - if (strcasecmp (charset, "TIS620") && strcasecmp (charset, "TIS-620")) - return NULL; - strcpy (charset, "CP874"); - mbc_max = 1; -#ifdef _MB_CAPABLE - l_wctomb = __cp_wctomb; - l_mbtowc = __cp_mbtowc; #endif break; #endif /* __CYGWIN__ */ diff --git a/newlib/libc/locale/nl_langinfo.c b/newlib/libc/locale/nl_langinfo.c index cd86c158d..8e8856de7 100644 --- a/newlib/libc/locale/nl_langinfo.c +++ b/newlib/libc/locale/nl_langinfo.c @@ -78,6 +78,10 @@ _DEFUN(nl_langinfo, (item), ret = "KOI8-R"; else if (strcmp (ret + 2, "21866") == 0) ret = "KOI8-U"; + else if (strcmp (ret + 2, "101") == 0) + ret = "GEORGIAN-PS"; + else if (strcmp (ret + 2, "102") == 0) + ret = "PT154"; } else if (ret[0] == 'S'/*JIS*/) { diff --git a/newlib/libc/stdlib/sb_charsets.c b/newlib/libc/stdlib/sb_charsets.c index 4ab1de69a..e668c4b83 100644 --- a/newlib/libc/stdlib/sb_charsets.c +++ b/newlib/libc/stdlib/sb_charsets.c @@ -203,7 +203,7 @@ wchar_t __iso_8859_conv[14][0x60] = { value (function __cp_index), the second index is the value of the incoming character - 0x80. Values < 0x80 don't have to be converted anyway. */ -wchar_t __cp_conv[24][0x80] = { +wchar_t __cp_conv[26][0x80] = { /* CP437 */ { 0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7, 0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5, @@ -611,7 +611,47 @@ wchar_t __cp_conv[24][0x80] = { 0x42e, 0x410, 0x411, 0x426, 0x414, 0x415, 0x424, 0x413, 0x425, 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e, 0x41f, 0x42f, 0x420, 0x421, 0x422, 0x423, 0x416, 0x412, - 0x42c, 0x42b, 0x417, 0x428, 0x42d, 0x429, 0x427, 0x42a } + 0x42c, 0x42b, 0x417, 0x428, 0x42d, 0x429, 0x427, 0x42a }, + /* The following are not valid Windows codepages, but they fit nicely here. + The CP numbers are only used internally and are guranteed not to clash + with valid Windows codepage identifier. */ + /* CP101 (GEORGIAN-PS) Georgian charset, used as the default charset in + the ka_GE locale (Georgian, Georgia). Apparently derived from Windows + CP1252. */ + { 0x80, 0x81, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021, + 0x2c6, 0x2030, 0x160, 0x2039, 0x152, 0x8d, 0x8e, 0x8f, + 0x90, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, + 0x2dc, 0x2122, 0x161, 0x203a, 0x153, 0x9d, 0x9e, 0x178, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0x10d0, 0x10d1, 0x10d2, 0x10d3, 0x10d4, 0x10d5, 0x10d6, 0x10f1, + 0x10d7, 0x10d8, 0x10d9, 0x10da, 0x10db, 0x10dc, 0x10f2, 0x10dd, + 0x10de, 0x10df, 0x10e0, 0x10e1, 0x10e2, 0x10f3, 0x10e3, 0x10e4, + 0x10e5, 0x10e6, 0x10e7, 0x10e8, 0x10e9, 0x10ea, 0x10eb, 0x10ec, + 0x10ed, 0x10ee, 0x10f4, 0x10ef, 0x10f0, 0x10f5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }, + /* CP102 (PT154) Cyrillic-Asian charset, used as the default charset in + the kk_KZ locale (Kazakh, Kazakhstan). */ + { 0x496, 0x492, 0x4ee, 0x493, 0x201e, 0x2026, 0x4b6, 0x4ae, + 0x4b2, 0x4af, 0x4a0, 0x4e2, 0x4a2, 0x49a, 0x4ba, 0x4b8, + 0x497, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, + 0x4b3, 0x4b7, 0x4a1, 0x4e3, 0x4a3, 0x49b, 0x4bb, 0x4b9, + 0xa0, 0x40e, 0x45e, 0x408, 0x4e8, 0x498, 0x4b0, 0xa7, + 0x401, 0xa9, 0x4d8, 0xab, 0xac, 0x4ef, 0xae, 0x49c, + 0xb0, 0x4b1, 0x406, 0x456, 0x499, 0x4e9, 0xb6, 0xb7, + 0x451, 0x2116, 0x4d9, 0xbb, 0x458, 0x4aa, 0x4ab, 0x49d, + 0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, + 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e, 0x41f, + 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, + 0x428, 0x429, 0x42a, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, + 0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, + 0x438, 0x439, 0x43a, 0x43b, 0x43c, 0x43d, 0x43e, 0x43f, + 0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, + 0x448, 0x449, 0x44a, 0x44b, 0x44c, 0x44d, 0x44e, 0x44f } }; #endif /* _MB_EXTENDED_CHARSETS_WINDOWS */ @@ -727,6 +767,12 @@ __cp_index (const char *charset_ext) case 21866: cp_idx = 23; break; + case 101: + cp_idx = 24; + break; + case 102: + cp_idx = 25; + break; default: cp_idx = -1; break;