* ctype.cc (_CTYPE_DATA_0_127): Add _B class to TAB character.

(__ctype_default): New character class array for default ASCII
	character set.
	(__ctype_iso): New array of character class array for ISO charsets.
	(__ctype_cp): Ditto for singlebyte Windows codepages.
	(tolower): Implement as distinct function to support any singlebyte
	charset.
	(toupper): Ditto.
	(__set_ctype): New function to copy singlebyte character classes
	corresponding to current charset to ctype_b array.
	Align copyright text to upstream.
	* dcrt0.cc (dll_crt0_1): Reset current locale to "C" per POSIX.
	* environ.cc (set_file_api_mode): Remove.
	(codepage_init): Remove.
	(parse_thing): Remove "codepage" setting.
	(environ_init): Set locale according to environment settings, or
	to current codepage, before converting environment to multibyte.
	* fhandler.h (fhandler_console::write_replacement_char): Drop argument.
	* fhandler_console.cc (dev_console::str_to_con): Call sys_cp_mbstowcs
	rather than MultiByteToWideChar.
	(fhandler_console::write_replacement_char): Always print a funny
	half filled square if a character isn't in the current charset.
	(fhandler_console::write_normal): Convert to using __mbtowc
	rather than next_char.
	* fork.cc (frok::child): Drop call to set_file_api_mode.
	* globals.cc (enum codepage_type) Remove.
	(current_codepage): Remove.
	* miscfuncs.cc (cygwin_wcslwr): Unused, dangerous.  Remove.
	(cygwin_wcsupr): Ditto.
	(is_cp_multibyte): Remove.
	(next_char): Remove.
	* miscfuncs.h (is_cp_multibyte): Drop declaration.
	(next_char): Ditto.
	* strfuncs.cc (get_cp): Remove.
	(__db_wctomb): New function to implement _wctomb_r functionality for
	doublebyte charsets using WideCharToMultiByte.
	(__sjis_wctomb): New function to replace unusable newlib function.
	(__jis_wctomb): Ditto.
	(__eucjp_wctomb): Ditto.
	(__gbk_wctomb): New function.
	(__kr_wctomb): Ditto.
	(__big5_wctomb): Ditto.
	(__db_mbtowc): New function to implement _mbtowc_r functionality for
	doublebyte charsets using MultiByteToWideChar.
	(__sjis_mbtowc): New function to replace unusable newlib function.
	(__jis_mbtowc): Ditto.
	(__eucjp_mbtowc): Ditto.
	(__gbk_mbtowc): New function.
	(__kr_mbtowc): New function
	(__big5_mbtowc): New function
	(__set_charset_from_codepage): New function.
	(sys_wcstombs): Reimplement, basically using same wide char to multibyte
	conversion as newlib's application level functions.  Plus extras.
	Add lengthy comment to explain.  Change return type to size_t.
	(sys_wcstombs_alloc): Just use sys_wcstombs.  Change return type to
	size_t.
	(sys_cp_mbstowcs): Replace sys_mbstowcs, take additional codepage
	argument.  Explain why.  Change return type to size_t.
	(sys_mbstowcs_alloc): Just use sys_mbstowcs.  Change return type to
	size_t.
	* wchar.h: Declare internal functions implemented in strfuncs.cc.
	(wcscasecmp): Remove.
	(wcsncasecmp): Remove.
	(wcslwr): Remove.
	(wcsupr): Remove.
	* winsup.h (codepage_init): Remove declaration.
	(get_cp): Ditto.
	(sys_wcstombs): Align declaration to new implementation.
	(sys_wcstombs_alloc): Ditto.
	(sys_cp_mbstowcs): Add declaration.
	(sys_mbstowcs): Define as inline function.
	(sys_mbstowcs_alloc): Align declaration to new implementation.
	(set_file_api_mode): Remove declaration.
	* include/ctype.h (isblank): Redefine to use _B character class.
	(toupper): Remove ASCII-only definition.
	(tolower): Ditto.
This commit is contained in:
Corinna Vinschen
2009-03-24 12:18:34 +00:00
parent 6a32d500a9
commit 161211d186
14 changed files with 1337 additions and 316 deletions

View File

@@ -141,26 +141,6 @@ cygwin_strncasecmp (const char *cs, const char *ct, size_t n)
return RtlCompareUnicodeString (&us, &ut, TRUE);
}
extern "C" wchar_t * __stdcall
cygwin_wcslwr (wchar_t *string)
{
UNICODE_STRING us;
RtlInitUnicodeString (&us, string);
RtlDowncaseUnicodeString (&us, &us, FALSE);
return string;
}
extern "C" wchar_t * __stdcall
cygwin_wcsupr (wchar_t *string)
{
UNICODE_STRING us;
RtlInitUnicodeString (&us, string);
RtlUpcaseUnicodeString (&us, &us, FALSE);
return string;
}
extern "C" char * __stdcall
cygwin_strlwr (char *string)
{
@@ -189,118 +169,6 @@ cygwin_strupr (char *string)
return string;
}
/* FIXME? We only support standard ANSI/OEM codepages according to
http://www.microsoft.com/globaldev/reference/cphome.mspx as well
as UTF-8 and codepage 1361, which is also mentioned as valid
doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx).
Everything else will be hosed. */
bool
is_cp_multibyte (UINT cp)
{
switch (cp)
{
case 932:
case 936:
case 949:
case 950:
case 1361:
case 65001:
return true;
}
return false;
}
/* OMYGOD! CharNextExA is not UTF-8 aware! It only works fine with
double byte charsets. So we have to do it ourselves for UTF-8.
While being at it, we do more. If a double-byte or multibyte
sequence is truncated due to an early end, we need a way to recognize
it. The reason is that multiple buffered write statements might
accidentally stop and start in the middle of a single character byte
sequence. If we have to interpret the byte sequences (as in
fhandler_console), we would print wrong output in these cases.
So we have four possible return values here:
ret = end if str >= end
ret = NULL if we encounter an invalid byte sequence
ret = str if we encounter the start byte of a truncated byte sequence
ret = str + n if we encounter a vaild byte sequence
*/
const unsigned char *
next_char (UINT cp, const unsigned char *str, const unsigned char *end)
{
const unsigned char *ret = NULL;
if (str >= end)
return end;
switch (cp)
{
case 932:
case 936:
case 949:
case 950:
case 1361:
if (*str <= 0x7f)
ret = str + 1;
else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str))
ret = str;
else
ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0);
break;
case CP_UTF8:
switch (str[0] >> 4)
{
case 0x0 ... 0x7: /* One byte character. */
ret = str + 1;
break;
case 0x8 ... 0xb: /* Followup byte. Invalid as first byte. */
ret = NULL;
break;
case 0xc ... 0xd: /* Two byte character. */
/* Check followup bytes for validity. */
if (str >= end - 1)
ret = str;
else if (str[1] <= 0xbf)
ret = str + 2;
else
ret = NULL;
break;
case 0xe: /* Three byte character. */
if (str >= end - 2)
ret = str;
else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
&& (str[0] != 0xe0 || str[1] >= 0xa0)
&& (str[0] != 0xed || str[1] <= 0x9f))
ret = str + 3;
else
ret = NULL;
break;
case 0xf: /* Four byte character. */
if (str[0] >= 0xf8)
ret = NULL;
else if (str >= end - 3)
ret = str;
else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
&& (str[3] & 0xc0) == 0x80
&& (str[0] == 0xf0 || str[1] >= 0x90)
&& (str[0] == 0xf4 || str[1] <= 0x8f))
ret = str + 4;
else
ret = NULL;
break;
}
break;
default:
ret = str + 1;
break;
}
return ret;
}
int __stdcall
check_invalid_virtual_addr (const void *s, unsigned sz)
{