* ctype.cc (_CTYPE_DATA_0_127): Add _B class to TAB character.

(__ctype_default): New character class array for default ASCII character set. (__ctype_iso): New array of character class array for ISO charsets. (__ctype_cp): Ditto for singlebyte Windows codepages. (tolower): Implement as distinct function to support any singlebyte charset. (toupper): Ditto. (__set_ctype): New function to copy singlebyte character classes corresponding to current charset to ctype_b array. Align copyright text to upstream. * dcrt0.cc (dll_crt0_1): Reset current locale to "C" per POSIX. * environ.cc (set_file_api_mode): Remove. (codepage_init): Remove. (parse_thing): Remove "codepage" setting. (environ_init): Set locale according to environment settings, or to current codepage, before converting environment to multibyte. * fhandler.h (fhandler_console::write_replacement_char): Drop argument. * fhandler_console.cc (dev_console::str_to_con): Call sys_cp_mbstowcs rather than MultiByteToWideChar. (fhandler_console::write_replacement_char): Always print a funny half filled square if a character isn't in the current charset. (fhandler_console::write_normal): Convert to using __mbtowc rather than next_char. * fork.cc (frok::child): Drop call to set_file_api_mode. * globals.cc (enum codepage_type) Remove. (current_codepage): Remove. * miscfuncs.cc (cygwin_wcslwr): Unused, dangerous. Remove. (cygwin_wcsupr): Ditto. (is_cp_multibyte): Remove. (next_char): Remove. * miscfuncs.h (is_cp_multibyte): Drop declaration. (next_char): Ditto. * strfuncs.cc (get_cp): Remove. (__db_wctomb): New function to implement _wctomb_r functionality for doublebyte charsets using WideCharToMultiByte. (__sjis_wctomb): New function to replace unusable newlib function. (__jis_wctomb): Ditto. (__eucjp_wctomb): Ditto. (__gbk_wctomb): New function. (__kr_wctomb): Ditto. (__big5_wctomb): Ditto. (__db_mbtowc): New function to implement _mbtowc_r functionality for doublebyte charsets using MultiByteToWideChar. (__sjis_mbtowc): New function to replace unusable newlib function. (__jis_mbtowc): Ditto. (__eucjp_mbtowc): Ditto. (__gbk_mbtowc): New function. (__kr_mbtowc): New function (__big5_mbtowc): New function (__set_charset_from_codepage): New function. (sys_wcstombs): Reimplement, basically using same wide char to multibyte conversion as newlib's application level functions. Plus extras. Add lengthy comment to explain. Change return type to size_t. (sys_wcstombs_alloc): Just use sys_wcstombs. Change return type to size_t. (sys_cp_mbstowcs): Replace sys_mbstowcs, take additional codepage argument. Explain why. Change return type to size_t. (sys_mbstowcs_alloc): Just use sys_mbstowcs. Change return type to size_t. * wchar.h: Declare internal functions implemented in strfuncs.cc. (wcscasecmp): Remove. (wcsncasecmp): Remove. (wcslwr): Remove. (wcsupr): Remove. * winsup.h (codepage_init): Remove declaration. (get_cp): Ditto. (sys_wcstombs): Align declaration to new implementation. (sys_wcstombs_alloc): Ditto. (sys_cp_mbstowcs): Add declaration. (sys_mbstowcs): Define as inline function. (sys_mbstowcs_alloc): Align declaration to new implementation. (set_file_api_mode): Remove declaration. * include/ctype.h (isblank): Redefine to use _B character class. (toupper): Remove ASCII-only definition. (tolower): Ditto.
2009-03-24 12:18:34 +00:00
parent 6a32d500a9
commit 161211d186
14 changed files with 1337 additions and 316 deletions
--- a/winsup/cygwin/strfuncs.cc
+++ b/winsup/cygwin/strfuncs.cc
@@ -20,45 +20,356 @@ details. */
 #include "fhandler.h"
 #include "dtable.h"
 #include "cygheap.h"
+#include "tls_pbuf.h"

-UINT
-get_cp ()
+/* The SJIS, JIS and EUCJP conversion in newlib does not use UTF as
+   wchar_t character representation.  That's unfortunate for us since
+   we require UTF for the OS.  What we do here is to have our own
+   implementation of the base functions for the conversion using
+   the MulitByteToWideChar/WideCharToMultiByte functions. */
+
+/* GBK, CP949, and Big5 conversions are not available so far in newlib. */
+
+static int
+__db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp)
 {
-  if (!active_codepage)
-    codepage_init ("ansi");
-  return active_codepage;
+  if (s == NULL)
+    return 0;
+
+  if (wchar < 0x80)
+    {
+      *s = (char) wchar;
+      return 1;
+    }
+
+  BOOL def_used = false;
+  int ret = WideCharToMultiByte (cp, cp > 50000 ? 0 : WC_NO_BEST_FIT_CHARS,
+				 &wchar, 1, s, MB_CUR_MAX, NULL, &def_used);
+  if (ret > 0 && !def_used)
+    return ret;
+
+  r->_errno = EILSEQ;
+  return -1;
 }

-/* tlen is always treated as the maximum buffer size, including the '\0'
-   character.  sys_wcstombs will always return a 0-terminated result, no
-   matter what. */
-int __stdcall
-sys_wcstombs (char *tgt, int tlen, const PWCHAR src, int slen)
+extern "C" int
+__sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
+	       mbstate_t *state)
 {
-  int ret;
+  return __db_wctomb (r,s, wchar, 932);
+}

-  /* Convert UNICODE private use area.  Reverse functionality (only for
-     path names) is transform_chars in path.cc. */
-  if (slen < 0)
-    slen = wcslen (src) + 1;
-  WCHAR sbuf[slen];
-  memcpy (sbuf, src, slen * sizeof (WCHAR));
-  const unsigned char *end = (unsigned char *) (sbuf + slen);
-  for (unsigned char *s = ((unsigned char *) sbuf) + 1; s < end;
-       s += sizeof (WCHAR))
-    if (*s == 0xf0)
-      *s = 0;
-  ret = WideCharToMultiByte (get_cp (), 0, sbuf, slen, tgt, tlen, NULL, NULL);
-  if (ret && tgt)
+extern "C" int
+__jis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
+	       mbstate_t *state)
+{
+  return __db_wctomb (r,s, wchar, 50220);
+}
+
+extern "C" int
+__eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
+	       mbstate_t *state)
+{
+  return __db_wctomb (r,s, wchar, 51932);
+}
+
+extern "C" int
+__gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
+	       mbstate_t *state)
+{
+  return __db_wctomb (r,s, wchar, 936);
+}
+
+extern "C" int
+__kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
+	       mbstate_t *state)
+{
+  return __db_wctomb (r,s, wchar, 949);
+}
+
+extern "C" int
+__big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
+	       mbstate_t *state)
+{
+  return __db_wctomb (r,s, wchar, 950);
+}
+
+static int
+__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
+	     UINT cp, mbstate_t *state)
+{
+  wchar_t dummy;
+  char buf[2];
+  int ret;
+  
+  if (pwc == NULL)
+    pwc = &dummy;
+
+  if (s == NULL)
+    return 0;  /* not state-dependent */
+
+  if (n == 0)
+    return -2;
+
+  if (state->__count == 0)
    {
-      ret = (ret < tlen) ? ret : tlen - 1;
-      tgt[ret] = '\0';
+      if (*(unsigned char *) s < 0x80)
+	{
+	  *pwc = *(unsigned char *) s;
+	  return *s ? 1 : 0;
+	}
+      ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
+				 s, 2, pwc, 1);
+      if (ret)
+	return *s ? 2 : 0;
+      if (n == 1)
+	{
+	  state->__count = 1;
+	  state->__value.__wchb[0] = *s;
+	  return -2;
+	}
+      else
+	{
+	  /* These Win32 functions are really crappy.  Assuming n is 2
+	     but the first byte is a singlebyte charcode, the function
+	     does not convert that byte and return 1, rather it just
+	     returns 0.  So, what we do here is to check if the first
+	     byte returns a valid value... */
+	  ret = MultiByteToWideChar (cp,
+				     cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
+				     s, 1, pwc, 1);
+	  if (ret)
+	    return *s ? 1 : 0;
+	}
+      r->_errno = EILSEQ;
+      return -1;
+    }
+  if (!*s)
+    return -2;
+  buf[0] = state->__value.__wchb[0];
+  buf[1] = *s;
+  ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS,
+			     buf, 2, pwc, 1);
+  if (!ret)
+    {
+      r->_errno = EILSEQ;
+      return -1;
    }
  return ret;
 }

+extern "C" int
+__sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
+	       const char *charset, mbstate_t *state)
+{
+  return __db_mbtowc (r, pwc, s, n, 932, state);
+}
+
+extern "C" int
+__jis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
+	       const char *charset, mbstate_t *state)
+{
+  return __db_mbtowc (r, pwc, s, n, 50220, state);
+}
+
+extern "C" int
+__eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
+	       const char *charset, mbstate_t *state)
+{
+  return __db_mbtowc (r, pwc, s, n, 51932, state);
+}
+
+extern "C" int
+__gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
+	       const char *charset, mbstate_t *state)
+{
+  return __db_mbtowc (r, pwc, s, n, 936, state);
+}
+
+extern "C" int
+__kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
+	       const char *charset, mbstate_t *state)
+{
+  return __db_mbtowc (r, pwc, s, n, 949, state);
+}
+
+extern "C" int
+__big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
+	       const char *charset, mbstate_t *state)
+{
+  return __db_mbtowc (r, pwc, s, n, 950, state);
+}
+
+/* Convert Windows codepage to a setlocale compatible character set code.
+   Called from newlib's setlocale() with the current ANSI codepage, if the
+   charset isn't given explicitely in the POSIX compatible locale specifier.
+   The function also returns a pointer to the corresponding _mbtowc_r
+   function.  This is used below in the sys_cp_mbstowcs function which
+   is called directly from fhandler_console if the "Alternate Charset" has
+   been switched on by an escape sequence. */
+extern "C" mbtowc_p
+__set_charset_from_codepage (UINT cp, char *charset)
+{
+  switch (cp)
+    {
+    case 437:
+    case 720:
+    case 737:
+    case 775:
+    case 850:
+    case 852:
+    case 855:
+    case 857:
+    case 858:
+    case 862:
+    case 866:
+    case 874:
+    case 1125:
+    case 1250:
+    case 1251:
+    case 1252:
+    case 1253:
+    case 1254:
+    case 1255:
+    case 1256:
+    case 1257:
+    case 1258:
+      __small_sprintf (charset, "CP%u", cp);
+      return __cp_mbtowc;
+    case 28591:
+    case 28592:
+    case 28593:
+    case 28594:
+    case 28595:
+    case 28596:
+    case 28597:
+    case 28598:
+    case 28599:
+    case 28603:
+    case 28605:
+      __small_sprintf (charset, "ISO-8859-%u", cp - 28590);
+      return __iso_mbtowc;
+    case 932:
+      strcpy (charset, "SJIS");
+      return __sjis_mbtowc;
+    case 936:
+      strcpy (charset, "GBK");
+      return __gbk_mbtowc;
+    case 949:
+      strcpy (charset, "CP949");
+      return __kr_mbtowc;
+    case 950:
+      strcpy (charset, "BIG5");
+      return __big5_mbtowc;
+    case 50220:
+      strcpy (charset, "JIS");
+      return __jis_mbtowc;
+    case 51932:
+      strcpy (charset, "EUCJP");
+      return __eucjp_mbtowc;
+    case 65001:
+      strcpy (charset, "UTF-8");
+      return __utf8_mbtowc;
+    default:
+      break;
+    }
+  strcpy (charset, "ASCII");
+  return __ascii_mbtowc;
+}
+
+/* Our own sys_wcstombs/sys_mbstowcs functions differ from the
+   wcstombs/mbstowcs API in three ways:
+
+   - The UNICODE private use area is used in filenames to specify
+     characters not allowed in Windows filenames ('*', '?', etc).
+     The sys_wcstombs converts characters in the private use area
+     back to the corresponding ASCII chars.
+
+   - If a wide character in a filename has no representation in the current
+     multibyte charset, then usually you wouldn't be able to access the
+     file.  To fix this problem, sys_wcstombs creates a replacement multibyte
+     sequences for the non-representable wide-char.  The sequence starts with
+     an ASCII SO (0x0e, Ctrl-N), followed by the UTF-8 representation of the
+     character.  The sys_(cp_)mbstowcs function detects ASCII SO characters
+     in the input multibyte string and converts the following multibyte
+     sequence in by treating it as an UTF-8 char.  If that fails, the ASCII
+     SO was probably standalone and it gets just copied over as ASCII SO.
+
+   - The functions always create 0-terminated results, no matter what.
+     If the result is truncated due to buffer size, it's a bug in Cygwin
+     and the buffer in the calling function should be raised. */
+size_t __stdcall
+sys_wcstombs (char *dst, size_t len, const PWCHAR src, size_t nwc)
+{
+  char buf[10];
+  char *ptr = dst;
+  wchar_t *pwcs = (wchar_t *) src;
+  size_t n = 0;
+  mbstate_t ps;
+
+  memset (&ps, 0, sizeof ps);
+  if (dst == NULL)
+    len = (size_t) -1;
+  while (n < len && nwc-- > 0)
+    {
+      wchar_t pw = *pwcs;
+      /* Convert UNICODE private use area.  Reverse functionality (only for
+	 path names) is transform_chars in path.cc. */
+      if ((pw & 0xff00) == 0xf000)
+	pw &= 0xff;
+      int bytes = _wctomb_r (_REENT, buf, pw, &ps);
+      /* Convert chars invalid in the current codepage to a sequence
+         ASCII SO; UTF-8 representation of invalid char.
+	 Do the same for ASCII SO itself. */
+      if ((bytes == -1 || pw == 0x0e) && *__locale_charset () != 'U'/*TF-8*/)
+        {
+	  buf[0] = 0x0e; /* ASCII SO */
+	  bytes = __utf8_wctomb (_REENT, buf + 1, pw, __locale_charset (), &ps);
+	  if (bytes == -1)
+	    {
+	      ++pwcs;
+	      ps.__count = 0;
+	      continue;
+	    }
+	  ++bytes; /* Add the ASCII SO to the byte count. */
+	  if (ps.__count == -4) /* First half of a surrogate pair. */
+	    {
+	      ++pwcs;
+	      if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
+		{
+		  ++pwcs;
+		  ps.__count = 0;
+		  continue;
+		}
+	      bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs,
+				      __locale_charset (), &ps);
+	    }
+        }
+      if (n + bytes <= len)
+        {
+          n += bytes;
+          if (dst)
+            {
+              for (int i = 0; i < bytes; ++i)
+                *ptr++ = buf[i];
+            }
+          if (*pwcs++ == 0x00)
+	    break;
+        }
+      else
+        break;
+    }
+  if (n && dst)
+    {
+      n = (n < len) ? n : len - 1;
+      dst[n] = '\0';
+    }
+
+  return n;
+}
+
 /* Allocate a buffer big enough for the string, always including the
-   terminating '\0'.  The buffer pointer is returned in *tgt_p, the return
+   terminating '\0'.  The buffer pointer is returned in *dst_p, the return
   value is the number of bytes written to the buffer, as usual.
   The "type" argument determines where the resulting buffer is stored.
   It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP".
@@ -67,57 +378,129 @@ sys_wcstombs (char *tgt, int tlen, const PWCHAR src, int slen)
   Note that this code is shared by cygserver (which requires it via
   __small_vsprintf) and so when built there plain calloc is the
   only choice.  */
-int __stdcall
-sys_wcstombs_alloc (char **tgt_p, int type, const PWCHAR src, int slen)
+size_t __stdcall
+sys_wcstombs_alloc (char **dst_p, int type, const PWCHAR src, size_t nwc)
 {
-  int ret;
+  size_t ret;

-  ret = WideCharToMultiByte (get_cp (), 0, src, slen, NULL, 0 ,NULL, NULL);
-  if (ret)
+  ret = sys_wcstombs (NULL, (size_t) -1, src, nwc);
+  if (ret > 0)
    {
-      size_t tlen = (slen == -1) ? ret : ret + 1;
+      size_t dlen = ret + 1;

      if (type == HEAP_NOTHEAP)
-	*tgt_p = (char *) calloc (tlen, sizeof (char));
+	*dst_p = (char *) calloc (dlen, sizeof (char));
      else
-	*tgt_p = (char *) ccalloc ((cygheap_types) type, tlen, sizeof (char));
-      if (!*tgt_p)
+	*dst_p = (char *) ccalloc ((cygheap_types) type, dlen, sizeof (char));
+      if (!*dst_p)
 	return 0;
-      ret = sys_wcstombs (*tgt_p, tlen, src, slen);
+      ret = sys_wcstombs (*dst_p, dlen, src, nwc);
    }
  return ret;
 }

-int __stdcall
-sys_mbstowcs (PWCHAR tgt, int tlen, const char *src, int slen)
+/* sys_cp_mbstowcs is actually most of the time called as sys_mbstowcs with
+   a 0 codepage.  If cp is not 0, the codepage is evaluated and used for the
+   conversion.  This is so that fhandler_console can switch to an alternate
+   charset, which is the charset returned by GetConsoleCP ().  Most of the
+   time this is used for box and line drawing characters. */
+size_t __stdcall
+sys_cp_mbstowcs (UINT cp, PWCHAR dst, size_t dlen, const char *src, size_t nms)
 {
-  int ret = MultiByteToWideChar (get_cp (), 0, src, slen, tgt, tlen);
-  if (ret && tgt)
+  wchar_t *ptr = dst;
+  char *pmbs = (char *) src;
+  size_t count = 0;
+  size_t len = dlen;
+  int bytes;
+  mbstate_t ps;
+  char charsetbuf[32];
+  char *charset = __locale_charset ();
+  mbtowc_p f_mbtowc = __mbtowc;
+
+  if (cp)
+    f_mbtowc = __set_charset_from_codepage (cp, charset = charsetbuf);
+
+  memset (&ps, 0, sizeof ps);
+  if (dst == NULL)
+    len = (size_t)-1;
+  while (len > 0)
    {
-      ret = (ret < tlen) ? ret : tlen - 1;
-      tgt[ret] = L'\0';
+      /* ASCII SO.  Convert following UTF-8 sequence (if not UTF-8 anyway). */
+      if (*pmbs == 0x0e && *charset != 'U'/*TF-8*/)
+	{
+	  pmbs++;
+	  bytes = __utf8_mbtowc (_REENT, ptr, pmbs, nms, charset, &ps);
+	  if (bytes < 0)
+	    {
+	      /* Invalid UTF-8 sequence?  Treat the ASCII SO character as
+	         stand-alone ASCII SO char. */
+	      bytes = 1;
+	      if (dst)
+	      	*ptr = 0x0e;
+	      memset (&ps, 0, sizeof ps);
+	      break;
+	    }
+	  if (bytes == 0)
+	    break;
+	  if (ps.__count == 4) /* First half of a surrogate. */
+	    {
+	      wchar_t *ptr2 = dst ? ptr + 1 : NULL;
+	      int bytes2 = __utf8_mbtowc (_REENT, ptr2, pmbs + bytes,
+					  nms - bytes, charset, &ps);
+	      if (bytes2 < 0)
+		break;
+	      pmbs += bytes2;
+	      nms -= bytes2;
+	      ++count;
+	      ptr = dst ? ptr + 1 : NULL;
+	      --len;
+	    }
+	}
+      else
+	bytes = f_mbtowc (_REENT, ptr, pmbs, nms, charset, &ps);
+      if (bytes > 0)
+        {
+          pmbs += bytes;
+          nms -= bytes;
+          ++count;
+	  ptr = dst ? ptr + 1 : NULL;
+          --len;
+        }
+      else
+	{
+	  if (bytes == 0)
+	    ++count;
+	  break;
+	}
    }
-  return ret;
+
+  if (count && dst)
+    {
+      count = (count < dlen) ? count : dlen - 1;
+      dst[count] = L'\0';
+    }
+
+  return count;
 }

 /* Same as sys_wcstombs_alloc, just backwards. */
-int __stdcall
-sys_mbstowcs_alloc (PWCHAR *tgt_p, int type, const char *src, int slen)
+size_t __stdcall
+sys_mbstowcs_alloc (PWCHAR *dst_p, int type, const char *src, size_t nms)
 {
-  int ret;
+  size_t ret;

-  ret = MultiByteToWideChar (get_cp (), 0, src, slen, NULL, 0);
-  if (ret)
+  ret = sys_mbstowcs (NULL, (size_t) -1, src, nms);
+  if (ret > 0)
    {
-      size_t tlen = (slen == -1 ? ret : ret + 1);
+      size_t dlen = ret + 1;

      if (type == HEAP_NOTHEAP)
-	*tgt_p = (PWCHAR) calloc (tlen, sizeof (WCHAR));
+	*dst_p = (PWCHAR) calloc (dlen, sizeof (WCHAR));
      else
-	*tgt_p = (PWCHAR) ccalloc ((cygheap_types) type, tlen, sizeof (WCHAR));
-      if (!*tgt_p)
+	*dst_p = (PWCHAR) ccalloc ((cygheap_types) type, dlen, sizeof (WCHAR));
+      if (!*dst_p)
 	return 0;
-      ret = sys_mbstowcs (*tgt_p, tlen, src, slen);
+      ret = sys_mbstowcs (*dst_p, dlen, src, nms);
    }
  return ret;
 }