* fhandler.h (fhandler_console::trunc_buf): Add to use as cache for

truncated multibyte characters on input. (fhandler_console::write_replacement_char): Declare new method. * fhandler_console.cc (CONVERT_LIMIT): Raise to 64K. (fhandler_console::fhandler_console): Initialize trunc_buf. (ERR): Define as independent value again. (fhandler_console::write_replacement_char): New method to print replacement chars. (fhandler_console::write_normal): Add handling for truncated multibyte sequences. Call next_char instead of pathetic CharNextExA function. Don't change src, rather just work with found later on. * miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc. Don't call Windows function, restrict to well-known ANSI/OEM codepages and UTF-8. (next_char): Call CharNextExA only for doublebyte codepages. Implement for UTF-8 here. * strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc. * winsup.h (next_char): Declare. * include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX as defined by newlib for now.
2008-02-06 18:24:50 +00:00
parent a7197550f3
commit 4b65f19045
7 changed files with 226 additions and 34 deletions
--- a/winsup/cygwin/miscfuncs.cc
+++ b/winsup/cygwin/miscfuncs.cc
@@ -17,7 +17,8 @@ details. */
 #include <alloca.h>
 #include <limits.h>
 #include <wchar.h>
-#include <winbase.h>
+#include <wingdi.h>
+#include <winuser.h>
 #include <winnls.h>
 #include "cygthread.h"
 #include "cygtls.h"
@@ -192,6 +193,118 @@ cygwin_strupr (char *string)
  return string;
 }

+/* FIXME?  We only support standard ANSI/OEM codepages according to
+   http://www.microsoft.com/globaldev/reference/cphome.mspx as well
+   as UTF-8 and codepage 1361, which is also mentioned as valid
+   doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx).
+   Everything else will be hosed. */
+
+bool
+is_cp_multibyte (UINT cp)
+{
+  switch (cp)
+    {
+    case 932:
+    case 936:
+    case 949:
+    case 950:
+    case 1361:
+    case 65001:
+      return true;
+    }
+  return false;
+}
+
+/* OMYGOD!  CharNextExA is not UTF-8 aware!  It only works fine with
+   double byte charsets.  So we have to do it ourselves for UTF-8.
+   
+   While being at it, we do more.  If a double-byte or multibyte
+   sequence is trucated due to an early end, we need a way to recognize
+   it.  The reason is that multiple buffered write statements might
+   accidentally stop and start in the middle of a single character byte
+   sequence.  If we have to interpret the byte sequences (as in
+   fhandler_console, we would print wrong output in these cases.
+   
+   So we have four possible return values here:
+
+   ret = end      if str >= end
+   ret = NULL	  if we encounter an invalid byte sequence
+   ret = str      if we encounter the start byte of a truncated byte sequence
+   ret = str + n  if we encounter a vaild byte sequence
+*/
+
+const unsigned char *
+next_char (UINT cp, const unsigned char *str, const unsigned char *end)
+{
+  const unsigned char *ret;
+
+  if (str >= end)
+    return end;
+
+  switch (cp)
+    {
+    case 932:
+    case 936:
+    case 949:
+    case 950:
+    case 1361:
+      if (*str <= 0x7f)
+        ret = str + 1;
+      else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str))
+	ret = str;
+      else
+	ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0);
+      break;
+    case CP_UTF8:
+      switch (str[0] >> 4)
+	{
+	case 0x0 ... 0x7:	/* One byte character. */
+	  ret = str + 1;
+	  break;
+	case 0x8 ... 0xb:	/* Followup byte.  Invalid as first byte. */
+	  ret = NULL;
+	  break;
+	case 0xc ... 0xd:	/* Two byte character. */
+	  /* Check followup bytes for validity. */
+	  if (str >= end - 1)
+	    ret = str;
+	  else if (str[1] <= 0xbf)
+	    ret = str + 2;
+	  else
+	    ret = NULL;
+	  break;
+	case 0xe:		/* Three byte character. */
+	  if (str >= end - 2)
+	    ret = str;
+	  else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
+		   && (str[0] != 0xe0 || str[1] >= 0xa0)
+		   && (str[0] != 0xed || str[1] <= 0x9f))
+	    ret = str + 3;
+	  else
+	    ret = NULL;
+	  break;
+	case 0xf:		/* Four byte character. */
+	  if (str[0] >= 0xf8)
+	    ret = NULL;
+	  else if (str >= end - 3)
+	    ret = str;
+	  else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
+		   && (str[3] & 0xc0) == 0x80
+		   && (str[0] == 0xf0 || str[1] >= 0x90)
+		   && (str[0] == 0xf4 || str[1] <= 0x8f))
+	    ret = str + 4;
+	  else
+	    ret = NULL;
+	  break;
+	}
+      break;
+    default:
+      ret = str + 1;
+      break;
+    }
+  return ret;
+}
+
 int __stdcall
 check_invalid_virtual_addr (const void *s, unsigned sz)
 {