* fhandler.h (fhandler_console::trunc_buf): Add to use as cache for
truncated multibyte characters on input. (fhandler_console::write_replacement_char): Declare new method. * fhandler_console.cc (CONVERT_LIMIT): Raise to 64K. (fhandler_console::fhandler_console): Initialize trunc_buf. (ERR): Define as independent value again. (fhandler_console::write_replacement_char): New method to print replacement chars. (fhandler_console::write_normal): Add handling for truncated multibyte sequences. Call next_char instead of pathetic CharNextExA function. Don't change src, rather just work with found later on. * miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc. Don't call Windows function, restrict to well-known ANSI/OEM codepages and UTF-8. (next_char): Call CharNextExA only for doublebyte codepages. Implement for UTF-8 here. * strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc. * winsup.h (next_char): Declare. * include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX as defined by newlib for now.
This commit is contained in:
		| @@ -17,7 +17,8 @@ details. */ | ||||
| #include <alloca.h> | ||||
| #include <limits.h> | ||||
| #include <wchar.h> | ||||
| #include <winbase.h> | ||||
| #include <wingdi.h> | ||||
| #include <winuser.h> | ||||
| #include <winnls.h> | ||||
| #include "cygthread.h" | ||||
| #include "cygtls.h" | ||||
| @@ -192,6 +193,118 @@ cygwin_strupr (char *string) | ||||
|   return string; | ||||
| } | ||||
|  | ||||
| /* FIXME?  We only support standard ANSI/OEM codepages according to | ||||
|    http://www.microsoft.com/globaldev/reference/cphome.mspx as well | ||||
|    as UTF-8 and codepage 1361, which is also mentioned as valid | ||||
|    doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx). | ||||
|    Everything else will be hosed. */ | ||||
|  | ||||
| bool | ||||
| is_cp_multibyte (UINT cp) | ||||
| { | ||||
|   switch (cp) | ||||
|     { | ||||
|     case 932: | ||||
|     case 936: | ||||
|     case 949: | ||||
|     case 950: | ||||
|     case 1361: | ||||
|     case 65001: | ||||
|       return true; | ||||
|     } | ||||
|   return false; | ||||
| } | ||||
|  | ||||
| /* OMYGOD!  CharNextExA is not UTF-8 aware!  It only works fine with | ||||
|    double byte charsets.  So we have to do it ourselves for UTF-8. | ||||
|     | ||||
|    While being at it, we do more.  If a double-byte or multibyte | ||||
|    sequence is trucated due to an early end, we need a way to recognize | ||||
|    it.  The reason is that multiple buffered write statements might | ||||
|    accidentally stop and start in the middle of a single character byte | ||||
|    sequence.  If we have to interpret the byte sequences (as in | ||||
|    fhandler_console, we would print wrong output in these cases. | ||||
|     | ||||
|    So we have four possible return values here: | ||||
|  | ||||
|    ret = end      if str >= end | ||||
|    ret = NULL	  if we encounter an invalid byte sequence | ||||
|    ret = str      if we encounter the start byte of a truncated byte sequence | ||||
|    ret = str + n  if we encounter a vaild byte sequence | ||||
| */ | ||||
|  | ||||
| const unsigned char * | ||||
| next_char (UINT cp, const unsigned char *str, const unsigned char *end) | ||||
| { | ||||
|   const unsigned char *ret; | ||||
|  | ||||
|   if (str >= end) | ||||
|     return end; | ||||
|  | ||||
|   switch (cp) | ||||
|     { | ||||
|     case 932: | ||||
|     case 936: | ||||
|     case 949: | ||||
|     case 950: | ||||
|     case 1361: | ||||
|       if (*str <= 0x7f) | ||||
|         ret = str + 1; | ||||
|       else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str)) | ||||
| 	ret = str; | ||||
|       else | ||||
| 	ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0); | ||||
|       break; | ||||
|     case CP_UTF8: | ||||
|       switch (str[0] >> 4) | ||||
| 	{ | ||||
| 	case 0x0 ... 0x7:	/* One byte character. */ | ||||
| 	  ret = str + 1; | ||||
| 	  break; | ||||
| 	case 0x8 ... 0xb:	/* Followup byte.  Invalid as first byte. */ | ||||
| 	  ret = NULL; | ||||
| 	  break; | ||||
| 	case 0xc ... 0xd:	/* Two byte character. */ | ||||
| 	  /* Check followup bytes for validity. */ | ||||
| 	  if (str >= end - 1) | ||||
| 	    ret = str; | ||||
| 	  else if (str[1] <= 0xbf) | ||||
| 	    ret = str + 2; | ||||
| 	  else | ||||
| 	    ret = NULL; | ||||
| 	  break; | ||||
| 	case 0xe:		/* Three byte character. */ | ||||
| 	  if (str >= end - 2) | ||||
| 	    ret = str; | ||||
| 	  else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 | ||||
| 		   && (str[0] != 0xe0 || str[1] >= 0xa0) | ||||
| 		   && (str[0] != 0xed || str[1] <= 0x9f)) | ||||
| 	    ret = str + 3; | ||||
| 	  else | ||||
| 	    ret = NULL; | ||||
| 	  break; | ||||
| 	case 0xf:		/* Four byte character. */ | ||||
| 	  if (str[0] >= 0xf8) | ||||
| 	    ret = NULL; | ||||
| 	  else if (str >= end - 3) | ||||
| 	    ret = str; | ||||
| 	  else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 | ||||
| 		   && (str[3] & 0xc0) == 0x80 | ||||
| 		   && (str[0] == 0xf0 || str[1] >= 0x90) | ||||
| 		   && (str[0] == 0xf4 || str[1] <= 0x8f)) | ||||
| 	    ret = str + 4; | ||||
| 	  else | ||||
| 	    ret = NULL; | ||||
| 	  break; | ||||
| 	} | ||||
|       break; | ||||
|     default: | ||||
|       ret = str + 1; | ||||
|       break; | ||||
|     } | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| int __stdcall | ||||
| check_invalid_virtual_addr (const void *s, unsigned sz) | ||||
| { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user