* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Rework UTF-16 surrogate
pair handling to be more bullet-proof even with incomplete UTF-8 sequences. Add check for 4 byte sequences resulting in values outside the valid Unicode range. Add a comment to clarify checking for invalid CESU-8 sequences.
This commit is contained in:
		| @@ -1,3 +1,11 @@ | ||||
| 2009-07-29  Corinna Vinschen  <corinna@vinschen.de> | ||||
|  | ||||
| 	* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Rework UTF-16 surrogate | ||||
| 	pair handling to be more bullet-proof even with incomplete UTF-8 | ||||
| 	sequences.  Add check for 4 byte sequences resulting in values | ||||
| 	outside the valid Unicode range.  Add a comment to clarify checking | ||||
| 	for invalid CESU-8 sequences. | ||||
|  | ||||
| 2009-07-28  Corinna Vinschen  <corinna@vinschen.de> | ||||
|  | ||||
| 	* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Fix incrementing n in case | ||||
|   | ||||
| @@ -205,18 +205,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), | ||||
|   if (n == 0) | ||||
|     return -2; | ||||
|  | ||||
|   if (state->__count == 4) | ||||
|     { | ||||
|       /* Create the second half of the surrogate pair.  For a description | ||||
| 	 see the comment below. */ | ||||
|       wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18) | ||||
| 	|   (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12) | ||||
| 	|   (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6) | ||||
| 	|   (wchar_t)(state->__value.__wchb[3] & 0x3f); | ||||
|       state->__count = 0; | ||||
|       *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff); | ||||
|       return 2; | ||||
|     } | ||||
|   if (state->__count == 0) | ||||
|     ch = t[i++]; | ||||
|   else | ||||
| @@ -303,7 +291,7 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), | ||||
|       tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12) | ||||
| 	|    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6) | ||||
| 	|     (wchar_t)(ch & 0x3f); | ||||
|      | ||||
|       /* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */ | ||||
|       if (tmp >= 0xd800 && tmp <= 0xdfff) | ||||
| 	{ | ||||
| 	  r->_errno = EILSEQ; | ||||
| @@ -312,7 +300,7 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), | ||||
|       *pwc = tmp; | ||||
|       return i; | ||||
|     } | ||||
|   if (ch >= 0xf0 && ch <= 0xf7) | ||||
|   if (ch >= 0xf0 && ch <= 0xf4) | ||||
|     { | ||||
|       /* four-byte sequence */ | ||||
|       wint_t tmp; | ||||
| @@ -324,9 +312,10 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), | ||||
|       if (n < 2) | ||||
| 	return -2; | ||||
|       ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; | ||||
|       if (state->__value.__wchb[0] == 0xf0 && ch < 0x90) | ||||
|       if ((state->__value.__wchb[0] == 0xf0 && ch < 0x90) | ||||
| 	  || (state->__value.__wchb[0] == 0xf4 && ch >= 0x90)) | ||||
| 	{ | ||||
| 	  /* overlong UTF-8 sequence */ | ||||
| 	  /* overlong UTF-8 sequence or result is > 0x10ffff */ | ||||
| 	  r->_errno = EILSEQ; | ||||
| 	  return -1; | ||||
| 	} | ||||
| @@ -353,6 +342,26 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), | ||||
| 	state->__count = 3; | ||||
|       else if (n < (size_t)-1) | ||||
| 	++n; | ||||
|       if (state->__count == 3 && sizeof(wchar_t) == 2) | ||||
| 	{ | ||||
| 	  /* On systems which have wchar_t being UTF-16 values, the value | ||||
| 	     doesn't fit into a single wchar_t in this case.  So what we | ||||
| 	     do here is to store the state with a special value of __count | ||||
| 	     and return the first half of a surrogate pair.  The first | ||||
| 	     three bytes of a UTF-8 sequence are enough to generate the | ||||
| 	     first half of a UTF-16 surrogate pair.  As return value we | ||||
| 	     choose to return the number of bytes actually read up to | ||||
| 	     here. | ||||
| 	     The second half of the surrogate pair is returned in case we | ||||
| 	     recognize the special __count value of four, and the next | ||||
| 	     byte is actually a valid value.  See below. */ | ||||
| 	  tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18) | ||||
| 	    |   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) | ||||
| 	    |   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6); | ||||
| 	  state->__count = 4; | ||||
| 	  *pwc = 0xd800 | ((tmp - 0x10000) >> 10); | ||||
| 	  return i; | ||||
| 	} | ||||
|       if (n < 4) | ||||
| 	return -2; | ||||
|       ch = t[i++]; | ||||
| @@ -365,20 +374,11 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), | ||||
| 	|   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) | ||||
| 	|   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6) | ||||
| 	|   (wint_t)(ch & 0x3f); | ||||
|       if (tmp > 0xffff && sizeof(wchar_t) == 2) | ||||
| 	{ | ||||
| 	  /* On systems which have wchar_t being UTF-16 values, the value | ||||
| 	     doesn't fit into a single wchar_t in this case.  So what we | ||||
| 	     do here is to store the state with a special value of __count | ||||
| 	     and return the first half of a surrogate pair.  As return | ||||
| 	     value we choose to return the half of the actual UTF-8 char. | ||||
| 	     The second half is returned in case we recognize the special | ||||
| 	     __count value above. */ | ||||
| 	  state->__value.__wchb[3] = ch; | ||||
| 	  state->__count = 4; | ||||
| 	  *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff); | ||||
| 	  return 2; | ||||
| 	} | ||||
|       if (state->__count == 4 && sizeof(wchar_t) == 2) | ||||
| 	/* Create the second half of the surrogate pair for systems with | ||||
| 	   wchar_t == UTF-16 . */ | ||||
| 	*pwc = 0xdc00 | (tmp & 0x3ff); | ||||
|       else | ||||
| 	*pwc = tmp; | ||||
|       state->__count = 0; | ||||
|       return i; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user