* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8

sequences since they are invalid in the Unicode standard. Handle surrogate pairs in case of wchar_t == UTF-16. * wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t values beyond 0x10ffff into UTF-8 chars. Handle surrogate pairs in case of wchar_t == UTF-16.
2009-02-25 09:10:09 +00:00
parent 56eafaf6e3
commit 8d8bf5a5e2
3 changed files with 79 additions and 144 deletions
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@@ -1,3 +1,12 @@
+2009-02-25  Corinna Vinschen  <corinna@vinschen.de>
+
+	* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
+	sequences since they are invalid in the Unicode standard.
+	Handle surrogate pairs in case of wchar_t == UTF-16.
+	* wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t
+	values beyond 0x10ffff into UTF-8 chars.  Handle surrogate pairs in
+	case of wchar_t == UTF-16.
+
 2009-02-24  Kevin Buettner  <kevinb@redhat.com>

 	* libc/stdio/open_memstream.c (stdint.h): Include.
--- a/newlib/libc/stdlib/mbtowc_r.c
+++ b/newlib/libc/stdlib/mbtowc_r.c
@@ -75,6 +75,18 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
      if (s == NULL)
        return 0; /* UTF-8 character encodings are not state-dependent */

+      if (state->__count == 4)
+	{
+	  /* Create the second half of the surrogate pair.  For a description
+	     see the comment below. */
+	  wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
+	    |   (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
+	    |   (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
+	    |   (wchar_t)(state->__value.__wchb[3] & 0x3f);
+	  state->__count = 0;
+	  *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff);
+	  return 2;
+	}
      if (state->__count == 0)
 	ch = t[i++];
      else
@@ -153,8 +165,7 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
      else if (ch >= 0xf0 && ch <= 0xf7)
 	{
 	  /* four-byte sequence */
-	  if (sizeof(wchar_t) < 4)
-	    return -1; /* we can't store such a value */
+	  wint_t tmp;
 	  state->__value.__wchb[0] = ch;
 	  if (state->__count == 0)
 	    state->__count = 1;
@@ -185,125 +196,25 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
 	  ch = t[i++];
 	  if (ch < 0x80 || ch > 0xbf)
 	    return -1;
-	  *pwc = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
-	    |    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
-	    |    (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
-	    |    (wchar_t)(ch & 0x3f);
-	
-	  state->__count = 0;
-	  return i;
-	}
-      else if (ch >= 0xf8 && ch <= 0xfb)
-	{
-	  /* five-byte sequence */
-	  if (sizeof(wchar_t) < 4)
-	    return -1; /* we can't store such a value */
-	  state->__value.__wchb[0] = ch;
-	  if (state->__count == 0)
-	    state->__count = 1;
-	  else if (n < (size_t)-1)
-	    ++n;
-	  if (n < 2)
-	    return -2;
-	  ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
-	  if (state->__value.__wchb[0] == 0xf8 && ch < 0x88)
-	    /* overlong UTF-8 sequence */
-	    return -1;
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[1] = ch;
-	  if (state->__count == 1)
-	    state->__count = 2;
-	  else if (n < (size_t)-1)
-	    ++n;
-	  if (n < 3)
-	    return -2;
-	  ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[2] = ch;
-	  if (state->__count == 2)
-	    state->__count = 3;
-	  else if (n < (size_t)-1)
-	    ++n;
-	  if (n < 4)
-	    return -2;
-	  ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[3] = ch;
-	  state->__count = 4;
-	  if (n < 5)
-	    return -2;
-	  ch = t[i++];
-	  *pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24)
-	    |    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18)
-	    |    (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12)
-	    |    (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6)
-	    |    (wchar_t)(ch & 0x3f);
-	
-	  state->__count = 0;
-	  return i;
-	}
-      else if (ch >= 0xfc && ch <= 0xfd)
-        {
-          /* six-byte sequence */
-	  int ch2;
-	  if (sizeof(wchar_t) < 4)
-	    return -1; /* we can't store such a value */
-	  state->__value.__wchb[0] = ch;
-	  if (state->__count == 0)
-	    state->__count = 1;
-	  else if (n < (size_t)-1)
-	    ++n;
-	  if (n < 2)
-	    return -2;
-	  ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
-	  if (state->__value.__wchb[0] == 0xfc && ch < 0x84)
-	    /* overlong UTF-8 sequence */
-	    return -1;
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[1] = ch;
-	  if (state->__count == 1)
-	    state->__count = 2;
-	  else if (n < (size_t)-1)
-	    ++n;
-	  if (n < 3)
-	    return -2;
-	  ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[2] = ch;
-	  if (state->__count == 2)
-	    state->__count = 3;
-	  else if (n < (size_t)-1)
-	    ++n;
-	  if (n < 4)
-	    return -2;
-	  ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[3] = ch;
-	  if (state->__count == 3)
-	    state->__count = 4;
-	  else if (n < (size_t)-1)
-	    ++n;
-	  if (n < 5)
-	    return -2;
-	  if (n == 5)
-	    return -1; /* at this point we can't save enough to restart */
-	  ch = t[i++];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  ch2 = t[i++];
-	  *pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30)
-	    |    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24)
-	    |    (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18)
-	    |    (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12)
-	    |    (wchar_t)((ch & 0x3f) << 6)
-	    |    (wchar_t)(ch2 & 0x3f);
-	
+	  tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
+	    |   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
+	    |   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
+	    |   (wint_t)(ch & 0x3f);
+	  if (tmp > 0xffff && sizeof(wchar_t) == 2)
+	    {
+	      /* On systems which have wchar_t being UTF-16 values, the value
+		 doesn't fit into a single wchar_t in this case.  So what we
+		 do here is to store the state with a special value of __count
+		 and return the first half of a surrogate pair.  As return
+		 value we choose to return the half of the actual UTF-8 char.
+		 The second half is returned in case we recognize the special
+		 __count value above. */
+	      state->__value.__wchb[3] = ch;
+	      state->__count = 4;
+	      *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff);
+	      return 2;
+	    }
+	  *pwc = tmp;
 	  state->__count = 0;
 	  return i;
 	}
--- a/newlib/libc/stdlib/wctomb_r.c
+++ b/newlib/libc/stdlib/wctomb_r.c
@@ -28,6 +28,11 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
      if (s == NULL)
        return 0; /* UTF-8 encoding is not state-dependent */

+      if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
+	{
+	  /* At this point only the second half of a surrogate pair is valid. */
+	  return -1;
+	}
      if (wchar <= 0x7f)
        {
          *s = wchar;
@@ -41,16 +46,45 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
        }
      else if (wchar >= 0x800 && wchar <= 0xffff)
        {
-          /* UTF-16 surrogates -- must not occur in normal UCS-4 data */
          if (wchar >= 0xd800 && wchar <= 0xdfff)
-            return -1;
-
+	    {
+	      wint_t tmp;
+	      /* UTF-16 surrogates -- must not occur in normal UCS-4 data */
+	      if (sizeof (wchar_t) != 2)
+		return -1;
+	      if (wchar >= 0xdc00)
+		{
+		  /* Second half of a surrogate pair. It's not valid if
+		     we don't have already read a first half of a surrogate
+		     before. */
+		  if (state->__count != -4)
+		    return -1;
+		  /* If it's valid, reconstruct the full Unicode value and
+		     return the trailing three bytes of the UTF-8 char. */
+		  tmp = (state->__value.__wchb[0] << 16)
+			| (state->__value.__wchb[1] << 8)
+			| (wchar & 0x3ff);
+		  state->__count = 0;
+		  *s++ = 0x80 | ((tmp &  0x3f000) >> 12);
+		  *s++ = 0x80 | ((tmp &    0xfc0) >> 6);
+		  *s   = 0x80 |  (tmp &     0x3f);
+		  return 3;
+	      	}
+	      /* First half of a surrogate pair.  Store the state and return
+	         the first byte of the UTF-8 char. */
+	      tmp = ((wchar & 0x3ff) << 10) + 0x10000;
+	      state->__value.__wchb[0] = (tmp >> 16) & 0xff;
+	      state->__value.__wchb[1] = (tmp >> 8) & 0xff;
+	      state->__count = -4;
+	      *s = (0xf0 | ((tmp & 0x1c0000) >> 18));
+	      return 1;
+	    }
          *s++ = 0xe0 | ((wchar & 0xf000) >> 12);
          *s++ = 0x80 | ((wchar &  0xfc0) >> 6);
          *s   = 0x80 |  (wchar &   0x3f);
          return 3;
        }
-      else if (wchar >= 0x10000 && wchar <= 0x1fffff)
+      else if (wchar >= 0x10000 && wchar <= 0x10ffff)
        {
          *s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
          *s++ = 0x80 | ((wchar &  0x3f000) >> 12);
@@ -58,25 +92,6 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
          *s   = 0x80 |  (wchar &     0x3f);
          return 4;
        }
-      else if (wchar >= 0x200000 && wchar <= 0x3ffffff)
-        {
-          *s++ = 0xf8 | ((wchar & 0x3000000) >> 24);
-          *s++ = 0x80 | ((wchar &  0xfc0000) >> 18);
-          *s++ = 0x80 | ((wchar &   0x3f000) >> 12);
-          *s++ = 0x80 | ((wchar &     0xfc0) >> 6);
-          *s   = 0x80 |  (wchar &      0x3f);
-          return 5;
-        }
-      else if (wchar >= 0x4000000 && wchar <= 0x7fffffff)
-        {
-          *s++ = 0xfc | ((wchar & 0x40000000) >> 30);
-          *s++ = 0x80 | ((wchar & 0x3f000000) >> 24);
-          *s++ = 0x80 | ((wchar &   0xfc0000) >> 18);
-          *s++ = 0x80 | ((wchar &    0x3f000) >> 12);
-          *s++ = 0x80 | ((wchar &      0xfc0) >> 6);
-          *s   = 0x80 |  (wchar &       0x3f);
-          return 6;
-        }
      else
        return -1;
    }