* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8

sequences since they are invalid in the Unicode standard.
	Handle surrogate pairs in case of wchar_t == UTF-16.
	* wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t
	values beyond 0x10ffff into UTF-8 chars.  Handle surrogate pairs in
	case of wchar_t == UTF-16.
This commit is contained in:
Corinna Vinschen 2009-02-25 09:10:09 +00:00
parent 56eafaf6e3
commit 8d8bf5a5e2
3 changed files with 79 additions and 144 deletions

View File

@ -1,3 +1,12 @@
2009-02-25 Corinna Vinschen <corinna@vinschen.de>
* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
sequences since they are invalid in the Unicode standard.
Handle surrogate pairs in case of wchar_t == UTF-16.
* wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t
values beyond 0x10ffff into UTF-8 chars. Handle surrogate pairs in
case of wchar_t == UTF-16.
2009-02-24 Kevin Buettner <kevinb@redhat.com> 2009-02-24 Kevin Buettner <kevinb@redhat.com>
* libc/stdio/open_memstream.c (stdint.h): Include. * libc/stdio/open_memstream.c (stdint.h): Include.

View File

@ -75,6 +75,18 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
if (s == NULL) if (s == NULL)
return 0; /* UTF-8 character encodings are not state-dependent */ return 0; /* UTF-8 character encodings are not state-dependent */
if (state->__count == 4)
{
/* Create the second half of the surrogate pair. For a description
see the comment below. */
wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
| (wchar_t)(state->__value.__wchb[3] & 0x3f);
state->__count = 0;
*pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff);
return 2;
}
if (state->__count == 0) if (state->__count == 0)
ch = t[i++]; ch = t[i++];
else else
@ -153,8 +165,7 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
else if (ch >= 0xf0 && ch <= 0xf7) else if (ch >= 0xf0 && ch <= 0xf7)
{ {
/* four-byte sequence */ /* four-byte sequence */
if (sizeof(wchar_t) < 4) wint_t tmp;
return -1; /* we can't store such a value */
state->__value.__wchb[0] = ch; state->__value.__wchb[0] = ch;
if (state->__count == 0) if (state->__count == 0)
state->__count = 1; state->__count = 1;
@ -185,125 +196,25 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
ch = t[i++]; ch = t[i++];
if (ch < 0x80 || ch > 0xbf) if (ch < 0x80 || ch > 0xbf)
return -1; return -1;
*pwc = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18) tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12) | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6) | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
| (wchar_t)(ch & 0x3f); | (wint_t)(ch & 0x3f);
if (tmp > 0xffff && sizeof(wchar_t) == 2)
state->__count = 0;
return i;
}
else if (ch >= 0xf8 && ch <= 0xfb)
{ {
/* five-byte sequence */ /* On systems which have wchar_t being UTF-16 values, the value
if (sizeof(wchar_t) < 4) doesn't fit into a single wchar_t in this case. So what we
return -1; /* we can't store such a value */ do here is to store the state with a special value of __count
state->__value.__wchb[0] = ch; and return the first half of a surrogate pair. As return
if (state->__count == 0) value we choose to return the half of the actual UTF-8 char.
state->__count = 1; The second half is returned in case we recognize the special
else if (n < (size_t)-1) __count value above. */
++n;
if (n < 2)
return -2;
ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
if (state->__value.__wchb[0] == 0xf8 && ch < 0x88)
/* overlong UTF-8 sequence */
return -1;
if (ch < 0x80 || ch > 0xbf)
return -1;
state->__value.__wchb[1] = ch;
if (state->__count == 1)
state->__count = 2;
else if (n < (size_t)-1)
++n;
if (n < 3)
return -2;
ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
if (ch < 0x80 || ch > 0xbf)
return -1;
state->__value.__wchb[2] = ch;
if (state->__count == 2)
state->__count = 3;
else if (n < (size_t)-1)
++n;
if (n < 4)
return -2;
ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
if (ch < 0x80 || ch > 0xbf)
return -1;
state->__value.__wchb[3] = ch; state->__value.__wchb[3] = ch;
state->__count = 4; state->__count = 4;
if (n < 5) *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff);
return -2; return 2;
ch = t[i++];
*pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24)
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18)
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12)
| (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6)
| (wchar_t)(ch & 0x3f);
state->__count = 0;
return i;
} }
else if (ch >= 0xfc && ch <= 0xfd) *pwc = tmp;
{
/* six-byte sequence */
int ch2;
if (sizeof(wchar_t) < 4)
return -1; /* we can't store such a value */
state->__value.__wchb[0] = ch;
if (state->__count == 0)
state->__count = 1;
else if (n < (size_t)-1)
++n;
if (n < 2)
return -2;
ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
if (state->__value.__wchb[0] == 0xfc && ch < 0x84)
/* overlong UTF-8 sequence */
return -1;
if (ch < 0x80 || ch > 0xbf)
return -1;
state->__value.__wchb[1] = ch;
if (state->__count == 1)
state->__count = 2;
else if (n < (size_t)-1)
++n;
if (n < 3)
return -2;
ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
if (ch < 0x80 || ch > 0xbf)
return -1;
state->__value.__wchb[2] = ch;
if (state->__count == 2)
state->__count = 3;
else if (n < (size_t)-1)
++n;
if (n < 4)
return -2;
ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
if (ch < 0x80 || ch > 0xbf)
return -1;
state->__value.__wchb[3] = ch;
if (state->__count == 3)
state->__count = 4;
else if (n < (size_t)-1)
++n;
if (n < 5)
return -2;
if (n == 5)
return -1; /* at this point we can't save enough to restart */
ch = t[i++];
if (ch < 0x80 || ch > 0xbf)
return -1;
ch2 = t[i++];
*pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30)
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24)
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18)
| (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12)
| (wchar_t)((ch & 0x3f) << 6)
| (wchar_t)(ch2 & 0x3f);
state->__count = 0; state->__count = 0;
return i; return i;
} }

View File

@ -28,6 +28,11 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
if (s == NULL) if (s == NULL)
return 0; /* UTF-8 encoding is not state-dependent */ return 0; /* UTF-8 encoding is not state-dependent */
if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
{
/* At this point only the second half of a surrogate pair is valid. */
return -1;
}
if (wchar <= 0x7f) if (wchar <= 0x7f)
{ {
*s = wchar; *s = wchar;
@ -41,16 +46,45 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
} }
else if (wchar >= 0x800 && wchar <= 0xffff) else if (wchar >= 0x800 && wchar <= 0xffff)
{ {
/* UTF-16 surrogates -- must not occur in normal UCS-4 data */
if (wchar >= 0xd800 && wchar <= 0xdfff) if (wchar >= 0xd800 && wchar <= 0xdfff)
{
wint_t tmp;
/* UTF-16 surrogates -- must not occur in normal UCS-4 data */
if (sizeof (wchar_t) != 2)
return -1; return -1;
if (wchar >= 0xdc00)
{
/* Second half of a surrogate pair. It's not valid if
we don't have already read a first half of a surrogate
before. */
if (state->__count != -4)
return -1;
/* If it's valid, reconstruct the full Unicode value and
return the trailing three bytes of the UTF-8 char. */
tmp = (state->__value.__wchb[0] << 16)
| (state->__value.__wchb[1] << 8)
| (wchar & 0x3ff);
state->__count = 0;
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
*s = 0x80 | (tmp & 0x3f);
return 3;
}
/* First half of a surrogate pair. Store the state and return
the first byte of the UTF-8 char. */
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
state->__count = -4;
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
return 1;
}
*s++ = 0xe0 | ((wchar & 0xf000) >> 12); *s++ = 0xe0 | ((wchar & 0xf000) >> 12);
*s++ = 0x80 | ((wchar & 0xfc0) >> 6); *s++ = 0x80 | ((wchar & 0xfc0) >> 6);
*s = 0x80 | (wchar & 0x3f); *s = 0x80 | (wchar & 0x3f);
return 3; return 3;
} }
else if (wchar >= 0x10000 && wchar <= 0x1fffff) else if (wchar >= 0x10000 && wchar <= 0x10ffff)
{ {
*s++ = 0xf0 | ((wchar & 0x1c0000) >> 18); *s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
*s++ = 0x80 | ((wchar & 0x3f000) >> 12); *s++ = 0x80 | ((wchar & 0x3f000) >> 12);
@ -58,25 +92,6 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
*s = 0x80 | (wchar & 0x3f); *s = 0x80 | (wchar & 0x3f);
return 4; return 4;
} }
else if (wchar >= 0x200000 && wchar <= 0x3ffffff)
{
*s++ = 0xf8 | ((wchar & 0x3000000) >> 24);
*s++ = 0x80 | ((wchar & 0xfc0000) >> 18);
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
*s = 0x80 | (wchar & 0x3f);
return 5;
}
else if (wchar >= 0x4000000 && wchar <= 0x7fffffff)
{
*s++ = 0xfc | ((wchar & 0x40000000) >> 30);
*s++ = 0x80 | ((wchar & 0x3f000000) >> 24);
*s++ = 0x80 | ((wchar & 0xfc0000) >> 18);
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
*s = 0x80 | (wchar & 0x3f);
return 6;
}
else else
return -1; return -1;
} }