* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
sequences since they are invalid in the Unicode standard. Handle surrogate pairs in case of wchar_t == UTF-16. * wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t values beyond 0x10ffff into UTF-8 chars. Handle surrogate pairs in case of wchar_t == UTF-16.
This commit is contained in:
parent
56eafaf6e3
commit
8d8bf5a5e2
|
@ -1,3 +1,12 @@
|
||||||
|
2009-02-25 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
|
* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
|
||||||
|
sequences since they are invalid in the Unicode standard.
|
||||||
|
Handle surrogate pairs in case of wchar_t == UTF-16.
|
||||||
|
* wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t
|
||||||
|
values beyond 0x10ffff into UTF-8 chars. Handle surrogate pairs in
|
||||||
|
case of wchar_t == UTF-16.
|
||||||
|
|
||||||
2009-02-24 Kevin Buettner <kevinb@redhat.com>
|
2009-02-24 Kevin Buettner <kevinb@redhat.com>
|
||||||
|
|
||||||
* libc/stdio/open_memstream.c (stdint.h): Include.
|
* libc/stdio/open_memstream.c (stdint.h): Include.
|
||||||
|
|
|
@ -75,6 +75,18 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
|
||||||
if (s == NULL)
|
if (s == NULL)
|
||||||
return 0; /* UTF-8 character encodings are not state-dependent */
|
return 0; /* UTF-8 character encodings are not state-dependent */
|
||||||
|
|
||||||
|
if (state->__count == 4)
|
||||||
|
{
|
||||||
|
/* Create the second half of the surrogate pair. For a description
|
||||||
|
see the comment below. */
|
||||||
|
wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
|
||||||
|
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
|
||||||
|
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
|
||||||
|
| (wchar_t)(state->__value.__wchb[3] & 0x3f);
|
||||||
|
state->__count = 0;
|
||||||
|
*pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
if (state->__count == 0)
|
if (state->__count == 0)
|
||||||
ch = t[i++];
|
ch = t[i++];
|
||||||
else
|
else
|
||||||
|
@ -153,8 +165,7 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
|
||||||
else if (ch >= 0xf0 && ch <= 0xf7)
|
else if (ch >= 0xf0 && ch <= 0xf7)
|
||||||
{
|
{
|
||||||
/* four-byte sequence */
|
/* four-byte sequence */
|
||||||
if (sizeof(wchar_t) < 4)
|
wint_t tmp;
|
||||||
return -1; /* we can't store such a value */
|
|
||||||
state->__value.__wchb[0] = ch;
|
state->__value.__wchb[0] = ch;
|
||||||
if (state->__count == 0)
|
if (state->__count == 0)
|
||||||
state->__count = 1;
|
state->__count = 1;
|
||||||
|
@ -185,125 +196,25 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
|
||||||
ch = t[i++];
|
ch = t[i++];
|
||||||
if (ch < 0x80 || ch > 0xbf)
|
if (ch < 0x80 || ch > 0xbf)
|
||||||
return -1;
|
return -1;
|
||||||
*pwc = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
|
tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
|
||||||
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
|
| (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
|
||||||
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
|
| (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
|
||||||
| (wchar_t)(ch & 0x3f);
|
| (wint_t)(ch & 0x3f);
|
||||||
|
if (tmp > 0xffff && sizeof(wchar_t) == 2)
|
||||||
state->__count = 0;
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
else if (ch >= 0xf8 && ch <= 0xfb)
|
|
||||||
{
|
{
|
||||||
/* five-byte sequence */
|
/* On systems which have wchar_t being UTF-16 values, the value
|
||||||
if (sizeof(wchar_t) < 4)
|
doesn't fit into a single wchar_t in this case. So what we
|
||||||
return -1; /* we can't store such a value */
|
do here is to store the state with a special value of __count
|
||||||
state->__value.__wchb[0] = ch;
|
and return the first half of a surrogate pair. As return
|
||||||
if (state->__count == 0)
|
value we choose to return the half of the actual UTF-8 char.
|
||||||
state->__count = 1;
|
The second half is returned in case we recognize the special
|
||||||
else if (n < (size_t)-1)
|
__count value above. */
|
||||||
++n;
|
|
||||||
if (n < 2)
|
|
||||||
return -2;
|
|
||||||
ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
|
|
||||||
if (state->__value.__wchb[0] == 0xf8 && ch < 0x88)
|
|
||||||
/* overlong UTF-8 sequence */
|
|
||||||
return -1;
|
|
||||||
if (ch < 0x80 || ch > 0xbf)
|
|
||||||
return -1;
|
|
||||||
state->__value.__wchb[1] = ch;
|
|
||||||
if (state->__count == 1)
|
|
||||||
state->__count = 2;
|
|
||||||
else if (n < (size_t)-1)
|
|
||||||
++n;
|
|
||||||
if (n < 3)
|
|
||||||
return -2;
|
|
||||||
ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
|
|
||||||
if (ch < 0x80 || ch > 0xbf)
|
|
||||||
return -1;
|
|
||||||
state->__value.__wchb[2] = ch;
|
|
||||||
if (state->__count == 2)
|
|
||||||
state->__count = 3;
|
|
||||||
else if (n < (size_t)-1)
|
|
||||||
++n;
|
|
||||||
if (n < 4)
|
|
||||||
return -2;
|
|
||||||
ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
|
|
||||||
if (ch < 0x80 || ch > 0xbf)
|
|
||||||
return -1;
|
|
||||||
state->__value.__wchb[3] = ch;
|
state->__value.__wchb[3] = ch;
|
||||||
state->__count = 4;
|
state->__count = 4;
|
||||||
if (n < 5)
|
*pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff);
|
||||||
return -2;
|
return 2;
|
||||||
ch = t[i++];
|
|
||||||
*pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24)
|
|
||||||
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18)
|
|
||||||
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12)
|
|
||||||
| (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6)
|
|
||||||
| (wchar_t)(ch & 0x3f);
|
|
||||||
|
|
||||||
state->__count = 0;
|
|
||||||
return i;
|
|
||||||
}
|
}
|
||||||
else if (ch >= 0xfc && ch <= 0xfd)
|
*pwc = tmp;
|
||||||
{
|
|
||||||
/* six-byte sequence */
|
|
||||||
int ch2;
|
|
||||||
if (sizeof(wchar_t) < 4)
|
|
||||||
return -1; /* we can't store such a value */
|
|
||||||
state->__value.__wchb[0] = ch;
|
|
||||||
if (state->__count == 0)
|
|
||||||
state->__count = 1;
|
|
||||||
else if (n < (size_t)-1)
|
|
||||||
++n;
|
|
||||||
if (n < 2)
|
|
||||||
return -2;
|
|
||||||
ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
|
|
||||||
if (state->__value.__wchb[0] == 0xfc && ch < 0x84)
|
|
||||||
/* overlong UTF-8 sequence */
|
|
||||||
return -1;
|
|
||||||
if (ch < 0x80 || ch > 0xbf)
|
|
||||||
return -1;
|
|
||||||
state->__value.__wchb[1] = ch;
|
|
||||||
if (state->__count == 1)
|
|
||||||
state->__count = 2;
|
|
||||||
else if (n < (size_t)-1)
|
|
||||||
++n;
|
|
||||||
if (n < 3)
|
|
||||||
return -2;
|
|
||||||
ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
|
|
||||||
if (ch < 0x80 || ch > 0xbf)
|
|
||||||
return -1;
|
|
||||||
state->__value.__wchb[2] = ch;
|
|
||||||
if (state->__count == 2)
|
|
||||||
state->__count = 3;
|
|
||||||
else if (n < (size_t)-1)
|
|
||||||
++n;
|
|
||||||
if (n < 4)
|
|
||||||
return -2;
|
|
||||||
ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
|
|
||||||
if (ch < 0x80 || ch > 0xbf)
|
|
||||||
return -1;
|
|
||||||
state->__value.__wchb[3] = ch;
|
|
||||||
if (state->__count == 3)
|
|
||||||
state->__count = 4;
|
|
||||||
else if (n < (size_t)-1)
|
|
||||||
++n;
|
|
||||||
if (n < 5)
|
|
||||||
return -2;
|
|
||||||
if (n == 5)
|
|
||||||
return -1; /* at this point we can't save enough to restart */
|
|
||||||
ch = t[i++];
|
|
||||||
if (ch < 0x80 || ch > 0xbf)
|
|
||||||
return -1;
|
|
||||||
ch2 = t[i++];
|
|
||||||
*pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30)
|
|
||||||
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24)
|
|
||||||
| (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18)
|
|
||||||
| (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12)
|
|
||||||
| (wchar_t)((ch & 0x3f) << 6)
|
|
||||||
| (wchar_t)(ch2 & 0x3f);
|
|
||||||
|
|
||||||
state->__count = 0;
|
state->__count = 0;
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,11 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
|
||||||
if (s == NULL)
|
if (s == NULL)
|
||||||
return 0; /* UTF-8 encoding is not state-dependent */
|
return 0; /* UTF-8 encoding is not state-dependent */
|
||||||
|
|
||||||
|
if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
|
||||||
|
{
|
||||||
|
/* At this point only the second half of a surrogate pair is valid. */
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
if (wchar <= 0x7f)
|
if (wchar <= 0x7f)
|
||||||
{
|
{
|
||||||
*s = wchar;
|
*s = wchar;
|
||||||
|
@ -41,16 +46,45 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
|
||||||
}
|
}
|
||||||
else if (wchar >= 0x800 && wchar <= 0xffff)
|
else if (wchar >= 0x800 && wchar <= 0xffff)
|
||||||
{
|
{
|
||||||
/* UTF-16 surrogates -- must not occur in normal UCS-4 data */
|
|
||||||
if (wchar >= 0xd800 && wchar <= 0xdfff)
|
if (wchar >= 0xd800 && wchar <= 0xdfff)
|
||||||
|
{
|
||||||
|
wint_t tmp;
|
||||||
|
/* UTF-16 surrogates -- must not occur in normal UCS-4 data */
|
||||||
|
if (sizeof (wchar_t) != 2)
|
||||||
return -1;
|
return -1;
|
||||||
|
if (wchar >= 0xdc00)
|
||||||
|
{
|
||||||
|
/* Second half of a surrogate pair. It's not valid if
|
||||||
|
we don't have already read a first half of a surrogate
|
||||||
|
before. */
|
||||||
|
if (state->__count != -4)
|
||||||
|
return -1;
|
||||||
|
/* If it's valid, reconstruct the full Unicode value and
|
||||||
|
return the trailing three bytes of the UTF-8 char. */
|
||||||
|
tmp = (state->__value.__wchb[0] << 16)
|
||||||
|
| (state->__value.__wchb[1] << 8)
|
||||||
|
| (wchar & 0x3ff);
|
||||||
|
state->__count = 0;
|
||||||
|
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
|
||||||
|
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
|
||||||
|
*s = 0x80 | (tmp & 0x3f);
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
/* First half of a surrogate pair. Store the state and return
|
||||||
|
the first byte of the UTF-8 char. */
|
||||||
|
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
|
||||||
|
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
|
||||||
|
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
|
||||||
|
state->__count = -4;
|
||||||
|
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
|
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
|
||||||
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
||||||
*s = 0x80 | (wchar & 0x3f);
|
*s = 0x80 | (wchar & 0x3f);
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
else if (wchar >= 0x10000 && wchar <= 0x1fffff)
|
else if (wchar >= 0x10000 && wchar <= 0x10ffff)
|
||||||
{
|
{
|
||||||
*s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
|
*s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
|
||||||
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
|
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
|
||||||
|
@ -58,25 +92,6 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
|
||||||
*s = 0x80 | (wchar & 0x3f);
|
*s = 0x80 | (wchar & 0x3f);
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
else if (wchar >= 0x200000 && wchar <= 0x3ffffff)
|
|
||||||
{
|
|
||||||
*s++ = 0xf8 | ((wchar & 0x3000000) >> 24);
|
|
||||||
*s++ = 0x80 | ((wchar & 0xfc0000) >> 18);
|
|
||||||
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
|
|
||||||
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
|
||||||
*s = 0x80 | (wchar & 0x3f);
|
|
||||||
return 5;
|
|
||||||
}
|
|
||||||
else if (wchar >= 0x4000000 && wchar <= 0x7fffffff)
|
|
||||||
{
|
|
||||||
*s++ = 0xfc | ((wchar & 0x40000000) >> 30);
|
|
||||||
*s++ = 0x80 | ((wchar & 0x3f000000) >> 24);
|
|
||||||
*s++ = 0x80 | ((wchar & 0xfc0000) >> 18);
|
|
||||||
*s++ = 0x80 | ((wchar & 0x3f000) >> 12);
|
|
||||||
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
|
||||||
*s = 0x80 | (wchar & 0x3f);
|
|
||||||
return 6;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue