diff --git a/newlib/ChangeLog b/newlib/ChangeLog index b627b1c92..b0c5088ab 100644 --- a/newlib/ChangeLog +++ b/newlib/ChangeLog @@ -1,3 +1,12 @@ +2009-02-25 Corinna Vinschen + + * mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8 + sequences since they are invalid in the Unicode standard. + Handle surrogate pairs in case of wchar_t == UTF-16. + * wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t + values beyond 0x10ffff into UTF-8 chars. Handle surrogate pairs in + case of wchar_t == UTF-16. + 2009-02-24 Kevin Buettner * libc/stdio/open_memstream.c (stdint.h): Include. diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c index 71bbf8537..00021beff 100644 --- a/newlib/libc/stdlib/mbtowc_r.c +++ b/newlib/libc/stdlib/mbtowc_r.c @@ -75,6 +75,18 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state), if (s == NULL) return 0; /* UTF-8 character encodings are not state-dependent */ + if (state->__count == 4) + { + /* Create the second half of the surrogate pair. For a description + see the comment below. */ + wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18) + | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12) + | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6) + | (wchar_t)(state->__value.__wchb[3] & 0x3f); + state->__count = 0; + *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff); + return 2; + } if (state->__count == 0) ch = t[i++]; else @@ -153,8 +165,7 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state), else if (ch >= 0xf0 && ch <= 0xf7) { /* four-byte sequence */ - if (sizeof(wchar_t) < 4) - return -1; /* we can't store such a value */ + wint_t tmp; state->__value.__wchb[0] = ch; if (state->__count == 0) state->__count = 1; @@ -185,125 +196,25 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state), ch = t[i++]; if (ch < 0x80 || ch > 0xbf) return -1; - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18) - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12) - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6) - | (wchar_t)(ch & 0x3f); - - state->__count = 0; - return i; - } - else if (ch >= 0xf8 && ch <= 0xfb) - { - /* five-byte sequence */ - if (sizeof(wchar_t) < 4) - return -1; /* we can't store such a value */ - state->__value.__wchb[0] = ch; - if (state->__count == 0) - state->__count = 1; - else if (n < (size_t)-1) - ++n; - if (n < 2) - return -2; - ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; - if (state->__value.__wchb[0] == 0xf8 && ch < 0x88) - /* overlong UTF-8 sequence */ - return -1; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[1] = ch; - if (state->__count == 1) - state->__count = 2; - else if (n < (size_t)-1) - ++n; - if (n < 3) - return -2; - ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2]; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[2] = ch; - if (state->__count == 2) - state->__count = 3; - else if (n < (size_t)-1) - ++n; - if (n < 4) - return -2; - ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3]; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[3] = ch; - state->__count = 4; - if (n < 5) - return -2; - ch = t[i++]; - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24) - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18) - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12) - | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6) - | (wchar_t)(ch & 0x3f); - - state->__count = 0; - return i; - } - else if (ch >= 0xfc && ch <= 0xfd) - { - /* six-byte sequence */ - int ch2; - if (sizeof(wchar_t) < 4) - return -1; /* we can't store such a value */ - state->__value.__wchb[0] = ch; - if (state->__count == 0) - state->__count = 1; - else if (n < (size_t)-1) - ++n; - if (n < 2) - return -2; - ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; - if (state->__value.__wchb[0] == 0xfc && ch < 0x84) - /* overlong UTF-8 sequence */ - return -1; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[1] = ch; - if (state->__count == 1) - state->__count = 2; - else if (n < (size_t)-1) - ++n; - if (n < 3) - return -2; - ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2]; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[2] = ch; - if (state->__count == 2) - state->__count = 3; - else if (n < (size_t)-1) - ++n; - if (n < 4) - return -2; - ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3]; - if (ch < 0x80 || ch > 0xbf) - return -1; - state->__value.__wchb[3] = ch; - if (state->__count == 3) - state->__count = 4; - else if (n < (size_t)-1) - ++n; - if (n < 5) - return -2; - if (n == 5) - return -1; /* at this point we can't save enough to restart */ - ch = t[i++]; - if (ch < 0x80 || ch > 0xbf) - return -1; - ch2 = t[i++]; - *pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30) - | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24) - | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18) - | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12) - | (wchar_t)((ch & 0x3f) << 6) - | (wchar_t)(ch2 & 0x3f); - + tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18) + | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) + | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6) + | (wint_t)(ch & 0x3f); + if (tmp > 0xffff && sizeof(wchar_t) == 2) + { + /* On systems which have wchar_t being UTF-16 values, the value + doesn't fit into a single wchar_t in this case. So what we + do here is to store the state with a special value of __count + and return the first half of a surrogate pair. As return + value we choose to return the half of the actual UTF-8 char. + The second half is returned in case we recognize the special + __count value above. */ + state->__value.__wchb[3] = ch; + state->__count = 4; + *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff); + return 2; + } + *pwc = tmp; state->__count = 0; return i; } diff --git a/newlib/libc/stdlib/wctomb_r.c b/newlib/libc/stdlib/wctomb_r.c index 82730424f..c96d954a1 100644 --- a/newlib/libc/stdlib/wctomb_r.c +++ b/newlib/libc/stdlib/wctomb_r.c @@ -28,6 +28,11 @@ _DEFUN (_wctomb_r, (r, s, wchar, state), if (s == NULL) return 0; /* UTF-8 encoding is not state-dependent */ + if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff)) + { + /* At this point only the second half of a surrogate pair is valid. */ + return -1; + } if (wchar <= 0x7f) { *s = wchar; @@ -41,16 +46,45 @@ _DEFUN (_wctomb_r, (r, s, wchar, state), } else if (wchar >= 0x800 && wchar <= 0xffff) { - /* UTF-16 surrogates -- must not occur in normal UCS-4 data */ if (wchar >= 0xd800 && wchar <= 0xdfff) - return -1; - + { + wint_t tmp; + /* UTF-16 surrogates -- must not occur in normal UCS-4 data */ + if (sizeof (wchar_t) != 2) + return -1; + if (wchar >= 0xdc00) + { + /* Second half of a surrogate pair. It's not valid if + we don't have already read a first half of a surrogate + before. */ + if (state->__count != -4) + return -1; + /* If it's valid, reconstruct the full Unicode value and + return the trailing three bytes of the UTF-8 char. */ + tmp = (state->__value.__wchb[0] << 16) + | (state->__value.__wchb[1] << 8) + | (wchar & 0x3ff); + state->__count = 0; + *s++ = 0x80 | ((tmp & 0x3f000) >> 12); + *s++ = 0x80 | ((tmp & 0xfc0) >> 6); + *s = 0x80 | (tmp & 0x3f); + return 3; + } + /* First half of a surrogate pair. Store the state and return + the first byte of the UTF-8 char. */ + tmp = ((wchar & 0x3ff) << 10) + 0x10000; + state->__value.__wchb[0] = (tmp >> 16) & 0xff; + state->__value.__wchb[1] = (tmp >> 8) & 0xff; + state->__count = -4; + *s = (0xf0 | ((tmp & 0x1c0000) >> 18)); + return 1; + } *s++ = 0xe0 | ((wchar & 0xf000) >> 12); *s++ = 0x80 | ((wchar & 0xfc0) >> 6); *s = 0x80 | (wchar & 0x3f); return 3; } - else if (wchar >= 0x10000 && wchar <= 0x1fffff) + else if (wchar >= 0x10000 && wchar <= 0x10ffff) { *s++ = 0xf0 | ((wchar & 0x1c0000) >> 18); *s++ = 0x80 | ((wchar & 0x3f000) >> 12); @@ -58,25 +92,6 @@ _DEFUN (_wctomb_r, (r, s, wchar, state), *s = 0x80 | (wchar & 0x3f); return 4; } - else if (wchar >= 0x200000 && wchar <= 0x3ffffff) - { - *s++ = 0xf8 | ((wchar & 0x3000000) >> 24); - *s++ = 0x80 | ((wchar & 0xfc0000) >> 18); - *s++ = 0x80 | ((wchar & 0x3f000) >> 12); - *s++ = 0x80 | ((wchar & 0xfc0) >> 6); - *s = 0x80 | (wchar & 0x3f); - return 5; - } - else if (wchar >= 0x4000000 && wchar <= 0x7fffffff) - { - *s++ = 0xfc | ((wchar & 0x40000000) >> 30); - *s++ = 0x80 | ((wchar & 0x3f000000) >> 24); - *s++ = 0x80 | ((wchar & 0xfc0000) >> 18); - *s++ = 0x80 | ((wchar & 0x3f000) >> 12); - *s++ = 0x80 | ((wchar & 0xfc0) >> 6); - *s = 0x80 | (wchar & 0x3f); - return 6; - } else return -1; }