Consolidate wctomb/mbtowc calls for POSIX-1.2008

- Remove charset parameter from low level __foo_wctomb/__foo_mbtowc calls.
- Instead, create array of function for ISO and Windows codepages to point
  to function which does not require to evaluate the charset string on
  each call.  Create matching helper functions.  I.e., __iso_wctomb,
  __iso_mbtowc, __cp_wctomb and __cp_mbtowc are functions returning the
  right function pointer now.
- Create __WCTOMB/__MBTOWC macros utilizing per-reent locale and replace
  calls to __wctomb/__mbtowc with calls to __WCTOMB/__MBTOWC.
- Drop global __wctomb/__mbtowc vars.
- Utilize aforementioned changes in Cygwin to get rid of charset in other,
  calling functions and simplify the code.
- In Cygwin restrict global cygheap locale info to the job performed
  by internal_setlocale.  Use UTF-8 instead of ASCII on the fly in
  internal conversion functions.
- In Cygwin dll_entry, make sure to initialize a TLS area with a NULL
  _REENT->_locale pointer.  Add comment to explain why.

Signed-off by: Corinna Vinschen <corinna@vinschen.de>
This commit is contained in:
Corinna Vinschen
2016-07-20 22:05:59 +02:00
parent 88208d3735
commit d16a56306d
31 changed files with 941 additions and 355 deletions

View File

@ -140,15 +140,13 @@ __db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp)
}
extern "C" int
__sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
mbstate_t *state)
__sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, mbstate_t *state)
{
return __db_wctomb (r,s, wchar, 932);
}
extern "C" int
__eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
mbstate_t *state)
__eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, mbstate_t *state)
{
/* Unfortunately, the Windows eucJP codepage 20932 is not really 100%
compatible to eucJP. It's a cute approximation which makes it a
@ -192,22 +190,19 @@ __eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
}
extern "C" int
__gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
mbstate_t *state)
__gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, mbstate_t *state)
{
return __db_wctomb (r,s, wchar, 936);
}
extern "C" int
__kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
mbstate_t *state)
__kr_wctomb (struct _reent *r, char *s, wchar_t wchar, mbstate_t *state)
{
return __db_wctomb (r,s, wchar, 949);
}
extern "C" int
__big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
mbstate_t *state)
__big5_wctomb (struct _reent *r, char *s, wchar_t wchar, mbstate_t *state)
{
return __db_wctomb (r,s, wchar, 950);
}
@ -268,14 +263,14 @@ __db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, UINT cp,
extern "C" int
__sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
const char *charset, mbstate_t *state)
mbstate_t *state)
{
return __db_mbtowc (r, pwc, s, n, 932, state);
}
extern "C" int
__eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
const char *charset, mbstate_t *state)
mbstate_t *state)
{
/* See comment in __eucjp_wctomb above. */
wchar_t dummy;
@ -352,21 +347,21 @@ jis_x_0212:
extern "C" int
__gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
const char *charset, mbstate_t *state)
mbstate_t *state)
{
return __db_mbtowc (r, pwc, s, n, 936, state);
}
extern "C" int
__kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
const char *charset, mbstate_t *state)
mbstate_t *state)
{
return __db_mbtowc (r, pwc, s, n, 949, state);
}
extern "C" int
__big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
const char *charset, mbstate_t *state)
mbstate_t *state)
{
return __db_mbtowc (r, pwc, s, n, 950, state);
}
@ -408,7 +403,7 @@ __big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
*/
static size_t __reg3
sys_wcstombs (char *dst, size_t len, const wchar_t *src, size_t nwc,
bool is_path)
bool is_path)
{
char buf[10];
char *ptr = dst;
@ -416,9 +411,10 @@ sys_wcstombs (char *dst, size_t len, const wchar_t *src, size_t nwc,
size_t n = 0;
mbstate_t ps;
save_errno save;
wctomb_p f_wctomb = cygheap->locale.wctomb;
const char *charset = cygheap->locale.charset;
wctomb_p f_wctomb = __WCTOMB;
if (f_wctomb == __ascii_wctomb)
f_wctomb = __utf8_wctomb;
memset (&ps, 0, sizeof ps);
if (dst == NULL)
len = (size_t) -1;
@ -441,13 +437,13 @@ sys_wcstombs (char *dst, size_t len, const wchar_t *src, size_t nwc,
}
else
{
bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
if (bytes == -1 && *charset != 'U'/*TF-8*/)
bytes = f_wctomb (_REENT, buf, pw, &ps);
if (bytes == -1 && f_wctomb != __utf8_wctomb)
{
/* Convert chars invalid in the current codepage to a sequence
ASCII CAN; UTF-8 representation of invalid char. */
buf[0] = 0x18; /* ASCII CAN */
bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
bytes = __utf8_wctomb (_REENT, buf + 1, pw, &ps);
if (bytes == -1)
{
++pwcs;
@ -465,8 +461,7 @@ sys_wcstombs (char *dst, size_t len, const wchar_t *src, size_t nwc,
ps.__count = 0;
continue;
}
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset,
&ps);
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, &ps);
nwc--;
}
}
@ -557,8 +552,8 @@ sys_wcstombs_alloc_no_path (char **dst_p, int type, const wchar_t *src,
charset, which is the charset returned by GetConsoleCP (). Most of the
time this is used for box and line drawing characters. */
size_t __reg3
sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
size_t dlen, const char *src, size_t nms)
sys_cp_mbstowcs (mbtowc_p f_mbtowc, wchar_t *dst, size_t dlen,
const char *src, size_t nms)
{
wchar_t *ptr = dst;
unsigned const char *pmbs = (unsigned const char *) src;
@ -581,10 +576,11 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
next byte must be a valid UTF-8 start byte. If the charset
isn't UTF-8 anyway, try to convert the following bytes as UTF-8
sequence. */
if (nms > 2 && pmbs[1] >= 0xc2 && pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/)
if (nms > 2 && pmbs[1] >= 0xc2 && pmbs[1] <= 0xf4
&& f_mbtowc != __utf8_mbtowc)
{
bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs + 1,
nms - 1, charset, &ps);
nms - 1, &ps);
if (bytes < 0)
{
/* Invalid UTF-8 sequence? Treat the ASCII CAN character as
@ -603,7 +599,7 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
wchar_t *ptr2 = dst ? ptr + 1 : NULL;
int bytes2 = __utf8_mbtowc (_REENT, ptr2,
(const char *) pmbs + bytes,
nms - bytes, charset, &ps);
nms - bytes, &ps);
if (bytes2 < 0)
memset (&ps, 0, sizeof ps);
else
@ -625,7 +621,7 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
}
}
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
charset, &ps)) < 0)
&ps)) < 0)
{
/* The technique is based on a discussion here:
http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
@ -668,8 +664,10 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
size_t __reg3
sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src, size_t nms)
{
return sys_cp_mbstowcs (cygheap->locale.mbtowc, cygheap->locale.charset,
dst, dlen, src, nms);
mbtowc_p f_mbtowc = __MBTOWC;
if (f_mbtowc == __ascii_mbtowc)
f_mbtowc = __utf8_mbtowc;
return sys_cp_mbstowcs (f_mbtowc, dst, dlen, src, nms);
}
/* Same as sys_wcstombs_alloc, just backwards. */