/* nlsfuncs.cc: NLS helper functions Copyright 2010 Red Hat, Inc. This file is part of Cygwin. This software is a copyrighted work licensed under the terms of the Cygwin license. Please consult the file "CYGWIN_LICENSE" for details. */ #include "winsup.h" #include #include #include #include #include "path.h" #include "fhandler.h" #include "dtable.h" #include "cygheap.h" #include "tls_pbuf.h" /* Internal headers from newlib */ #include "../locale/timelocal.h" #include "../locale/lnumeric.h" #include "../locale/lmonetary.h" static char *lc_time_buf; static char *lc_numeric_buf; static char *lc_monetary_buf; #define _LC(x) &lc_##x##_ptr,lc_##x##_end-lc_##x##_ptr #define getlocaleinfo(category,type) \ __getlocaleinfo(lcid,(type),_LC(category),f_wctomb,charset) #define eval_datetimefmt(type,force) \ __eval_datetimefmt(lcid,(type),(force),&lc_time_ptr,\ lc_time_end-lc_time_ptr,f_wctomb, charset) #define has_modifier(x) ((x)[0] && !strcmp (modifier, (x))) /* Vista and later. Not defined in w32api yet. */ extern "C" { WINBASEAPI LCID WINAPI LocaleNameToLCID (LPCWSTR, DWORD); }; static char last_locale[ENCODING_LEN + 1]; static LCID last_lcid; /* Fetch LCID from POSIX locale specifier. Return values: -1: Invalid locale 0: C or POSIX >0: LCID */ static LCID __get_lcid_from_locale (const char *name) { char locale[ENCODING_LEN + 1]; char *c; LCID lcid; /* Speed up reusing the same locale as before, for instance in LC_ALL case. */ if (!strcmp (name, last_locale)) { debug_printf ("LCID=0x%04x", last_lcid); return last_lcid; } stpcpy (last_locale, name); stpcpy (locale, name); /* Store modifier for later use. */ const char *modifier = strchr (last_locale, '@') ? : ""; /* Drop charset and modifier */ c = strchr (locale, '.'); if (!c) c = strchr (locale, '@'); if (c) *c = '\0'; /* "POSIX" already converted to "C" in loadlocale. */ if (!strcmp (locale, "C")) return last_lcid = 0; c = strchr (locale, '_'); if (!c) return last_lcid = (LCID) -1; if (wincap.has_localenames ()) { wchar_t wlocale[ENCODING_LEN + 1]; /* Convert to RFC 4646 syntax which is the standard for the locale names replacing LCIDs starting with Vista. */ *c = '-'; mbstowcs (wlocale, locale, ENCODING_LEN + 1); lcid = LocaleNameToLCID (wlocale, 0); if (lcid == 0) { /* Unfortunately there are a couple of locales for which no form without a Script part per RFC 4646 exists. Linux also supports no_NO which is equivalent to nb_NO. */ struct { const char *loc; const wchar_t *wloc; } sc_only_locale[] = { { "az-AZ" , L"az-Latn-AZ" }, { "bs-BA" , L"bs-Latn-BA" }, { "ha-NG" , L"ha-Latn-NG" }, { "iu-CA" , L"iu-Cans-CA" }, { "mn-CN" , L"mn-Mong-CN" }, { "no-NO" , L"nb-NO" }, { "sr-BA" , L"sr-Cyrl-BA" }, { "sr-CS" , L"sr-Cyrl-CS" }, { "sr-ME" , L"sr-Cyrl-ME" }, { "sr-RS" , L"sr-Cyrl-RS" }, { "tg-TJ" , L"tg-Cyrl-TJ" }, { "tzm-DZ", L"tzm-Latn-DZ" }, { "uz-UZ" , L"uz-Latn-UZ" }, { NULL , NULL } }; for (int i = 0; sc_only_locale[i].loc && sc_only_locale[i].loc[0] <= locale[0]; ++i) if (!strcmp (locale, sc_only_locale[i].loc)) { lcid = LocaleNameToLCID (sc_only_locale[i].wloc, 0); if (!strncmp (locale, "sr-", 3)) { /* Vista/2K8 is missing sr-ME and sr-RS. It has only the deprecated sr-CS. So we map ME and RS to CS here. */ if (lcid == 0) lcid = LocaleNameToLCID (L"sr-Cyrl-CS", 0); /* "@latin" modifier for the sr_XY locales changes collation behaviour so lcid should accommodate that by being set to the Latin sublang. */ if (lcid != 0 && has_modifier ("@latin")) lcid = MAKELANGID (lcid & 0x3ff, (lcid >> 10) - 1); } break; } } last_lcid = lcid ?: (LCID) -1; debug_printf ("LCID=0x%04x", last_lcid); return last_lcid; } /* Pre-Vista we have to loop through the LCID values and see if they match language and TERRITORY. */ *c++ = '\0'; /* locale now points to the language, c points to the TERRITORY */ const char *language = locale; const char *territory = c; LCID lang, sublang; char iso[10]; /* In theory the lang part takes 10 bits (0x3ff), but up to Windows 2003 R2 the highest lang value is 0x81. */ for (lang = 1; lang <= 0x81; ++lang) if (GetLocaleInfo (lang, LOCALE_SISO639LANGNAME, iso, 10) && !strcmp (language, iso)) break; if (lang > 0x81) lcid = 0; else if (!territory) lcid = lang; else { /* In theory the sublang part takes 7 bits (0x3f), but up to Windows 2003 R2 the highest sublang value is 0x14. */ for (sublang = 1; sublang <= 0x14; ++sublang) { lcid = (sublang << 10) | lang; if (GetLocaleInfo (lcid, LOCALE_SISO3166CTRYNAME, iso, 10) && !strcmp (territory, iso)) break; } if (sublang > 0x14) lcid = 0; } if (lcid == 0 && territory) { /* Unfortunately there are four language LCID number areas representing multiple languages. Fortunately only two of them already existed pre-Vista. The concealed languages have to be tested explicitly, since they are not catched by the above loops. This also enables the serbian ISO 3166 territory codes which have been changed post 2003, and maps them to the old wrong (SP was never a valid ISO 3166 code) territory code sr_SP which fortunately has the same LCID as the newer sr_CS. Linux also supports no_NO which is equivalent to nb_NO. */ struct { const char *loc; LCID lcid; } ambiguous_locale[] = { { "bs_BA", MAKELANGID (LANG_BOSNIAN, 0x05) }, { "nn_NO", MAKELANGID (LANG_NORWEGIAN, SUBLANG_NORWEGIAN_NYNORSK) }, { "no_NO", MAKELANGID (LANG_NORWEGIAN, SUBLANG_NORWEGIAN_BOKMAL) }, { "sr_BA", MAKELANGID (LANG_BOSNIAN, SUBLANG_SERBIAN_BOSNIA_HERZEGOVINA_CYRILLIC) }, { "sr_CS", MAKELANGID (LANG_SERBIAN, SUBLANG_SERBIAN_CYRILLIC) }, { "sr_ME", MAKELANGID (LANG_SERBIAN, SUBLANG_SERBIAN_CYRILLIC) }, { "sr_RS", MAKELANGID (LANG_SERBIAN, SUBLANG_SERBIAN_CYRILLIC) }, { "sr_SP", MAKELANGID (LANG_SERBIAN, SUBLANG_SERBIAN_CYRILLIC) }, { NULL, 0 }, }; *--c = '_'; for (int i = 0; ambiguous_locale[i].loc && ambiguous_locale[i].loc[0] <= locale[0]; ++i) if (!strcmp (locale, ambiguous_locale[i].loc) && GetLocaleInfo (ambiguous_locale[i].lcid, LOCALE_SISO639LANGNAME, iso, 10)) { lcid = ambiguous_locale[i].lcid; /* "@latin" modifier for the sr_XY locales changes collation behaviour so lcid should accommodate that by being set to the Latin sublang. */ if (!strncmp (locale, "sr_", 3) && has_modifier ("@latin")) lcid = MAKELANGID (lcid & 0x3ff, (lcid >> 10) - 1); break; } } last_lcid = lcid ?: (LCID) -1; debug_printf ("LCID=0x%04x", last_lcid); return last_lcid; } /* Never returns -1, *iff* s is not NULL. Just skips invalid chars instead. s==NULL returns -1 since it's used to recognize invalid strings in the used charset. */ static size_t lc_wcstombs (wctomb_p f_wctomb, const char *charset, char *s, const wchar_t *pwcs, size_t n) { char *ptr = s; size_t max = n; char buf[8]; size_t i, bytes, num_to_copy; mbstate_t state; memset (&state, 0, sizeof state); if (s == NULL) { size_t num_bytes = 0; while (*pwcs != 0) { bytes = f_wctomb (_REENT, buf, *pwcs++, charset, &state); if (bytes == (size_t) -1) return (size_t) -1; num_bytes += bytes; } return num_bytes; } while (n > 0) { bytes = f_wctomb (_REENT, buf, *pwcs, charset, &state); if (bytes == (size_t) -1) { memset (&state, 0, sizeof state); ++pwcs; continue; } num_to_copy = (n > bytes ? bytes : n); for (i = 0; i < num_to_copy; ++i) *ptr++ = buf[i]; if (*pwcs == 0x00) return ptr - s - (n >= bytes); ++pwcs; n -= num_to_copy; } return max; } /* Never returns -1. Invalid sequences are translated to replacement wide-chars. */ static size_t lc_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *pwcs, const char *s, size_t n) { size_t ret = 0; char *t = (char *) s; size_t bytes; mbstate_t state; memset (&state, 0, sizeof state); if (!pwcs) n = 1; while (n > 0) { bytes = f_mbtowc (_REENT, pwcs, t, MB_CUR_MAX, charset, &state); if (bytes == (size_t) -1) { state.__count = 0; bytes = 1; if (pwcs) *pwcs = L' '; } else if (bytes == 0) break; t += bytes; ++ret; if (pwcs) { ++pwcs; --n; } } return ret; } static char * __getlocaleinfo (LCID lcid, LCTYPE type, char **ptr, size_t size, wctomb_p f_wctomb, const char *charset) { wchar_t wbuf[80]; size_t num; char *ret; GetLocaleInfoW (lcid, type, wbuf, 80); num = lc_wcstombs (f_wctomb, charset, ret = *ptr, wbuf, size); *ptr += num + 1; return ret; } static UINT getlocaleint (LCID lcid, LCTYPE type) { UINT val; return GetLocaleInfoW (lcid, type | LOCALE_RETURN_NUMBER, (PWCHAR) &val, sizeof val) ? val : 0; } static char * __eval_datetimefmt (LCID lcid, LCTYPE type, int force, char **ptr, size_t size, wctomb_p f_wctomb, const char *charset) { wchar_t buf[80]; wchar_t fc; size_t num; mbstate_t mb; size_t idx; const char *day_str = "edaA"; const char *mon_str = "mmbB"; const char *year_str = "yyyY"; const char *hour12_str = "lI"; const char *hour24_str = "kH"; const char *t_str; char *ret = *ptr; char *p = *ptr; GetLocaleInfoW (lcid, type, buf, 80); memset (&mb, 0, sizeof mb); for (wchar_t *fmt = buf; *fmt; ++fmt) switch (fc = *fmt) { case L'\'': if (fmt[1] == L'\'') *p++ = '\''; else while (fmt[1] && *++fmt != L'\'') { num = f_wctomb (_REENT, p, *fmt, charset, &mb); if (num == (size_t) -1) memset (&mb, 0, sizeof mb); else p += num; } break; case L'd': case L'M': case L'y': t_str = (fc == L'd' ? day_str : fc == L'M' ? mon_str : year_str); if (fc == L'y') force = 0; for (idx = 1; fmt[1] == fc; ++idx, ++fmt); if (--idx > 3) idx = 3; if (force && idx == 3) idx = 2; *p++ = '%'; *p++ = t_str[idx]; break; case L'g': break; case L'h': case L'H': t_str = (fc == L'h' || force ? hour12_str : hour24_str); idx = 0; if (fmt[1] == fc) { ++fmt; idx = 1; } *p++ = '%'; *p++ = t_str[idx]; break; case L'm': case L's': case L't': if (fmt[1] == fc) ++fmt; *p++ = '%'; *p++ = (fc == L'm' ? 'M' : fc == L's' ? 'S' : 'p'); break; case L'\t': case L'\n': case L'%': *p++ = '%'; *p++ = (char) fc; break; default: num = f_wctomb (_REENT, p, *fmt, charset, &mb); if (num == (size_t) -1) memset (&mb, 0, sizeof mb); else p += num; break; } *p++ = '\0'; *ptr = p; return ret; } /* Convert Windows grouping format into POSIX grouping format. */ static char * conv_grouping (LCID lcid, LCTYPE type, char **lc_ptr) { char buf[10]; /* Per MSDN max size of LOCALE_SGROUPING element incl. NUL */ bool repeat = false; char *ptr = *lc_ptr; char *ret = ptr; GetLocaleInfoA (lcid, type, buf, 10); /* Convert Windows grouping format into POSIX grouping format. */ for (char *c = buf; *c; ++c) { if (*c < '0' || *c > '9') continue; char val = *c - '0'; if (!val) { repeat = true; break; } *ptr++ = val; } if (!repeat) *ptr++ = CHAR_MAX; *ptr++ = '\0'; *lc_ptr = ptr; return ret; } /* Called from newlib's setlocale() via __time_load_locale() if category is LC_TIME. Returns LC_TIME values fetched from Windows locale data in the structure pointed to by _time_locale. This is subsequently accessed by functions like nl_langinfo, strftime, strptime. */ extern "C" int __set_lc_time_from_win (const char *name, struct lc_time_T *_time_locale, wctomb_p f_wctomb, const char *charset) { LCID lcid = __get_lcid_from_locale (name); if (!lcid || lcid == (LCID) -1) return lcid; char *new_lc_time_buf = (char *) malloc (4096); const char *lc_time_end = new_lc_time_buf + 4096; if (!new_lc_time_buf) return -1; char *lc_time_ptr = new_lc_time_buf; /* mon */ for (int i = 0; i < 12; ++i) _time_locale->mon[i] = getlocaleinfo (time, LOCALE_SABBREVMONTHNAME1 + i); /* month and alt_month */ for (int i = 0; i < 12; ++i) _time_locale->month[i] = _time_locale->alt_month[i] = getlocaleinfo (time, LOCALE_SMONTHNAME1 + i); /* wday */ _time_locale->wday[0] = getlocaleinfo (time, LOCALE_SABBREVDAYNAME7); for (int i = 0; i < 6; ++i) _time_locale->wday[i + 1] = getlocaleinfo (time, LOCALE_SABBREVDAYNAME1 + i); /* weekday */ _time_locale->weekday[0] = getlocaleinfo (time, LOCALE_SDAYNAME7); for (int i = 0; i < 6; ++i) _time_locale->weekday[i + 1] = getlocaleinfo (time, LOCALE_SDAYNAME1 + i); /* X_fmt */ _time_locale->X_fmt = eval_datetimefmt (LOCALE_STIMEFORMAT, 0); /* x_fmt */ _time_locale->x_fmt = eval_datetimefmt (LOCALE_SSHORTDATE, 0); /* c_fmt */ _time_locale->c_fmt = eval_datetimefmt (LOCALE_SLONGDATE, 1); --lc_time_ptr; *lc_time_ptr++ = ' '; eval_datetimefmt (LOCALE_STIMEFORMAT, 0); /* AM/PM */ _time_locale->am_pm[0] = getlocaleinfo (time, LOCALE_S1159); _time_locale->am_pm[1] = getlocaleinfo (time, LOCALE_S2359); /* date_fmt */ _time_locale->date_fmt = eval_datetimefmt (LOCALE_SLONGDATE, 1); --lc_time_ptr; *lc_time_ptr++ = ' '; eval_datetimefmt (LOCALE_STIMEFORMAT, 0); --lc_time_ptr; lc_time_ptr = stpcpy (lc_time_ptr, " %Z") + 1; /* md */ { wchar_t buf[80]; GetLocaleInfoW (lcid, LOCALE_IDATE, buf, 80); lc_time_ptr = stpcpy (lc_time_ptr, *buf == L'1' ? "dm" : "md") + 1; } /* ampm_fmt */ _time_locale->ampm_fmt = eval_datetimefmt (LOCALE_STIMEFORMAT, 1); char *tmp = (char *) realloc (new_lc_time_buf, lc_time_ptr - new_lc_time_buf); if (!tmp) { free (new_lc_time_buf); return -1; } if (lc_time_buf) free (lc_time_buf); lc_time_buf = tmp; return 1; } /* Called from newlib's setlocale() via __numeric_load_locale() if category is LC_NUMERIC. Returns LC_NUMERIC values fetched from Windows locale data in the structure pointed to by _numeric_locale. This is subsequently accessed by functions like nl_langinfo, localeconv, printf, etc. */ extern "C" int __set_lc_numeric_from_win (const char *name, struct lc_numeric_T *_numeric_locale, wctomb_p f_wctomb, const char *charset) { LCID lcid = __get_lcid_from_locale (name); if (!lcid || lcid == (LCID) -1) return lcid; char *new_lc_numeric_buf = (char *) malloc (48); const char *lc_numeric_end = new_lc_numeric_buf + 48; if (!new_lc_numeric_buf) return -1; char *lc_numeric_ptr = new_lc_numeric_buf; /* decimal_point */ _numeric_locale->decimal_point = getlocaleinfo (numeric, LOCALE_SDECIMAL); /* thousands_sep */ _numeric_locale->thousands_sep = getlocaleinfo (numeric, LOCALE_STHOUSAND); /* grouping */ _numeric_locale->grouping = conv_grouping (lcid, LOCALE_SGROUPING, &lc_numeric_ptr); char *tmp = (char *) realloc (new_lc_numeric_buf, lc_numeric_ptr - new_lc_numeric_buf); if (!tmp) { free (new_lc_numeric_buf); return -1; } if (lc_numeric_buf) free (lc_numeric_buf); lc_numeric_buf = tmp; return 1; } /* Called from newlib's setlocale() via __monetary_load_locale() if category is LC_MONETARY. Returns LC_MONETARY values fetched from Windows locale data in the structure pointed to by _monetary_locale. This is subsequently accessed by functions like nl_langinfo, localeconv, printf, etc. */ extern "C" int __set_lc_monetary_from_win (const char *name, struct lc_monetary_T *_monetary_locale, wctomb_p f_wctomb, const char *charset) { LCID lcid = __get_lcid_from_locale (name); if (!lcid || lcid == (LCID) -1) return lcid; char *new_lc_monetary_buf = (char *) malloc (256); const char *lc_monetary_end = new_lc_monetary_buf + 256; if (!new_lc_monetary_buf) return -1; char *lc_monetary_ptr = new_lc_monetary_buf; /* int_curr_symbol */ _monetary_locale->int_curr_symbol = getlocaleinfo (monetary, LOCALE_SINTLSYMBOL); /* No spacing char means space. */ if (!_monetary_locale->int_curr_symbol[3]) { lc_monetary_ptr[-1] = ' '; *lc_monetary_ptr++ = '\0'; } /* currency_symbol */ { /* As on Linux: If the currency_symbol can't be represented in the given charset, use int_curr_symbol. */ wchar_t wbuf[14]; GetLocaleInfoW (lcid, LOCALE_SCURRENCY, wbuf, 14); if (lc_wcstombs (f_wctomb, charset, NULL, wbuf, 0) == (size_t) -1) { _monetary_locale->currency_symbol = lc_monetary_ptr; lc_monetary_ptr = stpncpy (lc_monetary_ptr, _monetary_locale->int_curr_symbol, 3); *lc_monetary_ptr++ = '\0'; } else _monetary_locale->currency_symbol = getlocaleinfo (monetary, LOCALE_SCURRENCY); } /* mon_decimal_point */ _monetary_locale->mon_decimal_point = getlocaleinfo (monetary, LOCALE_SMONDECIMALSEP); /* mon_thousands_sep */ _monetary_locale->mon_thousands_sep = getlocaleinfo (monetary, LOCALE_SMONTHOUSANDSEP); /* mon_grouping */ _monetary_locale->mon_grouping = conv_grouping (lcid, LOCALE_SMONGROUPING, &lc_monetary_ptr); /* positive_sign */ _monetary_locale->positive_sign = getlocaleinfo (monetary, LOCALE_SPOSITIVESIGN); /* negative_sign */ _monetary_locale->negative_sign = getlocaleinfo (monetary, LOCALE_SNEGATIVESIGN); /* int_frac_digits */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IINTLCURRDIGITS); _monetary_locale->int_frac_digits = lc_monetary_ptr++; /* frac_digits */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_ICURRDIGITS); _monetary_locale->frac_digits = lc_monetary_ptr++; /* p_cs_precedes */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSYMPRECEDES); _monetary_locale->p_cs_precedes = lc_monetary_ptr++; /* p_sep_by_space */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSEPBYSPACE); _monetary_locale->p_sep_by_space = lc_monetary_ptr++; /* n_cs_precedes */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSYMPRECEDES); _monetary_locale->n_cs_precedes = lc_monetary_ptr++; /* n_sep_by_space */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSEPBYSPACE); _monetary_locale->n_sep_by_space = lc_monetary_ptr++; /* p_sign_posn */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSIGNPOSN); _monetary_locale->p_sign_posn = lc_monetary_ptr++; /* p_sign_posn */ *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSIGNPOSN); _monetary_locale->n_sign_posn = lc_monetary_ptr++; char *tmp = (char *) realloc (new_lc_monetary_buf, lc_monetary_ptr - new_lc_monetary_buf); if (!tmp) { free (new_lc_monetary_buf); return -1; } if (lc_monetary_buf) free (lc_monetary_buf); lc_monetary_buf = tmp; return 1; } LCID collate_lcid = 0; static mbtowc_p collate_mbtowc = __ascii_mbtowc; char collate_charset[ENCODING_LEN + 1] = "ASCII"; /* Called from newlib's setlocale() if category is LC_COLLATE. Stores LC_COLLATE locale information. This is subsequently accessed by the below functions strcoll, strxfrm, wcscoll, wcsxfrm. */ extern "C" int __collate_load_locale (const char *name, mbtowc_p f_mbtowc, const char *charset) { LCID lcid = __get_lcid_from_locale (name); if (lcid == (LCID) -1) return -1; collate_lcid = lcid; collate_mbtowc = f_mbtowc; stpcpy (collate_charset, charset); return 0; } /* We use the Windows functions for locale-specific string comparison and transformation. The advantage is that we don't need any files with collation information. */ extern "C" int wcscoll (const wchar_t *ws1, const wchar_t *ws2) { int ret; if (!collate_lcid) return wcscmp (ws1, ws2); ret = CompareStringW (collate_lcid, 0, ws1, -1, ws2, -1); if (!ret) set_errno (EINVAL); return ret - CSTR_EQUAL; } extern "C" int strcoll (const char *s1, const char *s2) { size_t n1, n2; wchar_t *ws1, *ws2; tmp_pathbuf tp; int ret; if (!collate_lcid) return strcmp (s1, s2); /* The ANSI version of CompareString uses the default charset of the lcid, so we must use the Unicode version. */ n1 = lc_mbstowcs (collate_mbtowc, collate_charset, NULL, s1, 0) + 1; ws1 = (n1 > NT_MAX_PATH ? (wchar_t *) malloc (n1 * sizeof (wchar_t)) : tp.w_get ()); lc_mbstowcs (collate_mbtowc, collate_charset, ws1, s1, n1); n2 = lc_mbstowcs (collate_mbtowc, collate_charset, NULL, s2, 0) + 1; ws2 = (n2 > NT_MAX_PATH ? (wchar_t *) malloc (n2 * sizeof (wchar_t)) : tp.w_get ()); lc_mbstowcs (collate_mbtowc, collate_charset, ws2, s2, n2); ret = CompareStringW (collate_lcid, 0, ws1, -1, ws2, -1); if (n1 > NT_MAX_PATH) free (ws1); if (n2 > NT_MAX_PATH) free (ws2); if (!ret) set_errno (EINVAL); return ret - CSTR_EQUAL; } extern "C" size_t wcsxfrm (wchar_t *ws1, const wchar_t *ws2, size_t wsn) { size_t ret; if (!collate_lcid) return wcslcpy (ws1, ws2, wsn); ret = LCMapStringW (collate_lcid, LCMAP_SORTKEY | LCMAP_BYTEREV, ws2, -1, ws1, wsn * sizeof (wchar_t)); /* LCMapStringW returns byte count including the terminating NUL character, wcsxfrm is supposed to return length in wchar_t excluding the NUL. Since the array is only single byte NUL-terminated we must make sure the result is wchar_t-NUL terminated. */ if (ret) { ret = (ret + 1) / sizeof (wchar_t); if (ret >= wsn) return wsn; ws1[ret] = L'\0'; return ret; } if (GetLastError () != ERROR_INSUFFICIENT_BUFFER) set_errno (EINVAL); return wsn; } extern "C" size_t strxfrm (char *s1, const char *s2, size_t sn) { size_t ret; size_t n2; wchar_t *ws2; tmp_pathbuf tp; if (!collate_lcid) return strlcpy (s1, s2, sn); /* The ANSI version of LCMapString uses the default charset of the lcid, so we must use the Unicode version. */ n2 = lc_mbstowcs (collate_mbtowc, collate_charset, NULL, s2, 0) + 1; ws2 = (n2 > NT_MAX_PATH ? (wchar_t *) malloc (n2 * sizeof (wchar_t)) : tp.w_get ()); lc_mbstowcs (collate_mbtowc, collate_charset, ws2, s2, n2); /* The sort key is a NUL-terminated byte string. */ ret = LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, (PWCHAR) s1, sn); if (n2 > NT_MAX_PATH) free (ws2); if (ret == 0) { if (GetLastError () != ERROR_INSUFFICIENT_BUFFER) set_errno (EINVAL); return sn; } /* LCMapStringW returns byte count including the terminating NUL character. strxfrm is supposed to return length excluding the NUL. */ return ret - 1; } /* Fetch default ANSI codepage from locale info and generate a setlocale compatible character set code. Called from newlib's setlocale(), if the charset isn't given explicitely in the POSIX compatible locale specifier. */ extern "C" void __set_charset_from_locale (const char *locale, char *charset) { UINT cp; LCID lcid = __get_lcid_from_locale (locale); /* "C" locale, or invalid locale? */ if (lcid == 0 || lcid == (LCID) -1) cp = 20127; else if (!GetLocaleInfoW (lcid, LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER, (PWCHAR) &cp, sizeof cp)) cp = 0; /* Translate codepage and lcid to a charset closely aligned with the default charsets defined in Glibc. */ const char *cs; const char *modifier = strchr (locale, '@') ?: ""; switch (cp) { case 20127: cs = "ASCII"; break; case 874: cs = "CP874"; break; case 932: cs = "EUCJP"; break; case 936: cs = "GBK"; break; case 949: cs = "EUCKR"; break; case 950: cs = "BIG5"; break; case 1250: if (lcid == 0x081a /* sr_CS (Serbian Language/Former Serbia and Montenegro) */ || lcid == 0x181a /* sr_BA (Serbian Language/Bosnia and Herzegovina) */ || lcid == 0x241a /* sr_RS (Serbian Language/Serbia) */ || lcid == 0x2c1a /* sr_ME (Serbian Language/Montenegro)*/ || lcid == 0x0442) /* tk_TM (Turkmen/Turkmenistan) */ cs = "UTF-8"; else if (lcid == 0x041c) /* sq_AL (Albanian/Albania) */ cs = "ISO-8859-1"; else cs = "ISO-8859-2"; break; case 1251: if (lcid == 0x0c1a /* sr_CS (Serbian Language/Former Serbia and Montenegro) */ || lcid == 0x1c1a /* sr_BA (Serbian Language/Bosnia and Herzegovina) */ || lcid == 0x281a /* sr_RS (Serbian Language/Serbia) */ || lcid == 0x301a /* sr_ME (Serbian Language/Montenegro)*/ || lcid == 0x0440 /* ky_KG (Kyrgyz/Kyrgyzstan) */ || lcid == 0x0450 /* mn_MN (Mongolian/Mongolia) */ /* tt_RU (Tatar/Russia), IQTElif alphabet */ || (lcid == 0x0444 && has_modifier ("@iqtelif"))) cs = "UTF-8"; else if (lcid == 0x0423) /* be_BY (Belarusian/Belarus) */ cs = has_modifier ("@latin") ? "UTF-8" : "CP1251"; else if (lcid == 0x0402) /* bg_BG (Bulgarian/Bulgaria) */ cs = "CP1251"; else if (lcid == 0x0422) /* uk_UA (Ukrainian/Ukraine) */ cs = "KOI8-U"; else cs = "ISO-8859-5"; break; case 1252: if (lcid == 0x0452) /* cy_GB (Welsh/Great Britain) */ cs = "ISO-8859-14"; else if (lcid == 0x4009 /* en_IN (English/India) */ || lcid == 0x0464 /* fil_PH (Filipino/Philippines) */ || lcid == 0x0462 /* fy_NL (Frisian/Netherlands) */ || lcid == 0x0468 /* ha_NG (Hausa/Nigeria) */ || lcid == 0x0470 /* ig_NG (Igbo/Nigeria) */ || lcid == 0x046c /* nso_ZA (Northern Sotho/South Africa) */ || lcid == 0x0487 /* rw_RW (Kinyarwanda/Rwanda) */ || lcid == 0x043b /* se_NO (Northern Saami/Norway) */ || lcid == 0x0432 /* tn_ZA (Tswana/South Africa) */ || lcid == 0x0488 /* wo_SN (Wolof/Senegal) */ || lcid == 0x046a) /* yo_NG (Yoruba/Nigeria) */ cs = "UTF-8"; else if (lcid == 0x042e) /* hsb_DE (Upper Sorbian/Germany) */ cs = "ISO-8859-2"; else if (lcid == 0x0491 /* gd_GB (Scots Gaelic/Great Britain) */ || has_modifier ("@euro")) cs = "ISO-8859-15"; else cs = "ISO-8859-1"; break; case 1253: cs = "ISO-8859-7"; break; case 1254: if (lcid == 0x042c) /* az_AZ (Azeri/Azerbaijan) */ cs = "UTF-8"; else if (lcid == 0x0443) /* uz_UZ (Uzbek/Uzbekistan) */ cs = has_modifier ("@cyrillic") ? "UTF-8" : "ISO-8859-1"; else cs = "ISO-8859-9"; break; case 1255: cs = "ISO-8859-8"; break; case 1256: if (lcid == 0x0429 /* fa_IR (Persian/Iran) */ || lcid == 0x0480 /* ug_CN (Uyghur/China) */ || lcid == 0x0420) /* ur_PK (Urdu/Pakistan) */ cs = "UTF-8"; else cs = "ISO-8859-6"; break; case 1257: if (lcid == 0x0425) /* et_EE (Estonian/Estonia) */ cs = "ISO-8859-15"; else cs = "ISO-8859-13"; break; case 1258: default: if (lcid == 0x0481) /* mi_NZ (Maori/New Zealand) */ cs = "ISO-8859-13"; else if (lcid == 0x043a) /* mt_MT (Maltese/Malta) */ cs = "ISO-8859-3"; else if (lcid == 0x0437) /* ka_GE (Georgian/Georgia) */ cs = "GEORGIAN-PS"; else if (lcid == 0x043f) /* kk_KZ (Kazakh/Kazakhstan) */ cs = "PT154"; else cs = "UTF-8"; break; } stpcpy (charset, cs); } static char * check_codepage (char *ret) { if (!wincap.has_always_all_codepages ()) { /* Prior to Windows Vista, many codepages are not installed by default, or can be deinstalled. The following codepages require that the respective conversion tables are installed into the OS. So we check if they are installed and if not, setlocale should fail. */ CPINFO cpi; UINT cp = 0; if (__mbtowc == __sjis_mbtowc) cp = 932; else if (__mbtowc == __eucjp_mbtowc) cp = 20932; else if (__mbtowc == __gbk_mbtowc) cp = 936; else if (__mbtowc == __kr_mbtowc) cp = 949; else if (__mbtowc == __big5_mbtowc) cp = 950; if (cp && !GetCPInfo (cp, &cpi) && GetLastError () == ERROR_INVALID_PARAMETER) return NULL; } return ret; } static void internal_setlocale () { /* Each setlocale from the environment potentially changes the multibyte representation of the CWD. Therefore we have to reevaluate the CWD's posix path and store in the new charset. Same for the PATH environment variable. */ /* FIXME: Other buffered paths might be affected as well. */ /* FIXME: It could be necessary to convert the entire environment, not just PATH. */ tmp_pathbuf tp; char *path = getenv ("PATH"); wchar_t *w_path = NULL, *w_cwd; debug_printf ("Cygwin charset changed from %s to %s", cygheap->locale.charset, __locale_charset ()); /* Fetch PATH and CWD and convert to wchar_t in previous charset. */ if (path && *path) /* $PATH can be potentially unset. */ { w_path = tp.w_get (); sys_mbstowcs (w_path, 32768, path); } w_cwd = tp.w_get (); cwdstuff::cwd_lock.acquire (); sys_mbstowcs (w_cwd, 32768, cygheap->cwd.get_posix ()); /* Set charset for internal conversion functions. */ if (*__locale_charset () == 'A'/*SCII*/) { cygheap->locale.mbtowc = __utf8_mbtowc; cygheap->locale.wctomb = __utf8_wctomb; } else { cygheap->locale.mbtowc = __mbtowc; cygheap->locale.wctomb = __wctomb; } strcpy (cygheap->locale.charset, __locale_charset ()); /* Restore CWD and PATH in new charset. */ cygheap->cwd.reset_posix (w_cwd); cwdstuff::cwd_lock.release (); if (w_path) { char *c_path = tp.c_get (); sys_wcstombs (c_path, 32768, w_path); setenv ("PATH", c_path, 1); } } /* Called from dll_crt0_1, before fetching the command line from Windows. Set the internal charset according to the environment locale settings. Check if a required codepage is available, and only switch internal charset if so. Make sure to reset the application locale to "C" per POSIX. */ void initial_setlocale () { char *ret = _setlocale_r (_REENT, LC_CTYPE, ""); if (ret && check_codepage (ret) && strcmp (cygheap->locale.charset, __locale_charset ()) != 0) internal_setlocale (); } /* Like newlib's setlocale, but additionally check if the charset needs OS support and the required codepage is actually installed. If codepage is not available, revert to previous locale and return NULL. For details about codepage availability, see the comment in check_codepage() above. */ extern "C" char * setlocale (int category, const char *locale) { char old[(LC_MESSAGES + 1) * (ENCODING_LEN + 1/*"/"*/ + 1)]; if (locale && !wincap.has_always_all_codepages ()) stpcpy (old, _setlocale_r (_REENT, category, NULL)); char *ret = _setlocale_r (_REENT, category, locale); if (ret && locale && !(ret = check_codepage (ret))) _setlocale_r (_REENT, category, old); return ret; }