* miscfuncs.h (transform_chars): Declare. Define inline variation here.

* mount.cc (mount_info::from_fstab): Remove extern declaration of
	transform_chars.
	* path.cc (tfx_chars): Move to strfuncs.cc.
	(transform_chars): Ditto.
	* strfunc.cc (tfx_chars): Moved here from path.cc.
	(transform_chars): Ditto.
	(sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip
	save for all characters.
	(sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences
	representing U+f0XX UNICODE chars.  Fix typo in comment.
This commit is contained in:
Corinna Vinschen
2009-11-02 11:42:04 +00:00
parent 9725900d86
commit a657970571
5 changed files with 83 additions and 68 deletions

View File

@@ -22,6 +22,55 @@ details. */
#include "cygheap.h"
#include "tls_pbuf.h"
/* Transform characters invalid for Windows filenames to the Unicode private
use area in the U+f0XX range. The affected characters are all control
chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash
is affected as well, but we can't transform it as long as we accept Win32
paths as input.
The reverse functionality is in function sys_cp_wcstombs. */
static const WCHAR tfx_chars[] = {
0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3,
0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7,
0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11,
0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15,
0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19,
0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23,
0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27,
0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31,
' ', '!', 0xf000 | '"', '#',
'$', '%', '&', 39,
'(', ')', 0xf000 | '*', '+',
',', '-', '.', '\\',
'0', '1', '2', '3',
'4', '5', '6', '7',
'8', '9', 0xf000 | ':', ';',
0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?',
'@', 'A', 'B', 'C',
'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S',
'T', 'U', 'V', 'W',
'X', 'Y', 'Z', '[',
'\\', ']', '^', '_',
'`', 'a', 'b', 'c',
'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k',
'l', 'm', 'n', 'o',
'p', 'q', 'r', 's',
't', 'u', 'v', 'w',
'x', 'y', 'z', '{',
0xf000 | '|', '}', '~', 127
};
void
transform_chars (PWCHAR path, PWCHAR path_end)
{
for (; path <= path_end; ++path)
if (*path < 128)
*path = tfx_chars[*path];
}
/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
wchar_t character representation. That's unfortunate for us since
we require UTF for the OS. What we do here is to have our own
@@ -426,16 +475,19 @@ sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
{
wchar_t pw = *pwcs;
int bytes;
unsigned char cwc;
/* Convert UNICODE private use area. Reverse functionality for the
ASCII area <= 0x7f (only for path names) is transform_chars in
path.cc. Reverse functionality for invalid bytes in a multibyte
sequence is in sys_cp_mbstowcs. */
if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1))
ASCII area <= 0x7f (only for path names) is transform_chars above.
Reverse functionality for invalid bytes in a multibyte sequence is
in sys_cp_mbstowcs below. */
if ((pw & 0xff00) == 0xf000
&& (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000)
|| (cwc >= 0x80 && MB_CUR_MAX > 1)))
{
buf[0] = pw & 0xff;
buf[0] = (char) cwc;
bytes = 1;
}
}
else
{
bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
@@ -603,15 +655,14 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
}
}
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
charset, &ps)) < 0
|| (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80))
charset, &ps)) < 0)
{
/* The technique is based on a discussion here:
http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
Invalid bytes in a multibyte secuence are converted to
the private use area which is already used to store ASCII
chars invalid in Windows filenames. This techinque allows
chars invalid in Windows filenames. This technque allows
to store them in a symmetric way. */
bytes = 1;
if (dst)