* miscfuncs.h (transform_chars): Declare. Define inline variation here.
* mount.cc (mount_info::from_fstab): Remove extern declaration of transform_chars. * path.cc (tfx_chars): Move to strfuncs.cc. (transform_chars): Ditto. * strfunc.cc (tfx_chars): Moved here from path.cc. (transform_chars): Ditto. (sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip save for all characters. (sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences representing U+f0XX UNICODE chars. Fix typo in comment.
This commit is contained in:
		@@ -1,3 +1,17 @@
 | 
			
		||||
2009-11-02  Corinna Vinschen  <corinna@vinschen.de>
 | 
			
		||||
 | 
			
		||||
	* miscfuncs.h (transform_chars): Declare.  Define inline variation here.
 | 
			
		||||
	* mount.cc (mount_info::from_fstab): Remove extern declaration of
 | 
			
		||||
	transform_chars.
 | 
			
		||||
	* path.cc (tfx_chars): Move to strfuncs.cc.
 | 
			
		||||
	(transform_chars): Ditto.
 | 
			
		||||
	* strfunc.cc (tfx_chars): Moved here from path.cc.
 | 
			
		||||
	(transform_chars): Ditto.
 | 
			
		||||
	(sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip
 | 
			
		||||
	save for all characters.
 | 
			
		||||
	(sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences
 | 
			
		||||
	representing U+f0XX UNICODE chars.  Fix typo in comment.
 | 
			
		||||
 | 
			
		||||
2009-11-02  Corinna Vinschen  <corinna@vinschen.de>
 | 
			
		||||
 | 
			
		||||
	* path.cc (tfx_chars): Constify.
 | 
			
		||||
@@ -362,7 +376,7 @@
 | 
			
		||||
	(fhandler_console::write_normal): Always use codepage 437 for alternate
 | 
			
		||||
	charset.  Otherwise always default to the current internal locale.
 | 
			
		||||
	Replace ASCII SO with ASCII CAN.
 | 
			
		||||
	* strfuncs.cc: Tweka comments according to below changes.
 | 
			
		||||
	* strfuncs.cc: Tweak comments according to below changes.
 | 
			
		||||
	(sys_cp_wcstombs): Constify charset parameter.  Convert all wchar_t
 | 
			
		||||
	values in the Unicode private use area U+F0xx to the singlebyte
 | 
			
		||||
	counterpart.  Drop special handling creating ASCII SO sequence from
 | 
			
		||||
 
 | 
			
		||||
@@ -25,6 +25,14 @@ void backslashify (const char *, char *, bool);
 | 
			
		||||
void slashify (const char *, char *, bool);
 | 
			
		||||
#define isslash(c) ((c) == '/')
 | 
			
		||||
 | 
			
		||||
extern void transform_chars (PWCHAR, PWCHAR);
 | 
			
		||||
inline void
 | 
			
		||||
transform_chars (PUNICODE_STRING upath, USHORT start_idx)
 | 
			
		||||
{
 | 
			
		||||
  transform_chars (upath->Buffer + start_idx,
 | 
			
		||||
		   upath->Buffer + upath->Length / sizeof (WCHAR) - 1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Memory checking */
 | 
			
		||||
int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) __attribute__ ((regparm(2)));
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -997,7 +997,6 @@ mount_info::from_fstab (bool user, WCHAR fstab[], PWCHAR fstab_end)
 | 
			
		||||
 | 
			
		||||
  if (user)
 | 
			
		||||
    {
 | 
			
		||||
      extern void transform_chars (PWCHAR, PWCHAR);
 | 
			
		||||
      PWCHAR username;
 | 
			
		||||
      sys_mbstowcs (username = wcpcpy (fstab_end, L".d\\"),
 | 
			
		||||
		    NT_MAX_PATH - (fstab_end - fstab),
 | 
			
		||||
 
 | 
			
		||||
@@ -395,63 +395,6 @@ path_conv::set_normalized_path (const char *path_copy)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Transform characters invalid for Windows filenames to the Unicode private
 | 
			
		||||
   use area in the U+f0XX range.  The affected characters are all control
 | 
			
		||||
   chars 1 <= c <= 31, as well as the characters " * : < > ? |.  The backslash
 | 
			
		||||
   is affected as well, but we can't transform it as long as we accept Win32
 | 
			
		||||
   paths as input.
 | 
			
		||||
   The reverse functionality is in strfuncs.cc, function sys_cp_wcstombs. */
 | 
			
		||||
static const WCHAR tfx_chars[] = {
 | 
			
		||||
            0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
 | 
			
		||||
 0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
 | 
			
		||||
 0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
 | 
			
		||||
 0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
 | 
			
		||||
 0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
 | 
			
		||||
 0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
 | 
			
		||||
 0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
 | 
			
		||||
 0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
 | 
			
		||||
          ' ',          '!', 0xf000 | '"',          '#',
 | 
			
		||||
          '$',          '%',          '&',           39,
 | 
			
		||||
          '(',          ')', 0xf000 | '*',          '+',
 | 
			
		||||
          ',',          '-',          '.',          '\\',
 | 
			
		||||
          '0',          '1',          '2',          '3',
 | 
			
		||||
          '4',          '5',          '6',          '7',
 | 
			
		||||
          '8',          '9', 0xf000 | ':',          ';',
 | 
			
		||||
 0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
 | 
			
		||||
          '@',          'A',          'B',          'C',
 | 
			
		||||
          'D',          'E',          'F',          'G',
 | 
			
		||||
          'H',          'I',          'J',          'K',
 | 
			
		||||
          'L',          'M',          'N',          'O',
 | 
			
		||||
          'P',          'Q',          'R',          'S',
 | 
			
		||||
          'T',          'U',          'V',          'W',
 | 
			
		||||
          'X',          'Y',          'Z',          '[',
 | 
			
		||||
          '\\',          ']',          '^',          '_',
 | 
			
		||||
          '`',          'a',          'b',          'c',
 | 
			
		||||
          'd',          'e',          'f',          'g',
 | 
			
		||||
          'h',          'i',          'j',          'k',
 | 
			
		||||
          'l',          'm',          'n',          'o',
 | 
			
		||||
          'p',          'q',          'r',          's',
 | 
			
		||||
          't',          'u',          'v',          'w',
 | 
			
		||||
          'x',          'y',          'z',          '{',
 | 
			
		||||
 0xf000 | '|',          '}',          '~',          127
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
transform_chars (PWCHAR path, PWCHAR path_end)
 | 
			
		||||
{
 | 
			
		||||
  for (; path <= path_end; ++path)
 | 
			
		||||
    if (*path < 128)
 | 
			
		||||
      *path = tfx_chars[*path];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline
 | 
			
		||||
void
 | 
			
		||||
transform_chars (PUNICODE_STRING upath, USHORT start_idx)
 | 
			
		||||
{
 | 
			
		||||
  transform_chars (upath->Buffer + start_idx,
 | 
			
		||||
		   upath->Buffer + upath->Length / sizeof (WCHAR) - 1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void
 | 
			
		||||
str2uni_cat (UNICODE_STRING &tgt, const char *srcstr)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -22,6 +22,55 @@ details. */
 | 
			
		||||
#include "cygheap.h"
 | 
			
		||||
#include "tls_pbuf.h"
 | 
			
		||||
 | 
			
		||||
/* Transform characters invalid for Windows filenames to the Unicode private
 | 
			
		||||
   use area in the U+f0XX range.  The affected characters are all control
 | 
			
		||||
   chars 1 <= c <= 31, as well as the characters " * : < > ? |.  The backslash
 | 
			
		||||
   is affected as well, but we can't transform it as long as we accept Win32
 | 
			
		||||
   paths as input.
 | 
			
		||||
   The reverse functionality is in function sys_cp_wcstombs. */
 | 
			
		||||
static const WCHAR tfx_chars[] = {
 | 
			
		||||
            0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
 | 
			
		||||
 0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
 | 
			
		||||
 0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
 | 
			
		||||
 0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
 | 
			
		||||
 0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
 | 
			
		||||
 0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
 | 
			
		||||
 0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
 | 
			
		||||
 0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
 | 
			
		||||
          ' ',          '!', 0xf000 | '"',          '#',
 | 
			
		||||
          '$',          '%',          '&',           39,
 | 
			
		||||
          '(',          ')', 0xf000 | '*',          '+',
 | 
			
		||||
          ',',          '-',          '.',          '\\',
 | 
			
		||||
          '0',          '1',          '2',          '3',
 | 
			
		||||
          '4',          '5',          '6',          '7',
 | 
			
		||||
          '8',          '9', 0xf000 | ':',          ';',
 | 
			
		||||
 0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
 | 
			
		||||
          '@',          'A',          'B',          'C',
 | 
			
		||||
          'D',          'E',          'F',          'G',
 | 
			
		||||
          'H',          'I',          'J',          'K',
 | 
			
		||||
          'L',          'M',          'N',          'O',
 | 
			
		||||
          'P',          'Q',          'R',          'S',
 | 
			
		||||
          'T',          'U',          'V',          'W',
 | 
			
		||||
          'X',          'Y',          'Z',          '[',
 | 
			
		||||
          '\\',          ']',          '^',          '_',
 | 
			
		||||
          '`',          'a',          'b',          'c',
 | 
			
		||||
          'd',          'e',          'f',          'g',
 | 
			
		||||
          'h',          'i',          'j',          'k',
 | 
			
		||||
          'l',          'm',          'n',          'o',
 | 
			
		||||
          'p',          'q',          'r',          's',
 | 
			
		||||
          't',          'u',          'v',          'w',
 | 
			
		||||
          'x',          'y',          'z',          '{',
 | 
			
		||||
 0xf000 | '|',          '}',          '~',          127
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
transform_chars (PWCHAR path, PWCHAR path_end)
 | 
			
		||||
{
 | 
			
		||||
  for (; path <= path_end; ++path)
 | 
			
		||||
    if (*path < 128)
 | 
			
		||||
      *path = tfx_chars[*path];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
 | 
			
		||||
   wchar_t character representation.  That's unfortunate for us since
 | 
			
		||||
   we require UTF for the OS.  What we do here is to have our own
 | 
			
		||||
@@ -426,16 +475,19 @@ sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
 | 
			
		||||
    {
 | 
			
		||||
      wchar_t pw = *pwcs;
 | 
			
		||||
      int bytes;
 | 
			
		||||
      unsigned char cwc;
 | 
			
		||||
 | 
			
		||||
      /* Convert UNICODE private use area.  Reverse functionality for the
 | 
			
		||||
         ASCII area <= 0x7f (only for path names) is transform_chars in
 | 
			
		||||
	 path.cc.  Reverse functionality for invalid bytes in a multibyte
 | 
			
		||||
	 sequence is in sys_cp_mbstowcs. */
 | 
			
		||||
      if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1))
 | 
			
		||||
         ASCII area <= 0x7f (only for path names) is transform_chars above.
 | 
			
		||||
	 Reverse functionality for invalid bytes in a multibyte sequence is
 | 
			
		||||
	 in sys_cp_mbstowcs below. */
 | 
			
		||||
      if ((pw & 0xff00) == 0xf000
 | 
			
		||||
	  && (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000)
 | 
			
		||||
	      || (cwc >= 0x80 && MB_CUR_MAX > 1)))
 | 
			
		||||
	{
 | 
			
		||||
	  buf[0] = pw & 0xff;
 | 
			
		||||
	  buf[0] = (char) cwc;
 | 
			
		||||
	  bytes = 1;
 | 
			
		||||
      	}
 | 
			
		||||
	}
 | 
			
		||||
      else
 | 
			
		||||
	{
 | 
			
		||||
	  bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
 | 
			
		||||
@@ -603,15 +655,14 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
 | 
			
		||||
	    }
 | 
			
		||||
	}
 | 
			
		||||
      else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
 | 
			
		||||
				  charset, &ps)) < 0
 | 
			
		||||
	       || (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80))
 | 
			
		||||
				  charset, &ps)) < 0)
 | 
			
		||||
	{
 | 
			
		||||
	  /* The technique is based on a discussion here:
 | 
			
		||||
	     http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
 | 
			
		||||
 | 
			
		||||
	     Invalid bytes in a multibyte secuence are converted to
 | 
			
		||||
	     the private use area which is already used to store ASCII
 | 
			
		||||
	     chars invalid in Windows filenames.  This techinque allows 
 | 
			
		||||
	     chars invalid in Windows filenames.  This technque allows 
 | 
			
		||||
	     to store them in a symmetric way. */
 | 
			
		||||
	  bytes = 1;
 | 
			
		||||
	  if (dst)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user