same as in commitid 1005B6CF36E3932B560, plus assorted cleanup

This commit is contained in:
tg 2018-08-10 02:53:39 +00:00
parent d5ce724471
commit 6cea111ef1
9 changed files with 36 additions and 35 deletions

18
check.t
View File

@ -1,4 +1,4 @@
# $MirOS: src/bin/mksh/check.t,v 1.807 2018/07/15 17:22:15 tg Exp $ # $MirOS: src/bin/mksh/check.t,v 1.808 2018/08/10 02:53:31 tg Exp $
# -*- mode: sh -*- # -*- mode: sh -*-
#- #-
# Copyright © 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, # Copyright © 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
@ -2487,7 +2487,7 @@ expected-stdout:
name: glob-range-3 name: glob-range-3
description: description:
Check that globbing matches the right things... Check that globbing matches the right things...
# breaks on Mac OSX (HFS+ non-standard Unicode canonical decomposition) # breaks on Mac OSX (HFS+ non-standard UTF-8 canonical decomposition)
# breaks on Cygwin 1.7 (files are now UTF-16 or something) # breaks on Cygwin 1.7 (files are now UTF-16 or something)
# breaks on QNX 6.4.1 (says RT) # breaks on QNX 6.4.1 (says RT)
category: !os:cygwin,!os:darwin,!os:msys,!os:nto,!os:os2,!os:os390 category: !os:cygwin,!os:darwin,!os:msys,!os:nto,!os:os2,!os:os390
@ -8512,7 +8512,7 @@ expected-stdout:
--- ---
name: typeset-padding-3 name: typeset-padding-3
description: description:
Check for a regression in which Unicode wasnt left-padded right Check for a regression in which UTF-8 wasnt left-padded right
stdin: stdin:
set -U set -U
nl=$'\n' nl=$'\n'
@ -8535,7 +8535,7 @@ description:
Check that the UTF-8 Byte Order Mark is ignored as the first Check that the UTF-8 Byte Order Mark is ignored as the first
multibyte character of the shell input (with -c, from standard multibyte character of the shell input (with -c, from standard
input, as file, or as eval argument), but nowhere else input, as file, or as eval argument), but nowhere else
# breaks on Mac OSX (HFS+ non-standard Unicode canonical decomposition) # breaks on Mac OSX (HFS+ non-standard UTF-8 canonical decomposition)
category: !os:darwin,!shell:ebcdic-yes category: !os:darwin,!shell:ebcdic-yes
stdin: stdin:
mkdir foo mkdir foo
@ -10414,7 +10414,7 @@ expected-stdout:
--- ---
name: integer-base-one-3Ws name: integer-base-one-3Ws
description: description:
some sample code for hexdumping Unicode some sample code for hexdumping UCS-2
not NUL safe; input lines must be NL terminated not NUL safe; input lines must be NL terminated
stdin: stdin:
set -U set -U
@ -10582,7 +10582,7 @@ expected-stdout:
--- ---
name: integer-base-one-3Wr name: integer-base-one-3Wr
description: description:
some sample code for hexdumping Unicode; NUL and binary safe some sample code for hexdumping UCS-2; NUL and binary safe
stdin: stdin:
set -U set -U
{ {
@ -10702,7 +10702,7 @@ expected-stdout:
--- ---
name: integer-base-one-5A name: integer-base-one-5A
description: description:
Check to see that were NUL and Unicode safe Check to see that were NUL and UCS safe
category: !shell:ebcdic-yes category: !shell:ebcdic-yes
stdin: stdin:
set +U set +U
@ -10716,7 +10716,7 @@ expected-stdout:
--- ---
name: integer-base-one-5E name: integer-base-one-5E
description: description:
Check to see that were NUL and Unicode safe Check to see that were NUL and UCS safe
category: !shell:ebcdic-no category: !shell:ebcdic-no
stdin: stdin:
set +U set +U
@ -10730,7 +10730,7 @@ expected-stdout:
--- ---
name: integer-base-one-5W name: integer-base-one-5W
description: description:
Check to see that were NUL and Unicode safe Check to see that were NUL and UCS safe
stdin: stdin:
set -U set -U
print 'a\0b€c' >x print 'a\0b€c' >x

6
expr.c
View File

@ -23,7 +23,7 @@
#include "sh.h" #include "sh.h"
__RCSID("$MirOS: src/bin/mksh/expr.c,v 1.104 2018/06/26 21:22:21 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/expr.c,v 1.105 2018/08/10 02:53:33 tg Exp $");
#define EXPRTOK_DEFNS #define EXPRTOK_DEFNS
#include "exprtok.h" #include "exprtok.h"
@ -885,7 +885,7 @@ static int mb_ucsbsearch(const struct mb_ucsrange arr[], size_t elems,
unsigned int val) MKSH_A_PURE; unsigned int val) MKSH_A_PURE;
/* /*
* Generated from the Unicode Character Database, Version 11.0.0, by * Generated from the UCD 11.0.0 by
* MirOS: contrib/code/Snippets/eawparse,v 1.12 2017/09/06 16:05:45 tg Exp $ * MirOS: contrib/code/Snippets/eawparse,v 1.12 2017/09/06 16:05:45 tg Exp $
*/ */
@ -1177,7 +1177,7 @@ mb_ucsbsearch(const struct mb_ucsrange arr[], size_t elems, unsigned int val)
return (0); return (0);
} }
/* Unix column width of a wide character (Unicode code point, really) */ /* Unix column width of a wide character (UCS code point, really) */
int int
utf_wcwidth(unsigned int wc) utf_wcwidth(unsigned int wc)
{ {

View File

@ -38,7 +38,7 @@
#endif #endif
#endif #endif
__RCSID("$MirOS: src/bin/mksh/funcs.c,v 1.353 2018/01/14 01:26:49 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/funcs.c,v 1.354 2018/08/10 02:53:34 tg Exp $");
#if HAVE_KILLPG #if HAVE_KILLPG
/* /*
@ -493,7 +493,7 @@ c_print(const char **wp)
Xput(xs, xp, '\\'); Xput(xs, xp, '\\');
} }
} else if ((unsigned int)c > 0xFF) { } else if ((unsigned int)c > 0xFF) {
/* generic function returned Unicode */ /* generic function returned UCS */
po.ts[utf_wctomb(po.ts, c - 0x100)] = 0; po.ts[utf_wctomb(po.ts, c - 0x100)] = 0;
c = 0; c = 0;
do { do {

10
misc.c
View File

@ -32,7 +32,7 @@
#include <grp.h> #include <grp.h>
#endif #endif
__RCSID("$MirOS: src/bin/mksh/misc.c,v 1.292 2018/03/17 22:46:09 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/misc.c,v 1.293 2018/08/10 02:53:35 tg Exp $");
#define KSH_CHVT_FLAG #define KSH_CHVT_FLAG
#ifdef MKSH_SMALL #ifdef MKSH_SMALL
@ -2456,7 +2456,7 @@ getrusage(int what, struct rusage *ru)
* and fp (put back a char) for backslash escapes, * and fp (put back a char) for backslash escapes,
* assuming the first call to *fg gets the char di- * assuming the first call to *fg gets the char di-
* rectly after the backslash; return the character * rectly after the backslash; return the character
* (0..0xFF), Unicode (wc + 0x100), or -1 if no known * (0..0xFF), UCS (wc + 0x100), or -1 if no known
* escape sequence was found * escape sequence was found
*/ */
int int
@ -2538,9 +2538,9 @@ unbksl(bool cstyle, int (*fg)(void), void (*fp)(int))
/** /**
* x: look for a hexadecimal number with up to * x: look for a hexadecimal number with up to
* two (C style: arbitrary) digits; convert * two (C style: arbitrary) digits; convert
* to raw octet (C style: Unicode if >0xFF) * to raw octet (C style: UCS if >0xFF)
* u/U: look for a hexadecimal number with up to * u/U: look for a hexadecimal number with up to
* four (U: eight) digits; convert to Unicode * four (U: eight) digits; convert to UCS
*/ */
wc = 0; wc = 0;
n = 0; n = 0;
@ -2562,7 +2562,7 @@ unbksl(bool cstyle, int (*fg)(void), void (*fp)(int))
if (!n) if (!n)
goto unknown_escape; goto unknown_escape;
if ((cstyle && wc > 0xFF) || fc != 'x') if ((cstyle && wc > 0xFF) || fc != 'x')
/* Unicode marker */ /* UCS marker */
wc += 0x100; wc += 0x100;
break; break;
case '\'': case '\'':

17
mksh.1
View File

@ -1,4 +1,4 @@
.\" $MirOS: src/bin/mksh/mksh.1,v 1.458 2018/07/15 17:21:22 tg Exp $ .\" $MirOS: src/bin/mksh/mksh.1,v 1.459 2018/08/10 02:53:36 tg Exp $
.\" $OpenBSD: ksh.1,v 1.160 2015/07/04 13:27:04 feinerer Exp $ .\" $OpenBSD: ksh.1,v 1.160 2015/07/04 13:27:04 feinerer Exp $
.\"- .\"-
.\" Copyright © 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, .\" Copyright © 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
@ -77,7 +77,7 @@
.\" with -mandoc, it might implement .Mx itself, but we want to .\" with -mandoc, it might implement .Mx itself, but we want to
.\" use our own definition. And .Dd must come *first*, always. .\" use our own definition. And .Dd must come *first*, always.
.\" .\"
.Dd $Mdocdate: July 15 2018 $ .Dd $Mdocdate: August 10 2018 $
.\" .\"
.\" Check which macro package we use, and do other -mdoc setup. .\" Check which macro package we use, and do other -mdoc setup.
.\" .\"
@ -1047,7 +1047,7 @@ and
.Dq Li \eu#### , .Dq Li \eu#### ,
.Dq # .Dq #
means a hexadecimal digit, of which there may be none up to four or eight; means a hexadecimal digit, of which there may be none up to four or eight;
these escapes translate a Unicode codepoint to UTF-8. these escapes translate a Universal Coded Character Set codepoint to UTF-8.
Furthermore, Furthermore,
.Dq Li \eE .Dq Li \eE
and and
@ -1083,7 +1083,7 @@ and yield raw octets; hexadecimal sequences
greedily eat up as many hexadecimal digits greedily eat up as many hexadecimal digits
.Dq # .Dq #
as they can and terminate with the first non-hexadecimal digit; as they can and terminate with the first non-hexadecimal digit;
these translate a Unicode codepoint to UTF-8. these translate a Universal Coded Character Set codepoint to UTF-8.
The sequence The sequence
.Dq Li \ec# , .Dq Li \ec# ,
where where
@ -2652,7 +2652,8 @@ as required by the standard), as that's unsafe to do.
As a special As a special
.Nm mksh .Nm mksh
extension, numbers to the base of one are treated as either (8-bit extension, numbers to the base of one are treated as either (8-bit
transparent) ASCII or Unicode codepoints, depending on the shell's transparent) ASCII or Universal Coded Character Set codepoints,
depending on the shell's
.Ic utf8\-mode .Ic utf8\-mode
flag (current setting). flag (current setting).
The The
@ -2665,7 +2666,7 @@ instead of
is also supported. is also supported.
Note that NUL bytes (integral value of zero) cannot be used. Note that NUL bytes (integral value of zero) cannot be used.
An unset or empty parameter evaluates to 0 in integer context. An unset or empty parameter evaluates to 0 in integer context.
In Unicode mode, raw octets are mapped into the range EF80..EFFF as in In UTF-8 mode, raw octets are mapped into the range EF80..EFFF as in
OPTU-8, which is in the PUA and has been assigned by CSUR for this use. OPTU-8, which is in the PUA and has been assigned by CSUR for this use.
If more than one octet in ASCII mode, or a sequence of more than one If more than one octet in ASCII mode, or a sequence of more than one
octet not forming a valid and minimal CESU-8 sequence is passed, the octet not forming a valid and minimal CESU-8 sequence is passed, the
@ -6658,7 +6659,7 @@ locale.
.Ic utf8\-mode .Ic utf8\-mode
.Em must .Em must
be disabled in POSIX mode, and it be disabled in POSIX mode, and it
only supports the Unicode BMP (Basic Multilingual Plane) and maps only supports the BMP (Basic Multilingual Plane) of UCS and maps
raw octets into the U+EF80..U+EFFF wide character range; compare raw octets into the U+EF80..U+EFFF wide character range; compare
.Sx Arithmetic expressions . .Sx Arithmetic expressions .
The following The following
@ -6679,7 +6680,7 @@ case ${KSH_VERSION:\-} in
esac ;; esac ;;
esac esac
.Ed .Ed
In near future, (Unicode) locale tracking will be implemented though. In near future, (UTF-8) locale tracking will be implemented though.
.Pp .Pp
See also the FAQ below. See also the FAQ below.
.Sh BUGS .Sh BUGS

4
sh.h
View File

@ -182,7 +182,7 @@
#endif #endif
#ifdef EXTERN #ifdef EXTERN
__RCSID("$MirOS: src/bin/mksh/sh.h,v 1.864 2018/07/15 17:21:23 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/sh.h,v 1.865 2018/08/10 02:53:37 tg Exp $");
#endif #endif
#define MKSH_VERSION "R56 2018/07/15" #define MKSH_VERSION "R56 2018/07/15"
@ -783,7 +783,7 @@ enum sh_flag {
}; };
#define Flag(f) (shell_flags[(int)(f)]) #define Flag(f) (shell_flags[(int)(f)])
#define UTFMODE Flag(FUNICODE) #define UTFMODE Flag(FUNNYCODE)
/* /*
* parsing & execution environment * parsing & execution environment

View File

@ -19,7 +19,7 @@
*/ */
@SHFLAGS_DEFNS @SHFLAGS_DEFNS
__RCSID("$MirOS: src/bin/mksh/sh_flags.opt,v 1.5 2017/02/18 02:33:15 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/sh_flags.opt,v 1.6 2018/08/10 02:53:39 tg Exp $");
@SHFLAGS_ENUMS @SHFLAGS_ENUMS
#define FN(sname,cname,flags,ochar) cname, #define FN(sname,cname,flags,ochar) cname,
#define F0(sname,cname,flags,ochar) cname = 0, #define F0(sname,cname,flags,ochar) cname = 0,
@ -153,7 +153,7 @@ FN("trackall", FTRACKALL, OF_ANY
/* -U enable UTF-8 processing (non-standard) */ /* -U enable UTF-8 processing (non-standard) */
>U| >U|
FN("utf8-mode", FUNICODE, OF_ANY FN("utf8-mode", FUNNYCODE, OF_ANY
/* -v echo input */ /* -v echo input */
>v| >v|

4
shf.c
View File

@ -27,7 +27,7 @@
#include "sh.h" #include "sh.h"
__RCSID("$MirOS: src/bin/mksh/shf.c,v 1.97 2018/01/14 01:28:16 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/shf.c,v 1.98 2018/08/10 02:53:39 tg Exp $");
/* flags to shf_emptybuf() */ /* flags to shf_emptybuf() */
#define EB_READSW 0x01 /* about to switch to reading */ #define EB_READSW 0x01 /* about to switch to reading */
@ -1304,7 +1304,7 @@ ebcdic_init(void)
* and the C1 control characters other than NEL are * and the C1 control characters other than NEL are
* hopeless, but we map EBCDIC NEL to ASCII LF so we * hopeless, but we map EBCDIC NEL to ASCII LF so we
* cannot even use C1 NEL. * cannot even use C1 NEL.
* If ever we map to Unicode, bump the table width to * If ever we map to UCS, bump the table width to
* an unsigned int, and or the raw unconverted EBCDIC * an unsigned int, and or the raw unconverted EBCDIC
* values with 0x01000000 instead. * values with 0x01000000 instead.
*/ */

4
tree.c
View File

@ -23,7 +23,7 @@
#include "sh.h" #include "sh.h"
__RCSID("$MirOS: src/bin/mksh/tree.c,v 1.95 2018/01/14 00:03:05 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/tree.c,v 1.96 2018/08/10 02:53:39 tg Exp $");
#define INDENT 8 #define INDENT 8
@ -808,7 +808,7 @@ vistree(char *dst, size_t sz, struct op *t)
c = ksh_unctrl(c); c = ksh_unctrl(c);
} else if (UTFMODE && rtt2asc(c) > 0x7F) { } else if (UTFMODE && rtt2asc(c) > 0x7F) {
/* better not try to display broken multibyte chars */ /* better not try to display broken multibyte chars */
/* also go easy on the Unicode: no U+FFFD here */ /* also go easy on the UCS: no U+FFFD here */
c = ORD('?'); c = ORD('?');
} }
*dst++ = c; *dst++ = c;