same as in commitid 1005B6CF36E3932B560, plus assorted cleanup

This commit is contained in:
tg 2018-08-10 02:53:39 +00:00
parent d5ce724471
commit 6cea111ef1
9 changed files with 36 additions and 35 deletions

18
check.t
View File

@ -1,4 +1,4 @@
# $MirOS: src/bin/mksh/check.t,v 1.807 2018/07/15 17:22:15 tg Exp $
# $MirOS: src/bin/mksh/check.t,v 1.808 2018/08/10 02:53:31 tg Exp $
# -*- mode: sh -*-
#-
# Copyright © 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
@ -2487,7 +2487,7 @@ expected-stdout:
name: glob-range-3
description:
Check that globbing matches the right things...
# breaks on Mac OSX (HFS+ non-standard Unicode canonical decomposition)
# breaks on Mac OSX (HFS+ non-standard UTF-8 canonical decomposition)
# breaks on Cygwin 1.7 (files are now UTF-16 or something)
# breaks on QNX 6.4.1 (says RT)
category: !os:cygwin,!os:darwin,!os:msys,!os:nto,!os:os2,!os:os390
@ -8512,7 +8512,7 @@ expected-stdout:
---
name: typeset-padding-3
description:
Check for a regression in which Unicode wasnt left-padded right
Check for a regression in which UTF-8 wasnt left-padded right
stdin:
set -U
nl=$'\n'
@ -8535,7 +8535,7 @@ description:
Check that the UTF-8 Byte Order Mark is ignored as the first
multibyte character of the shell input (with -c, from standard
input, as file, or as eval argument), but nowhere else
# breaks on Mac OSX (HFS+ non-standard Unicode canonical decomposition)
# breaks on Mac OSX (HFS+ non-standard UTF-8 canonical decomposition)
category: !os:darwin,!shell:ebcdic-yes
stdin:
mkdir foo
@ -10414,7 +10414,7 @@ expected-stdout:
---
name: integer-base-one-3Ws
description:
some sample code for hexdumping Unicode
some sample code for hexdumping UCS-2
not NUL safe; input lines must be NL terminated
stdin:
set -U
@ -10582,7 +10582,7 @@ expected-stdout:
---
name: integer-base-one-3Wr
description:
some sample code for hexdumping Unicode; NUL and binary safe
some sample code for hexdumping UCS-2; NUL and binary safe
stdin:
set -U
{
@ -10702,7 +10702,7 @@ expected-stdout:
---
name: integer-base-one-5A
description:
Check to see that were NUL and Unicode safe
Check to see that were NUL and UCS safe
category: !shell:ebcdic-yes
stdin:
set +U
@ -10716,7 +10716,7 @@ expected-stdout:
---
name: integer-base-one-5E
description:
Check to see that were NUL and Unicode safe
Check to see that were NUL and UCS safe
category: !shell:ebcdic-no
stdin:
set +U
@ -10730,7 +10730,7 @@ expected-stdout:
---
name: integer-base-one-5W
description:
Check to see that were NUL and Unicode safe
Check to see that were NUL and UCS safe
stdin:
set -U
print 'a\0b€c' >x

6
expr.c
View File

@ -23,7 +23,7 @@
#include "sh.h"
__RCSID("$MirOS: src/bin/mksh/expr.c,v 1.104 2018/06/26 21:22:21 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/expr.c,v 1.105 2018/08/10 02:53:33 tg Exp $");
#define EXPRTOK_DEFNS
#include "exprtok.h"
@ -885,7 +885,7 @@ static int mb_ucsbsearch(const struct mb_ucsrange arr[], size_t elems,
unsigned int val) MKSH_A_PURE;
/*
* Generated from the Unicode Character Database, Version 11.0.0, by
* Generated from the UCD 11.0.0 by
* MirOS: contrib/code/Snippets/eawparse,v 1.12 2017/09/06 16:05:45 tg Exp $
*/
@ -1177,7 +1177,7 @@ mb_ucsbsearch(const struct mb_ucsrange arr[], size_t elems, unsigned int val)
return (0);
}
/* Unix column width of a wide character (Unicode code point, really) */
/* Unix column width of a wide character (UCS code point, really) */
int
utf_wcwidth(unsigned int wc)
{

View File

@ -38,7 +38,7 @@
#endif
#endif
__RCSID("$MirOS: src/bin/mksh/funcs.c,v 1.353 2018/01/14 01:26:49 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/funcs.c,v 1.354 2018/08/10 02:53:34 tg Exp $");
#if HAVE_KILLPG
/*
@ -493,7 +493,7 @@ c_print(const char **wp)
Xput(xs, xp, '\\');
}
} else if ((unsigned int)c > 0xFF) {
/* generic function returned Unicode */
/* generic function returned UCS */
po.ts[utf_wctomb(po.ts, c - 0x100)] = 0;
c = 0;
do {

10
misc.c
View File

@ -32,7 +32,7 @@
#include <grp.h>
#endif
__RCSID("$MirOS: src/bin/mksh/misc.c,v 1.292 2018/03/17 22:46:09 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/misc.c,v 1.293 2018/08/10 02:53:35 tg Exp $");
#define KSH_CHVT_FLAG
#ifdef MKSH_SMALL
@ -2456,7 +2456,7 @@ getrusage(int what, struct rusage *ru)
* and fp (put back a char) for backslash escapes,
* assuming the first call to *fg gets the char di-
* rectly after the backslash; return the character
* (0..0xFF), Unicode (wc + 0x100), or -1 if no known
* (0..0xFF), UCS (wc + 0x100), or -1 if no known
* escape sequence was found
*/
int
@ -2538,9 +2538,9 @@ unbksl(bool cstyle, int (*fg)(void), void (*fp)(int))
/**
* x: look for a hexadecimal number with up to
* two (C style: arbitrary) digits; convert
* to raw octet (C style: Unicode if >0xFF)
* to raw octet (C style: UCS if >0xFF)
* u/U: look for a hexadecimal number with up to
* four (U: eight) digits; convert to Unicode
* four (U: eight) digits; convert to UCS
*/
wc = 0;
n = 0;
@ -2562,7 +2562,7 @@ unbksl(bool cstyle, int (*fg)(void), void (*fp)(int))
if (!n)
goto unknown_escape;
if ((cstyle && wc > 0xFF) || fc != 'x')
/* Unicode marker */
/* UCS marker */
wc += 0x100;
break;
case '\'':

17
mksh.1
View File

@ -1,4 +1,4 @@
.\" $MirOS: src/bin/mksh/mksh.1,v 1.458 2018/07/15 17:21:22 tg Exp $
.\" $MirOS: src/bin/mksh/mksh.1,v 1.459 2018/08/10 02:53:36 tg Exp $
.\" $OpenBSD: ksh.1,v 1.160 2015/07/04 13:27:04 feinerer Exp $
.\"-
.\" Copyright © 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
@ -77,7 +77,7 @@
.\" with -mandoc, it might implement .Mx itself, but we want to
.\" use our own definition. And .Dd must come *first*, always.
.\"
.Dd $Mdocdate: July 15 2018 $
.Dd $Mdocdate: August 10 2018 $
.\"
.\" Check which macro package we use, and do other -mdoc setup.
.\"
@ -1047,7 +1047,7 @@ and
.Dq Li \eu#### ,
.Dq #
means a hexadecimal digit, of which there may be none up to four or eight;
these escapes translate a Unicode codepoint to UTF-8.
these escapes translate a Universal Coded Character Set codepoint to UTF-8.
Furthermore,
.Dq Li \eE
and
@ -1083,7 +1083,7 @@ and yield raw octets; hexadecimal sequences
greedily eat up as many hexadecimal digits
.Dq #
as they can and terminate with the first non-hexadecimal digit;
these translate a Unicode codepoint to UTF-8.
these translate a Universal Coded Character Set codepoint to UTF-8.
The sequence
.Dq Li \ec# ,
where
@ -2652,7 +2652,8 @@ as required by the standard), as that's unsafe to do.
As a special
.Nm mksh
extension, numbers to the base of one are treated as either (8-bit
transparent) ASCII or Unicode codepoints, depending on the shell's
transparent) ASCII or Universal Coded Character Set codepoints,
depending on the shell's
.Ic utf8\-mode
flag (current setting).
The
@ -2665,7 +2666,7 @@ instead of
is also supported.
Note that NUL bytes (integral value of zero) cannot be used.
An unset or empty parameter evaluates to 0 in integer context.
In Unicode mode, raw octets are mapped into the range EF80..EFFF as in
In UTF-8 mode, raw octets are mapped into the range EF80..EFFF as in
OPTU-8, which is in the PUA and has been assigned by CSUR for this use.
If more than one octet in ASCII mode, or a sequence of more than one
octet not forming a valid and minimal CESU-8 sequence is passed, the
@ -6658,7 +6659,7 @@ locale.
.Ic utf8\-mode
.Em must
be disabled in POSIX mode, and it
only supports the Unicode BMP (Basic Multilingual Plane) and maps
only supports the BMP (Basic Multilingual Plane) of UCS and maps
raw octets into the U+EF80..U+EFFF wide character range; compare
.Sx Arithmetic expressions .
The following
@ -6679,7 +6680,7 @@ case ${KSH_VERSION:\-} in
esac ;;
esac
.Ed
In near future, (Unicode) locale tracking will be implemented though.
In near future, (UTF-8) locale tracking will be implemented though.
.Pp
See also the FAQ below.
.Sh BUGS

4
sh.h
View File

@ -182,7 +182,7 @@
#endif
#ifdef EXTERN
__RCSID("$MirOS: src/bin/mksh/sh.h,v 1.864 2018/07/15 17:21:23 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/sh.h,v 1.865 2018/08/10 02:53:37 tg Exp $");
#endif
#define MKSH_VERSION "R56 2018/07/15"
@ -783,7 +783,7 @@ enum sh_flag {
};
#define Flag(f) (shell_flags[(int)(f)])
#define UTFMODE Flag(FUNICODE)
#define UTFMODE Flag(FUNNYCODE)
/*
* parsing & execution environment

View File

@ -19,7 +19,7 @@
*/
@SHFLAGS_DEFNS
__RCSID("$MirOS: src/bin/mksh/sh_flags.opt,v 1.5 2017/02/18 02:33:15 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/sh_flags.opt,v 1.6 2018/08/10 02:53:39 tg Exp $");
@SHFLAGS_ENUMS
#define FN(sname,cname,flags,ochar) cname,
#define F0(sname,cname,flags,ochar) cname = 0,
@ -153,7 +153,7 @@ FN("trackall", FTRACKALL, OF_ANY
/* -U enable UTF-8 processing (non-standard) */
>U|
FN("utf8-mode", FUNICODE, OF_ANY
FN("utf8-mode", FUNNYCODE, OF_ANY
/* -v echo input */
>v|

4
shf.c
View File

@ -27,7 +27,7 @@
#include "sh.h"
__RCSID("$MirOS: src/bin/mksh/shf.c,v 1.97 2018/01/14 01:28:16 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/shf.c,v 1.98 2018/08/10 02:53:39 tg Exp $");
/* flags to shf_emptybuf() */
#define EB_READSW 0x01 /* about to switch to reading */
@ -1304,7 +1304,7 @@ ebcdic_init(void)
* and the C1 control characters other than NEL are
* hopeless, but we map EBCDIC NEL to ASCII LF so we
* cannot even use C1 NEL.
* If ever we map to Unicode, bump the table width to
* If ever we map to UCS, bump the table width to
* an unsigned int, and or the raw unconverted EBCDIC
* values with 0x01000000 instead.
*/

4
tree.c
View File

@ -23,7 +23,7 @@
#include "sh.h"
__RCSID("$MirOS: src/bin/mksh/tree.c,v 1.95 2018/01/14 00:03:05 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/tree.c,v 1.96 2018/08/10 02:53:39 tg Exp $");
#define INDENT 8
@ -808,7 +808,7 @@ vistree(char *dst, size_t sz, struct op *t)
c = ksh_unctrl(c);
} else if (UTFMODE && rtt2asc(c) > 0x7F) {
/* better not try to display broken multibyte chars */
/* also go easy on the Unicode: no U+FFFD here */
/* also go easy on the UCS: no U+FFFD here */
c = ORD('?');
}
*dst++ = c;