ok, so instead of removing the “-o utf8-hack” behaviour of the 1#* operator

we just tell the user to only throw valid CESU-8 or single octets on it
This commit is contained in:
tg 2008-04-20 01:12:52 +00:00
parent f80424e92e
commit c312619cc9
2 changed files with 72 additions and 120 deletions

184
check.t
View File

@ -1,4 +1,4 @@
# $MirOS: src/bin/mksh/check.t,v 1.184 2008/04/20 00:56:17 tg Exp $
# $MirOS: src/bin/mksh/check.t,v 1.185 2008/04/20 01:12:52 tg Exp $
# $OpenBSD: bksl-nl.t,v 1.2 2001/01/28 23:04:56 niklas Exp $
# $OpenBSD: history.t,v 1.5 2001/01/28 23:04:56 niklas Exp $
# $OpenBSD: read.t,v 1.3 2003/03/10 03:48:16 david Exp $
@ -4731,7 +4731,70 @@ expected-stderr-pattern:
/1#…: unexpected '€'/
expected-exit: e != 0
---
name: integer-base-one-3a
name: integer-base-one-2d1
description:
check if the use of fake integer base 1 handles octets okay
stdin:
set -o utf8-hack
typeset -i16 x=1#ÿ
print /$x/ # invalid utf-8
expected-stdout:
/16#efff/
---
name: integer-base-one-2d2
description:
check if the use of fake integer base 1 handles octets
stdin:
set -o utf8-hack
typeset -i16 x=1
print /$x/ # invalid 2-byte
expected-stdout:
/16#efc2/
---
name: integer-base-one-2d3
description:
check if the use of fake integer base 1 handles octets
stdin:
set -o utf8-hack
typeset -i16 x=1
print /$x/ # invalid 2-byte
expected-stdout:
/16#efef/
---
name: integer-base-one-2d4
description:
check if the use of fake integer base 1 stops at invalid input
stdin:
set -o utf8-hack
typeset -i16 x=1#ï¿À
print /$x/ # invalid 3-byte
expected-stderr-pattern:
/1#ï¿À: unexpected '¿'/
expected-exit: e != 0
---
name: integer-base-one-2d5
description:
check if the use of fake integer base 1 stops at invalid input
stdin:
set -o utf8-hack
typeset -i16 x=1#À€
print /$x/ # non-minimalistic
expected-stderr-pattern:
/1#À€: unexpected '€'/
expected-exit: e != 0
---
name: integer-base-one-2d6
description:
check if the use of fake integer base 1 stops at invalid input
stdin:
set -o utf8-hack
typeset -i16 x=1#à€€
print /$x/ # non-minimalistic
expected-stderr-pattern:
/1#à€€: unexpected '€'/
expected-exit: e != 0
---
name: integer-base-one-3
description:
some sample code for hexdumping
stdin:
@ -4798,120 +4861,3 @@ expected-stdout:
00000110 EF F0 F1 F2 F3 F4 F5 F6 - F7 F8 F9 FA FB FC FD FE |................|
00000120 FF 0A - |..|
---
name: integer-base-one-3b
description:
some sample code for hexdumping Unicode
stdin:
set -o utf8-hack
{
print 'Hello, World!\\\nこんにちは'
typeset -Uui16 i=0x100
# change that to 0xFF once we can handle embedded
# NUL characters in strings / here documents
while (( i++ < 0x1FF )); do
print -n "\u${i#16#1}"
done
print
print \\xff # invalid utf-8
print \\xc2 # invalid 2-byte
print \\xef\\xbf\\xc0 # invalid 3-byte
print \\xc0\\x80 # non-minimalistic
print \\xe0\\x80\\x80 # non-minimalistic
} | {
typeset -Uui16 -Z11 pos=0
typeset -Uui16 -Z5 hv
typeset -i1 wc=0x0A
dasc=
nl=${wc#1#}
integer n
while IFS= read -r line; do
line=$line$nl
while [[ -n $line ]]; do
(( hv = 1#${line::1} & 0xFF ))
if (( (hv < 0xC2) || (hv >= 0xF0) )); then
n=1
elif (( hv < 0xE0 )); then
n=2
else
n=3
fi
if (( n > 1 )); then
(( hv = 1#${line:1:1} & 0xFF ))
(( (hv & 0xC0) == 0x80 )) || n=1
fi
if (( n > 2 )); then
(( hv = 1#${line:2:1} & 0xFF ))
(( (hv & 0xC0) == 0x80 )) || n=1
fi
wc=1#${line::n}
if (( (wc & 0xFF80) == 0xEF80 )); then
#if (( ((n == 2) && ((wc < 0x80)) || \
# ((n == 3) && (wc < 0x800)) )); then
n=1
wc=1#${line::n}
fi
if (( (wc < 32) || \
((wc > 126) && (wc < 160)) )); then
dch=.
elif (( (wc & 0xFF80) == 0xEF80 )); then
dch=<EFBFBD>
else
dch=${wc#1#}
fi
if (( (pos & 15) >= (n == 3 ? 14 : 15) )); then
dasc=$dasc$dch
dch=
fi
while (( n-- )); do
if (( (pos & 15) == 0 )); then
(( pos )) && print "$dasc|"
print -n "${pos#16#} "
dasc=' |'
fi
hv=1#${line::1}
print -n "${hv#16#} "
(( (pos++ & 15) == 7 )) && \
print -n -- '- '
line=${line:1}
done
dasc=$dasc$dch
done
done
if (( pos & 15 )); then
while (( pos & 15 )); do
print -n ' '
(( (pos++ & 15) == 7 )) && print -n -- '- '
done
print "$dasc|"
fi
}
expected-stdout:
00000000 48 65 6C 6C 6F 2C 20 57 - 6F 72 6C 64 21 5C 0A E3 |Hello, World!\.|
00000010 81 93 E3 82 93 E3 81 AB - E3 81 A1 E3 81 AF EF BC |んにちは|
00000020 81 0A 01 02 03 04 05 06 - 07 08 09 0A 0B 0C 0D 0E |...............|
00000030 0F 10 11 12 13 14 15 16 - 17 18 19 1A 1B 1C 1D 1E |................|
00000040 1F 20 21 22 23 24 25 26 - 27 28 29 2A 2B 2C 2D 2E |. !"#$%&'()*+,-.|
00000050 2F 30 31 32 33 34 35 36 - 37 38 39 3A 3B 3C 3D 3E |/0123456789:;<=>|
00000060 3F 40 41 42 43 44 45 46 - 47 48 49 4A 4B 4C 4D 4E |?@ABCDEFGHIJKLMN|
00000070 4F 50 51 52 53 54 55 56 - 57 58 59 5A 5B 5C 5D 5E |OPQRSTUVWXYZ[\]^|
00000080 5F 60 61 62 63 64 65 66 - 67 68 69 6A 6B 6C 6D 6E |_`abcdefghijklmn|
00000090 6F 70 71 72 73 74 75 76 - 77 78 79 7A 7B 7C 7D 7E |opqrstuvwxyz{|}~|
000000A0 7F C2 80 C2 81 C2 82 C2 - 83 C2 84 C2 85 C2 86 C2 |.........|
000000B0 87 C2 88 C2 89 C2 8A C2 - 8B C2 8C C2 8D C2 8E C2 |........|
000000C0 8F C2 90 C2 91 C2 92 C2 - 93 C2 94 C2 95 C2 96 C2 |........|
000000D0 97 C2 98 C2 99 C2 9A C2 - 9B C2 9C C2 9D C2 9E C2 |........|
000000E0 9F C2 A0 C2 A1 C2 A2 C2 - A3 C2 A4 C2 A5 C2 A6 C2 | ¡¢£¤¥¦§|
000000F0 A7 C2 A8 C2 A9 C2 AA C2 - AB C2 AC C2 AD C2 AE C2 |¨©ª«¬­®¯|
00000100 AF C2 B0 C2 B1 C2 B2 C2 - B3 C2 B4 C2 B5 C2 B6 C2 |°±²³´µ·|
00000110 B7 C2 B8 C2 B9 C2 BA C2 - BB C2 BC C2 BD C2 BE C2 |¸¹º»¼½¾¿|
00000120 BF C3 80 C3 81 C3 82 C3 - 83 C3 84 C3 85 C3 86 C3 |ÀÁÂÃÄÅÆÇ|
00000130 87 C3 88 C3 89 C3 8A C3 - 8B C3 8C C3 8D C3 8E C3 |ÈÉÊËÌÍÎÏ|
00000140 8F C3 90 C3 91 C3 92 C3 - 93 C3 94 C3 95 C3 96 C3 |ÐÑÒÓÔÕÖ×|
00000150 97 C3 98 C3 99 C3 9A C3 - 9B C3 9C C3 9D C3 9E C3 |ØÙÚÛÜÝÞß|
00000160 9F C3 A0 C3 A1 C3 A2 C3 - A3 C3 A4 C3 A5 C3 A6 C3 |àáâãäåæç|
00000170 A7 C3 A8 C3 A9 C3 AA C3 - AB C3 AC C3 AD C3 AE C3 |èéêëìíîï|
00000180 AF C3 B0 C3 B1 C3 B2 C3 - B3 C3 B4 C3 B5 C3 B6 C3 |ðñòóôõö÷|
00000190 B7 C3 B8 C3 B9 C3 BA C3 - BB C3 BC C3 BD C3 BE C3 |øùúûüýþÿ|
000001A0 BF 0A FF 0A C2 0A EF BF - C0 0A C0 80 0A E0 80 80 |.<EFBFBD>.<EFBFBD>.<EFBFBD><EFBFBD><EFBFBD>.<EFBFBD><EFBFBD>.<EFBFBD><EFBFBD><EFBFBD>|
000001B0 0A - |.|
---

8
mksh.1
View File

@ -1,4 +1,4 @@
.\" $MirOS: src/bin/mksh/mksh.1,v 1.120 2008/04/20 00:24:25 tg Exp $
.\" $MirOS: src/bin/mksh/mksh.1,v 1.121 2008/04/20 01:12:52 tg Exp $
.\" $OpenBSD: ksh.1,v 1.121 2008/03/21 12:51:19 millert Exp $
.\"-
.\" Try to make GNU groff and AT&T nroff more compatible
@ -2118,6 +2118,12 @@ transparent) ASCII or Unicode codepoints, depending on the shell's
flag (current setting).
In Unicode mode, raw octets are mapped into the range EF80..EFFF,
which is in the PUA and has been assigned by CSUR for this use.
If more than one octet in ASCII mode, or a sequence of more than one
octet not forming a valid and minimal CESU-8 sequence is passed, the
behaviour is undefined (usually, the shell aborts with a parse error,
but rarely, it succeeds, e.g. on the sequence C2 20).
That's why you should always use ASCII mode unless you know that the
input is well-formed CESU-8 (UTF-8 BMP, 0000..FFFD) Unicode.
.Pp
The operators are evaluated as follows:
.Bl -tag -width Ds -offset indent