switch EBCDIC to “nega-UTF8”

This commit is contained in:
tg 2017-05-05 20:36:03 +00:00
parent 6dc1ab0379
commit cc725e67ca
6 changed files with 49 additions and 38 deletions

28
edit.c
View File

@ -28,7 +28,7 @@
#ifndef MKSH_NO_CMDLINE_EDITING #ifndef MKSH_NO_CMDLINE_EDITING
__RCSID("$MirOS: src/bin/mksh/edit.c,v 1.335 2017/04/29 22:04:26 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/edit.c,v 1.336 2017/05/05 20:36:00 tg Exp $");
/* /*
* in later versions we might use libtermcap for this, but since external * in later versions we might use libtermcap for this, but since external
@ -714,8 +714,8 @@ x_longest_prefix(int nwords, char * const * words)
break; break;
} }
/* false for nwords==1 as 0 = words[0][prefix_len] then */ /* false for nwords==1 as 0 = words[0][prefix_len] then */
if (UTFMODE && prefix_len && (words[0][prefix_len] & 0xC0) == 0x80) if (UTFMODE && prefix_len && (rtt2asc(words[0][prefix_len]) & 0xC0) == 0x80)
while (prefix_len && (words[0][prefix_len] & 0xC0) != 0xC0) while (prefix_len && (rtt2asc(words[0][prefix_len]) & 0xC0) != 0xC0)
--prefix_len; --prefix_len;
return (prefix_len); return (prefix_len);
} }
@ -1186,17 +1186,19 @@ x_e_getmbc(char *sbuf)
if (c == -1) if (c == -1)
return (-1); return (-1);
if (UTFMODE) { if (UTFMODE) {
if ((buf[0] >= 0xC2) && (buf[0] < 0xF0)) { if ((rtt2asc(buf[0]) >= (unsigned char)0xC2) &&
(rtt2asc(buf[0]) < (unsigned char)0xF0)) {
c = x_e_getc(); c = x_e_getc();
if (c == -1) if (c == -1)
return (-1); return (-1);
if ((c & 0xC0) != 0x80) { if ((rtt2asc(c) & 0xC0) != 0x80) {
x_e_ungetc(c); x_e_ungetc(c);
return (1); return (1);
} }
buf[pos++] = c; buf[pos++] = c;
} }
if ((buf[0] >= 0xE0) && (buf[0] < 0xF0)) { if ((rtt2asc(buf[0]) >= (unsigned char)0xE0) &&
(rtt2asc(buf[0]) < (unsigned char)0xF0)) {
/* XXX x_e_ungetc is one-octet only */ /* XXX x_e_ungetc is one-octet only */
buf[pos++] = c = x_e_getc(); buf[pos++] = c = x_e_getc();
if (c == -1) if (c == -1)
@ -1317,7 +1319,7 @@ x_insert(int c)
return (KSTD); return (KSTD);
} }
if (UTFMODE) { if (UTFMODE) {
if (((c & 0xC0) == 0x80) && left) { if (((rtt2asc(c) & 0xC0) == 0x80) && left) {
str[pos++] = c; str[pos++] = c;
if (!--left) { if (!--left) {
str[pos] = '\0'; str[pos] = '\0';
@ -1614,7 +1616,7 @@ x_bs0(char *cp, char *lower_bound)
{ {
if (UTFMODE) if (UTFMODE)
while ((!lower_bound || (cp > lower_bound)) && while ((!lower_bound || (cp > lower_bound)) &&
((*(unsigned char *)cp & 0xC0) == 0x80)) ((rtt2asc(*cp) & 0xC0) == 0x80))
--cp; --cp;
return (cp); return (cp);
} }
@ -1635,7 +1637,7 @@ x_size2(char *cp, char **dcp)
{ {
uint8_t c = *(unsigned char *)cp; uint8_t c = *(unsigned char *)cp;
if (UTFMODE && (c > 0x7F)) if (UTFMODE && (rtt2asc(c) > 0x7F))
return (utf_widthadj(cp, (const char **)dcp)); return (utf_widthadj(cp, (const char **)dcp));
if (dcp) if (dcp)
*dcp = cp + 1; *dcp = cp + 1;
@ -2903,6 +2905,7 @@ x_e_putc2(int c)
if (ctype(c, C_CR | C_LF)) if (ctype(c, C_CR | C_LF))
x_col = 0; x_col = 0;
if (x_col < xx_cols) { if (x_col < xx_cols) {
#ifndef MKSH_EBCDIC
if (UTFMODE && (c > 0x7F)) { if (UTFMODE && (c > 0x7F)) {
char utf_tmp[3]; char utf_tmp[3];
size_t x; size_t x;
@ -2917,6 +2920,7 @@ x_e_putc2(int c)
x_putc(utf_tmp[2]); x_putc(utf_tmp[2]);
width = utf_wcwidth(c); width = utf_wcwidth(c);
} else } else
#endif
x_putc(c); x_putc(c);
switch (c) { switch (c) {
case KSH_BEL: case KSH_BEL:
@ -2950,7 +2954,13 @@ x_e_putc3(const char **cp)
width = utf_widthadj(*cp, (const char **)&cp2); width = utf_widthadj(*cp, (const char **)&cp2);
if (cp2 == *cp + 1) { if (cp2 == *cp + 1) {
(*cp)++; (*cp)++;
#ifdef MKSH_EBCDIC
x_putc(asc2rtt(0xEF));
x_putc(asc2rtt(0xBF));
x_putc(asc2rtt(0xBD));
#else
shf_puts("\xEF\xBF\xBD", shl_out); shf_puts("\xEF\xBF\xBD", shl_out);
#endif
} else } else
while (*cp < cp2) while (*cp < cp2)
x_putcf(*(*cp)++); x_putcf(*(*cp)++);

6
eval.c
View File

@ -23,7 +23,7 @@
#include "sh.h" #include "sh.h"
__RCSID("$MirOS: src/bin/mksh/eval.c,v 1.212 2017/05/03 15:36:12 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/eval.c,v 1.213 2017/05/05 20:36:01 tg Exp $");
/* /*
* string expansion * string expansion
@ -1174,7 +1174,7 @@ varsub(Expand *xp, const char *sp, const char *word,
if (!UTFMODE || (len = utf_mbtowc(&wc, if (!UTFMODE || (len = utf_mbtowc(&wc,
s)) == (size_t)-1) s)) == (size_t)-1)
/* not UTFMODE or not UTF-8 */ /* not UTFMODE or not UTF-8 */
wc = (unsigned char)(*s++); wc = rtt2asc(*s++);
else else
/* UTFMODE and UTF-8 */ /* UTFMODE and UTF-8 */
s += len; s += len;
@ -1522,7 +1522,7 @@ trimsub(char *str, char *pat, int how)
goto trimsub_match; goto trimsub_match;
if (UTFMODE) { if (UTFMODE) {
char *op = p; char *op = p;
while ((p-- > str) && ((*p & 0xC0) == 0x80)) while ((p-- > str) && ((rtt2asc(*p) & 0xC0) == 0x80))
; ;
if ((p < str) || (p + utf_ptradj(p) != op)) if ((p < str) || (p + utf_ptradj(p) != op))
p = op - 1; p = op - 1;

23
expr.c
View File

@ -23,7 +23,7 @@
#include "sh.h" #include "sh.h"
__RCSID("$MirOS: src/bin/mksh/expr.c,v 1.96 2017/04/27 23:12:46 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/expr.c,v 1.97 2017/05/05 20:36:02 tg Exp $");
#define EXPRTOK_DEFNS #define EXPRTOK_DEFNS
#include "exprtok.h" #include "exprtok.h"
@ -772,8 +772,7 @@ utf_ptradj(const char *src)
{ {
register size_t n; register size_t n;
if (!UTFMODE || if (!UTFMODE || rtt2asc(*src) < 0xC2 ||
*(const unsigned char *)(src) < 0xC2 ||
(n = utf_mbtowc(NULL, src)) == (size_t)-1) (n = utf_mbtowc(NULL, src)) == (size_t)-1)
n = 1; n = 1;
return (n); return (n);
@ -791,7 +790,7 @@ utf_mbtowc(unsigned int *dst, const char *src)
const unsigned char *s = (const unsigned char *)src; const unsigned char *s = (const unsigned char *)src;
unsigned int c, wc; unsigned int c, wc;
if ((wc = *s++) < 0x80) { if ((wc = ord(rtt2asc(*s++))) < 0x80) {
out: out:
if (dst != NULL) if (dst != NULL)
*dst = wc; *dst = wc;
@ -805,7 +804,7 @@ utf_mbtowc(unsigned int *dst, const char *src)
if (wc < 0xE0) { if (wc < 0xE0) {
wc = (wc & 0x1F) << 6; wc = (wc & 0x1F) << 6;
if (((c = *s++) & 0xC0) != 0x80) if (((c = ord(rtt2asc(*s++))) & 0xC0) != 0x80)
goto ilseq; goto ilseq;
wc |= c & 0x3F; wc |= c & 0x3F;
goto out; goto out;
@ -813,11 +812,11 @@ utf_mbtowc(unsigned int *dst, const char *src)
wc = (wc & 0x0F) << 12; wc = (wc & 0x0F) << 12;
if (((c = *s++) & 0xC0) != 0x80) if (((c = ord(rtt2asc(*s++))) & 0xC0) != 0x80)
goto ilseq; goto ilseq;
wc |= (c & 0x3F) << 6; wc |= (c & 0x3F) << 6;
if (((c = *s++) & 0xC0) != 0x80) if (((c = ord(rtt2asc(*s++))) & 0xC0) != 0x80)
goto ilseq; goto ilseq;
wc |= c & 0x3F; wc |= c & 0x3F;
@ -834,18 +833,18 @@ utf_wctomb(char *dst, unsigned int wc)
unsigned char *d; unsigned char *d;
if (wc < 0x80) { if (wc < 0x80) {
*dst = wc; *dst = asc2rtt(wc);
return (1); return (1);
} }
d = (unsigned char *)dst; d = (unsigned char *)dst;
if (wc < 0x0800) if (wc < 0x0800)
*d++ = (wc >> 6) | 0xC0; *d++ = asc2rtt((wc >> 6) | 0xC0);
else { else {
*d++ = ((wc = wc > 0xFFFD ? 0xFFFD : wc) >> 12) | 0xE0; *d++ = asc2rtt(((wc = wc > 0xFFFD ? 0xFFFD : wc) >> 12) | 0xE0);
*d++ = ((wc >> 6) & 0x3F) | 0x80; *d++ = asc2rtt(((wc >> 6) & 0x3F) | 0x80);
} }
*d++ = (wc & 0x3F) | 0x80; *d++ = asc2rtt((wc & 0x3F) | 0x80);
return ((char *)d - dst); return ((char *)d - dst);
} }

16
lex.c
View File

@ -23,7 +23,7 @@
#include "sh.h" #include "sh.h"
__RCSID("$MirOS: src/bin/mksh/lex.c,v 1.237 2017/04/28 00:38:31 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/lex.c,v 1.238 2017/05/05 20:36:02 tg Exp $");
/* /*
* states while lexing word * states while lexing word
@ -1536,7 +1536,7 @@ pprompt(const char *cp, int ntruncate)
columns--; columns--;
} else if (*cp == delimiter) } else if (*cp == delimiter)
indelimit = !indelimit; indelimit = !indelimit;
else if (UTFMODE && ((unsigned char)*cp > 0x7F)) { else if (UTFMODE && (rtt2asc(*cp) > 0x7F)) {
const char *cp2; const char *cp2;
columns += utf_widthadj(cp, &cp2); columns += utf_widthadj(cp, &cp2);
if (doprint && (indelimit || if (doprint && (indelimit ||
@ -1754,19 +1754,19 @@ yyskiputf8bom(void)
{ {
int c; int c;
if ((unsigned char)(c = o_getsc_u()) != 0xEF) { if (rtt2asc((c = o_getsc_u())) != 0xEF) {
ungetsc_i(c); ungetsc_i(c);
return; return;
} }
if ((unsigned char)(c = o_getsc_u()) != 0xBB) { if (rtt2asc((c = o_getsc_u())) != 0xBB) {
ungetsc_i(c); ungetsc_i(c);
ungetsc_i(0xEF); ungetsc_i(asc2rtt(0xEF));
return; return;
} }
if ((unsigned char)(c = o_getsc_u()) != 0xBF) { if (rtt2asc((c = o_getsc_u())) != 0xBF) {
ungetsc_i(c); ungetsc_i(c);
ungetsc_i(0xBB); ungetsc_i(asc2rtt(0xBB));
ungetsc_i(0xEF); ungetsc_i(asc2rtt(0xEF));
return; return;
} }
UTFMODE |= 8; UTFMODE |= 8;

4
tree.c
View File

@ -23,7 +23,7 @@
#include "sh.h" #include "sh.h"
__RCSID("$MirOS: src/bin/mksh/tree.c,v 1.91 2017/04/28 03:28:19 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/tree.c,v 1.92 2017/05/05 20:36:03 tg Exp $");
#define INDENT 8 #define INDENT 8
@ -805,7 +805,7 @@ vistree(char *dst, size_t sz, struct op *t)
goto vist_out; goto vist_out;
*dst++ = '^'; *dst++ = '^';
c = ksh_unctrl(c); c = ksh_unctrl(c);
} else if (UTFMODE && c > 0x7F) { } else if (UTFMODE && rtt2asc(c) > 0x7F) {
/* better not try to display broken multibyte chars */ /* better not try to display broken multibyte chars */
/* also go easy on the Unicode: no U+FFFD here */ /* also go easy on the Unicode: no U+FFFD here */
c = '?'; c = '?';

10
var.c
View File

@ -28,7 +28,7 @@
#include <sys/sysctl.h> #include <sys/sysctl.h>
#endif #endif
__RCSID("$MirOS: src/bin/mksh/var.c,v 1.217 2017/04/29 22:04:31 tg Exp $"); __RCSID("$MirOS: src/bin/mksh/var.c,v 1.218 2017/05/05 20:36:03 tg Exp $");
/*- /*-
* Variables * Variables
@ -414,9 +414,11 @@ str_val(struct tbl *vp)
*(s = strbuf) = '1'; *(s = strbuf) = '1';
s[1] = '#'; s[1] = '#';
if (!UTFMODE || ((n & 0xFF80) == 0xEF80)) if (!UTFMODE)
s[2] = (unsigned char)n;
else if ((n & 0xFF80) == 0xEF80)
/* OPTU-16 -> raw octet */ /* OPTU-16 -> raw octet */
s[2] = n & 0xFF; s[2] = asc2rtt(n & 0xFF);
else else
sz = utf_wctomb(s + 2, n); sz = utf_wctomb(s + 2, n);
s[2 + sz] = '\0'; s[2 + sz] = '\0';
@ -577,7 +579,7 @@ getint(struct tbl *vp, mksh_ari_u *nump, bool arith)
* the same as 1#\x80 does, thus is * the same as 1#\x80 does, thus is
* not round-tripping correctly XXX) * not round-tripping correctly XXX)
*/ */
wc = 0xEF00 + *(const unsigned char *)s; wc = 0xEF00 + rtt2asc(*s);
nump->u = (mksh_uari_t)wc; nump->u = (mksh_uari_t)wc;
return (1); return (1);
} else if (base > 36) } else if (base > 36)