switch EBCDIC to “nega-UTF8”

This commit is contained in:
tg 2017-05-05 20:36:03 +00:00
parent 6dc1ab0379
commit cc725e67ca
6 changed files with 49 additions and 38 deletions

28
edit.c
View File

@ -28,7 +28,7 @@
#ifndef MKSH_NO_CMDLINE_EDITING
__RCSID("$MirOS: src/bin/mksh/edit.c,v 1.335 2017/04/29 22:04:26 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/edit.c,v 1.336 2017/05/05 20:36:00 tg Exp $");
/*
* in later versions we might use libtermcap for this, but since external
@ -714,8 +714,8 @@ x_longest_prefix(int nwords, char * const * words)
break;
}
/* false for nwords==1 as 0 = words[0][prefix_len] then */
if (UTFMODE && prefix_len && (words[0][prefix_len] & 0xC0) == 0x80)
while (prefix_len && (words[0][prefix_len] & 0xC0) != 0xC0)
if (UTFMODE && prefix_len && (rtt2asc(words[0][prefix_len]) & 0xC0) == 0x80)
while (prefix_len && (rtt2asc(words[0][prefix_len]) & 0xC0) != 0xC0)
--prefix_len;
return (prefix_len);
}
@ -1186,17 +1186,19 @@ x_e_getmbc(char *sbuf)
if (c == -1)
return (-1);
if (UTFMODE) {
if ((buf[0] >= 0xC2) && (buf[0] < 0xF0)) {
if ((rtt2asc(buf[0]) >= (unsigned char)0xC2) &&
(rtt2asc(buf[0]) < (unsigned char)0xF0)) {
c = x_e_getc();
if (c == -1)
return (-1);
if ((c & 0xC0) != 0x80) {
if ((rtt2asc(c) & 0xC0) != 0x80) {
x_e_ungetc(c);
return (1);
}
buf[pos++] = c;
}
if ((buf[0] >= 0xE0) && (buf[0] < 0xF0)) {
if ((rtt2asc(buf[0]) >= (unsigned char)0xE0) &&
(rtt2asc(buf[0]) < (unsigned char)0xF0)) {
/* XXX x_e_ungetc is one-octet only */
buf[pos++] = c = x_e_getc();
if (c == -1)
@ -1317,7 +1319,7 @@ x_insert(int c)
return (KSTD);
}
if (UTFMODE) {
if (((c & 0xC0) == 0x80) && left) {
if (((rtt2asc(c) & 0xC0) == 0x80) && left) {
str[pos++] = c;
if (!--left) {
str[pos] = '\0';
@ -1614,7 +1616,7 @@ x_bs0(char *cp, char *lower_bound)
{
if (UTFMODE)
while ((!lower_bound || (cp > lower_bound)) &&
((*(unsigned char *)cp & 0xC0) == 0x80))
((rtt2asc(*cp) & 0xC0) == 0x80))
--cp;
return (cp);
}
@ -1635,7 +1637,7 @@ x_size2(char *cp, char **dcp)
{
uint8_t c = *(unsigned char *)cp;
if (UTFMODE && (c > 0x7F))
if (UTFMODE && (rtt2asc(c) > 0x7F))
return (utf_widthadj(cp, (const char **)dcp));
if (dcp)
*dcp = cp + 1;
@ -2903,6 +2905,7 @@ x_e_putc2(int c)
if (ctype(c, C_CR | C_LF))
x_col = 0;
if (x_col < xx_cols) {
#ifndef MKSH_EBCDIC
if (UTFMODE && (c > 0x7F)) {
char utf_tmp[3];
size_t x;
@ -2917,6 +2920,7 @@ x_e_putc2(int c)
x_putc(utf_tmp[2]);
width = utf_wcwidth(c);
} else
#endif
x_putc(c);
switch (c) {
case KSH_BEL:
@ -2950,7 +2954,13 @@ x_e_putc3(const char **cp)
width = utf_widthadj(*cp, (const char **)&cp2);
if (cp2 == *cp + 1) {
(*cp)++;
#ifdef MKSH_EBCDIC
x_putc(asc2rtt(0xEF));
x_putc(asc2rtt(0xBF));
x_putc(asc2rtt(0xBD));
#else
shf_puts("\xEF\xBF\xBD", shl_out);
#endif
} else
while (*cp < cp2)
x_putcf(*(*cp)++);

6
eval.c
View File

@ -23,7 +23,7 @@
#include "sh.h"
__RCSID("$MirOS: src/bin/mksh/eval.c,v 1.212 2017/05/03 15:36:12 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/eval.c,v 1.213 2017/05/05 20:36:01 tg Exp $");
/*
* string expansion
@ -1174,7 +1174,7 @@ varsub(Expand *xp, const char *sp, const char *word,
if (!UTFMODE || (len = utf_mbtowc(&wc,
s)) == (size_t)-1)
/* not UTFMODE or not UTF-8 */
wc = (unsigned char)(*s++);
wc = rtt2asc(*s++);
else
/* UTFMODE and UTF-8 */
s += len;
@ -1522,7 +1522,7 @@ trimsub(char *str, char *pat, int how)
goto trimsub_match;
if (UTFMODE) {
char *op = p;
while ((p-- > str) && ((*p & 0xC0) == 0x80))
while ((p-- > str) && ((rtt2asc(*p) & 0xC0) == 0x80))
;
if ((p < str) || (p + utf_ptradj(p) != op))
p = op - 1;

23
expr.c
View File

@ -23,7 +23,7 @@
#include "sh.h"
__RCSID("$MirOS: src/bin/mksh/expr.c,v 1.96 2017/04/27 23:12:46 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/expr.c,v 1.97 2017/05/05 20:36:02 tg Exp $");
#define EXPRTOK_DEFNS
#include "exprtok.h"
@ -772,8 +772,7 @@ utf_ptradj(const char *src)
{
register size_t n;
if (!UTFMODE ||
*(const unsigned char *)(src) < 0xC2 ||
if (!UTFMODE || rtt2asc(*src) < 0xC2 ||
(n = utf_mbtowc(NULL, src)) == (size_t)-1)
n = 1;
return (n);
@ -791,7 +790,7 @@ utf_mbtowc(unsigned int *dst, const char *src)
const unsigned char *s = (const unsigned char *)src;
unsigned int c, wc;
if ((wc = *s++) < 0x80) {
if ((wc = ord(rtt2asc(*s++))) < 0x80) {
out:
if (dst != NULL)
*dst = wc;
@ -805,7 +804,7 @@ utf_mbtowc(unsigned int *dst, const char *src)
if (wc < 0xE0) {
wc = (wc & 0x1F) << 6;
if (((c = *s++) & 0xC0) != 0x80)
if (((c = ord(rtt2asc(*s++))) & 0xC0) != 0x80)
goto ilseq;
wc |= c & 0x3F;
goto out;
@ -813,11 +812,11 @@ utf_mbtowc(unsigned int *dst, const char *src)
wc = (wc & 0x0F) << 12;
if (((c = *s++) & 0xC0) != 0x80)
if (((c = ord(rtt2asc(*s++))) & 0xC0) != 0x80)
goto ilseq;
wc |= (c & 0x3F) << 6;
if (((c = *s++) & 0xC0) != 0x80)
if (((c = ord(rtt2asc(*s++))) & 0xC0) != 0x80)
goto ilseq;
wc |= c & 0x3F;
@ -834,18 +833,18 @@ utf_wctomb(char *dst, unsigned int wc)
unsigned char *d;
if (wc < 0x80) {
*dst = wc;
*dst = asc2rtt(wc);
return (1);
}
d = (unsigned char *)dst;
if (wc < 0x0800)
*d++ = (wc >> 6) | 0xC0;
*d++ = asc2rtt((wc >> 6) | 0xC0);
else {
*d++ = ((wc = wc > 0xFFFD ? 0xFFFD : wc) >> 12) | 0xE0;
*d++ = ((wc >> 6) & 0x3F) | 0x80;
*d++ = asc2rtt(((wc = wc > 0xFFFD ? 0xFFFD : wc) >> 12) | 0xE0);
*d++ = asc2rtt(((wc >> 6) & 0x3F) | 0x80);
}
*d++ = (wc & 0x3F) | 0x80;
*d++ = asc2rtt((wc & 0x3F) | 0x80);
return ((char *)d - dst);
}

16
lex.c
View File

@ -23,7 +23,7 @@
#include "sh.h"
__RCSID("$MirOS: src/bin/mksh/lex.c,v 1.237 2017/04/28 00:38:31 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/lex.c,v 1.238 2017/05/05 20:36:02 tg Exp $");
/*
* states while lexing word
@ -1536,7 +1536,7 @@ pprompt(const char *cp, int ntruncate)
columns--;
} else if (*cp == delimiter)
indelimit = !indelimit;
else if (UTFMODE && ((unsigned char)*cp > 0x7F)) {
else if (UTFMODE && (rtt2asc(*cp) > 0x7F)) {
const char *cp2;
columns += utf_widthadj(cp, &cp2);
if (doprint && (indelimit ||
@ -1754,19 +1754,19 @@ yyskiputf8bom(void)
{
int c;
if ((unsigned char)(c = o_getsc_u()) != 0xEF) {
if (rtt2asc((c = o_getsc_u())) != 0xEF) {
ungetsc_i(c);
return;
}
if ((unsigned char)(c = o_getsc_u()) != 0xBB) {
if (rtt2asc((c = o_getsc_u())) != 0xBB) {
ungetsc_i(c);
ungetsc_i(0xEF);
ungetsc_i(asc2rtt(0xEF));
return;
}
if ((unsigned char)(c = o_getsc_u()) != 0xBF) {
if (rtt2asc((c = o_getsc_u())) != 0xBF) {
ungetsc_i(c);
ungetsc_i(0xBB);
ungetsc_i(0xEF);
ungetsc_i(asc2rtt(0xBB));
ungetsc_i(asc2rtt(0xEF));
return;
}
UTFMODE |= 8;

4
tree.c
View File

@ -23,7 +23,7 @@
#include "sh.h"
__RCSID("$MirOS: src/bin/mksh/tree.c,v 1.91 2017/04/28 03:28:19 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/tree.c,v 1.92 2017/05/05 20:36:03 tg Exp $");
#define INDENT 8
@ -805,7 +805,7 @@ vistree(char *dst, size_t sz, struct op *t)
goto vist_out;
*dst++ = '^';
c = ksh_unctrl(c);
} else if (UTFMODE && c > 0x7F) {
} else if (UTFMODE && rtt2asc(c) > 0x7F) {
/* better not try to display broken multibyte chars */
/* also go easy on the Unicode: no U+FFFD here */
c = '?';

10
var.c
View File

@ -28,7 +28,7 @@
#include <sys/sysctl.h>
#endif
__RCSID("$MirOS: src/bin/mksh/var.c,v 1.217 2017/04/29 22:04:31 tg Exp $");
__RCSID("$MirOS: src/bin/mksh/var.c,v 1.218 2017/05/05 20:36:03 tg Exp $");
/*-
* Variables
@ -414,9 +414,11 @@ str_val(struct tbl *vp)
*(s = strbuf) = '1';
s[1] = '#';
if (!UTFMODE || ((n & 0xFF80) == 0xEF80))
if (!UTFMODE)
s[2] = (unsigned char)n;
else if ((n & 0xFF80) == 0xEF80)
/* OPTU-16 -> raw octet */
s[2] = n & 0xFF;
s[2] = asc2rtt(n & 0xFF);
else
sz = utf_wctomb(s + 2, n);
s[2 + sz] = '\0';
@ -577,7 +579,7 @@ getint(struct tbl *vp, mksh_ari_u *nump, bool arith)
* the same as 1#\x80 does, thus is
* not round-tripping correctly XXX)
*/
wc = 0xEF00 + *(const unsigned char *)s;
wc = 0xEF00 + rtt2asc(*s);
nump->u = (mksh_uari_t)wc;
return (1);
} else if (base > 36)