From 1692a6da66d98e3201579e1bd24dad60a556a6af Mon Sep 17 00:00:00 2001 From: tg Date: Sun, 15 Apr 2007 10:45:59 +0000 Subject: [PATCH] =?UTF-8?q?ignore=20the=20UTF-8=20Byte=20Order=20Mark=20at?= =?UTF-8?q?=20the=20beginning=20of=20the=20input=20(via=20a=20file=20given?= =?UTF-8?q?=20to=20execute,=20standard=20input=20(interactive=20or=20not),?= =?UTF-8?q?=20via=20-c=20command=20line=20argument,=20or=20after=20?= =?UTF-8?q?=E2=80=9Ceval=E2=80=9D,=20but=20not=20for=20$(=E2=80=A6)=20coms?= =?UTF-8?q?ubs,=20at=20the=20beginning=20of=20a=20subsequent=20line,=20or?= =?UTF-8?q?=20within=20a=20line,=20etc.);=20regression=20test=20for=20it?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit idea during my “week off” (despite the pain), bsiegert@ thinks it's good – and utf-8 capable tools ought to be able to do this anyway --- check.t | 46 +++++++++++++++++++++++++++++++++++++++++++++- lex.c | 19 ++++++++++++++++--- main.c | 4 +++- sh.h | 3 ++- 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/check.t b/check.t index 39842cf..097c2ae 100644 --- a/check.t +++ b/check.t @@ -1,4 +1,4 @@ -# $MirOS: src/bin/mksh/check.t,v 1.94 2007/03/14 02:41:08 tg Exp $ +# $MirOS: src/bin/mksh/check.t,v 1.95 2007/04/15 10:45:58 tg Exp $ # $OpenBSD: bksl-nl.t,v 1.2 2001/01/28 23:04:56 niklas Exp $ # $OpenBSD: history.t,v 1.5 2001/01/28 23:04:56 niklas Exp $ # $OpenBSD: read.t,v 1.3 2003/03/10 03:48:16 david Exp $ @@ -3884,3 +3884,47 @@ stdin: expected-stdout: <0hall0 > < 0hall0> <00000hall0> <0000 hallo> --- +name: utf8bom-1 +description: + Check that the UTF-8 Byte Order Mark is ignored as the first + multibyte character of the shell input (with -c, from standard + input, as file, or as eval argument), but nowhere else +category: pdksh +stdin: + mkdir foo + print '#!/bin/sh\necho ohne' >foo/fnord + print '#!/bin/sh\necho mit' >foo/fnord + print 'fnord\nfnord\nfnord\nfnord' >foo/bar + print eval \''fnord\nfnord\nfnord\nfnord'\' >foo/zoo + set -A anzahl -- foo/* + print got ${#anzahl[*]} files + chmod +x foo/* + export PATH=$(pwd)/foo:$PATH + $0 -c 'fnord' + $0 -c 'fnord; fnord; fnord; fnord' + $0 foo/bar + $0 str != '\0' && *source->str != '\\' \ - && !backslash_skip ? *source->str++ : getsc_bn()) + && !backslash_skip && !(source->flags & SF_FIRST) \ + ? *source->str++ : getsc_bn()) /* optimised getsc__() */ -#define getsc_() ((*source->str != '\0') ? *source->str++ : getsc__()) +#define getsc_() ((*source->str != '\0') && !(source->flags & SF_FIRST) \ + ? *source->str++ : getsc__()) #define STATE_BSIZE 32 @@ -856,6 +858,7 @@ getsc__(void) Source *s = source; int c; + getsc_again: while ((c = *s->str++) == 0) { s->str = NULL; /* return 0 for EOF by default */ switch (s->type) { @@ -947,6 +950,16 @@ getsc__(void) shf_flush(shl_out); } } + /* check for UTF-8 byte order mark */ + if (s->flags & SF_FIRST) { + s->flags &= ~SF_FIRST; + if (((unsigned char)c == 0xEF) && + (((const unsigned char *)(s->str))[0] == 0xBB) && + (((const unsigned char *)(s->str))[1] == 0xBF)) { + s->str += 2; + goto getsc_again; + } + } return c; } diff --git a/main.c b/main.c index 16bff12..c4d37ab 100644 --- a/main.c +++ b/main.c @@ -13,7 +13,7 @@ #include #endif -__RCSID("$MirOS: src/bin/mksh/main.c,v 1.73 2007/03/04 03:04:26 tg Exp $"); +__RCSID("$MirOS: src/bin/mksh/main.c,v 1.74 2007/04/15 10:45:59 tg Exp $"); extern char **environ; @@ -457,6 +457,8 @@ shell(Source * volatile s, volatile int toplevel) Source *volatile old_source = source; int i; + s->flags |= SF_FIRST; /* enable UTF-8 BOM check */ + newenv(E_PARSE); if (interactive) really_exit = 0; diff --git a/sh.h b/sh.h index 19c86a8..5b2a1e7 100644 --- a/sh.h +++ b/sh.h @@ -8,7 +8,7 @@ /* $OpenBSD: c_test.h,v 1.4 2004/12/20 11:34:26 otto Exp $ */ /* $OpenBSD: tty.h,v 1.5 2004/12/20 11:34:26 otto Exp $ */ -#define MKSH_SH_H_ID "$MirOS: src/bin/mksh/sh.h,v 1.121 2007/03/14 02:41:09 tg Exp $" +#define MKSH_SH_H_ID "$MirOS: src/bin/mksh/sh.h,v 1.122 2007/04/15 10:45:59 tg Exp $" #define MKSH_VERSION "R29 2007/03/14" #if HAVE_SYS_PARAM_H @@ -1088,6 +1088,7 @@ struct source { #define SF_ALIAS BIT(1) /* faking space at end of alias */ #define SF_ALIASEND BIT(2) /* faking space at end of alias */ #define SF_TTY BIT(3) /* type == SSTDIN & it is a tty */ +#define SF_FIRST BIT(4) /* initial state (to ignore UTF-8 BOM) */ /* * states while lexing word