ignore the UTF-8 Byte Order Mark at the beginning of the input (via a file
given to execute, standard input (interactive or not), via -c command line argument, or after “eval”, but not for $(…) comsubs, at the beginning of a subsequent line, or within a line, etc.); regression test for it idea during my “week off” (despite the pain), bsiegert@ thinks it's good – and utf-8 capable tools ought to be able to do this anyway
This commit is contained in:
46
check.t
46
check.t
@@ -1,4 +1,4 @@
|
|||||||
# $MirOS: src/bin/mksh/check.t,v 1.94 2007/03/14 02:41:08 tg Exp $
|
# $MirOS: src/bin/mksh/check.t,v 1.95 2007/04/15 10:45:58 tg Exp $
|
||||||
# $OpenBSD: bksl-nl.t,v 1.2 2001/01/28 23:04:56 niklas Exp $
|
# $OpenBSD: bksl-nl.t,v 1.2 2001/01/28 23:04:56 niklas Exp $
|
||||||
# $OpenBSD: history.t,v 1.5 2001/01/28 23:04:56 niklas Exp $
|
# $OpenBSD: history.t,v 1.5 2001/01/28 23:04:56 niklas Exp $
|
||||||
# $OpenBSD: read.t,v 1.3 2003/03/10 03:48:16 david Exp $
|
# $OpenBSD: read.t,v 1.3 2003/03/10 03:48:16 david Exp $
|
||||||
@@ -3884,3 +3884,47 @@ stdin:
|
|||||||
expected-stdout:
|
expected-stdout:
|
||||||
<0hall0 > < 0hall0> <hall0 > <00000hall0> <0000 hallo>
|
<0hall0 > < 0hall0> <hall0 > <00000hall0> <0000 hallo>
|
||||||
---
|
---
|
||||||
|
name: utf8bom-1
|
||||||
|
description:
|
||||||
|
Check that the UTF-8 Byte Order Mark is ignored as the first
|
||||||
|
multibyte character of the shell input (with -c, from standard
|
||||||
|
input, as file, or as eval argument), but nowhere else
|
||||||
|
category: pdksh
|
||||||
|
stdin:
|
||||||
|
mkdir foo
|
||||||
|
print '#!/bin/sh\necho ohne' >foo/fnord
|
||||||
|
print '#!/bin/sh\necho mit' >foo/fnord
|
||||||
|
print 'fnord\nfnord\nfnord\nfnord' >foo/bar
|
||||||
|
print eval \''fnord\nfnord\nfnord\nfnord'\' >foo/zoo
|
||||||
|
set -A anzahl -- foo/*
|
||||||
|
print got ${#anzahl[*]} files
|
||||||
|
chmod +x foo/*
|
||||||
|
export PATH=$(pwd)/foo:$PATH
|
||||||
|
$0 -c 'fnord'
|
||||||
|
$0 -c 'fnord; fnord; fnord; fnord'
|
||||||
|
$0 foo/bar
|
||||||
|
$0 <foo/bar
|
||||||
|
$0 foo/zoo
|
||||||
|
$0 -c 'print : $(fnord)'
|
||||||
|
rm -rf foo
|
||||||
|
expected-stdout:
|
||||||
|
got 4 files
|
||||||
|
ohne
|
||||||
|
ohne
|
||||||
|
ohne
|
||||||
|
mit
|
||||||
|
ohne
|
||||||
|
ohne
|
||||||
|
ohne
|
||||||
|
mit
|
||||||
|
ohne
|
||||||
|
ohne
|
||||||
|
ohne
|
||||||
|
mit
|
||||||
|
ohne
|
||||||
|
ohne
|
||||||
|
ohne
|
||||||
|
mit
|
||||||
|
ohne
|
||||||
|
: mit
|
||||||
|
---
|
||||||
|
19
lex.c
19
lex.c
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
#include "sh.h"
|
#include "sh.h"
|
||||||
|
|
||||||
__RCSID("$MirOS: src/bin/mksh/lex.c,v 1.26 2007/03/04 03:04:26 tg Exp $");
|
__RCSID("$MirOS: src/bin/mksh/lex.c,v 1.27 2007/04/15 10:45:58 tg Exp $");
|
||||||
|
|
||||||
/* Structure to keep track of the lexing state and the various pieces of info
|
/* Structure to keep track of the lexing state and the various pieces of info
|
||||||
* needed for each particular state. */
|
* needed for each particular state. */
|
||||||
@@ -64,9 +64,11 @@ static int ignore_backslash_newline;
|
|||||||
|
|
||||||
/* optimised getsc_bn() */
|
/* optimised getsc_bn() */
|
||||||
#define getsc() (*source->str != '\0' && *source->str != '\\' \
|
#define getsc() (*source->str != '\0' && *source->str != '\\' \
|
||||||
&& !backslash_skip ? *source->str++ : getsc_bn())
|
&& !backslash_skip && !(source->flags & SF_FIRST) \
|
||||||
|
? *source->str++ : getsc_bn())
|
||||||
/* optimised getsc__() */
|
/* optimised getsc__() */
|
||||||
#define getsc_() ((*source->str != '\0') ? *source->str++ : getsc__())
|
#define getsc_() ((*source->str != '\0') && !(source->flags & SF_FIRST) \
|
||||||
|
? *source->str++ : getsc__())
|
||||||
|
|
||||||
#define STATE_BSIZE 32
|
#define STATE_BSIZE 32
|
||||||
|
|
||||||
@@ -856,6 +858,7 @@ getsc__(void)
|
|||||||
Source *s = source;
|
Source *s = source;
|
||||||
int c;
|
int c;
|
||||||
|
|
||||||
|
getsc_again:
|
||||||
while ((c = *s->str++) == 0) {
|
while ((c = *s->str++) == 0) {
|
||||||
s->str = NULL; /* return 0 for EOF by default */
|
s->str = NULL; /* return 0 for EOF by default */
|
||||||
switch (s->type) {
|
switch (s->type) {
|
||||||
@@ -947,6 +950,16 @@ getsc__(void)
|
|||||||
shf_flush(shl_out);
|
shf_flush(shl_out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* check for UTF-8 byte order mark */
|
||||||
|
if (s->flags & SF_FIRST) {
|
||||||
|
s->flags &= ~SF_FIRST;
|
||||||
|
if (((unsigned char)c == 0xEF) &&
|
||||||
|
(((const unsigned char *)(s->str))[0] == 0xBB) &&
|
||||||
|
(((const unsigned char *)(s->str))[1] == 0xBF)) {
|
||||||
|
s->str += 2;
|
||||||
|
goto getsc_again;
|
||||||
|
}
|
||||||
|
}
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
4
main.c
4
main.c
@@ -13,7 +13,7 @@
|
|||||||
#include <locale.h>
|
#include <locale.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__RCSID("$MirOS: src/bin/mksh/main.c,v 1.73 2007/03/04 03:04:26 tg Exp $");
|
__RCSID("$MirOS: src/bin/mksh/main.c,v 1.74 2007/04/15 10:45:59 tg Exp $");
|
||||||
|
|
||||||
extern char **environ;
|
extern char **environ;
|
||||||
|
|
||||||
@@ -457,6 +457,8 @@ shell(Source * volatile s, volatile int toplevel)
|
|||||||
Source *volatile old_source = source;
|
Source *volatile old_source = source;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
s->flags |= SF_FIRST; /* enable UTF-8 BOM check */
|
||||||
|
|
||||||
newenv(E_PARSE);
|
newenv(E_PARSE);
|
||||||
if (interactive)
|
if (interactive)
|
||||||
really_exit = 0;
|
really_exit = 0;
|
||||||
|
3
sh.h
3
sh.h
@@ -8,7 +8,7 @@
|
|||||||
/* $OpenBSD: c_test.h,v 1.4 2004/12/20 11:34:26 otto Exp $ */
|
/* $OpenBSD: c_test.h,v 1.4 2004/12/20 11:34:26 otto Exp $ */
|
||||||
/* $OpenBSD: tty.h,v 1.5 2004/12/20 11:34:26 otto Exp $ */
|
/* $OpenBSD: tty.h,v 1.5 2004/12/20 11:34:26 otto Exp $ */
|
||||||
|
|
||||||
#define MKSH_SH_H_ID "$MirOS: src/bin/mksh/sh.h,v 1.121 2007/03/14 02:41:09 tg Exp $"
|
#define MKSH_SH_H_ID "$MirOS: src/bin/mksh/sh.h,v 1.122 2007/04/15 10:45:59 tg Exp $"
|
||||||
#define MKSH_VERSION "R29 2007/03/14"
|
#define MKSH_VERSION "R29 2007/03/14"
|
||||||
|
|
||||||
#if HAVE_SYS_PARAM_H
|
#if HAVE_SYS_PARAM_H
|
||||||
@@ -1088,6 +1088,7 @@ struct source {
|
|||||||
#define SF_ALIAS BIT(1) /* faking space at end of alias */
|
#define SF_ALIAS BIT(1) /* faking space at end of alias */
|
||||||
#define SF_ALIASEND BIT(2) /* faking space at end of alias */
|
#define SF_ALIASEND BIT(2) /* faking space at end of alias */
|
||||||
#define SF_TTY BIT(3) /* type == SSTDIN & it is a tty */
|
#define SF_TTY BIT(3) /* type == SSTDIN & it is a tty */
|
||||||
|
#define SF_FIRST BIT(4) /* initial state (to ignore UTF-8 BOM) */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* states while lexing word
|
* states while lexing word
|
||||||
|
Reference in New Issue
Block a user