Replace regex files with multibyte-aware version from FreeBSD.

* Makefile.in (install-headers): Remove extra command to install
	regex.h.
	(uninstall-headers): Remove extra command to uninstall regex.h.
	* nlsfuncs.cc (collate_lcid): Make externally available to allow
	access to collation internals from regex functions.
	(collate_charset): Ditto.
	* wchar.h: Add __cplusplus guards to make C-clean.
	* include/regex.h: New file, replacing regex/regex.h.  Remove UCB
	advertising clause.
	* regex/COPYRIGHT: Accommodate BSD license.  Remove UCB advertising
	clause.
	* regex/cclass.h: Remove.
	* regex/cname.h: New file from FreeBSD.
	* regex/engine.c: Ditto.
	(NONCHAR): Tweak for Cygwin.
	* regex/engine.ih: Remove.
	* regex/mkh: Remove.
	* regex/regcomp.c: New file from FreeBSD.  Tweak slightly for Cygwin.
	Import required collate internals from nlsfunc.cc.
	(p_ere_exp): Add GNU-specific \< and \> handling for word boundaries.
	(p_simp_re): Ditto.
	(__collate_range_cmp): Define.
	(p_b_term): Use Cygwin-specific collate internals.
	(findmust): Ditto.
	* regex/regcomp.ih: Remove.
	* regex/regerror.c: New file from FreeBSD.  Fix a few compiler warnings.
	* regex/regerror.ih: Remove.
	* regex/regex.7: New file from FreeBSD.  Remove UCB advertising clause.
	* regex/regex.h: Remove.  Replaced by include/regex.h.
	* regex/regexec.c: New file from FreeBSD.  Fix a few compiler warnings.
	* regex/regfree.c: New file from FreeBSD.
	* regex/tests: Remove.
	* regex/utils.h: New file from FreeBSD.
This commit is contained in:
Corinna Vinschen
2010-02-04 12:35:49 +00:00
parent c8f7d3cb48
commit e1e595a649
23 changed files with 2832 additions and 2110 deletions

View File

@ -1,3 +1,41 @@
/*-
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Henry Spencer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)engine.c 8.5 (Berkeley) 3/20/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD: src/lib/libc/regex/engine.c,v 1.23 2009/09/16 06:32:23 dds Exp $");
/*
* The matching engine and friends. This file is #included by regexec.c
* after suitable #defines of a variety of macros used herein, so that
@ -27,25 +65,77 @@
#define at lat
#define match lmat
#endif
#ifdef MNAMES
#define matcher mmatcher
#define fast mfast
#define slow mslow
#define dissect mdissect
#define backref mbackref
#define step mstep
#define print mprint
#define at mat
#define match mmat
#endif
/* another structure passed up and down to avoid zillions of parameters */
struct match {
struct re_guts *g;
int eflags;
regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
char *offp; /* offsets work from here */
char *beginp; /* start of string -- virtual NUL precedes */
char *endp; /* end of string -- virtual NUL here */
char *coldp; /* can be no match starting before here */
char **lastpos; /* [nplus+1] */
const char *offp; /* offsets work from here */
const char *beginp; /* start of string -- virtual NUL precedes */
const char *endp; /* end of string -- virtual NUL here */
const char *coldp; /* can be no match starting before here */
const char **lastpos; /* [nplus+1] */
STATEVARS;
states st; /* current states */
states fresh; /* states for a fresh start */
states tmp; /* temporary */
states empty; /* empty set of states */
mbstate_t mbs; /* multibyte conversion state */
};
#include "engine.ih"
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === engine.c === */
static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
#define MAX_RECURSION 100
#define BOL (OUT-1)
#define EOL (BOL-1)
#define BOLEOL (BOL-2)
#define NOTHING (BOL-3)
#define BOW (BOL-4)
#define EOW (BOL-5)
#define BADCHAR (BOL-6)
#ifdef __CYGWIN__
/* In contrast to BSD, wint_t on Cygwin is unsigned. This breaks this test,
unless the compared values are casted to signed. */
#define NONCHAR(c) ((int)(c) <= (int)OUT)
#else
#define NONCHAR(c) ((c) <= OUT)
#endif
#ifdef REDEBUG
static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
#endif
#ifdef REDEBUG
static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst);
#endif
#ifdef REDEBUG
static const char *pchar(int ch);
#endif
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */
#ifdef REDEBUG
#define SP(t, s, c) print(m, t, s, c, stdout)
@ -59,26 +149,32 @@ struct match {
/*
- matcher - the actual matching engine
== static int matcher(register struct re_guts *g, char *string, \
== static int matcher(struct re_guts *g, const char *string, \
== size_t nmatch, regmatch_t pmatch[], int eflags);
*/
static int /* 0 success, REG_NOMATCH failure */
matcher(g, string, nmatch, pmatch, eflags)
register struct re_guts *g;
char *string;
size_t nmatch;
regmatch_t pmatch[];
int eflags;
matcher(struct re_guts *g,
const char *string,
size_t nmatch,
regmatch_t pmatch[],
int eflags)
{
register char *endp;
register size_t i;
const char *endp;
int i;
struct match mv;
register struct match *m = &mv;
register char *dp;
const register sopno gf = g->firststate+1; /* +1 for OEND */
const register sopno gl = g->laststate;
char *start;
char *stop;
struct match *m = &mv;
const char *dp;
const sopno gf = g->firststate+1; /* +1 for OEND */
const sopno gl = g->laststate;
const char *start;
const char *stop;
/* Boyer-Moore algorithms variables */
const char *pp;
int cj, mj;
const char *mustfirst;
const char *mustlast;
int *matchjump;
int *charjump;
/* simplify the situation where possible */
if (g->cflags&REG_NOSUB)
@ -95,12 +191,46 @@ int eflags;
/* prescreening; this does wonders for this rather slow code */
if (g->must != NULL) {
for (dp = start; dp < stop; dp++)
if (*dp == g->must[0] && stop - dp >= g->mlen &&
memcmp(dp, g->must, (size_t)g->mlen) == 0)
break;
if (dp == stop) /* we didn't find g->must */
return(REG_NOMATCH);
if (g->charjump != NULL && g->matchjump != NULL) {
mustfirst = g->must;
mustlast = g->must + g->mlen - 1;
charjump = g->charjump;
matchjump = g->matchjump;
pp = mustlast;
for (dp = start+g->mlen-1; dp < stop;) {
/* Fast skip non-matches */
while (dp < stop && charjump[(int)*dp])
dp += charjump[(int)*dp];
if (dp >= stop)
break;
/* Greedy matcher */
/* We depend on not being used for
* for strings of length 1
*/
while (*--dp == *--pp && pp != mustfirst);
if (*dp == *pp)
break;
/* Jump to next possible match */
mj = matchjump[pp - mustfirst];
cj = charjump[(int)*dp];
dp += (cj < mj ? mj : cj);
pp = mustlast;
}
if (pp != mustfirst)
return(REG_NOMATCH);
} else {
for (dp = start; dp < stop; dp++)
if (*dp == g->must[0] &&
stop - dp >= g->mlen &&
memcmp(dp, g->must, (size_t)g->mlen) == 0)
break;
if (dp == stop) /* we didn't find g->must */
return(REG_NOMATCH);
}
}
/* match struct setup */
@ -117,11 +247,22 @@ int eflags;
SETUP(m->tmp);
SETUP(m->empty);
CLEAR(m->empty);
ZAPSTATE(&m->mbs);
/* Adjust start according to moffset, to speed things up */
if (g->moffset > -1)
start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
SP("mloop", m->st, *start);
/* this loop does only one repetition except for backrefs */
for (;;) {
endp = fast(m, start, stop, gf, gl);
if (endp == NULL) { /* a miss */
if (m->pmatch != NULL)
free((char *)m->pmatch);
if (m->lastpos != NULL)
free((char *)m->lastpos);
STATETEARDOWN(m);
return(REG_NOMATCH);
}
@ -136,7 +277,8 @@ int eflags;
if (endp != NULL)
break;
assert(m->coldp < m->endp);
m->coldp++;
m->coldp += XMBRTOWC(NULL, m->coldp,
m->endp - m->coldp, &m->mbs, 0);
}
if (nmatch == 1 && !g->backrefs)
break; /* no further info needed */
@ -156,15 +298,15 @@ int eflags;
dp = dissect(m, m->coldp, endp, gf, gl);
} else {
if (g->nplus > 0 && m->lastpos == NULL)
m->lastpos = (char **)malloc((g->nplus+1) *
sizeof(char *));
m->lastpos = malloc((g->nplus+1) *
sizeof(const char *));
if (g->nplus > 0 && m->lastpos == NULL) {
free(m->pmatch);
STATETEARDOWN(m);
return(REG_ESPACE);
}
NOTE("backref dissect");
dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
}
if (dp != NULL)
break;
@ -187,7 +329,7 @@ int eflags;
}
#endif
NOTE("backoff dissect");
dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
}
assert(dp == NULL || dp == endp);
if (dp != NULL) /* found a shorter one */
@ -195,7 +337,9 @@ int eflags;
/* despite initial appearances, there is no match here */
NOTE("false alarm");
start = m->coldp + 1; /* recycle starting later */
/* recycle starting later */
start = m->coldp + XMBRTOWC(NULL, m->coldp,
stop - m->coldp, &m->mbs, 0);
assert(start <= stop);
}
@ -225,30 +369,29 @@ int eflags;
/*
- dissect - figure out what matched what, no back references
== static char *dissect(register struct match *m, char *start, \
== char *stop, sopno startst, sopno stopst);
== static const char *dissect(struct match *m, const char *start, \
== const char *stop, sopno startst, sopno stopst);
*/
static char * /* == stop (success) always */
dissect(m, start, stop, startst, stopst)
register struct match *m;
char *start;
char *stop;
sopno startst;
sopno stopst;
static const char * /* == stop (success) always */
dissect(struct match *m,
const char *start,
const char *stop,
sopno startst,
sopno stopst)
{
register int i;
register sopno ss; /* start sop of current subRE */
register sopno es; /* end sop of current subRE */
register char *sp; /* start of string matched by it */
register char *stp; /* string matched by it cannot pass here */
register char *rest; /* start of rest of string */
register char *tail; /* string unmatched by rest of RE */
register sopno ssub; /* start sop of subsubRE */
register sopno esub; /* end sop of subsubRE */
register char *ssp; /* start of string matched by subsubRE */
register char *sep; /* end of string matched by subsubRE */
register char *oldssp; /* previous ssp */
register char *dp;
int i;
sopno ss; /* start sop of current subRE */
sopno es; /* end sop of current subRE */
const char *sp; /* start of string matched by it */
const char *stp; /* string matched by it cannot pass here */
const char *rest; /* start of rest of string */
const char *tail; /* string unmatched by rest of RE */
sopno ssub; /* start sop of subsubRE */
sopno esub; /* end sop of subsubRE */
const char *ssp; /* start of string matched by subsubRE */
const char *sep; /* end of string matched by subsubRE */
const char *oldssp; /* previous ssp */
const char *dp;
AT("diss", start, stop, startst, stopst);
sp = start;
@ -273,7 +416,7 @@ sopno stopst;
assert(nope);
break;
case OCHAR:
sp++;
sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBOL:
case OEOL:
@ -282,7 +425,7 @@ sopno stopst;
break;
case OANY:
case OANYOF:
sp++;
sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBACK_:
case O_BACK:
@ -413,30 +556,31 @@ sopno stopst;
/*
- backref - figure out what matched what, figuring in back references
== static char *backref(register struct match *m, char *start, \
== char *stop, sopno startst, sopno stopst, sopno lev);
== static const char *backref(struct match *m, const char *start, \
== const char *stop, sopno startst, sopno stopst, sopno lev);
*/
static char * /* == stop (success) or NULL (failure) */
backref(m, start, stop, startst, stopst, lev)
register struct match *m;
char *start;
char *stop;
sopno startst;
sopno stopst;
sopno lev; /* PLUS nesting level */
static const char * /* == stop (success) or NULL (failure) */
backref(struct match *m,
const char *start,
const char *stop,
sopno startst,
sopno stopst,
sopno lev, /* PLUS nesting level */
int rec)
{
register int i;
register sopno ss; /* start sop of current subRE */
register char *sp; /* start of string matched by it */
register sopno ssub; /* start sop of subsubRE */
register sopno esub; /* end sop of subsubRE */
register char *ssp; /* start of string matched by subsubRE */
register char *dp;
register size_t len;
register int hard;
register sop s;
register regoff_t offsave;
register cset *cs;
int i;
sopno ss; /* start sop of current subRE */
const char *sp; /* start of string matched by it */
sopno ssub; /* start sop of subsubRE */
sopno esub; /* end sop of subsubRE */
const char *ssp; /* start of string matched by subsubRE */
const char *dp;
size_t len;
int hard;
sop s;
regoff_t offsave;
cset *cs;
wint_t wc;
AT("back", start, stop, startst, stopst);
sp = start;
@ -446,17 +590,25 @@ sopno lev; /* PLUS nesting level */
for (ss = startst; !hard && ss < stopst; ss++)
switch (OP(s = m->g->strip[ss])) {
case OCHAR:
if (sp == stop || *sp++ != (char)OPND(s))
if (sp == stop)
return(NULL);
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc != OPND(s))
return(NULL);
break;
case OANY:
if (sp == stop)
return(NULL);
sp++;
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc == BADCHAR)
return (NULL);
break;
case OANYOF:
if (sp == stop)
return (NULL);
cs = &m->g->sets[OPND(s)];
if (sp == stop || !CHIN(cs, *sp++))
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc == BADCHAR || !CHIN(cs, wc))
return(NULL);
break;
case OBOL:
@ -529,6 +681,8 @@ sopno lev; /* PLUS nesting level */
return(NULL);
assert(m->pmatch[i].rm_so != -1);
len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
if (len == 0 && rec++ > MAX_RECURSION)
return(NULL);
assert(stop - m->beginp >= len);
if (sp > stop - len)
return(NULL); /* not enough left to match */
@ -537,28 +691,28 @@ sopno lev; /* PLUS nesting level */
return(NULL);
while (m->g->strip[ss] != SOP(O_BACK, i))
ss++;
return(backref(m, sp+len, stop, ss+1, stopst, lev));
return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
break;
case OQUEST_: /* to null or not */
dp = backref(m, sp, stop, ss+1, stopst, lev);
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp); /* not */
return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
break;
case OPLUS_:
assert(m->lastpos != NULL);
assert(lev+1 <= m->g->nplus);
m->lastpos[lev+1] = sp;
return(backref(m, sp, stop, ss+1, stopst, lev+1));
return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
break;
case O_PLUS:
if (sp == m->lastpos[lev]) /* last pass matched null */
return(backref(m, sp, stop, ss+1, stopst, lev-1));
return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
/* try another pass */
m->lastpos[lev] = sp;
dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
if (dp == NULL)
return(backref(m, sp, stop, ss+1, stopst, lev-1));
return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
else
return(dp);
break;
@ -567,7 +721,7 @@ sopno lev; /* PLUS nesting level */
esub = ss + OPND(s) - 1;
assert(OP(m->g->strip[esub]) == OOR1);
for (;;) { /* find first matching branch */
dp = backref(m, sp, stop, ssub, esub, lev);
dp = backref(m, sp, stop, ssub, esub, lev, rec);
if (dp != NULL)
return(dp);
/* that one missed, try next one */
@ -588,7 +742,7 @@ sopno lev; /* PLUS nesting level */
assert(0 < i && i <= m->g->nsub);
offsave = m->pmatch[i].rm_so;
m->pmatch[i].rm_so = sp - m->offp;
dp = backref(m, sp, stop, ss+1, stopst, lev);
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp);
m->pmatch[i].rm_so = offsave;
@ -599,7 +753,7 @@ sopno lev; /* PLUS nesting level */
assert(0 < i && i <= m->g->nsub);
offsave = m->pmatch[i].rm_eo;
m->pmatch[i].rm_eo = sp - m->offp;
dp = backref(m, sp, stop, ss+1, stopst, lev);
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp);
m->pmatch[i].rm_eo = offsave;
@ -613,42 +767,57 @@ sopno lev; /* PLUS nesting level */
/* "can't happen" */
assert(nope);
/* NOTREACHED */
return((char *)NULL); /* dummy */
return "shut up gcc";
}
/*
- fast - step through the string at top speed
== static char *fast(register struct match *m, char *start, \
== char *stop, sopno startst, sopno stopst);
== static const char *fast(struct match *m, const char *start, \
== const char *stop, sopno startst, sopno stopst);
*/
static char * /* where tentative match ended, or NULL */
fast(m, start, stop, startst, stopst)
register struct match *m;
char *start;
char *stop;
sopno startst;
sopno stopst;
static const char * /* where tentative match ended, or NULL */
fast( struct match *m,
const char *start,
const char *stop,
sopno startst,
sopno stopst)
{
register states st = m->st;
register states fresh = m->fresh;
register states tmp = m->tmp;
register char *p = start;
register int c = (start == m->beginp) ? OUT : *(start-1);
register int lastc; /* previous c */
register int flagch;
register int i;
register char *coldp; /* last p after which no match was underway */
states st = m->st;
states fresh = m->fresh;
states tmp = m->tmp;
const char *p = start;
wint_t c;
wint_t lastc; /* previous c */
wint_t flagch;
int i;
const char *coldp; /* last p after which no match was underway */
size_t clen;
CLEAR(st);
SET1(st, startst);
SP("fast", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
ASSIGN(fresh, st);
SP("start", st, *p);
coldp = NULL;
if (start == m->beginp)
c = OUT;
else {
/*
* XXX Wrong if the previous character was multi-byte.
* Newline never is (in encodings supported by FreeBSD),
* so this only breaks the ISWORD tests below.
*/
c = (uch)*(start - 1);
}
for (;;) {
/* next character */
lastc = c;
c = (p == m->endp) ? OUT : *p;
if (p == m->endp) {
clen = 0;
c = OUT;
} else
clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
if (EQ(st, fresh))
coldp = p;
@ -686,7 +855,7 @@ sopno stopst;
}
/* are we done? */
if (ISSET(st, stopst) || p == stop)
if (ISSET(st, stopst) || p == stop || clen > stop - p)
break; /* NOTE BREAK OUT */
/* no, we must deal with this character */
@ -696,39 +865,39 @@ sopno stopst;
st = step(m->g, startst, stopst, tmp, c, st);
SP("aft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
p++;
p += clen;
}
assert(coldp != NULL);
m->coldp = coldp;
if (ISSET(st, stopst))
return(p+1);
return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
else
return(NULL);
}
/*
- slow - step through the string more deliberately
== static char *slow(register struct match *m, char *start, \
== char *stop, sopno startst, sopno stopst);
== static const char *slow(struct match *m, const char *start, \
== const char *stop, sopno startst, sopno stopst);
*/
static char * /* where it ended */
slow(m, start, stop, startst, stopst)
register struct match *m;
char *start;
char *stop;
sopno startst;
sopno stopst;
static const char * /* where it ended */
slow( struct match *m,
const char *start,
const char *stop,
sopno startst,
sopno stopst)
{
register states st = m->st;
register states empty = m->empty;
register states tmp = m->tmp;
register char *p = start;
register int c = (start == m->beginp) ? OUT : *(start-1);
register int lastc; /* previous c */
register int flagch;
register int i;
register char *matchp; /* last p at which a match ended */
states st = m->st;
states empty = m->empty;
states tmp = m->tmp;
const char *p = start;
wint_t c;
wint_t lastc; /* previous c */
wint_t flagch;
int i;
const char *matchp; /* last p at which a match ended */
size_t clen;
AT("slow", start, stop, startst, stopst);
CLEAR(st);
@ -736,10 +905,24 @@ sopno stopst;
SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
matchp = NULL;
if (start == m->beginp)
c = OUT;
else {
/*
* XXX Wrong if the previous character was multi-byte.
* Newline never is (in encodings supported by FreeBSD),
* so this only breaks the ISWORD tests below.
*/
c = (uch)*(start - 1);
}
for (;;) {
/* next character */
lastc = c;
c = (p == m->endp) ? OUT : *p;
if (p == m->endp) {
c = OUT;
clen = 0;
} else
clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
@ -777,7 +960,7 @@ sopno stopst;
/* are we done? */
if (ISSET(st, stopst))
matchp = p;
if (EQ(st, empty) || p == stop)
if (EQ(st, empty) || p == stop || clen > stop - p)
break; /* NOTE BREAK OUT */
/* no, we must deal with this character */
@ -787,7 +970,7 @@ sopno stopst;
st = step(m->g, startst, stopst, tmp, c, st);
SP("saft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
p++;
p += clen;
}
return(matchp);
@ -796,33 +979,31 @@ sopno stopst;
/*
- step - map set of states reachable before char to set reachable after
== static states step(register struct re_guts *g, sopno start, sopno stop, \
== register states bef, int ch, register states aft);
== #define BOL (OUT+1)
== #define EOL (BOL+1)
== #define BOLEOL (BOL+2)
== #define NOTHING (BOL+3)
== #define BOW (BOL+4)
== #define EOW (BOL+5)
== #define CODEMAX (BOL+5) // highest code used
== #define NONCHAR(c) ((c) > CHAR_MAX)
== #define NNONCHAR (CODEMAX-CHAR_MAX)
== static states step(struct re_guts *g, sopno start, sopno stop, \
== states bef, int ch, states aft);
== #define BOL (OUT-1)
== #define EOL (BOL-1)
== #define BOLEOL (BOL-2)
== #define NOTHING (BOL-3)
== #define BOW (BOL-4)
== #define EOW (BOL-5)
== #define BADCHAR (BOL-6)
== #define NONCHAR(c) ((c) <= OUT)
*/
static states
step(g, start, stop, bef, ch, aft)
register struct re_guts *g;
sopno start; /* start state within strip */
sopno stop; /* state after stop state within strip */
register states bef; /* states reachable before */
int ch; /* character or NONCHAR code */
register states aft; /* states already known reachable after */
step(struct re_guts *g,
sopno start, /* start state within strip */
sopno stop, /* state after stop state within strip */
states bef, /* states reachable before */
wint_t ch, /* character or NONCHAR code */
states aft) /* states already known reachable after */
{
register cset *cs;
register sop s;
register sopno pc;
register onestate here; /* note, macros know this name */
register sopno look;
register long i;
cset *cs;
sop s;
sopno pc;
onestate here; /* note, macros know this name */
sopno look;
int i;
for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
s = g->strip[pc];
@ -832,8 +1013,8 @@ register states aft; /* states already known reachable after */
break;
case OCHAR:
/* only characters can match */
assert(!NONCHAR(ch) || ch != (char)OPND(s));
if (ch == (char)OPND(s))
assert(!NONCHAR(ch) || ch != OPND(s));
if (ch == OPND(s))
FWD(aft, bef, 1);
break;
case OBOL:
@ -900,7 +1081,7 @@ register states aft; /* states already known reachable after */
OP(s = g->strip[pc+look]) != O_CH;
look += OPND(s))
assert(OP(s) == OOR2);
FWD(aft, aft, look);
FWD(aft, aft, look + 1);
}
break;
case OOR2: /* propagate OCH_'s marking */
@ -926,21 +1107,20 @@ register states aft; /* states already known reachable after */
/*
- print - print a set of states
== #ifdef REDEBUG
== static void print(struct match *m, char *caption, states st, \
== static void print(struct match *m, const char *caption, states st, \
== int ch, FILE *d);
== #endif
*/
static void
print(m, caption, st, ch, d)
struct match *m;
char *caption;
states st;
int ch;
FILE *d;
print(struct match *m,
const char *caption,
states st,
int ch,
FILE *d)
{
register struct re_guts *g = m->g;
register int i;
register int first = 1;
struct re_guts *g = m->g;
int i;
int first = 1;
if (!(m->eflags&REG_TRACE))
return;
@ -959,18 +1139,17 @@ FILE *d;
/*
- at - print current situation
== #ifdef REDEBUG
== static void at(struct match *m, char *title, char *start, char *stop, \
== sopno startst, sopno stopst);
== static void at(struct match *m, const char *title, const char *start, \
== const char *stop, sopno startst, sopno stopst);
== #endif
*/
static void
at(m, title, start, stop, startst, stopst)
struct match *m;
char *title;
char *start;
char *stop;
sopno startst;
sopno stopst;
at( struct match *m,
const char *title,
const char *start,
const char *stop,
sopno startst,
sopno stopst)
{
if (!(m->eflags&REG_TRACE))
return;
@ -985,7 +1164,7 @@ sopno stopst;
/*
- pchar - make a character printable
== #ifdef REDEBUG
== static char *pchar(int ch);
== static const char *pchar(int ch);
== #endif
*
* Is this identical to regchar() over in debug.c? Well, yes. But a
@ -993,13 +1172,12 @@ sopno stopst;
* a matching debug.o, and this is convenient. It all disappears in
* the non-debug compilation anyway, so it doesn't matter much.
*/
static char * /* -> representation */
pchar(ch)
int ch;
static const char * /* -> representation */
pchar(int ch)
{
static char pbuf[10];
if (isprint(ch) || ch == ' ')
if (isprint((uch)ch) || ch == ' ')
sprintf(pbuf, "%c", ch);
else
sprintf(pbuf, "\\%o", ch);