* regex/engine.c (step): Drop Cygwin-specific definition.
(NONCHAR): Better cast here to make the test work. Move comment from step here. (matcher): Disable skipping initial string in multibyte case. * regex/regcomp.c (p_bracket): Don't simplify singleton in the invert case. (p_b_term): Handle early end of pattern after dash in bracket expression. (singleton): Don't ignore the wides just because there's already a singleton in the single byte chars. Fix condition for a singleton wide accordingly. (findmust): Check for LC_CTYPE charset, rather than LC_COLLATE charset. * regex2.h (CHIN): Fix condition in the icase & invert case. (ISWORD): Fix wrong cast to unsigned char.
This commit is contained in:
		| @@ -1,3 +1,20 @@ | |||||||
|  | 2010-02-11  Corinna Vinschen  <corinna@vinschen.de> | ||||||
|  |  | ||||||
|  | 	* regex/engine.c (step): Drop Cygwin-specific definition. | ||||||
|  | 	(NONCHAR): Better cast here to make the test work.  Move comment | ||||||
|  | 	from step here. | ||||||
|  | 	(matcher): Disable skipping initial string in multibyte case. | ||||||
|  | 	* regex/regcomp.c (p_bracket): Don't simplify singleton in the invert | ||||||
|  | 	case. | ||||||
|  | 	(p_b_term): Handle early end of pattern after dash in bracket | ||||||
|  | 	expression. | ||||||
|  | 	(singleton): Don't ignore the wides just because there's already a | ||||||
|  | 	singleton in the single byte chars.  Fix condition for a singleton | ||||||
|  | 	wide accordingly. | ||||||
|  | 	(findmust): Check for LC_CTYPE charset, rather than LC_COLLATE charset. | ||||||
|  | 	* regex2.h (CHIN): Fix condition in the icase & invert case. | ||||||
|  | 	(ISWORD): Fix wrong cast to unsigned char. | ||||||
|  |  | ||||||
| 2010-02-11  Andy Koppe  <andy.koppe@gmail.com> | 2010-02-11  Andy Koppe  <andy.koppe@gmail.com> | ||||||
|  |  | ||||||
| 	* nlsfuncs.cc (initial_setlocale): Move check whether charset has | 	* nlsfuncs.cc (initial_setlocale): Move check whether charset has | ||||||
|   | |||||||
| @@ -106,11 +106,7 @@ static const char *dissect(struct match *m, const char *start, const char *stop, | |||||||
| static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); | static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); | ||||||
| static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); | static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); | ||||||
| static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); | static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); | ||||||
| #ifdef __CYGWIN__ |  | ||||||
| static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); |  | ||||||
| #else |  | ||||||
| static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); | static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); | ||||||
| #endif |  | ||||||
| #define MAX_RECURSION	100 | #define MAX_RECURSION	100 | ||||||
| #define	BOL	(OUT-1) | #define	BOL	(OUT-1) | ||||||
| #define	EOL	(BOL-1) | #define	EOL	(BOL-1) | ||||||
| @@ -119,7 +115,10 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_ | |||||||
| #define	BOW	(BOL-4) | #define	BOW	(BOL-4) | ||||||
| #define	EOW	(BOL-5) | #define	EOW	(BOL-5) | ||||||
| #define	BADCHAR	(BOL-6) | #define	BADCHAR	(BOL-6) | ||||||
| #define	NONCHAR(c)	((c) <= OUT) | /* When using wint_t, which is defined as unsigned int on BSD, | ||||||
|  |    as well as on Cygwin or Linux, the NONCHAR test is broken without | ||||||
|  |    the below cast.  I'm wondering how this is supposed to work at all... */ | ||||||
|  | #define	NONCHAR(c)	((int)(c) <= OUT) | ||||||
| #ifdef REDEBUG | #ifdef REDEBUG | ||||||
| static void print(struct match *m, const char *caption, states st, int ch, FILE *d); | static void print(struct match *m, const char *caption, states st, int ch, FILE *d); | ||||||
| #endif | #endif | ||||||
| @@ -248,9 +247,12 @@ matcher(struct re_guts *g, | |||||||
| 	ZAPSTATE(&m->mbs); | 	ZAPSTATE(&m->mbs); | ||||||
|  |  | ||||||
| 	/* Adjust start according to moffset, to speed things up */ | 	/* Adjust start according to moffset, to speed things up */ | ||||||
|  | #ifndef MNAMES | ||||||
|  | 	/* The code evaluating moffset doesn't seem to work right | ||||||
|  | 	   in the multibyte case. */ | ||||||
| 	if (g->moffset > -1) | 	if (g->moffset > -1) | ||||||
| 		start = ((dp - g->moffset) < start) ? start : dp - g->moffset; | 		start = ((dp - g->moffset) < start) ? start : dp - g->moffset; | ||||||
|  | #endif | ||||||
| 	SP("mloop", m->st, *start); | 	SP("mloop", m->st, *start); | ||||||
|  |  | ||||||
| 	/* this loop does only one repetition except for backrefs */ | 	/* this loop does only one repetition except for backrefs */ | ||||||
| @@ -993,14 +995,7 @@ step(struct re_guts *g, | |||||||
| 	sopno start,		/* start state within strip */ | 	sopno start,		/* start state within strip */ | ||||||
| 	sopno stop,		/* state after stop state within strip */ | 	sopno stop,		/* state after stop state within strip */ | ||||||
| 	states bef,		/* states reachable before */ | 	states bef,		/* states reachable before */ | ||||||
| #ifdef __CYGWIN__ |  | ||||||
| 	/* When using wint_t, which is defined as unsigned int on BSD, |  | ||||||
| 	   as well as on Cygwin or Linux, the NONCHAR test is broken. |  | ||||||
| 	   I'm wondering how this is supposed to work at all... */ |  | ||||||
| 	int ch,			/* character or NONCHAR code */ |  | ||||||
| #else |  | ||||||
| 	wint_t ch,		/* character or NONCHAR code */ | 	wint_t ch,		/* character or NONCHAR code */ | ||||||
| #endif |  | ||||||
| 	states aft)		/* states already known reachable after */ | 	states aft)		/* states already known reachable after */ | ||||||
| { | { | ||||||
| 	cset *cs; | 	cset *cs; | ||||||
|   | |||||||
| @@ -762,7 +762,8 @@ p_bracket(struct parse *p) | |||||||
| 	if (cs->invert && p->g->cflags®_NEWLINE) | 	if (cs->invert && p->g->cflags®_NEWLINE) | ||||||
| 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); | 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); | ||||||
|  |  | ||||||
| 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */ | 	if ((ch = singleton(cs)) != OUT		/* optimize singleton sets */ | ||||||
|  | 	     && cs->invert == 0) {		/* But not in invert case. */ | ||||||
| 		ordinary(p, ch); | 		ordinary(p, ch); | ||||||
| 		freeset(p, cs); | 		freeset(p, cs); | ||||||
| 	} else | 	} else | ||||||
| @@ -833,6 +834,9 @@ p_b_term(struct parse *p, cset *cs) | |||||||
| 				finish = '-'; | 				finish = '-'; | ||||||
| 			else | 			else | ||||||
| 				finish = p_b_symbol(p); | 				finish = p_b_symbol(p); | ||||||
|  | 		} else if (SEE('-') && !MORE2()) { | ||||||
|  | 			SETERROR(REG_EBRACK); | ||||||
|  | 			return; | ||||||
| 		} else | 		} else | ||||||
| 			finish = start; | 			finish = start; | ||||||
| 		if (start == finish) | 		if (start == finish) | ||||||
| @@ -1212,9 +1216,9 @@ singleton(cset *cs) | |||||||
| 			n++; | 			n++; | ||||||
| 			s = i; | 			s = i; | ||||||
| 		} | 		} | ||||||
| 	if (n == 1) | 	if (n == 1 && cs->nwides == 0) | ||||||
| 		return (s); | 		return (s); | ||||||
| 	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && | 	if (n == 0 && cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && | ||||||
| 	    cs->icase == 0) | 	    cs->icase == 0) | ||||||
| 		return (cs->wides[0]); | 		return (cs->wides[0]); | ||||||
| 	/* Don't bother handling the other cases. */ | 	/* Don't bother handling the other cases. */ | ||||||
| @@ -1467,7 +1471,7 @@ findmust(struct parse *p, struct re_guts *g) | |||||||
| 	 */ | 	 */ | ||||||
| 	if (MB_CUR_MAX > 1 && | 	if (MB_CUR_MAX > 1 && | ||||||
| #ifdef __CYGWIN__ | #ifdef __CYGWIN__ | ||||||
| 	    strcmp(collate_charset, "UTF-8") != 0) | 	    strcmp(__locale_charset (), "UTF-8") != 0) | ||||||
| #else | #else | ||||||
| 	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) | 	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -151,10 +151,14 @@ CHIN(cset *cs, wint_t ch) | |||||||
| 	if (ch < NC) | 	if (ch < NC) | ||||||
| 		return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ | 		return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ | ||||||
| 		    cs->invert); | 		    cs->invert); | ||||||
| 	else if (cs->icase) | 	else if (cs->icase) { | ||||||
| 		return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || | 		if (cs->invert) | ||||||
| 		    CHIN1(cs, towupper(ch))); | 			return (CHIN1(cs, ch) && CHIN1(cs, towlower(ch)) && | ||||||
| 	else | 			    CHIN1(cs, towupper(ch))); | ||||||
|  | 		else | ||||||
|  | 			return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || | ||||||
|  | 			    CHIN1(cs, towupper(ch))); | ||||||
|  | 	} else | ||||||
| 		return (CHIN1(cs, ch)); | 		return (CHIN1(cs, ch)); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -189,4 +193,4 @@ struct re_guts { | |||||||
|  |  | ||||||
| /* misc utilities */ | /* misc utilities */ | ||||||
| #define	OUT	(CHAR_MIN - 1)	/* a non-character value */ | #define	OUT	(CHAR_MIN - 1)	/* a non-character value */ | ||||||
| #define ISWORD(c)       (iswalnum((uch)(c)) || (c) == '_') | #define ISWORD(c)       (iswalnum((wint_t)(c)) || (c) == '_') | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user