jehanne/sys/src/cmd/grep/comp.c

/*
 * This file is part of the UCB release of Plan 9. It is subject to the license
 * terms in the LICENSE file found in the top-level directory of this
 * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
 * part of the UCB release of Plan 9, including this file, may be copied,
 * modified, propagated, or distributed except according to the terms contained
 * in the LICENSE file.
 */

#include	"grep.h"

/*
 * incremental compiler.
 * add the branch c to the
 * state s.
 */
void
increment(State * s, int c)
{
	int i;
	State *t, **tt;
	Re *re1, *re2;

	nfollow = 0;
	gen++;
	matched = 0;
	for (i = 0; i < s->count; i++)
		fol1(s->re[i], c);
	qsort(follow, nfollow, sizeof(*follow), fcmp);
	for (tt = &state0; t = *tt;) {
		if (t->count > nfollow) {
			tt = &t->linkleft;
			goto cont;
		}
		if (t->count < nfollow) {
			tt = &t->linkright;
			goto cont;
		}
		for (i = 0; i < nfollow; i++) {
			re1 = t->re[i];
			re2 = follow[i];
			if (re1 > re2) {
				tt = &t->linkleft;
				goto cont;
			}
			if (re1 < re2) {
				tt = &t->linkright;
				goto cont;
			}
		}
		if (! !matched && !t->match) {
			tt = &t->linkleft;
			goto cont;
		}
		if (!matched && ! !t->match) {
			tt = &t->linkright;
			goto cont;
		}
		s->next[c] = t;
		return;
cont:	;
	}

	t = sal(nfollow);
	*tt = t;
	for (i = 0; i < nfollow; i++) {
		re1 = follow[i];
		t->re[i] = re1;
	}
	s->next[c] = t;
	t->match = matched;
}

int
fcmp(const void *va, const void *vb)
{
	Re **aa, **bb;
	Re *a, *b;

	aa = (Re **)va;
	bb = (Re **)vb;
	a = *aa;
	b = *bb;
	if (a > b)
		return 1;
	if (a < b)
		return -1;
	return 0;
}

void
fol1(Re * r, int c)
{
	Re *r1;

loop:
	if (r->gen == gen)
		return;
	if (nfollow >= maxfollow)
		error("nfollow");
	r->gen = gen;
	switch (r->type) {
		default:
			error("fol1");

		case Tcase:
			if (c >= 0 && c < 256)
				if (r1 = r->cases[c])
					follow[nfollow++] = r1;
			if (r = r->next)
				goto loop;
			break;

		case Talt:
		case Tor:
			fol1(r->alt, c);
			r = r->next;
			goto loop;

		case Tbegin:
			if (c == '\n' || c == Cbegin)
				follow[nfollow++] = r->next;
			break;

		case Tend:
			if (c == '\n') {
				if (r->next == 0) {
					matched = 1;
					break;
				}
				r = r->next;
				goto loop;
			}
			break;

		case Tclass:
			if (c >= r->lo && c <= r->hi)
				follow[nfollow++] = r->next;
			break;
	}
}

Rune tab1[] = {
	0x007f,
	0x07ff,
};

Rune tab2[] = {
	0x003f,
	0x0fff,
};

Re2
rclass(Rune p0, Rune p1)
{
	char xc0[6], xc1[6];
	int i, n, m;
	Re2 x;

	if (p0 > p1)
		return re2char(0xff, 0xff);	// no match

	/*
	 * bust range into same length
	 * character sequences
	 */
	for (i = 0; i < nelem(tab1); i++) {
		m = tab1[i];
		if (p0 <= m && p1 > m)
			return re2or(rclass(p0, m), rclass(m + 1, p1));
	}

	/*
	 * bust range into part of a single page
	 * or into full pages
	 */
	for (i = 0; i < nelem(tab2); i++) {
		m = tab2[i];
		if ((p0 & ~m) != (p1 & ~m)) {
			if ((p0 & m) != 0)
				return re2or(rclass(p0, p0 | m), rclass((p0 | m) + 1, p1));
			if ((p1 & m) != m)
				return re2or(rclass(p0, (p1 & ~m) - 1), rclass(p1 & ~m, p1));
		}
	}

	n = runetochar(xc0, &p0);
	i = runetochar(xc1, &p1);
	if (i != n)
		error("length");

	x = re2char(xc0[0], xc1[0]);
	for (i = 1; i < n; i++)
		x = re2cat(x, re2char(xc0[i], xc1[i]));
	return x;
}

int
pcmp(const void *va, const void *vb)
{
	int n;
	const Rune *a, *b;

	a = va;
	b = vb;

	n = a[0] - b[0];
	if (n)
		return n;
	return a[1] - b[1];
}

/*
 * convert character chass into
 * run-pair ranges of matches.
 * this is 10646/utf specific and
 * needs to be changed for some
 * other input character set.
 * this is the key to a fast
 * regular search of characters
 * by looking at sequential bytes.
 */
Re2
re2class(char *s)
{
	Rune pairs[200 + 2], *p, *q, ov;
	int nc;
	Re2 x;

	nc = 0;
	if (*s == '^') {
		nc = 1;
		s++;
	}

	p = pairs;
	s += chartorune(p, s);
	for (;;) {
		if (*p == '\\')
			s += chartorune(p, s);
		if (*p == 0)
			break;
		p[1] = *p;
		p += 2;
		if (p >= pairs + nelem(pairs) - 2)
			error("class too big");
		s += chartorune(p, s);
		if (*p != '-')
			continue;
		s += chartorune(p, s);
		if (*p == '\\')
			s += chartorune(p, s);
		if (*p == 0)
			break;
		p[-1] = *p;
		s += chartorune(p, s);
	}
	*p = 0;
	qsort(pairs, (p - pairs) / 2, 2 * sizeof(*pairs), pcmp);

	q = pairs;
	for (p = pairs + 2; *p; p += 2) {
		if (p[0] > p[1])
			continue;
		if (p[0] > q[1] || p[1] < q[0]) {
			q[2] = p[0];
			q[3] = p[1];
			q += 2;
			continue;
		}
		if (p[0] < q[0])
			q[0] = p[0];
		if (p[1] > q[1])
			q[1] = p[1];
	}
	q[2] = 0;

	p = pairs;
	if (nc) {
		x = rclass(0, p[0] - 1);
		ov = p[1] + 1;
		for (p += 2; *p; p += 2) {
			x = re2or(x, rclass(ov, p[0] - 1));
			ov = p[1] + 1;
		}
		x = re2or(x, rclass(ov, Runemask));
	} else {
		x = rclass(p[0], p[1]);
		for (p += 2; *p; p += 2)
			x = re2or(x, rclass(p[0], p[1]));
	}
	return x;
}