Make strstr and strcasestr O(n), not O(n^2); add memmem.
* libc/string/str-two-way.h: New file. * libc/string/memmem.c (memmem): New file. * libc/include/string.h (memmem): Declare for all platforms. * libc/string/strstr.c (strstr): Provide O(n) implementation when not optimizing for space. * libc/string/strcasestr.c (strcasestr): Likewise. * libc/string/Makefile.am (ELIX_SOURCES): Rename to... (ELIX_2_SOURCES): ...this. (ELIX_4_SOURCES): New category, for memmem. (lib_a_SOURCES, libstring_la_SOURCES): Build new file. (CHEWOUT_FILES): Build documentation for memmem. * libc/string/strings.tex: Include new docs.
This commit is contained in:
		| @@ -1,3 +1,19 @@ | ||||
| 2008-01-11  Eric Blake  <ebb9@byu.net> | ||||
|  | ||||
| 	Make strstr and strcasestr O(n), not O(n^2); add memmem. | ||||
| 	* libc/string/str-two-way.h: New file. | ||||
| 	* libc/string/memmem.c (memmem): New file. | ||||
| 	* libc/include/string.h (memmem): Declare for all platforms. | ||||
| 	* libc/string/strstr.c (strstr): Provide O(n) implementation when | ||||
| 	not optimizing for space. | ||||
| 	* libc/string/strcasestr.c (strcasestr): Likewise. | ||||
| 	* libc/string/Makefile.am (ELIX_SOURCES): Rename to... | ||||
| 	(ELIX_2_SOURCES): ...this. | ||||
| 	(ELIX_4_SOURCES): New category, for memmem. | ||||
| 	(lib_a_SOURCES, libstring_la_SOURCES): Build new file. | ||||
| 	(CHEWOUT_FILES): Build documentation for memmem. | ||||
| 	* libc/string/strings.tex: Include new docs. | ||||
|  | ||||
| 2008-01-08  Jeff Johnston  <jjohnstn@redhat.com> | ||||
|  | ||||
| 	* libc/machine/m68k/memcpy.S: Remove % from register references | ||||
|   | ||||
| @@ -56,9 +56,7 @@ int	 _EXFUN(ffs,(int)); | ||||
| char 	*_EXFUN(index,(const char *, int)); | ||||
| _PTR	 _EXFUN(memccpy,(_PTR, const _PTR, int, size_t)); | ||||
| _PTR	 _EXFUN(mempcpy,(_PTR, const _PTR, size_t)); | ||||
| #ifdef __CYGWIN__ | ||||
| extern void *memmem (__const void *, size_t,  __const void *, size_t); | ||||
| #endif | ||||
| _PTR	 _EXFUN(memmem, (const _PTR, size_t, const _PTR, size_t)); | ||||
| char 	*_EXFUN(rindex,(const char *, int)); | ||||
| char 	*_EXFUN(stpcpy,(char *, const char *)); | ||||
| char 	*_EXFUN(stpncpy,(char *, const char *, size_t)); | ||||
|   | ||||
| @@ -72,9 +72,9 @@ GENERAL_SOURCES = \ | ||||
| 	wmemset.c | ||||
|  | ||||
| if ELIX_LEVEL_1 | ||||
| ELIX_SOURCES = | ||||
| ELIX_2_SOURCES = | ||||
| else | ||||
| ELIX_SOURCES = \ | ||||
| ELIX_2_SOURCES = \ | ||||
| 	bcmp.c \ | ||||
| 	memccpy.c \ | ||||
| 	mempcpy.c \ | ||||
| @@ -87,15 +87,30 @@ ELIX_SOURCES = \ | ||||
| 	wcpncpy.c \ | ||||
| endif | ||||
|  | ||||
| if ELIX_LEVEL_1 | ||||
| ELIX_4_SOURCES = | ||||
| else | ||||
| if ELIX_LEVEL_2 | ||||
| ELIX_4_SOURCES = | ||||
| else | ||||
| if ELIX_LEVEL_3 | ||||
| ELIX_4_SOURCES = | ||||
| else | ||||
| ELIX_4_SOURCES = \ | ||||
| 	memmem.c | ||||
| endif !ELIX_LEVEL_3 | ||||
| endif !ELIX_LEVEL_2 | ||||
| endif !ELIX_LEVEL_1 | ||||
|  | ||||
| libstring_la_LDFLAGS = -Xcompiler -nostdlib | ||||
|  | ||||
| if USE_LIBTOOL | ||||
| noinst_LTLIBRARIES = libstring.la | ||||
| libstring_la_SOURCES = $(GENERAL_SOURCES) $(ELIX_SOURCES) | ||||
| libstring_la_SOURCES = $(GENERAL_SOURCES) $(ELIX_2_SOURCES) $(ELIX_4_SOURCES) | ||||
| noinst_DATA = objectlist.awk.in | ||||
| else | ||||
| noinst_LIBRARIES = lib.a | ||||
| lib_a_SOURCES = $(GENERAL_SOURCES) $(ELIX_SOURCES) | ||||
| lib_a_SOURCES = $(GENERAL_SOURCES) $(ELIX_2_SOURCES) $(ELIX_4_SOURCES) | ||||
| lib_a_CFLAGS = $(AM_CFLAGS) | ||||
| noinst_DATA = | ||||
| endif # USE_LIBTOOL | ||||
| @@ -117,7 +132,8 @@ wcslcat.def	wcslcpy.def	wcslen.def	wcsncat.def \ | ||||
| wcsncmp.def	wcsncpy.def	wcsnlen.def	wcspbrk.def \ | ||||
| wcsrchr.def	wcsspn.def	wcsstr.def \ | ||||
| wcswidth.def	wcsxfrm.def	wcwidth.def	wmemchr.def \ | ||||
| wmemcmp.def	wmemcpy.def	wmemmove.def	wmemset.def | ||||
| wmemcmp.def	wmemcpy.def	wmemmove.def	wmemset.def \ | ||||
| memmem.def | ||||
|  | ||||
| SUFFIXES = .def | ||||
|  | ||||
|   | ||||
							
								
								
									
										102
									
								
								newlib/libc/string/memmem.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										102
									
								
								newlib/libc/string/memmem.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,102 @@ | ||||
| /* Byte-wise substring search, using the Two-Way algorithm. | ||||
|  * Copyright (C) 2008 Eric Blake | ||||
|  * Permission to use, copy, modify, and distribute this software | ||||
|  * is freely granted, provided that this notice is preserved. | ||||
|  */ | ||||
|  | ||||
| /* | ||||
| FUNCTION | ||||
| 	<<memmem>>---find memory segment | ||||
|  | ||||
| INDEX | ||||
| 	memmem | ||||
|  | ||||
| ANSI_SYNOPSIS | ||||
| 	#include <string.h> | ||||
| 	char *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>, | ||||
| 		     size_t <[l2]>); | ||||
|  | ||||
| DESCRIPTION | ||||
|  | ||||
| 	Locates the first occurrence in the memory region pointed to | ||||
| 	by <[s1]> with length <[l1]> of the sequence of bytes pointed | ||||
| 	to by <[s2]> of length <[l2]>.  If you already know the | ||||
| 	lengths of your haystack and needle, <<memmem>> can be much | ||||
| 	faster than <<strstr>>. | ||||
|  | ||||
| RETURNS | ||||
| 	Returns a pointer to the located segment, or a null pointer if | ||||
| 	<[s2]> is not found. If <[l2]> is 0, <[s1]> is returned. | ||||
|  | ||||
| PORTABILITY | ||||
| <<memmem>> is a newlib extension. | ||||
|  | ||||
| <<memmem>> requires no supporting OS subroutines. | ||||
|  | ||||
| QUICKREF | ||||
| 	memmem pure | ||||
| */ | ||||
|  | ||||
| #include <string.h> | ||||
|  | ||||
| #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) | ||||
| # define RETURN_TYPE void * | ||||
| # define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) | ||||
| # include "str-two-way.h" | ||||
| #endif | ||||
|  | ||||
| void * | ||||
| _DEFUN (memmem, (haystack_start, haystack_len, needle_start, needle_len), | ||||
| 	const void *haystack_start _AND | ||||
| 	size_t haystack_len _AND | ||||
| 	const void *needle_start _AND | ||||
| 	size_t needle_len) | ||||
| { | ||||
|   /* Abstract memory is considered to be an array of 'unsigned char' values, | ||||
|      not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */ | ||||
|   const unsigned char *haystack = (const unsigned char *) haystack_start; | ||||
|   const unsigned char *needle = (const unsigned char *) needle_start; | ||||
|  | ||||
|   if (needle_len == 0) | ||||
|     /* The first occurrence of the empty string is deemed to occur at | ||||
|        the beginning of the string.  */ | ||||
|     return (void *) haystack; | ||||
|  | ||||
| #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) | ||||
|  | ||||
|   /* Less code size, but quadratic performance in the worst case.  */ | ||||
|   while (needle_len <= haystack_len) | ||||
|     { | ||||
|       if (!memcmp (haystack, needle, needle_len)) | ||||
|         return (void *) haystack; | ||||
|       haystack++; | ||||
|       haystack_len--; | ||||
|     } | ||||
|   return NULL; | ||||
|  | ||||
| #else /* compilation for speed */ | ||||
|  | ||||
|   /* Larger code size, but guaranteed linear performance.  */ | ||||
|  | ||||
|   /* Sanity check, otherwise the loop might search through the whole | ||||
|      memory.  */ | ||||
|   if (haystack_len < needle_len) | ||||
|     return NULL; | ||||
|  | ||||
|   /* Use optimizations in memchr when possible, to reduce the search | ||||
|      size of haystack using a linear algorithm with a smaller | ||||
|      coefficient.  However, avoid memchr for long needles, since we | ||||
|      can often achieve sublinear performance.  */ | ||||
|   if (needle_len < LONG_NEEDLE_THRESHOLD) | ||||
|     { | ||||
|       haystack = memchr (haystack, *needle, haystack_len); | ||||
|       if (!haystack || needle_len == 1) | ||||
| 	return (void *) haystack; | ||||
|       haystack_len -= haystack - (const unsigned char *) haystack_start; | ||||
|       if (haystack_len < needle_len) | ||||
| 	return NULL; | ||||
|       return two_way_short_needle (haystack, haystack_len, needle, needle_len); | ||||
|     } | ||||
|   return two_way_long_needle (haystack, haystack_len, needle, needle_len); | ||||
| #endif /* compilation for speed */ | ||||
| } | ||||
							
								
								
									
										415
									
								
								newlib/libc/string/str-two-way.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										415
									
								
								newlib/libc/string/str-two-way.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,415 @@ | ||||
| /* Byte-wise substring search, using the Two-Way algorithm. | ||||
|  * Copyright (C) 2008 Eric Blake | ||||
|  * Permission to use, copy, modify, and distribute this software | ||||
|  * is freely granted, provided that this notice is preserved. | ||||
|  */ | ||||
|  | ||||
|  | ||||
| /* Before including this file, you need to include <string.h>, and define: | ||||
|      RESULT_TYPE		A macro that expands to the return type. | ||||
|      AVAILABLE(h, h_l, j, n_l)	A macro that returns nonzero if there are | ||||
| 				at least N_L bytes left starting at | ||||
| 				H[J].  H is 'unsigned char *', H_L, J, | ||||
| 				and N_L are 'size_t'; H_L is an | ||||
| 				lvalue.  For NUL-terminated searches, | ||||
| 				H_L can be modified each iteration to | ||||
| 				avoid having to compute the end of H | ||||
| 				up front. | ||||
|  | ||||
|   For case-insensitivity, you may optionally define: | ||||
|      CMP_FUNC(p1, p2, l)	A macro that returns 0 iff the first L | ||||
| 				characters of P1 and P2 are equal. | ||||
|      CANON_ELEMENT(c)		A macro that canonicalizes an element | ||||
| 				right after it has been fetched from | ||||
| 				one of the two strings.  The argument | ||||
| 				is an 'unsigned char'; the result must | ||||
| 				be an 'unsigned char' as well. | ||||
|  | ||||
|   This file undefines the macros documented above, and defines | ||||
|   LONG_NEEDLE_THRESHOLD. | ||||
| */ | ||||
|  | ||||
| #include <limits.h> | ||||
| #include <stdint.h> | ||||
|  | ||||
| /* We use the Two-Way string matching algorithm, which guarantees | ||||
|    linear complexity with constant space.  Additionally, for long | ||||
|    needles, we also use a bad character shift table similar to the | ||||
|    Boyer-Moore algorithm to achieve improved (potentially sub-linear) | ||||
|    performance. | ||||
|  | ||||
|    See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 | ||||
|    and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm | ||||
| */ | ||||
|  | ||||
| /* Point at which computing a bad-byte shift table is likely to be | ||||
|    worthwhile.  Small needles should not compute a table, since it | ||||
|    adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a | ||||
|    speedup no greater than a factor of NEEDLE_LEN.  The larger the | ||||
|    needle, the better the potential performance gain.  On the other | ||||
|    hand, on non-POSIX systems with CHAR_BIT larger than eight, the | ||||
|    memory required for the table is prohibitive.  */ | ||||
| #if CHAR_BIT < 10 | ||||
| # define LONG_NEEDLE_THRESHOLD 32U | ||||
| #else | ||||
| # define LONG_NEEDLE_THRESHOLD SIZE_MAX | ||||
| #endif | ||||
|  | ||||
| #define MAX(a, b) ((a < b) ? (b) : (a)) | ||||
|  | ||||
| #ifndef CANON_ELEMENT | ||||
| # define CANON_ELEMENT(c) c | ||||
| #endif | ||||
| #ifndef CMP_FUNC | ||||
| # define CMP_FUNC memcmp | ||||
| #endif | ||||
|  | ||||
| /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. | ||||
|    Return the index of the first byte in the right half, and set | ||||
|    *PERIOD to the global period of the right half. | ||||
|  | ||||
|    The global period of a string is the smallest index (possibly its | ||||
|    length) at which all remaining bytes in the string are repetitions | ||||
|    of the prefix (the last repetition may be a subset of the prefix). | ||||
|  | ||||
|    When NEEDLE is factored into two halves, a local period is the | ||||
|    length of the smallest word that shares a suffix with the left half | ||||
|    and shares a prefix with the right half.  All factorizations of a | ||||
|    non-empty NEEDLE have a local period of at least 1 and no greater | ||||
|    than NEEDLE_LEN. | ||||
|  | ||||
|    A critical factorization has the property that the local period | ||||
|    equals the global period.  All strings have at least one critical | ||||
|    factorization with the left half smaller than the global period. | ||||
|  | ||||
|    Given an ordered alphabet, a critical factorization can be computed | ||||
|    in linear time, with 2 * NEEDLE_LEN comparisons, by computing the | ||||
|    larger of two ordered maximal suffixes.  The ordered maximal | ||||
|    suffixes are determined by lexicographic comparison of | ||||
|    periodicity.  */ | ||||
| static size_t | ||||
| critical_factorization (const unsigned char *needle, size_t needle_len, | ||||
| 			size_t *period) | ||||
| { | ||||
|   /* Index of last byte of left half, or SIZE_MAX.  */ | ||||
|   size_t max_suffix, max_suffix_rev; | ||||
|   size_t j; /* Index into NEEDLE for current candidate suffix.  */ | ||||
|   size_t k; /* Offset into current period.  */ | ||||
|   size_t p; /* Intermediate period.  */ | ||||
|   unsigned char a, b; /* Current comparison bytes.  */ | ||||
|  | ||||
|   /* Invariants: | ||||
|      0 <= j < NEEDLE_LEN - 1 | ||||
|      -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) | ||||
|      min(max_suffix, max_suffix_rev) < global period of NEEDLE | ||||
|      1 <= p <= global period of NEEDLE | ||||
|      p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] | ||||
|      1 <= k <= p | ||||
|   */ | ||||
|  | ||||
|   /* Perform lexicographic search.  */ | ||||
|   max_suffix = SIZE_MAX; | ||||
|   j = 0; | ||||
|   k = p = 1; | ||||
|   while (j + k < needle_len) | ||||
|     { | ||||
|       a = CANON_ELEMENT (needle[j + k]); | ||||
|       b = CANON_ELEMENT (needle[max_suffix + k]); | ||||
|       if (a < b) | ||||
| 	{ | ||||
| 	  /* Suffix is smaller, period is entire prefix so far.  */ | ||||
| 	  j += k; | ||||
| 	  k = 1; | ||||
| 	  p = j - max_suffix; | ||||
| 	} | ||||
|       else if (a == b) | ||||
| 	{ | ||||
| 	  /* Advance through repetition of the current period.  */ | ||||
| 	  if (k != p) | ||||
| 	    ++k; | ||||
| 	  else | ||||
| 	    { | ||||
| 	      j += p; | ||||
| 	      k = 1; | ||||
| 	    } | ||||
| 	} | ||||
|       else /* b < a */ | ||||
| 	{ | ||||
| 	  /* Suffix is larger, start over from current location.  */ | ||||
| 	  max_suffix = j++; | ||||
| 	  k = p = 1; | ||||
| 	} | ||||
|     } | ||||
|   *period = p; | ||||
|  | ||||
|   /* Perform reverse lexicographic search.  */ | ||||
|   max_suffix_rev = SIZE_MAX; | ||||
|   j = 0; | ||||
|   k = p = 1; | ||||
|   while (j + k < needle_len) | ||||
|     { | ||||
|       a = CANON_ELEMENT (needle[j + k]); | ||||
|       b = CANON_ELEMENT (needle[max_suffix_rev + k]); | ||||
|       if (b < a) | ||||
| 	{ | ||||
| 	  /* Suffix is smaller, period is entire prefix so far.  */ | ||||
| 	  j += k; | ||||
| 	  k = 1; | ||||
| 	  p = j - max_suffix_rev; | ||||
| 	} | ||||
|       else if (a == b) | ||||
| 	{ | ||||
| 	  /* Advance through repetition of the current period.  */ | ||||
| 	  if (k != p) | ||||
| 	    ++k; | ||||
| 	  else | ||||
| 	    { | ||||
| 	      j += p; | ||||
| 	      k = 1; | ||||
| 	    } | ||||
| 	} | ||||
|       else /* a < b */ | ||||
| 	{ | ||||
| 	  /* Suffix is larger, start over from current location.  */ | ||||
| 	  max_suffix_rev = j++; | ||||
| 	  k = p = 1; | ||||
| 	} | ||||
|     } | ||||
|  | ||||
|   /* Choose the longer suffix.  Return the first byte of the right | ||||
|      half, rather than the last byte of the left half.  */ | ||||
|   if (max_suffix_rev + 1 < max_suffix + 1) | ||||
|     return max_suffix + 1; | ||||
|   *period = p; | ||||
|   return max_suffix_rev + 1; | ||||
| } | ||||
|  | ||||
| /* Return the first location of non-empty NEEDLE within HAYSTACK, or | ||||
|    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This | ||||
|    method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. | ||||
|    Performance is guaranteed to be linear, with an initialization cost | ||||
|    of 2 * NEEDLE_LEN comparisons. | ||||
|  | ||||
|    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at | ||||
|    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. | ||||
|    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * | ||||
|    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */ | ||||
| static RETURN_TYPE | ||||
| two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | ||||
| 		      const unsigned char *needle, size_t needle_len) | ||||
| { | ||||
|   size_t i; /* Index into current byte of NEEDLE.  */ | ||||
|   size_t j; /* Index into current window of HAYSTACK.  */ | ||||
|   size_t period; /* The period of the right half of needle.  */ | ||||
|   size_t suffix; /* The index of the right half of needle.  */ | ||||
|  | ||||
|   /* Factor the needle into two halves, such that the left half is | ||||
|      smaller than the global period, and the right half is | ||||
|      periodic (with a period as large as NEEDLE_LEN - suffix).  */ | ||||
|   suffix = critical_factorization (needle, needle_len, &period); | ||||
|  | ||||
|   /* Perform the search.  Each iteration compares the right half | ||||
|      first.  */ | ||||
|   if (CMP_FUNC (needle, needle + period, suffix) == 0) | ||||
|     { | ||||
|       /* Entire needle is periodic; a mismatch can only advance by the | ||||
| 	 period, so use memory to avoid rescanning known occurrences | ||||
| 	 of the period.  */ | ||||
|       size_t memory = 0; | ||||
|       j = 0; | ||||
|       while (AVAILABLE (haystack, haystack_len, j, needle_len)) | ||||
| 	{ | ||||
| 	  /* Scan for matches in right half.  */ | ||||
| 	  i = MAX (suffix, memory); | ||||
| 	  while (i < needle_len && (CANON_ELEMENT (needle[i]) | ||||
| 				    == CANON_ELEMENT (haystack[i + j]))) | ||||
| 	    ++i; | ||||
| 	  if (needle_len <= i) | ||||
| 	    { | ||||
| 	      /* Scan for matches in left half.  */ | ||||
| 	      i = suffix - 1; | ||||
| 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i]) | ||||
| 					== CANON_ELEMENT (haystack[i + j]))) | ||||
| 		--i; | ||||
| 	      if (i + 1 < memory + 1) | ||||
| 		return (RETURN_TYPE) (haystack + j); | ||||
| 	      /* No match, so remember how many repetitions of period | ||||
| 		 on the right half were scanned.  */ | ||||
| 	      j += period; | ||||
| 	      memory = needle_len - period; | ||||
| 	    } | ||||
| 	  else | ||||
| 	    { | ||||
| 	      j += i - suffix + 1; | ||||
| 	      memory = 0; | ||||
| 	    } | ||||
| 	} | ||||
|     } | ||||
|   else | ||||
|     { | ||||
|       /* The two halves of needle are distinct; no extra memory is | ||||
| 	 required, and any mismatch results in a maximal shift.  */ | ||||
|       period = MAX (suffix, needle_len - suffix) + 1; | ||||
|       j = 0; | ||||
|       while (AVAILABLE (haystack, haystack_len, j, needle_len)) | ||||
| 	{ | ||||
| 	  /* Scan for matches in right half.  */ | ||||
| 	  i = suffix; | ||||
| 	  while (i < needle_len && (CANON_ELEMENT (needle[i]) | ||||
| 				    == CANON_ELEMENT (haystack[i + j]))) | ||||
| 	    ++i; | ||||
| 	  if (needle_len <= i) | ||||
| 	    { | ||||
| 	      /* Scan for matches in left half.  */ | ||||
| 	      i = suffix - 1; | ||||
| 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) | ||||
| 				       == CANON_ELEMENT (haystack[i + j]))) | ||||
| 		--i; | ||||
| 	      if (i == SIZE_MAX) | ||||
| 		return (RETURN_TYPE) (haystack + j); | ||||
| 	      j += period; | ||||
| 	    } | ||||
| 	  else | ||||
| 	    j += i - suffix + 1; | ||||
| 	} | ||||
|     } | ||||
|   return NULL; | ||||
| } | ||||
|  | ||||
| /* Return the first location of non-empty NEEDLE within HAYSTACK, or | ||||
|    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This | ||||
|    method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. | ||||
|    Performance is guaranteed to be linear, with an initialization cost | ||||
|    of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. | ||||
|  | ||||
|    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at | ||||
|    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, | ||||
|    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. | ||||
|    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * | ||||
|    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and | ||||
|    sublinear performance is not possible.  */ | ||||
| static RETURN_TYPE | ||||
| two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | ||||
| 		     const unsigned char *needle, size_t needle_len) | ||||
| { | ||||
|   size_t i; /* Index into current byte of NEEDLE.  */ | ||||
|   size_t j; /* Index into current window of HAYSTACK.  */ | ||||
|   size_t period; /* The period of the right half of needle.  */ | ||||
|   size_t suffix; /* The index of the right half of needle.  */ | ||||
|   size_t shift_table[1U << CHAR_BIT]; /* See below.  */ | ||||
|  | ||||
|   /* Factor the needle into two halves, such that the left half is | ||||
|      smaller than the global period, and the right half is | ||||
|      periodic (with a period as large as NEEDLE_LEN - suffix).  */ | ||||
|   suffix = critical_factorization (needle, needle_len, &period); | ||||
|  | ||||
|   /* Populate shift_table.  For each possible byte value c, | ||||
|      shift_table[c] is the distance from the last occurrence of c to | ||||
|      the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. | ||||
|      shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */ | ||||
|   for (i = 0; i < 1U << CHAR_BIT; i++) | ||||
|     shift_table[i] = needle_len; | ||||
|   for (i = 0; i < needle_len; i++) | ||||
|     shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; | ||||
|  | ||||
|   /* Perform the search.  Each iteration compares the right half | ||||
|      first.  */ | ||||
|   if (CMP_FUNC (needle, needle + period, suffix) == 0) | ||||
|     { | ||||
|       /* Entire needle is periodic; a mismatch can only advance by the | ||||
| 	 period, so use memory to avoid rescanning known occurrences | ||||
| 	 of the period.  */ | ||||
|       size_t memory = 0; | ||||
|       size_t shift; | ||||
|       j = 0; | ||||
|       while (AVAILABLE (haystack, haystack_len, j, needle_len)) | ||||
| 	{ | ||||
| 	  /* Check the last byte first; if it does not match, then | ||||
| 	     shift to the next possible match location.  */ | ||||
| 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | ||||
| 	  if (0 < shift) | ||||
| 	    { | ||||
| 	      if (memory && shift < period) | ||||
| 		{ | ||||
| 		  /* Since needle is periodic, but the last period has | ||||
| 		     a byte out of place, there can be no match until | ||||
| 		     after the mismatch.  */ | ||||
| 		  shift = needle_len - period; | ||||
| 		  memory = 0; | ||||
| 		} | ||||
| 	      j += shift; | ||||
| 	      continue; | ||||
| 	    } | ||||
| 	  /* Scan for matches in right half.  The last byte has | ||||
| 	     already been matched, by virtue of the shift table.  */ | ||||
| 	  i = MAX (suffix, memory); | ||||
| 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) | ||||
| 					== CANON_ELEMENT (haystack[i + j]))) | ||||
| 	    ++i; | ||||
| 	  if (needle_len - 1 <= i) | ||||
| 	    { | ||||
| 	      /* Scan for matches in left half.  */ | ||||
| 	      i = suffix - 1; | ||||
| 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i]) | ||||
| 					== CANON_ELEMENT (haystack[i + j]))) | ||||
| 		--i; | ||||
| 	      if (i + 1 < memory + 1) | ||||
| 		return (RETURN_TYPE) (haystack + j); | ||||
| 	      /* No match, so remember how many repetitions of period | ||||
| 		 on the right half were scanned.  */ | ||||
| 	      j += period; | ||||
| 	      memory = needle_len - period; | ||||
| 	    } | ||||
| 	  else | ||||
| 	    { | ||||
| 	      j += i - suffix + 1; | ||||
| 	      memory = 0; | ||||
| 	    } | ||||
| 	} | ||||
|     } | ||||
|   else | ||||
|     { | ||||
|       /* The two halves of needle are distinct; no extra memory is | ||||
| 	 required, and any mismatch results in a maximal shift.  */ | ||||
|       size_t shift; | ||||
|       period = MAX (suffix, needle_len - suffix) + 1; | ||||
|       j = 0; | ||||
|       while (AVAILABLE (haystack, haystack_len, j, needle_len)) | ||||
| 	{ | ||||
| 	  /* Check the last byte first; if it does not match, then | ||||
| 	     shift to the next possible match location.  */ | ||||
| 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | ||||
| 	  if (0 < shift) | ||||
| 	    { | ||||
| 	      j += shift; | ||||
| 	      continue; | ||||
| 	    } | ||||
| 	  /* Scan for matches in right half.  The last byte has | ||||
| 	     already been matched, by virtue of the shift table.  */ | ||||
| 	  i = suffix; | ||||
| 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) | ||||
| 					== CANON_ELEMENT (haystack[i + j]))) | ||||
| 	    ++i; | ||||
| 	  if (needle_len - 1 <= i) | ||||
| 	    { | ||||
| 	      /* Scan for matches in left half.  */ | ||||
| 	      i = suffix - 1; | ||||
| 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) | ||||
| 				       == CANON_ELEMENT (haystack[i + j]))) | ||||
| 		--i; | ||||
| 	      if (i == SIZE_MAX) | ||||
| 		return (RETURN_TYPE) (haystack + j); | ||||
| 	      j += period; | ||||
| 	    } | ||||
| 	  else | ||||
| 	    j += i - suffix + 1; | ||||
| 	} | ||||
|     } | ||||
|   return NULL; | ||||
| } | ||||
|  | ||||
| #undef AVAILABLE | ||||
| #undef CANON_ELEMENT | ||||
| #undef CMP_FUNC | ||||
| #undef MAX | ||||
| #undef RETURN_TYPE | ||||
| @@ -40,7 +40,7 @@ QUICKREF | ||||
|  * Copyright (c) 1990, 1993 | ||||
|  *	The Regents of the University of California.  All rights reserved. | ||||
|  * | ||||
|  * This code is derived from software contributed to Berkeley by | ||||
|  * The quadratic code is derived from software contributed to Berkeley by | ||||
|  * Chris Torek. | ||||
|  * | ||||
|  * Redistribution and use in source and binary forms, with or without | ||||
| @@ -67,12 +67,26 @@ QUICKREF | ||||
|  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||||
|  * SUCH DAMAGE. | ||||
|  */ | ||||
| /* Linear algorithm Copyright (C) 2008 Eric Blake | ||||
|  * Permission to use, copy, modify, and distribute the linear portion of | ||||
|  * software is freely granted, provided that this notice is preserved. | ||||
|  */ | ||||
|  | ||||
| #include <sys/cdefs.h> | ||||
|  | ||||
| #include <ctype.h> | ||||
| #include <string.h> | ||||
|  | ||||
| #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) | ||||
| # define RETURN_TYPE char * | ||||
| # define AVAILABLE(h, h_l, j, n_l)			\ | ||||
|   (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\ | ||||
|    && ((h_l) = (j) + (n_l))) | ||||
| # define CANON_ELEMENT(c) tolower (c) | ||||
| # define CMP_FUNC strncasecmp | ||||
| # include "str-two-way.h" | ||||
| #endif | ||||
|  | ||||
| /* | ||||
|  * Find the first occurrence of find in s, ignore case. | ||||
|  */ | ||||
| @@ -80,6 +94,9 @@ char * | ||||
| strcasestr(s, find) | ||||
| 	const char *s, *find; | ||||
| { | ||||
| #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) | ||||
|  | ||||
|   /* Less code size, but quadratic performance in the worst case.  */ | ||||
| 	char c, sc; | ||||
| 	size_t len; | ||||
|  | ||||
| @@ -95,4 +112,36 @@ strcasestr(s, find) | ||||
| 		s--; | ||||
| 	} | ||||
| 	return ((char *)s); | ||||
|  | ||||
| #else /* compilation for speed */ | ||||
|  | ||||
|   /* Larger code size, but guaranteed linear performance.  */ | ||||
|   const char *haystack = s; | ||||
|   const char *needle = find; | ||||
|   size_t needle_len; /* Length of NEEDLE.  */ | ||||
|   size_t haystack_len; /* Known minimum length of HAYSTACK.  */ | ||||
|   int ok = 1; /* True if NEEDLE is prefix of HAYSTACK.  */ | ||||
|  | ||||
|   /* Determine length of NEEDLE, and in the process, make sure | ||||
|      HAYSTACK is at least as long (no point processing all of a long | ||||
|      NEEDLE if HAYSTACK is too short).  */ | ||||
|   while (*haystack && *needle) | ||||
|     ok &= (tolower ((unsigned char) *haystack++) | ||||
| 	   == tolower ((unsigned char) *needle++)); | ||||
|   if (*needle) | ||||
|     return NULL; | ||||
|   if (ok) | ||||
|     return (char *) s; | ||||
|   needle_len = needle - find; | ||||
|   haystack = s + 1; | ||||
|   haystack_len = needle_len - 1; | ||||
|  | ||||
|   /* Perform the search.  */ | ||||
|   if (needle_len < LONG_NEEDLE_THRESHOLD) | ||||
|     return two_way_short_needle ((const unsigned char *) haystack, | ||||
| 				 haystack_len, | ||||
| 				 (const unsigned char *) find, needle_len); | ||||
|   return two_way_long_needle ((const unsigned char *) haystack, haystack_len, | ||||
| 			      (const unsigned char *) find, needle_len); | ||||
| #endif /* compilation for speed */ | ||||
| } | ||||
|   | ||||
| @@ -14,6 +14,7 @@ managing areas of memory.  The corresponding declarations are in | ||||
| * memchr::      Find character in memory | ||||
| * memcmp::      Compare two memory areas | ||||
| * memcpy::      Copy memory regions | ||||
| * memmem::      Find memory segment | ||||
| * memmove::     Move possibly overlapping memory | ||||
| * mempcpy::	Copy memory regions and locate end | ||||
| * memset::      Set an area of memory | ||||
| @@ -71,6 +72,9 @@ managing areas of memory.  The corresponding declarations are in | ||||
| @page | ||||
| @include string/memcpy.def | ||||
|  | ||||
| @page | ||||
| @include string/memmem.def | ||||
|  | ||||
| @page | ||||
| @include string/memmove.def | ||||
|  | ||||
|   | ||||
| @@ -16,14 +16,14 @@ TRAD_SYNOPSIS | ||||
| 	char *<[s2]>; | ||||
|  | ||||
| DESCRIPTION | ||||
| 	Locates the first occurence in the string pointed to by <[s1]> of | ||||
| 	Locates the first occurrence in the string pointed to by <[s1]> of | ||||
| 	the sequence of characters in the string pointed to by <[s2]> | ||||
| 	(excluding the terminating null character). | ||||
|  | ||||
| RETURNS | ||||
| 	Returns a pointer to the located string segment, or a null | ||||
| 	pointer if the string <[s2]> is not found. If <[s2]> points to | ||||
| 	a string with zero length, the <[s1]> is returned. | ||||
| 	a string with zero length, <[s1]> is returned. | ||||
|  | ||||
| PORTABILITY | ||||
| <<strstr>> is ANSI C. | ||||
| @@ -36,11 +36,22 @@ QUICKREF | ||||
|  | ||||
| #include <string.h> | ||||
|  | ||||
| #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) | ||||
| # define RETURN_TYPE char * | ||||
| # define AVAILABLE(h, h_l, j, n_l)			\ | ||||
|   (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\ | ||||
|    && ((h_l) = (j) + (n_l))) | ||||
| # include "str-two-way.h" | ||||
| #endif | ||||
|  | ||||
| char * | ||||
| _DEFUN (strstr, (searchee, lookfor), | ||||
| 	_CONST char *searchee _AND | ||||
| 	_CONST char *lookfor) | ||||
| { | ||||
| #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) | ||||
|  | ||||
|   /* Less code size, but quadratic performance in the worst case.  */ | ||||
|   if (*searchee == 0) | ||||
|     { | ||||
|       if (*lookfor) | ||||
| @@ -70,4 +81,41 @@ _DEFUN (strstr, (searchee, lookfor), | ||||
|     } | ||||
|  | ||||
|   return (char *) NULL; | ||||
|  | ||||
| #else /* compilation for speed */ | ||||
|  | ||||
|   /* Larger code size, but guaranteed linear performance.  */ | ||||
|   const char *haystack = searchee; | ||||
|   const char *needle = lookfor; | ||||
|   size_t needle_len; /* Length of NEEDLE.  */ | ||||
|   size_t haystack_len; /* Known minimum length of HAYSTACK.  */ | ||||
|   int ok = 1; /* True if NEEDLE is prefix of HAYSTACK.  */ | ||||
|  | ||||
|   /* Determine length of NEEDLE, and in the process, make sure | ||||
|      HAYSTACK is at least as long (no point processing all of a long | ||||
|      NEEDLE if HAYSTACK is too short).  */ | ||||
|   while (*haystack && *needle) | ||||
|     ok &= *haystack++ == *needle++; | ||||
|   if (*needle) | ||||
|     return NULL; | ||||
|   if (ok) | ||||
|     return (char *) searchee; | ||||
|  | ||||
|   /* Reduce the size of haystack using strchr, since it has a smaller | ||||
|      linear coefficient than the Two-Way algorithm.  */ | ||||
|   needle_len = needle - lookfor; | ||||
|   haystack = strchr (searchee + 1, *lookfor); | ||||
|   if (!haystack || needle_len == 1) | ||||
|     return (char *) haystack; | ||||
|   haystack_len = (haystack > searchee + needle_len ? 1 | ||||
| 		  : needle_len + searchee - haystack); | ||||
|  | ||||
|   /* Perform the search.  */ | ||||
|   if (needle_len < LONG_NEEDLE_THRESHOLD) | ||||
|     return two_way_short_needle ((const unsigned char *) haystack, | ||||
| 				 haystack_len, | ||||
| 				 (const unsigned char *) lookfor, needle_len); | ||||
|   return two_way_long_needle ((const unsigned char *) haystack, haystack_len, | ||||
| 			      (const unsigned char *) lookfor, needle_len); | ||||
| #endif /* compilation for speed */ | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user