This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Optimize strstr, strcasestr and memmem
- From: Maxim Kuvyrkov <maxim at codesourcery dot com>
- To: <libc-alpha at sourceware dot org>
- Date: Fri, 18 May 2012 09:22:38 +1200
- Subject: [PATCH] Optimize strstr, strcasestr and memmem
This patch optimizes strstr, strcasestr and memmem functions. This patch speeds up strstr, strcase and memmem functions with short needle inputs by more than 2 times on i686, x86_64, MIPS and other architectures.
GLIBC 2.9 introduced new, algorithmically-superior implementation of strstr, strcasestr and memmem functions. Unfortunately, this new implementation uses memchr to detect end-of-string condition which comes at significant overhead compared to piggy-backing matching procedure that GLIBC 2.8 and earlier versions used. The new implementation heavily regressed the case for short needles, making strstr more than 2 times slower. This patch cures the regression and even makes the GLIBC 2.9+ implementation faster than original GLIBC 2.8- version.
This patch adds several optimizations:
- Piggy-back the matching procedure to detect end-of-string for strstr and strcasestr when needle is short. This removes the need to use memchr in two_way_short_needle. [The long-needle case hops through parts of the string, and the same optimization cannot be used there as not every haystack character is read in by the matching loop.]
- Optimize search for the first character in strstr, strcasestr and memmem functions. This is the hot-spot of the functions.
- Use pointer references instead of array[index] references. This optimization makes it easier for compiler to generate good code, especially on architectures that don't have base+index addressing.
- Use straight tolower() for strcasestr (complexity of isupper() is about the same as tolower()).
Tested on i686-linux-gnu and x86_64-linux-gnu with no regressions.
OK to apply?
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
2012-05-18 Maxim Kuvyrkov <maxim@codesourcery.com>
* string/str-two-way.h (AVAILABLE1, AVAILABLE2, RET0_IF_0,)
(AVAILABLE_USES_J): New macros, define their defaults.
(two_way_short_needle): Use pointers for traversing arrays, detect
end-of-string on-the-fly. Use optimized first-character-loop.
(two_way_long_needle): Use pointers for traversing arrays.
* string/strcasestr.c, string/strstr.c (AVAILABLE): Update.
(AVAILABLE1, AVAILABLE2, RET0_IF_0, AVAILABLE_USES_J): Define.
* string/strcasestr.c (TOLOWER): Make auto-increment safe.
* string/bug-strcasestr1.c: New test.
* string/Makefile: Run it.
---
string/Makefile | 3 +-
string/bug-strcasestr1.c | 21 ++++++++
string/str-two-way.h | 129 ++++++++++++++++++++++++++++++++++++++--------
string/strcasestr.c | 8 ++-
string/strstr.c | 6 ++-
5 files changed, 142 insertions(+), 25 deletions(-)
create mode 100644 string/bug-strcasestr1.c
diff --git a/string/Makefile b/string/Makefile
index 80923a2..fb97ce4 100644
--- a/string/Makefile
+++ b/string/Makefile
@@ -56,7 +56,7 @@ tests := tester inl-tester noinl-tester testcopy test-ffs \
tst-strtok tst-strxfrm bug-strcoll1 tst-strfry \
bug-strtok1 $(addprefix test-,$(strop-tests)) \
bug-envz1 tst-strxfrm2 tst-endian tst-svc2 \
- bug-strstr1 bug-strchr1
+ bug-strstr1 bug-strcasestr1 bug-strchr1
include ../Rules
@@ -74,6 +74,7 @@ CFLAGS-stratcliff.c = -fno-builtin
CFLAGS-test-ffs.c = -fno-builtin
CFLAGS-tst-inlcall.c = -fno-builtin
CFLAGS-bug-strstr1.c = -fno-builtin
+CFLAGS-bug-strcasestr1.c = -fno-builtin
ifeq ($(cross-compiling),no)
tests: $(objpfx)tst-svc.out
diff --git a/string/bug-strcasestr1.c b/string/bug-strcasestr1.c
new file mode 100644
index 0000000..16c77c3
--- /dev/null
+++ b/string/bug-strcasestr1.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <string.h>
+
+#define TEST_FUNCTION do_test ()
+static int
+do_test (void)
+{
+ const char haystack[] = "AOKB";
+ const char needle[] = "OK";
+ const char *sub = strcasestr (haystack, needle);
+
+ if (sub == NULL)
+ {
+ fprintf (stderr, "BUG: didn't find \"%s\" in \"%s\"\n", needle, haystack);
+ return 1;
+ }
+
+ return 0;
+}
+
+#include "../test-skeleton.c"
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 1b2a8bd..a826741 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -1,5 +1,5 @@
/* Byte-wise substring search, using the Two-Way algorithm.
- Copyright (C) 2008, 2010 Free Software Foundation, Inc.
+ Copyright (C) 2008-2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Written by Eric Blake <ebb9@byu.net>, 2008.
@@ -78,6 +78,19 @@
# define CMP_FUNC memcmp
#endif
+#ifndef AVAILABLE1
+# define AVAILABLE1(h, h_l, j, n_l) AVAILABLE (h, h_l, j, n_l)
+#endif
+#ifndef AVAILABLE2
+# define AVAILABLE2(h, h_l, j, n_l) (1)
+#endif
+#ifndef RET0_IF_0
+# define RET0_IF_0(a) /* nothing */
+#endif
+#ifndef AVAILABLE_USES_J
+# define AVAILABLE_USES_J (1)
+#endif
+
/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
Return the index of the first byte in the right half, and set
*PERIOD to the global period of the right half.
@@ -233,17 +246,24 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
j = 0;
while (AVAILABLE (haystack, haystack_len, j, needle_len))
{
+ const unsigned char *pneedle;
+ const unsigned char *phaystack;
+
/* Scan for matches in right half. */
i = MAX (suffix, memory);
- while (i < needle_len && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
+ pneedle = &needle[i];
+ phaystack = &haystack[i + j];
+ while (i < needle_len && (CANON_ELEMENT (*pneedle++)
+ == CANON_ELEMENT (*phaystack++)))
++i;
if (needle_len <= i)
{
/* Scan for matches in left half. */
i = suffix - 1;
- while (memory < i + 1 && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
+ pneedle = &needle[i];
+ phaystack = &haystack[i + j];
+ while (memory < i + 1 && (CANON_ELEMENT (*pneedle--)
+ == CANON_ELEMENT (*phaystack--)))
--i;
if (i + 1 < memory + 1)
return (RETURN_TYPE) (haystack + j);
@@ -261,32 +281,81 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
}
else
{
+ const unsigned char *phaystack = &haystack[suffix];
+ /* The comparison always starts from needle[suffix], so cache it
+ and use an optimized first-character loop. */
+ unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
+
/* The two halves of needle are distinct; no extra memory is
required, and any mismatch results in a maximal shift. */
period = MAX (suffix, needle_len - suffix) + 1;
j = 0;
- while (AVAILABLE (haystack, haystack_len, j, needle_len))
+ while (AVAILABLE1 (haystack, haystack_len, j, needle_len))
{
+ const unsigned char *pneedle;
+ unsigned char a;
+
+ /* TODO: The first-character loop can be sped up by adapting
+ longword-at-a-time implementation of memchr/strchr. */
+ if (needle_suffix
+ != (a = CANON_ELEMENT (*phaystack++)))
+ {
+#if AVAILABLE_USES_J
+ ++j;
+#endif
+ RET0_IF_0 (a);
+ continue;
+ }
+
+#if !AVAILABLE_USES_J
+ /* Calculate J if it wasn't kept up-to-date in the first-character
+ loop. */
+ j = phaystack - &haystack[suffix] - 1;
+#endif
+
/* Scan for matches in right half. */
- i = suffix;
- while (i < needle_len && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- ++i;
+ i = suffix + 1;
+ pneedle = &needle[i];
+ while (i < needle_len)
+ {
+ if (CANON_ELEMENT (*pneedle++)
+ != (a = CANON_ELEMENT (*phaystack++)))
+ {
+ RET0_IF_0 (a);
+ break;
+ }
+ ++i;
+ }
if (needle_len <= i)
{
/* Scan for matches in left half. */
i = suffix - 1;
- while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
- --i;
+ pneedle = &needle[i];
+ phaystack = &haystack[i + j];
+ while (i != SIZE_MAX)
+ {
+ if (CANON_ELEMENT (*pneedle--)
+ != (a = CANON_ELEMENT (*phaystack--)))
+ {
+ RET0_IF_0 (a);
+ break;
+ }
+ --i;
+ }
if (i == SIZE_MAX)
return (RETURN_TYPE) (haystack + j);
j += period;
}
else
j += i - suffix + 1;
+
+ if (!AVAILABLE2 (haystack, haystack_len, j, needle_len))
+ break;
+
+ phaystack = &haystack[suffix + j];
}
}
+ ret0: __attribute__ ((unused))
return NULL;
}
@@ -338,6 +407,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
j = 0;
while (AVAILABLE (haystack, haystack_len, j, needle_len))
{
+ const unsigned char *pneedle;
+ const unsigned char *phaystack;
+
/* Check the last byte first; if it does not match, then
shift to the next possible match location. */
shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
@@ -357,15 +429,19 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
/* Scan for matches in right half. The last byte has
already been matched, by virtue of the shift table. */
i = MAX (suffix, memory);
- while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
+ pneedle = &needle[i];
+ phaystack = &haystack[i + j];
+ while (i < needle_len - 1 && (CANON_ELEMENT (*pneedle++)
+ == CANON_ELEMENT (*phaystack++)))
++i;
if (needle_len - 1 <= i)
{
/* Scan for matches in left half. */
i = suffix - 1;
- while (memory < i + 1 && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
+ pneedle = &needle[i];
+ phaystack = &haystack[i + j];
+ while (memory < i + 1 && (CANON_ELEMENT (*pneedle--)
+ == CANON_ELEMENT (*phaystack--)))
--i;
if (i + 1 < memory + 1)
return (RETURN_TYPE) (haystack + j);
@@ -390,6 +466,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
j = 0;
while (AVAILABLE (haystack, haystack_len, j, needle_len))
{
+ const unsigned char *pneedle;
+ const unsigned char *phaystack;
+
/* Check the last byte first; if it does not match, then
shift to the next possible match location. */
shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
@@ -401,15 +480,19 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
/* Scan for matches in right half. The last byte has
already been matched, by virtue of the shift table. */
i = suffix;
- while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
+ pneedle = &needle[i];
+ phaystack = &haystack[i + j];
+ while (i < needle_len - 1 && (CANON_ELEMENT (*pneedle++)
+ == CANON_ELEMENT (*phaystack++)))
++i;
if (needle_len - 1 <= i)
{
/* Scan for matches in left half. */
i = suffix - 1;
- while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
- == CANON_ELEMENT (haystack[i + j])))
+ pneedle = &needle[i];
+ phaystack = &haystack[i + j];
+ while (i != SIZE_MAX && (CANON_ELEMENT (*pneedle--)
+ == CANON_ELEMENT (*phaystack--)))
--i;
if (i == SIZE_MAX)
return (RETURN_TYPE) (haystack + j);
@@ -423,6 +506,10 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
}
#undef AVAILABLE
+#undef AVAILABLE1
+#undef AVAILABLE2
+#undef AVAILABLE_USES_J
#undef CANON_ELEMENT
#undef CMP_FUNC
+#undef RET0_IF_0
#undef RETURN_TYPE
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 9e1bde9..9848d10 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -1,5 +1,5 @@
/* Return the offset of one string within another.
- Copyright (C) 1994, 1996-2000, 2004, 2008, 2009, 2010
+ Copyright (C) 1994-2012
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -37,13 +37,17 @@
#include <stdbool.h>
#include <strings.h>
-#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
+#define TOLOWER(Ch) tolower (Ch)
/* Two-Way algorithm. */
#define RETURN_TYPE char *
#define AVAILABLE(h, h_l, j, n_l) \
(!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \
&& ((h_l) = (j) + (n_l)))
+#define AVAILABLE1(h, h_l, j, n_l) (true)
+#define AVAILABLE2(h, h_l, j, n_l) AVAILABLE (h, h_l, j, n_l)
+#define RET0_IF_0(a) if (!a) goto ret0
+#define AVAILABLE_USES_J (0)
#define CANON_ELEMENT(c) TOLOWER (c)
#define CMP_FUNC(p1, p2, l) \
__strncasecmp ((const char *) (p1), (const char *) (p2), l)
diff --git a/string/strstr.c b/string/strstr.c
index 10e6fdc..94d071d 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -1,5 +1,5 @@
/* Return the offset of one string within another.
- Copyright (C) 1994,1996,1997,2000,2001,2003,2008,2009
+ Copyright (C) 1994-2012
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -36,6 +36,10 @@
#define AVAILABLE(h, h_l, j, n_l) \
(!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \
&& ((h_l) = (j) + (n_l)))
+#define AVAILABLE1(h, h_l, j, n_l) (true)
+#define AVAILABLE2(h, h_l, j, n_l) AVAILABLE (h, h_l, j, n_l)
+#define RET0_IF_0(a) if (!a) goto ret0
+#define AVAILABLE_USES_J (0)
#include "str-two-way.h"
#undef strstr
--
1.7.4.1