This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
[PATCH] Speed-up character range regexes by up to 2x
- From: Paolo Bonzini <paolo dot bonzini at polimi dot it>
- To: Aharon Robbins <arnold at skeeve dot com>
- Cc: bonzini at gnu dot org, libc-alpha at sources dot redhat dot com
- Date: Mon, 12 Jan 2004 10:50:01 +0100
- Subject: [PATCH] Speed-up character range regexes by up to 2x
- References: <200401120811.i0C8BNOP010105@skeeve.com>
- Reply-to: bonzini at gnu dot org
On single-byte character sets that have no collation elements (or on all
SBCS outside libc), it is useless to create the COMPLEX_BRACKET node
that tries to accept multi-byte chars for a character range. This way
in those locales (including the C locale) transit_state_mb is never
called and some more book-keeping disappears: I got a speed improvement
over 40% matching [A-Z][0-9] against ABCDEFGHIJKLMNOPQRSTUVWXYZ.
This patch does this together with a few other cleanups that I found
while reading the code.
Please apply also the patch
http://sources.redhat.com/ml/libc-alpha/2004-01/msg00099.html which is
needed to compile regex on many non-gcc hosts.
What follows the review of the "gawk guy"'s regex patch:
> +#ifdef RE_ENABLE_I18N
> int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
> +#else
> + int icase = (bufp->syntax & RE_ICASE);
> +#endif
This is unneeded.
> @@ -2558,8 +2564,8 @@
> ? __btowc (start_ch) : start_elem->opr.wch);
> end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
> ? __btowc (end_ch) : end_elem->opr.wch);
> - cmp_buf[0] = start_wc;
> - cmp_buf[4] = end_wc;
> + cmp_buf[0] = start_wc != WEOF ? start_wc : start_ch;
> + cmp_buf[4] = end_wc != WEOF ? end_wc : end_ch;
> if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
> return REG_ERANGE;
I am not sure this is the fix; maybe it is better not to include the
character set if start_wc == WEOF || end_wc == WEOF, or to return
REG_ERANGE?
> +#ifdef HAVE_CONFIG_H
> +#include "config.h"
> +#endif
The alloca patch does this at the very beginning of the file.
> +
> +#if defined (_MSC_VER)
> +#include <stdio.h> /* for size_t */
> +#endif
> +
> +#include <limits.h>
This is needed.
> +# elif defined __APPLE_CC__
> +# define __restrict
This too.
> +#if 0
> +/* Don't include this here. On some systems it sets RE_DUP_MAX to a
> + * lower value than GNU regex allows. Instead, include it in
> + * regex.c, before include of <regex.h>, which correctly
> + * #undefs RE_DUP_MAX and sets it to the right value.
> + */
> #include <limits.h>
> +#endif
Can be completely removed?
> /* This is for other GNU distributions with internationalized
messages. */
> -#if HAVE_LIBINTL_H || defined _LIBC
> +#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
Also needed.
> +#if _LIBC || __GNUC__ >= 3
> +# define BE(expr, val) __builtin_expect (expr, val)
> +#else
> +# define BE(expr, val) (expr)
> +# define inline
> +#endif
> +
Isn't this already there? Also, shouldn't "#define inline" be taken
care of in the configure script?
Paolo
2004-01-12 Paolo Bonzini <bonzini@gnu.org>
* posix/regcomp.c [_LIBC && !RE_ENABLE_I18N]:
Drop code to support this, it is never true.
(build_range_exp) [!_LIBC]: Do not create a range
in MBCSET for a single-byte character set.
(build_range_exp) [_LIBC]: Do not create a range
in MBCSET for a single-byte character set without
collation elements.
(init_dfa): Do not conditionalize on _LIBC, it
just makes the code less clear.
(parse_bracket_exp): Use NON_MATCH variable in
addition to "mbcset->non_match", not as an
alternative.
(build_charclass_op): rename NOT parameter to
NON_MATCH, use it instead of declaring a variable.
(parse_bracket_exp) [!_LIBC]: Pass NULL for MBCSET
if the character set is single-byte.
Index: regcomp.c
===================================================================
RCS file: /cvs/glibc/libc/posix/regcomp.c,v
retrieving revision 1.74
diff -u -p -r1.74 regcomp.c
--- regcomp.c 6 Jan 2004 21:59:24 -0000 1.74
+++ regcomp.c 12 Jan 2004 08:53:52 -0000
@@ -126,8 +118,8 @@ static reg_errcode_t build_charclass (un
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
unsigned RE_TRANSLATE_TYPE trans,
const unsigned char *class_name,
- const unsigned char *extra, int not,
- reg_errcode_t *err);
+ const unsigned char *extra,
+ int non_match, reg_errcode_t *err);
static bin_tree_t *create_tree (re_dfa_t *dfa,
bin_tree_t *left, bin_tree_t *right,
re_token_type_t type, int index);
@@ -862,11 +854,9 @@ init_dfa (dfa, pat_len)
dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
if (BE (dfa->sb_char == NULL, 0))
return REG_ESPACE;
-#ifdef _LIBC
if (dfa->is_utf8)
memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
else
-#endif
for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
for (j = 0; j < UINT_BITS; ++j, ++ch)
if (btowc (ch) != WEOF)
@@ -2563,32 +2553,40 @@ build_range_exp (sbcset, start_elem, end
if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
return REG_ERANGE;
- /* Check the space of the arrays. */
- if (BE (*range_alloc == mbcset->nranges, 0))
+ /* Got valid collation sequence values, add them as a new entry.
+ However, for !_LIBC we have no collation elements: if the
+ character set is single byte, the single byte character set
+ that we build below suffices. parse_bracket_exp passes
+ no MBCSET if dfa->mb_cur_max == 1. */
+ if (mbcset)
{
- /* There are not enough space, need realloc. */
- wchar_t *new_array_start, *new_array_end;
- int new_nranges;
-
- /* +1 in case of mbcset->nranges is 0. */
- new_nranges = 2 * mbcset->nranges + 1;
- /* Use realloc since mbcset->range_starts and mbcset->range_ends
- are NULL if *range_alloc == 0. */
- new_array_start = re_realloc (mbcset->range_starts, wchar_t,
- new_nranges);
- new_array_end = re_realloc (mbcset->range_ends, wchar_t,
- new_nranges);
-
- if (BE (new_array_start == NULL || new_array_end == NULL, 0))
- return REG_ESPACE;
-
- mbcset->range_starts = new_array_start;
- mbcset->range_ends = new_array_end;
- *range_alloc = new_nranges;
- }
+ /* Check the space of the arrays. */
+ if (BE (*range_alloc == mbcset->nranges, 0))
+ {
+ /* There is not enough space, need realloc. */
+ wchar_t *new_array_start, *new_array_end;
+ int new_nranges;
+
+ /* +1 in case of mbcset->nranges is 0. */
+ new_nranges = 2 * mbcset->nranges + 1;
+ /* Use realloc since mbcset->range_starts and mbcset->range_ends
+ are NULL if *range_alloc == 0. */
+ new_array_start = re_realloc (mbcset->range_starts, wchar_t,
+ new_nranges);
+ new_array_end = re_realloc (mbcset->range_ends, wchar_t,
+ new_nranges);
+
+ if (BE (new_array_start == NULL || new_array_end == NULL, 0))
+ return REG_ESPACE;
+
+ mbcset->range_starts = new_array_start;
+ mbcset->range_ends = new_array_end;
+ *range_alloc = new_nranges;
+ }
- mbcset->range_starts[mbcset->nranges] = start_wc;
- mbcset->range_ends[mbcset->nranges++] = end_wc;
+ mbcset->range_starts[mbcset->nranges] = start_wc;
+ mbcset->range_ends[mbcset->nranges++] = end_wc;
+ }
/* Build the table for single byte characters. */
for (wc = 0; wc <= SBC_MAX; ++wc)
@@ -2775,13 +2773,9 @@ parse_bracket_exp (regexp, dfa, token, s
static inline reg_errcode_t
__attribute ((always_inline))
-# ifdef RE_ENABLE_I18N
build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
re_charset_t *mbcset;
int *range_alloc;
-# else /* not RE_ENABLE_I18N */
- build_range_exp (sbcset, start_elem, end_elem)
-# endif /* not RE_ENABLE_I18N */
re_bitset_ptr_t sbcset;
bracket_elem_t *start_elem, *end_elem;
{
@@ -2789,33 +2783,6 @@ parse_bracket_exp (regexp, dfa, token, s
uint32_t start_collseq;
uint32_t end_collseq;
-# ifdef RE_ENABLE_I18N
- /* Check the space of the arrays. */
- if (BE (*range_alloc == mbcset->nranges, 0))
- {
- /* There are not enough space, need realloc. */
- uint32_t *new_array_start;
- uint32_t *new_array_end;
- int new_nranges;
-
- /* +1 in case of mbcset->nranges is 0. */
- new_nranges = 2 * mbcset->nranges + 1;
- /* Use realloc since mbcset->range_starts and mbcset->range_ends
- are NULL if *range_alloc == 0. */
- new_array_start = re_realloc (mbcset->range_starts, uint32_t,
- new_nranges);
- new_array_end = re_realloc (mbcset->range_ends, uint32_t,
- new_nranges);
-
- if (BE (new_array_start == NULL || new_array_end == NULL, 0))
- return REG_ESPACE;
-
- mbcset->range_starts = new_array_start;
- mbcset->range_ends = new_array_end;
- *range_alloc = new_nranges;
- }
-# endif /* RE_ENABLE_I18N */
-
/* Equivalence Classes and Character Classes can't be a range
start/end. */
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
@@ -2831,11 +2798,38 @@ parse_bracket_exp (regexp, dfa, token, s
if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
return REG_ERANGE;
-# ifdef RE_ENABLE_I18N
- /* Got valid collation sequence values, add them as a new entry. */
- mbcset->range_starts[mbcset->nranges] = start_collseq;
- mbcset->range_ends[mbcset->nranges++] = end_collseq;
-# endif /* RE_ENABLE_I18N */
+ /* Got valid collation sequence values, add them as a new entry.
+ However, if we have no collation elements, and the character set
+ is single byte, the single byte character set that we
+ build below suffices. */
+ if (nrules > 0 || dfa->mb_cur_max > 1)
+ {
+ /* Check the space of the arrays. */
+ if (BE (*range_alloc == mbcset->nranges, 0))
+ {
+ /* There is not enough space, need realloc. */
+ uint32_t *new_array_start;
+ uint32_t *new_array_end;
+ int new_nranges;
+
+ /* +1 in case of mbcset->nranges is 0. */
+ new_nranges = 2 * mbcset->nranges + 1;
+ new_array_start = re_realloc (mbcset->range_starts, uint32_t,
+ new_nranges);
+ new_array_end = re_realloc (mbcset->range_ends, uint32_t,
+ new_nranges);
+
+ if (BE (new_array_start == NULL || new_array_end == NULL, 0))
+ return REG_ESPACE;
+
+ mbcset->range_starts = new_array_start;
+ mbcset->range_ends = new_array_end;
+ *range_alloc = new_nranges;
+ }
+
+ mbcset->range_starts[mbcset->nranges] = start_collseq;
+ mbcset->range_ends[mbcset->nranges++] = end_collseq;
+ }
/* Build the table for single byte characters. */
for (ch = 0; ch <= SBC_MAX; ch++)
@@ -2862,13 +2856,9 @@ parse_bracket_exp (regexp, dfa, token, s
static inline reg_errcode_t
__attribute ((always_inline))
-# ifdef RE_ENABLE_I18N
build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
re_charset_t *mbcset;
int *coll_sym_alloc;
-# else /* not RE_ENABLE_I18N */
- build_collating_symbol (sbcset, name)
-# endif /* not RE_ENABLE_I18N */
re_bitset_ptr_t sbcset;
const unsigned char *name;
{
@@ -2894,7 +2884,6 @@ parse_bracket_exp (regexp, dfa, token, s
else
return REG_ECOLLATE;
-# ifdef RE_ENABLE_I18N
/* Got valid collation sequence, add it as a new entry. */
/* Check the space of the arrays. */
if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
@@ -2912,7 +2901,6 @@ parse_bracket_exp (regexp, dfa, token, s
*coll_sym_alloc = new_coll_sym_alloc;
}
mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
-# endif /* RE_ENABLE_I18N */
return REG_NOERROR;
}
else
@@ -2934,9 +2922,8 @@ parse_bracket_exp (regexp, dfa, token, s
re_charset_t *mbcset;
int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
int equiv_class_alloc = 0, char_class_alloc = 0;
-#else /* not RE_ENABLE_I18N */
- int non_match = 0;
#endif /* not RE_ENABLE_I18N */
+ int non_match = 0;
bin_tree_t *work_tree;
int token_len;
int first_round = 1;
@@ -2981,9 +2968,8 @@ parse_bracket_exp (regexp, dfa, token, s
{
#ifdef RE_ENABLE_I18N
mbcset->non_match = 1;
-#else /* not RE_ENABLE_I18N */
- non_match = 1;
#endif /* not RE_ENABLE_I18N */
+ non_match = 1;
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
bitset_set (sbcset, '\0');
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
@@ -3062,11 +3048,18 @@ parse_bracket_exp (regexp, dfa, token, s
token_len = peek_token_bracket (token, regexp, syntax);
+#ifdef _LIBC
+ *err = build_range_exp (sbcset, mbcset, &range_alloc,
+ &start_elem, &end_elem);
+#else
+# ifdef RE_ENABLE_I18N
*err = build_range_exp (sbcset,
-#ifdef RE_ENABLE_I18N
- mbcset, &range_alloc,
+ dfa->mb_cur_max > 1 ? mbcset : NULL,
+ &range_alloc, &start_elem, &end_elem);
+# else
+ *err = build_range_exp (sbcset, &start_elem, &end_elem);
+# endif
#endif /* RE_ENABLE_I18N */
- &start_elem, &end_elem);
if (BE (*err != REG_NOERROR, 0))
goto parse_bracket_exp_free_return;
}
@@ -3140,12 +3133,9 @@ parse_bracket_exp (regexp, dfa, token, s
re_string_skip_bytes (regexp, token_len); /* Skip a token. */
/* If it is non-matching list. */
-#ifdef RE_ENABLE_I18N
- if (mbcset->non_match)
-#else /* not RE_ENABLE_I18N */
if (non_match)
-#endif /* not RE_ENABLE_I18N */
bitset_not (sbcset);
+
#ifdef RE_ENABLE_I18N
/* Ensure only single byte characters are set. */
if (dfa->mb_cur_max > 1)
@@ -3316,7 +3306,7 @@ build_equiv_class (sbcset, name)
re_bitset_ptr_t sbcset;
const unsigned char *name;
{
-#if defined _LIBC && defined RE_ENABLE_I18N
+#if defined _LIBC
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules != 0)
{
@@ -3385,7 +3375,7 @@ build_equiv_class (sbcset, name)
mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
}
else
-#endif /* _LIBC && RE_ENABLE_I18N */
+#endif /* _LIBC */
{
if (BE (strlen ((const char *) name) != 1, 0))
return REG_ECOLLATE;
@@ -3481,20 +3471,18 @@ build_charclass (trans, sbcset, class_na
}
static bin_tree_t *
-build_charclass_op (dfa, trans, class_name, extra, not, err)
+build_charclass_op (dfa, trans, class_name, extra, non_match, err)
re_dfa_t *dfa;
unsigned RE_TRANSLATE_TYPE trans;
const unsigned char *class_name;
const unsigned char *extra;
- int not;
+ int non_match;
reg_errcode_t *err;
{
re_bitset_ptr_t sbcset;
#ifdef RE_ENABLE_I18N
re_charset_t *mbcset;
int alloc = 0;
-#else /* not RE_ENABLE_I18N */
- int non_match = 0;
#endif /* not RE_ENABLE_I18N */
reg_errcode_t ret;
re_token_t br_token;
@@ -3515,7 +3503,7 @@ build_charclass_op (dfa, trans, class_na
return NULL;
}
- if (not)
+ if (non_match)
{
#ifdef RE_ENABLE_I18N
/*
@@ -3523,8 +3511,6 @@ build_charclass_op (dfa, trans, class_na
bitset_set(cset->sbcset, '\0');
*/
mbcset->non_match = 1;
-#else /* not RE_ENABLE_I18N */
- non_match = 1;
#endif /* not RE_ENABLE_I18N */
}
@@ -3549,11 +3535,7 @@ build_charclass_op (dfa, trans, class_na
bitset_set (sbcset, *extra);
/* If it is non-matching list. */
-#ifdef RE_ENABLE_I18N
- if (mbcset->non_match)
-#else /* not RE_ENABLE_I18N */
if (non_match)
-#endif /* not RE_ENABLE_I18N */
bitset_not (sbcset);
#ifdef RE_ENABLE_I18N