This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Regex word char speedup


Hi!

Below is a patch which implements one of the alternatives I talked about.

Another thing which could help could be to copy preg->syntax and
preg->re_nsub in re_search_internal into re_string_t as well
and then kill all passing of preg around in routines called (indirectly)
from re_search_internal.  We could pass re_dfa_t * directly, or remove the
argument altogether depending on function.  That way we'd avoid the
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
in almost every routine in regexec.c.
Also, perhaps re_string_t and re_match_context_t types could be merged,
whether by adding re_match_context_t fields to re_string_t or adding
re_string_t as first field of re_match_context_t instead of the re_string_t
pointer.

2004-01-01  Jakub Jelinek  <jakub@redhat.com>
	    Paolo Bonzini  <bonzini@gnu.org>

	* posix/regex_internal.h (re_string_t): Add newline_anchor
	and word_char fields.
	(re_string_context_at, re_string_reconstruct): Remove last argument.
	* posix/regex_internal.c (re_string_allocate): Initialize
	pstr->word_char.
	(re_string_context_at): Remove newline_anchor argument.
	Use input->newline_anchor instead, swap && conditions.
	Only use IS_WIDE_WORD_CHAR if input->word_char != NULL.
	Use input->word_char bitmap instead of IS_WORD_CHAR.
	(re_string_reconstruct): Likewise.
	Adjust re_string_context_at caller.
	* posix/regexec.c (acquire_init_state_context,
	check_halt_state_context, transit_state, transit_state_sb,
	transit_state_mb, transit_state_bkref, check_arrival,
	check_node_accept): Adjust re_string_context_at and
	re_string_reconstruct callers.
	(re_search_internal): Likewise.  Set input.newline_anchor.
	(build_trtable): Use dfa->word_char bitmap instead of IS_WORD_CHAR.
	
--- libc/posix/regexec.c.jj	2004-01-01 15:44:58.000000000 +0100
+++ libc/posix/regexec.c	2004-01-01 16:07:59.000000000 +0100
@@ -1,5 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -616,6 +616,7 @@ re_search_internal (preg, string, length
     goto free_return;
   input.stop = stop;
   input.raw_stop = stop;
+  input.newline_anchor = preg->newline_anchor;
 
   err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
   if (BE (err != REG_NOERROR, 0))
@@ -708,8 +709,7 @@ re_search_internal (preg, string, length
 		  if (input.raw_mbs_idx + input.valid_raw_len <= match_first
 		      || match_first < input.raw_mbs_idx)
 		    {
-		      err = re_string_reconstruct (&input, match_first, eflags,
-						   preg->newline_anchor);
+		      err = re_string_reconstruct (&input, match_first, eflags);
 		      if (BE (err != REG_NOERROR, 0))
 			goto free_return;
 		    }
@@ -730,8 +730,7 @@ re_search_internal (preg, string, length
 
       /* Reconstruct the buffers so that the matcher can assume that
 	 the matching starts from the beginning of the buffer.  */
-      err = re_string_reconstruct (&input, match_first, eflags,
-				   preg->newline_anchor);
+      err = re_string_reconstruct (&input, match_first, eflags);
       if (BE (err != REG_NOERROR, 0))
 	goto free_return;
 #ifdef RE_ENABLE_I18N
@@ -939,8 +938,7 @@ acquire_init_state_context (err, preg, m
   if (dfa->init_state->has_constraint)
     {
       unsigned int context;
-      context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
-				      preg->newline_anchor);
+      context = re_string_context_at (mctx->input, idx - 1, mctx->eflags);
       if (IS_WORD_CONTEXT (context))
 	return dfa->init_state_word;
       else if (IS_ORDINARY_CONTEXT (context))
@@ -1103,8 +1101,7 @@ check_halt_state_context (preg, state, m
 #ifdef DEBUG
   assert (state->halt);
 #endif
-  context = re_string_context_at (mctx->input, idx, mctx->eflags,
-				  preg->newline_anchor);
+  context = re_string_context_at (mctx->input, idx, mctx->eflags);
   for (i = 0; i < state->nodes.nelem; ++i)
     if (check_halt_node_context (dfa, state->nodes.elems[i], context))
       return state->nodes.elems[i];
@@ -2177,7 +2174,7 @@ transit_state (err, preg, mctx, state)
 	      context
 		= re_string_context_at (mctx->input,
 					re_string_cur_idx (mctx->input) - 1,
-					mctx->eflags, preg->newline_anchor);
+					mctx->eflags);
 	      if (IS_WORD_CONTEXT (context))
 		next_state = trtable[ch + SBC_MAX];
 	      else
@@ -2236,7 +2233,7 @@ transit_state (err, preg, mctx, state)
 
 	  context = re_string_context_at (mctx->input,
 					  re_string_cur_idx (mctx->input) - 1,
-					  mctx->eflags, preg->newline_anchor);
+					  mctx->eflags);
 	  next_state = mctx->state_log[cur_idx]
 	    = re_acquire_state_context (err, dfa, &next_nodes, context);
 	  /* We don't need to check errors here, since the return value of
@@ -2340,8 +2337,7 @@ transit_state_sb (err, preg, state, mctx
 	    }
 	}
     }
-  context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
-				  preg->newline_anchor);
+  context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags);
   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
   /* We don't need to check errors here, since the return value of
      this function is next_state and ERR is already set.  */
@@ -2375,7 +2371,7 @@ transit_state_mb (preg, pstate, mctx)
 	{
 	  context = re_string_context_at (mctx->input,
 					  re_string_cur_idx (mctx->input),
-					  mctx->eflags, preg->newline_anchor);
+					  mctx->eflags);
 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
 					   context))
 	    continue;
@@ -2412,8 +2408,7 @@ transit_state_mb (preg, pstate, mctx)
 	  if (BE (err != REG_NOERROR, 0))
 	    return err;
 	}
-      context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
-				      preg->newline_anchor);
+      context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags);
       mctx->state_log[dest_idx]
 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
       if (dest_state != NULL)
@@ -2451,7 +2446,7 @@ transit_state_bkref (preg, nodes, mctx)
       if (node->constraint)
 	{
 	  context = re_string_context_at (mctx->input, cur_str_idx,
-					  mctx->eflags, preg->newline_anchor);
+					  mctx->eflags);
 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
 	    continue;
 	}
@@ -2483,7 +2478,7 @@ transit_state_bkref (preg, nodes, mctx)
 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
 			  - bkref_ent->subexp_from);
 	  context = re_string_context_at (mctx->input, dest_str_idx - 1,
-					  mctx->eflags, preg->newline_anchor);
+					  mctx->eflags);
 	  dest_state = mctx->state_log[dest_str_idx];
 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
 			: mctx->state_log[cur_str_idx]->nodes.nelem);
@@ -2757,8 +2752,7 @@ check_arrival (preg, mctx, path, top_nod
   mctx->input->cur_idx = str_idx;
 
   /* Setup initial node set.  */
-  context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
-				  preg->newline_anchor);
+  context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags);
   if (str_idx == top_str)
     {
       err = re_node_set_init_1 (&next_nodes, top_node);
@@ -2844,8 +2838,7 @@ check_arrival (preg, mctx, path, top_nod
 	      return err;
 	    }
 	}
-      context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
-				      preg->newline_anchor);
+      context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags);
       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
 	{
@@ -3304,7 +3297,8 @@ out_free:
 		;
 
 	      /* j-th destination accepts the word character ch.  */
-	      if (IS_WORD_CHAR (ch))
+	      if (BE (dfa->word_char != NULL, 0)
+		  && (dfa->word_char[i] & mask))
 		trtable[ch] = dest_states_word[j];
 	      else
 		trtable[ch] = dest_states[j];
@@ -3880,8 +3874,7 @@ check_node_accept (preg, node, mctx, idx
       /* The node has constraints.  Check whether the current context
 	 satisfies the constraints.  */
       unsigned int context = re_string_context_at (mctx->input, idx,
-						   mctx->eflags,
-						   preg->newline_anchor);
+						   mctx->eflags);
       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
 	return 0;
     }
--- libc/posix/regex_internal.h.jj	2003-12-28 00:34:45.000000000 +0100
+++ libc/posix/regex_internal.h	2004-01-01 15:53:28.000000000 +0100
@@ -1,5 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -337,12 +337,15 @@ struct re_string_t
   unsigned int tip_context;
   /* The translation passed as a part of an argument of re_compile_pattern.  */
   RE_TRANSLATE_TYPE trans;
+  /* Copy of re_dfa_t's word_char.  */
+  re_bitset_ptr_t word_char;
   /* 1 if REG_ICASE.  */
   unsigned char icase;
   unsigned char is_utf8;
   unsigned char map_notascii;
   unsigned char mbs_allocated;
   unsigned char offsets_needed;
+  unsigned char newline_anchor;
   int mb_cur_max;
 };
 typedef struct re_string_t re_string_t;
@@ -368,7 +371,7 @@ static reg_errcode_t re_string_construct
 					  int len, RE_TRANSLATE_TYPE trans,
 					  int icase, const re_dfa_t *dfa) internal_function;
 static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx,
-					    int eflags, int newline) internal_function;
+					    int eflags) internal_function;
 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
 						int new_buf_len) internal_function;
 # ifdef RE_ENABLE_I18N
@@ -384,7 +387,7 @@ static inline int re_string_char_size_at
 static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx) internal_function;
 # endif /* RE_ENABLE_I18N */
 static unsigned int re_string_context_at (const re_string_t *input, int idx,
-					  int eflags, int newline_anchor) internal_function;
+					  int eflags) internal_function;
 static unsigned char re_string_peek_byte_case (const re_string_t *pstr,
 					       int idx) internal_function;
 static unsigned char re_string_fetch_byte_case (re_string_t *pstr) internal_function;
--- libc/posix/regex_internal.c.jj	2003-12-28 00:34:31.000000000 +0100
+++ libc/posix/regex_internal.c	2004-01-01 16:22:22.000000000 +0100
@@ -1,5 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -67,6 +67,7 @@ re_string_allocate (pstr, str, len, init
   if (BE (ret != REG_NOERROR, 0))
     return ret;
 
+  pstr->word_char = dfa->word_char;
   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
   pstr->valid_raw_len = pstr->valid_len;
@@ -572,9 +573,9 @@ re_string_translate_buffer (pstr)
    convert to upper case in case of REG_ICASE, apply translation.  */
 
 static reg_errcode_t
-re_string_reconstruct (pstr, idx, eflags, newline)
+re_string_reconstruct (pstr, idx, eflags)
      re_string_t *pstr;
-     int idx, eflags, newline;
+     int idx, eflags;
 {
   int offset = idx - pstr->raw_mbs_idx;
   if (offset < 0)
@@ -609,8 +610,7 @@ re_string_reconstruct (pstr, idx, eflags
 	 )
 	{
 	  /* Yes, move them to the front of the buffer.  */
-	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
-						    newline);
+	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 #ifdef RE_ENABLE_I18N
 	  if (pstr->mb_cur_max > 1)
 	    memmove (pstr->wcs, pstr->wcs + offset,
@@ -695,9 +695,12 @@ re_string_reconstruct (pstr, idx, eflags
 		    memset (pstr->mbs, 255, pstr->valid_len);
 		}
 	      pstr->valid_raw_len = pstr->valid_len;
-	      pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
-				   : ((newline && IS_WIDE_NEWLINE (wc))
-				      ? CONTEXT_NEWLINE : 0));
+	      pstr->tip_context = (BE (pstr->word_char != NULL, 0)
+				   && IS_WIDE_WORD_CHAR (wc))
+				  ? CONTEXT_WORD
+				  : ((IS_WIDE_NEWLINE (wc)
+				      && pstr->newline_anchor)
+				     ? CONTEXT_NEWLINE : 0);
 	    }
 	  else
 #endif /* RE_ENABLE_I18N */
@@ -705,9 +708,11 @@ re_string_reconstruct (pstr, idx, eflags
 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 	      if (pstr->trans)
 		c = pstr->trans[c];
-	      pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD
-				   : ((newline && IS_NEWLINE (c))
-				      ? CONTEXT_NEWLINE : 0));
+	      pstr->tip_context = (BE (pstr->word_char != NULL, 0)
+				   && bitset_contain (pstr->word_char, c))
+				  ? CONTEXT_WORD
+				  : ((IS_NEWLINE (c) && pstr->newline_anchor)
+				     ? CONTEXT_NEWLINE : 0);
 	    }
 	}
       if (!pstr->mbs_allocated)
@@ -843,9 +848,9 @@ re_string_destruct (pstr)
 /* Return the context at IDX in INPUT.  */
 
 static unsigned int
-re_string_context_at (input, idx, eflags, newline_anchor)
+re_string_context_at (input, idx, eflags)
      const re_string_t *input;
-     int idx, eflags, newline_anchor;
+     int idx, eflags;
 {
   int c;
   if (idx < 0 || idx == input->len)
@@ -874,17 +879,18 @@ re_string_context_at (input, idx, eflags
 	    return input->tip_context;
 	}
       wc = input->wcs[wc_idx];
-      if (IS_WIDE_WORD_CHAR (wc))
+      if (BE (input->word_char != NULL, 0) && IS_WIDE_WORD_CHAR (wc))
 	return CONTEXT_WORD;
-      return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0;
+      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor)
+	     ? CONTEXT_NEWLINE : 0;
     }
   else
 #endif
     {
       c = re_string_byte_at (input, idx);
-      if (IS_WORD_CHAR (c))
+      if (BE (input->word_char != NULL, 0) && bitset_contain (input->word_char, c))
 	return CONTEXT_WORD;
-      return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
+      return (IS_NEWLINE (c) && input->newline_anchor) ? CONTEXT_NEWLINE : 0;
     }
 }
 

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]