This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[regex] Speedup transit_state (with patch)


These are the optimizations that I sent last March.  They have now survived sed 4.1's inclusion in several distros and the bug that appeared in glibc was found.  In addition I've split them to ease review and isolate any other bugs better.

This one optimizes transit_state to make it faster except when iswalnum has to be called, which is very slow anyway.

The stupid webmail does not attach the file unless you click "attach" again before "send", so here it is.

Ok?

Paolo

2004-04-27  Paolo Bonzini  <bonzini@gnu.org>

	* regex_internal.h (struct re_dfastate_t): Make
	word_trtable a pointer to the 512-item transition table.
	* regexec.c (build_trtable): Fill in either state->trtable
	or state->word_trtable.  Return a boolean indicating success.
	(transit_state): Expect state->trtable to be a 256-item
	transition table.  Reorganize code to have less tests in
	the common case, and to save an indentation level.

diff -ru lib_faster/regex_internal.h lib/regex_internal.h
--- lib_faster/regex_internal.h	2004-10-27 12:09:33.000000000 +0200
+++ lib/regex_internal.h	2004-10-27 12:10:06.000000000 +0200
@@ -487,7 +487,7 @@
   unsigned int hash;
   re_node_set nodes;
   re_node_set *entrance_nodes;
-  struct re_dfastate_t **trtable;
+  struct re_dfastate_t **trtable, **word_trtable;
   unsigned int context : 4;
   unsigned int halt : 1;
   /* If this state can accept `multi byte'.
@@ -497,7 +497,6 @@
   /* If this state has backreference node(s).  */
   unsigned int has_backref : 1;
   unsigned int has_constraint : 1;
-  unsigned int word_trtable : 1;
 };
 typedef struct re_dfastate_t re_dfastate_t;
 
diff -ru lib_faster/regexec.c lib/regexec.c
--- lib_faster/regexec.c	2004-10-27 12:09:33.000000000 +0200
+++ lib/regexec.c	2004-10-27 12:17:23.000000000 +0200
@@ -172,8 +172,8 @@
 					 re_node_set *cur_nodes, int cur_str,
 					 int last_str, int subexp_num,
 					 int type) internal_function;
-static re_dfastate_t **build_trtable (re_dfa_t *dfa,
-				      re_dfastate_t *state) internal_function;
+static int build_trtable (re_dfa_t *dfa,
+			  re_dfastate_t *state) internal_function;
 #ifdef RE_ENABLE_I18N
 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 				    const re_string_t *input, int idx) internal_function;
@@ -2168,7 +2168,6 @@
      re_match_context_t *mctx;
      re_dfastate_t *state;
 {
-  re_dfa_t *const dfa = mctx->dfa;
   re_dfastate_t **trtable;
   unsigned char ch;
 
@@ -2192,21 +2192,22 @@
 #endif /* RE_ENABLE_I18N */
 
   /* Then decide the next state with the single byte.  */
-  if (1)
+#if 0
+  if (0)
+    /* don't use transition table  */
+    return transit_state_sb (err, mctx, state);
+#endif
+
+  /* Use transition table  */
+  ch = re_string_fetch_byte (&mctx->input);
+  for (;;)
     {
-      /* Use transition table  */
-      ch = re_string_fetch_byte (&mctx->input);
       trtable = state->trtable;
-      if (trtable == NULL)
-        {
-          trtable = build_trtable (dfa, state);
-          if (trtable == NULL)
-	    {
-	      *err = REG_ESPACE;
-	      return NULL;
-	    }
-	}
-      if (BE (state->word_trtable, 0))
+      if (BE (trtable != NULL, 1))
+	return trtable[ch];
+
+      trtable = state->word_trtable;
+      if (BE (trtable != NULL, 1))
         {
 	  unsigned int context;
 	  context
@@ -2218,14 +2219,15 @@
 	  else
 	    return trtable[ch];
 	}
-      else
-	return trtable[ch];
+
+      if (!build_trtable (mctx->dfa, state))
+	{
+	  *err = REG_ESPACE;
+	  return NULL;
+	}
+
+      /* Retry, we now have a transition table.  */
     }
-#if 0
-  else
-    /* don't use transition table  */
-    return transit_state_sb (err, mctx, state);
-#endif
 }
 
 /* Update the state_log if we need */
@@ -3226,15 +3228,15 @@
 }
 
 /* Build transition table for the state.
-   Return the new table if succeeded, otherwise return NULL.  */
+   Return 1 if succeeded, otherwise return NULL.  */
 
-static re_dfastate_t **
+static int
 build_trtable (dfa, state)
     re_dfa_t *dfa;
     re_dfastate_t *state;
 {
   reg_errcode_t err;
-  int i, j, ch;
+  int i, j, ch, need_word_trtable = 0;
   unsigned int elem, mask;
   int dests_node_malloced = 0, dest_states_malloced = 0;
   int ndests; /* Number of the destination states from `state'.  */
@@ -3258,13 +3260,13 @@
       dests_node = (re_node_set *)
 		   malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
       if (BE (dests_node == NULL, 0))
-	return NULL;
+	return 0;
       dests_node_malloced = 1;
     }
   dests_ch = (bitset *) (dests_node + SBC_MAX);
 
   /* Initialize transiton table.  */
-  state->word_trtable = 0;
+  state->word_trtable = state->trtable = 0;
 
   /* At first, group all nodes belonging to `state' into several
      destinations.  */
@@ -3273,14 +3275,14 @@
     {
       if (dests_node_malloced)
 	free (dests_node);
-      /* Return NULL in case of an error, trtable otherwise.  */
+      /* Return 0 in case of an error, 1 otherwise.  */
       if (ndests == 0)
 	{
 	  state->trtable = (re_dfastate_t **)
 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);;
-	  return state->trtable;
+	  return 1;
 	}
-      return NULL;
+      return 0;
     }
 
   err = re_node_set_alloc (&follows, ndests + 1);
@@ -3307,7 +3309,7 @@
 	    re_node_set_free (dests_node + i);
 	  if (dests_node_malloced)
 	    free (dests_node);
-	  return NULL;
+	  return 0;
 	}
       dest_states_malloced = 1;
     }
@@ -3345,7 +3347,7 @@
 
 	  if (dest_states[i] != dest_states_word[i]
 	      && dfa->mb_cur_max > 1)
-	    state->word_trtable = 1;
+	    need_word_trtable = 1;
 
 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
 							CONTEXT_NEWLINE);
@@ -3360,13 +3362,14 @@
       bitset_merge (acceptable, dests_ch[i]);
     }
 
-  if (!BE (state->word_trtable, 0))
+  if (!BE (need_word_trtable, 0))
     {
       /* We don't care about whether the following character is a word
 	 character, or we are in a single-byte character set so we can
 	 discern by looking at the character code: allocate a
 	 256-entry transition table.  */
-      trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
+      trtable = state->trtable =
+	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
       if (BE (trtable == NULL, 0))
 	goto out_free;
 
@@ -3396,8 +3399,8 @@
 	 by looking at the character code: build two 256-entry
 	 transition tables, one starting at trtable[0] and one
 	 starting at trtable[SBC_MAX].  */
-      trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
-					   2 * SBC_MAX);
+      trtable = state->word_trtable =
+	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
       if (BE (trtable == NULL, 0))
 	goto out_free;
 
@@ -3428,7 +3431,7 @@
 	  {
 	    /* k-th destination accepts newline character.  */
 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
-	    if (state->word_trtable)
+	    if (need_word_trtable)
 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
 	    /* There must be only one destination which accepts
 	       newline.  See group_nodes_into_DFAstates.  */
@@ -3446,8 +3449,7 @@
   if (dests_node_malloced)
     free (dests_node);
 
-  state->trtable = trtable;
-  return trtable;
+  return 1;
 }
 
 /* Group all nodes belonging to STATE into several destinations.
--- lib_faster/regex_internal.c	2004-10-27 12:09:33.000000000 +0200
+++ lib/regex_internal.c	2004-10-27 12:47:44.000000000 +0200
@@ -1645,6 +1645,7 @@
       re_free (state->entrance_nodes);
     }
   re_node_set_free (&state->nodes);
+  re_free (state->word_trtable);
   re_free (state->trtable);
   re_free (state);
 }

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]