This is the mail archive of the guile@sourceware.cygnus.com mailing list for the Guile project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

regex speedup ???


Hello Guilers, sorry for my english.

Some (rather long) time ago, i guessed that regexp-exec
was slow because match structures are translated
from raw C api to scheme vectors and pairs.

So i started to make these match structures opaque,
to avoid consing & co. Tonight, i resurrected this 
mostly *untested* :( stuff, and made a patch.

Oh, i have absolutly no idea about performance impacts ;)
The tradoff might be: now you do regexp-exec fast, but you might
be slower at examining results.
Anyway, i like opaque types.

Another concern about regexp-exec is the SCM_COERCE_SUBSTR(str)
done because regexec wants its string argument 0 terminated. 
I think there exists somewhere in the gnu land (at least in
my old debian) a function:

 int re_match_2(rx,str1,len1,str2,len2,start,regs,stop);

if we can use it we avoid the SCM_COERCE_SUBSTR, and then we
can do nice things with shared/copy-on write substrings
and regexps.

- New primitives:
(make-match)
Make an empty match.

(regexp-exec! match rx str start flags)
Fast? destructive version of @code{regexp-exec}.
Results are stored in the match structure @var{match}.
Return @var{match}, or @code{#f} if no match could be found.

- Procedures from regex.scm made primitives:

(regexp-match? x)
Return @code{#t} if @var{obj} is a regular expression match, or
@code{#f} otherwise.

(match:count match)
(match:string match)
(match:start match index)
(match:end match index)

For regexp-exec!:

guile> (define rx (make-regexp "ab"))                                                            
guile> (define m (make-match))
guile> (match:count m)
0
guile> (match:string m)
#f
guile> (regexp-exec! m rx "ab")                                                 
#<regexp-match 8084e98>
guile> (match:count m)
1
guile> (match:string m)
"ab"
guile> (regexp-exec! m rx "cd")
#f
guile> (match:count m)
0
guile> (match:string m)
#f
guile> 

Charbel.
diff -ru guile-core/ice-9/regex.scm guile-dev/ice-9/regex.scm
--- guile-core/ice-9/regex.scm	Thu Dec 23 04:26:35 1999
+++ guile-dev/ice-9/regex.scm	Thu Dec 23 03:03:03 1999
@@ -29,12 +29,6 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; These procedures are not defined in SCSH, but I found them useful.
 
-(define-public (match:count match)
-  (- (vector-length match) 1))
-
-(define-public (match:string match)
-  (vector-ref match 0))
-
 (define-public (match:prefix match)
   (make-shared-substring (match:string match)
 			 0
@@ -47,17 +41,6 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; SCSH compatibility routines.
 
-(define-public (regexp-match? match)
-  (and (vector? match)
-       (string? (vector-ref match 0))
-       (let loop ((i 1))
-	 (cond ((>= i (vector-length match)) #t)
-	       ((and (pair? (vector-ref match i))
-		     (integer? (car (vector-ref match i)))
-		     (integer? (cdr (vector-ref match i))))
-		(loop (+ 1 i)))
-	       (else #f)))))
-
 (define-public (regexp-quote regexp)
   (call-with-output-string
    (lambda (p)
@@ -69,20 +52,6 @@
 		 (write-char #\\ p)))
 	      (write-char (string-ref regexp i) p)
 	      (loop (1+ i))))))))
-
-(define-public (match:start match . args)
-  (let* ((matchnum (if (pair? args)
-		       (+ 1 (car args))
-		       1))
-	 (start (car (vector-ref match matchnum))))
-    (if (= start -1) #f start)))
-
-(define-public (match:end match . args)
-  (let* ((matchnum (if (pair? args)
-		       (+ 1 (car args))
-		       1))
-	 (end (cdr (vector-ref match matchnum))))
-    (if (= end -1) #f end)))
 
 (define-public (match:substring match . args)
   (let* ((matchnum (if (pair? args)
diff -ru guile-core/libguile/regex-posix.c guile-dev/libguile/regex-posix.c
--- guile-core/libguile/regex-posix.c	Thu Dec 23 04:25:00 1999
+++ guile-dev/libguile/regex-posix.c	Thu Dec 23 04:29:23 1999
@@ -101,6 +101,63 @@
   return sizeof(regex_t);
 }
 
+long scm_tc16_match;
+
+typedef struct scm_match {
+  SCM str;
+  int size;
+  int length;
+  regmatch_t matches[1];
+} scm_match_t;
+
+#define SCM_MATCH_INFO(X)   ((scm_match_t *)SCM_CDR(X))
+
+static SCM
+make_match(SCM str, int length)
+{
+  scm_match_t *match;
+  match = (scm_match_t *) scm_must_malloc (sizeof(scm_match_t)
+					     + length * sizeof(regmatch_t),
+					     "make_match");
+  match->str = str;
+  match->size = length;
+  match->length = length;
+  SCM_RETURN_NEWSMOB (scm_tc16_match, match);
+}
+
+static SCM
+recycle_match(SCM m, SCM str, int length)
+{
+  scm_match_t *match = SCM_MATCH_INFO(m);
+  if (match->size < length)
+    {
+      match = (scm_match_t *) scm_must_realloc (match,
+						match->size, length,
+						"recycle_match");
+      match->size = length;
+    }
+  match->str = str;
+  match->length = length;
+  SCM_SET_SMOB_DATA(m,math);
+  return m;
+}
+
+static scm_sizet
+free_match (SCM obj)
+{
+  scm_match_t *match = SCM_MATCH_INFO(obj);
+  int size = match->size;
+  free (match);
+  return sizeof(scm_match_t) + (size - 1) * sizeof(regmatch_t);
+}
+
+
+static SCM 
+mark_match (SCM ptr)
+{
+  return SCM_MATCH_INFO(ptr)->str;
+}
+
 
 
 SCM_SYMBOL (scm_regexp_error_key, "regular-expression-syntax");
@@ -110,7 +167,7 @@
 {
   SCM errmsg;
   int l;
-
+  
   /* FIXME: must we wrap any external calls in SCM_DEFER_INTS...SCM_ALLOW_INTS?
      Or are these only necessary when a SCM object may be left in an
      undetermined state (half-formed)?  If the latter then I believe we
@@ -133,6 +190,74 @@
   return SCM_CHARS (errmsg);
 }
 
+GUILE_PROC (scm_make_match, "make-match", 0, 0, 0, 
+            (void),
+"")
+#define FUNC_NAME s_scm_make_match
+{
+  return make_match(SCM_BOOL_F, 0);
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_regexp_match_p, "regexp-match?", 1, 0, 0, 
+            (SCM x),
+"Return @code{#t} if @var{obj} is a regular expression match, or
+@code{#f} otherwise.")
+#define FUNC_NAME s_scm_regexp_match_p
+{
+  return SCM_BOOL(SCM_MATCHP (x));
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_match_count, "match:count", 1, 0, 0, 
+            (SCM match),
+"")
+#define FUNC_NAME s_scm_match_count
+{
+  SCM_VALIDATE_MATCHP(1,match);
+  return SCM_MAKINUM(SCM_MATCH_INFO(match)->length);
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_match_string, "match:string", 1, 0, 0, 
+            (SCM match),
+"")
+#define FUNC_NAME s_scm_match_string
+{
+  SCM_VALIDATE_MATCHP(1,match);
+  return SCM_MATCH_INFO(match)->str;
+}
+#undef FUNC_NAME
+
+
+
+GUILE_PROC (scm_match_start, "match:start", 1, 1, 0, 
+            (SCM match, SCM index),
+"")
+#define FUNC_NAME s_scm_match_start
+{
+  int i;
+  SCM_VALIDATE_MATCHP(1,match);
+  SCM_VALIDATE_INT_DEF_COPY(2,index,0,i);
+  SCM_ASSERT_RANGE (2,index,i >= 0 && i < SCM_MATCH_INFO(match)->length);
+  return SCM_MAKINUM(SCM_MATCH_INFO(match)->matches[i].rm_so);
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_match_end, "match:end", 1, 1, 0, 
+            (SCM match, SCM index),
+"")
+#define FUNC_NAME s_scm_match_end
+{
+  int i;
+  SCM_VALIDATE_MATCHP(1,match);
+  SCM_VALIDATE_INT_DEF_COPY(2,index,0,i);
+  SCM_ASSERT_RANGE (2,index,i >= 0 && i < SCM_MATCH_INFO(match)->length);
+  return SCM_MAKINUM(SCM_MATCH_INFO(match)->matches[i].rm_eo);
+}
+#undef FUNC_NAME
+
+
 GUILE_PROC (scm_regexp_p, "regexp?", 1, 0, 0, 
             (SCM x),
 "Return @code{#t} if @var{obj} is a compiled regular expression, or
@@ -232,7 +357,6 @@
 #define FUNC_NAME s_scm_regexp_exec
 {
   int status, nmatches, offset;
-  regmatch_t *matches;
   SCM mvec = SCM_BOOL_F;
 
   SCM_VALIDATE_RGXP(1,rx);
@@ -248,36 +372,63 @@
      whole regexp, so add 1 to nmatches. */
 
   nmatches = SCM_RGX(rx)->re_nsub + 1;
-  SCM_DEFER_INTS;
-  matches = SCM_MUST_MALLOC_TYPE_NUM (regmatch_t,nmatches);
+  mvec = make_match(str, nmatches);
+
   status = regexec (SCM_RGX (rx), SCM_ROCHARS (str) + offset,
-		    nmatches, matches,
+		    nmatches, SCM_MATCH_INFO(mvec)->matches,
 		    SCM_INUM (flags));
-  if (!status)
-    {
-      int i;
-      /* The match vector must include a cell for the string that was matched,
-	 so add 1. */
-      mvec = scm_make_vector (SCM_MAKINUM (nmatches + 1), SCM_UNSPECIFIED);
-      SCM_VELTS(mvec)[0] = str;
-      for (i = 0; i < nmatches; ++i)
-	if (matches[i].rm_so == -1)
-	  SCM_VELTS(mvec)[i+1] = scm_cons (SCM_MAKINUM (-1), SCM_MAKINUM (-1));
-	else
-	  SCM_VELTS(mvec)[i+1]
-	    = scm_cons(SCM_MAKINUM(matches[i].rm_so + offset),
-		       SCM_MAKINUM(matches[i].rm_eo + offset));
-    }
-  scm_must_free ((char *) matches);
-  SCM_ALLOW_INTS;
-
   if (status != 0 && status != REG_NOMATCH)
     scm_error (scm_regexp_error_key,
 	       FUNC_NAME,
 	       scm_regexp_error_msg (status, SCM_RGX (rx)),
 	       SCM_BOOL_F,
 	       SCM_BOOL_F);
-  return mvec;
+  return status ? SCM_BOOL_F : mvec;
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_regexp_exec_x, "regexp-exec!", 3, 2, 0, 
+            (SCM match, SCM rx, SCM str, SCM start, SCM flags),
+"Fast? destructive version of @code{regexp-exec}.
+Results are stored in the match structure @var{match}.
+Return @var{match}, or @code{#f} if no match could be found.")
+#define FUNC_NAME s_scm_regexp_exec_x
+{
+  int status, nmatches, offset;
+
+  SCM_VALIDATE_MATCHP(2,match);
+  SCM_VALIDATE_RGXP(2,rx);
+  SCM_VALIDATE_ROSTRING(3,str);
+  SCM_VALIDATE_INT_DEF_COPY(4,start,0,offset);
+  SCM_ASSERT_RANGE (4,start,offset >= 0 && (unsigned) offset <= SCM_LENGTH (str));
+  if (SCM_UNBNDP (flags))
+    flags = SCM_INUM0;
+  SCM_VALIDATE_INT(5,flags);
+  SCM_COERCE_SUBSTR (str);
+
+  /* re_nsub doesn't account for the `subexpression' representing the
+     whole regexp, so add 1 to nmatches. */
+
+  nmatches = SCM_RGX(rx)->re_nsub + 1;
+  match = recycle_match(match, str, nmatches);
+
+  status = regexec (SCM_RGX (rx), SCM_ROCHARS (str) + offset,
+		    nmatches, SCM_MATCH_INFO(match)->matches,
+		    SCM_INUM (flags));
+  if (status == 0)
+    return match;
+  else
+    {
+      SCM_MATCH_INFO(match)->length = 0;
+      SCM_MATCH_INFO(match)->str = SCM_BOOL_F;
+      if (status != REG_NOMATCH)
+	scm_error (scm_regexp_error_key,
+		   FUNC_NAME,
+		   scm_regexp_error_msg (status, SCM_RGX (rx)),
+		   SCM_BOOL_F,
+		   SCM_BOOL_F);
+      return SCM_BOOL_F;
+    }
 }
 #undef FUNC_NAME
 
@@ -286,6 +437,10 @@
 {
   scm_tc16_regex = scm_make_smob_type_mfpe ("regexp", sizeof (regex_t),
                                             NULL, free_regex, NULL, NULL);
+
+  scm_tc16_match = scm_make_smob_type_mfpe ("regexp-match",
+					    sizeof (scm_match_t), /* hmm, actually, variable sized */
+                                            mark_match, free_match, NULL, NULL);
 
   /* Compilation flags.  */
   scm_sysintern ("regexp/basic", scm_long2num (REG_BASIC));
diff -ru guile-core/libguile/regex-posix.h guile-dev/libguile/regex-posix.h
--- guile-core/libguile/regex-posix.h	Thu Dec 23 04:25:00 1999
+++ guile-dev/libguile/regex-posix.h	Thu Dec 23 01:33:49 1999
@@ -59,4 +59,13 @@
 extern SCM scm_regexp_exec SCM_P ((SCM rx, SCM str, SCM start, SCM flags));
 extern void scm_init_regex_posix SCM_P ((void));
 
+extern long scm_tc16_match;
+#define SCM_MATCHP(X)	((SCM_TYP16(X) == scm_tc16_match))
+extern SCM scm_regexp_match_p SCM_P ((SCM x));
+extern SCM scm_match_count SCM_P ((SCM match));
+extern SCM scm_match_string SCM_P ((SCM match));
+extern SCM scm_match_start SCM_P ((SCM match, SCM index));
+extern SCM scm_match_end SCM_P ((SCM match, SCM index));
+
+
 #endif
diff -ru guile-core/libguile/scm_validate.h guile-dev/libguile/scm_validate.h
--- guile-core/libguile/scm_validate.h	Thu Dec 23 04:25:00 1999
+++ guile-dev/libguile/scm_validate.h	Thu Dec 23 01:28:25 1999
@@ -227,6 +227,7 @@
 #define SCM_VALIDATE_HOOK(pos,a) SCM_MAKE_VALIDATE(pos,a,HOOKP)
 
 #define SCM_VALIDATE_RGXP(pos,a) SCM_MAKE_VALIDATE(pos,a,RGXP)
+#define SCM_VALIDATE_MATCHP(pos,a) SCM_MAKE_VALIDATE(pos,a,MATCHP)
 
 #define SCM_VALIDATE_OPDIR(pos,port) SCM_MAKE_VALIDATE(pos,port,OPDIRP)
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]