This is the mail archive of the
guile@sourceware.cygnus.com
mailing list for the Guile project.
regex speedup ???
- To: "guile at sourceware dot cygnus dot com" <guile at sourceware dot cygnus dot com>
- Subject: regex speedup ???
- From: Charbel Jacquin <cjacquin at club-internet dot fr>
- Date: Thu, 23 Dec 1999 04:42:57 +0100
Hello Guilers, sorry for my english.
Some (rather long) time ago, i guessed that regexp-exec
was slow because match structures are translated
from raw C api to scheme vectors and pairs.
So i started to make these match structures opaque,
to avoid consing & co. Tonight, i resurrected this
mostly *untested* :( stuff, and made a patch.
Oh, i have absolutly no idea about performance impacts ;)
The tradoff might be: now you do regexp-exec fast, but you might
be slower at examining results.
Anyway, i like opaque types.
Another concern about regexp-exec is the SCM_COERCE_SUBSTR(str)
done because regexec wants its string argument 0 terminated.
I think there exists somewhere in the gnu land (at least in
my old debian) a function:
int re_match_2(rx,str1,len1,str2,len2,start,regs,stop);
if we can use it we avoid the SCM_COERCE_SUBSTR, and then we
can do nice things with shared/copy-on write substrings
and regexps.
- New primitives:
(make-match)
Make an empty match.
(regexp-exec! match rx str start flags)
Fast? destructive version of @code{regexp-exec}.
Results are stored in the match structure @var{match}.
Return @var{match}, or @code{#f} if no match could be found.
- Procedures from regex.scm made primitives:
(regexp-match? x)
Return @code{#t} if @var{obj} is a regular expression match, or
@code{#f} otherwise.
(match:count match)
(match:string match)
(match:start match index)
(match:end match index)
For regexp-exec!:
guile> (define rx (make-regexp "ab"))
guile> (define m (make-match))
guile> (match:count m)
0
guile> (match:string m)
#f
guile> (regexp-exec! m rx "ab")
#<regexp-match 8084e98>
guile> (match:count m)
1
guile> (match:string m)
"ab"
guile> (regexp-exec! m rx "cd")
#f
guile> (match:count m)
0
guile> (match:string m)
#f
guile>
Charbel.
diff -ru guile-core/ice-9/regex.scm guile-dev/ice-9/regex.scm
--- guile-core/ice-9/regex.scm Thu Dec 23 04:26:35 1999
+++ guile-dev/ice-9/regex.scm Thu Dec 23 03:03:03 1999
@@ -29,12 +29,6 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; These procedures are not defined in SCSH, but I found them useful.
-(define-public (match:count match)
- (- (vector-length match) 1))
-
-(define-public (match:string match)
- (vector-ref match 0))
-
(define-public (match:prefix match)
(make-shared-substring (match:string match)
0
@@ -47,17 +41,6 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; SCSH compatibility routines.
-(define-public (regexp-match? match)
- (and (vector? match)
- (string? (vector-ref match 0))
- (let loop ((i 1))
- (cond ((>= i (vector-length match)) #t)
- ((and (pair? (vector-ref match i))
- (integer? (car (vector-ref match i)))
- (integer? (cdr (vector-ref match i))))
- (loop (+ 1 i)))
- (else #f)))))
-
(define-public (regexp-quote regexp)
(call-with-output-string
(lambda (p)
@@ -69,20 +52,6 @@
(write-char #\\ p)))
(write-char (string-ref regexp i) p)
(loop (1+ i))))))))
-
-(define-public (match:start match . args)
- (let* ((matchnum (if (pair? args)
- (+ 1 (car args))
- 1))
- (start (car (vector-ref match matchnum))))
- (if (= start -1) #f start)))
-
-(define-public (match:end match . args)
- (let* ((matchnum (if (pair? args)
- (+ 1 (car args))
- 1))
- (end (cdr (vector-ref match matchnum))))
- (if (= end -1) #f end)))
(define-public (match:substring match . args)
(let* ((matchnum (if (pair? args)
diff -ru guile-core/libguile/regex-posix.c guile-dev/libguile/regex-posix.c
--- guile-core/libguile/regex-posix.c Thu Dec 23 04:25:00 1999
+++ guile-dev/libguile/regex-posix.c Thu Dec 23 04:29:23 1999
@@ -101,6 +101,63 @@
return sizeof(regex_t);
}
+long scm_tc16_match;
+
+typedef struct scm_match {
+ SCM str;
+ int size;
+ int length;
+ regmatch_t matches[1];
+} scm_match_t;
+
+#define SCM_MATCH_INFO(X) ((scm_match_t *)SCM_CDR(X))
+
+static SCM
+make_match(SCM str, int length)
+{
+ scm_match_t *match;
+ match = (scm_match_t *) scm_must_malloc (sizeof(scm_match_t)
+ + length * sizeof(regmatch_t),
+ "make_match");
+ match->str = str;
+ match->size = length;
+ match->length = length;
+ SCM_RETURN_NEWSMOB (scm_tc16_match, match);
+}
+
+static SCM
+recycle_match(SCM m, SCM str, int length)
+{
+ scm_match_t *match = SCM_MATCH_INFO(m);
+ if (match->size < length)
+ {
+ match = (scm_match_t *) scm_must_realloc (match,
+ match->size, length,
+ "recycle_match");
+ match->size = length;
+ }
+ match->str = str;
+ match->length = length;
+ SCM_SET_SMOB_DATA(m,math);
+ return m;
+}
+
+static scm_sizet
+free_match (SCM obj)
+{
+ scm_match_t *match = SCM_MATCH_INFO(obj);
+ int size = match->size;
+ free (match);
+ return sizeof(scm_match_t) + (size - 1) * sizeof(regmatch_t);
+}
+
+
+static SCM
+mark_match (SCM ptr)
+{
+ return SCM_MATCH_INFO(ptr)->str;
+}
+
SCM_SYMBOL (scm_regexp_error_key, "regular-expression-syntax");
@@ -110,7 +167,7 @@
{
SCM errmsg;
int l;
-
+
/* FIXME: must we wrap any external calls in SCM_DEFER_INTS...SCM_ALLOW_INTS?
Or are these only necessary when a SCM object may be left in an
undetermined state (half-formed)? If the latter then I believe we
@@ -133,6 +190,74 @@
return SCM_CHARS (errmsg);
}
+GUILE_PROC (scm_make_match, "make-match", 0, 0, 0,
+ (void),
+"")
+#define FUNC_NAME s_scm_make_match
+{
+ return make_match(SCM_BOOL_F, 0);
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_regexp_match_p, "regexp-match?", 1, 0, 0,
+ (SCM x),
+"Return @code{#t} if @var{obj} is a regular expression match, or
+@code{#f} otherwise.")
+#define FUNC_NAME s_scm_regexp_match_p
+{
+ return SCM_BOOL(SCM_MATCHP (x));
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_match_count, "match:count", 1, 0, 0,
+ (SCM match),
+"")
+#define FUNC_NAME s_scm_match_count
+{
+ SCM_VALIDATE_MATCHP(1,match);
+ return SCM_MAKINUM(SCM_MATCH_INFO(match)->length);
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_match_string, "match:string", 1, 0, 0,
+ (SCM match),
+"")
+#define FUNC_NAME s_scm_match_string
+{
+ SCM_VALIDATE_MATCHP(1,match);
+ return SCM_MATCH_INFO(match)->str;
+}
+#undef FUNC_NAME
+
+
+
+GUILE_PROC (scm_match_start, "match:start", 1, 1, 0,
+ (SCM match, SCM index),
+"")
+#define FUNC_NAME s_scm_match_start
+{
+ int i;
+ SCM_VALIDATE_MATCHP(1,match);
+ SCM_VALIDATE_INT_DEF_COPY(2,index,0,i);
+ SCM_ASSERT_RANGE (2,index,i >= 0 && i < SCM_MATCH_INFO(match)->length);
+ return SCM_MAKINUM(SCM_MATCH_INFO(match)->matches[i].rm_so);
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_match_end, "match:end", 1, 1, 0,
+ (SCM match, SCM index),
+"")
+#define FUNC_NAME s_scm_match_end
+{
+ int i;
+ SCM_VALIDATE_MATCHP(1,match);
+ SCM_VALIDATE_INT_DEF_COPY(2,index,0,i);
+ SCM_ASSERT_RANGE (2,index,i >= 0 && i < SCM_MATCH_INFO(match)->length);
+ return SCM_MAKINUM(SCM_MATCH_INFO(match)->matches[i].rm_eo);
+}
+#undef FUNC_NAME
+
+
GUILE_PROC (scm_regexp_p, "regexp?", 1, 0, 0,
(SCM x),
"Return @code{#t} if @var{obj} is a compiled regular expression, or
@@ -232,7 +357,6 @@
#define FUNC_NAME s_scm_regexp_exec
{
int status, nmatches, offset;
- regmatch_t *matches;
SCM mvec = SCM_BOOL_F;
SCM_VALIDATE_RGXP(1,rx);
@@ -248,36 +372,63 @@
whole regexp, so add 1 to nmatches. */
nmatches = SCM_RGX(rx)->re_nsub + 1;
- SCM_DEFER_INTS;
- matches = SCM_MUST_MALLOC_TYPE_NUM (regmatch_t,nmatches);
+ mvec = make_match(str, nmatches);
+
status = regexec (SCM_RGX (rx), SCM_ROCHARS (str) + offset,
- nmatches, matches,
+ nmatches, SCM_MATCH_INFO(mvec)->matches,
SCM_INUM (flags));
- if (!status)
- {
- int i;
- /* The match vector must include a cell for the string that was matched,
- so add 1. */
- mvec = scm_make_vector (SCM_MAKINUM (nmatches + 1), SCM_UNSPECIFIED);
- SCM_VELTS(mvec)[0] = str;
- for (i = 0; i < nmatches; ++i)
- if (matches[i].rm_so == -1)
- SCM_VELTS(mvec)[i+1] = scm_cons (SCM_MAKINUM (-1), SCM_MAKINUM (-1));
- else
- SCM_VELTS(mvec)[i+1]
- = scm_cons(SCM_MAKINUM(matches[i].rm_so + offset),
- SCM_MAKINUM(matches[i].rm_eo + offset));
- }
- scm_must_free ((char *) matches);
- SCM_ALLOW_INTS;
-
if (status != 0 && status != REG_NOMATCH)
scm_error (scm_regexp_error_key,
FUNC_NAME,
scm_regexp_error_msg (status, SCM_RGX (rx)),
SCM_BOOL_F,
SCM_BOOL_F);
- return mvec;
+ return status ? SCM_BOOL_F : mvec;
+}
+#undef FUNC_NAME
+
+GUILE_PROC (scm_regexp_exec_x, "regexp-exec!", 3, 2, 0,
+ (SCM match, SCM rx, SCM str, SCM start, SCM flags),
+"Fast? destructive version of @code{regexp-exec}.
+Results are stored in the match structure @var{match}.
+Return @var{match}, or @code{#f} if no match could be found.")
+#define FUNC_NAME s_scm_regexp_exec_x
+{
+ int status, nmatches, offset;
+
+ SCM_VALIDATE_MATCHP(2,match);
+ SCM_VALIDATE_RGXP(2,rx);
+ SCM_VALIDATE_ROSTRING(3,str);
+ SCM_VALIDATE_INT_DEF_COPY(4,start,0,offset);
+ SCM_ASSERT_RANGE (4,start,offset >= 0 && (unsigned) offset <= SCM_LENGTH (str));
+ if (SCM_UNBNDP (flags))
+ flags = SCM_INUM0;
+ SCM_VALIDATE_INT(5,flags);
+ SCM_COERCE_SUBSTR (str);
+
+ /* re_nsub doesn't account for the `subexpression' representing the
+ whole regexp, so add 1 to nmatches. */
+
+ nmatches = SCM_RGX(rx)->re_nsub + 1;
+ match = recycle_match(match, str, nmatches);
+
+ status = regexec (SCM_RGX (rx), SCM_ROCHARS (str) + offset,
+ nmatches, SCM_MATCH_INFO(match)->matches,
+ SCM_INUM (flags));
+ if (status == 0)
+ return match;
+ else
+ {
+ SCM_MATCH_INFO(match)->length = 0;
+ SCM_MATCH_INFO(match)->str = SCM_BOOL_F;
+ if (status != REG_NOMATCH)
+ scm_error (scm_regexp_error_key,
+ FUNC_NAME,
+ scm_regexp_error_msg (status, SCM_RGX (rx)),
+ SCM_BOOL_F,
+ SCM_BOOL_F);
+ return SCM_BOOL_F;
+ }
}
#undef FUNC_NAME
@@ -286,6 +437,10 @@
{
scm_tc16_regex = scm_make_smob_type_mfpe ("regexp", sizeof (regex_t),
NULL, free_regex, NULL, NULL);
+
+ scm_tc16_match = scm_make_smob_type_mfpe ("regexp-match",
+ sizeof (scm_match_t), /* hmm, actually, variable sized */
+ mark_match, free_match, NULL, NULL);
/* Compilation flags. */
scm_sysintern ("regexp/basic", scm_long2num (REG_BASIC));
diff -ru guile-core/libguile/regex-posix.h guile-dev/libguile/regex-posix.h
--- guile-core/libguile/regex-posix.h Thu Dec 23 04:25:00 1999
+++ guile-dev/libguile/regex-posix.h Thu Dec 23 01:33:49 1999
@@ -59,4 +59,13 @@
extern SCM scm_regexp_exec SCM_P ((SCM rx, SCM str, SCM start, SCM flags));
extern void scm_init_regex_posix SCM_P ((void));
+extern long scm_tc16_match;
+#define SCM_MATCHP(X) ((SCM_TYP16(X) == scm_tc16_match))
+extern SCM scm_regexp_match_p SCM_P ((SCM x));
+extern SCM scm_match_count SCM_P ((SCM match));
+extern SCM scm_match_string SCM_P ((SCM match));
+extern SCM scm_match_start SCM_P ((SCM match, SCM index));
+extern SCM scm_match_end SCM_P ((SCM match, SCM index));
+
+
#endif
diff -ru guile-core/libguile/scm_validate.h guile-dev/libguile/scm_validate.h
--- guile-core/libguile/scm_validate.h Thu Dec 23 04:25:00 1999
+++ guile-dev/libguile/scm_validate.h Thu Dec 23 01:28:25 1999
@@ -227,6 +227,7 @@
#define SCM_VALIDATE_HOOK(pos,a) SCM_MAKE_VALIDATE(pos,a,HOOKP)
#define SCM_VALIDATE_RGXP(pos,a) SCM_MAKE_VALIDATE(pos,a,RGXP)
+#define SCM_VALIDATE_MATCHP(pos,a) SCM_MAKE_VALIDATE(pos,a,MATCHP)
#define SCM_VALIDATE_OPDIR(pos,port) SCM_MAKE_VALIDATE(pos,port,OPDIRP)