This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
[regex] Optimize UTF-8 and ASCII-superset character sets outside _glibc
- From: "Bonzini Paolo" <paolo dot bonzini at lu dot unisi dot ch>
- To: <libc-alpha at sources dot redhat dot com>
- Date: Wed, 27 Oct 2004 15:43:36 +0200
- Subject: [regex] Optimize UTF-8 and ASCII-superset character sets outside _glibc
This patch optimizes UTF-8 regexps even if !_LIBC. Support for this feature outside glibc was requested to replace the old (NFA) regex in gnulib.
The most important ASCII supersets are recognized, including Windows/AIX codepages, EUC, GB2312, ISO-8859, and UTF-8. The code translates between the canonical names maintained in glibc and the names used in other OSes, based on the code in gettext's config.charset and localcharset.c files. I did not want to encumber regex users with the need to distribute and install additional files, so I rewrote the code. I made it thread-safe along the way.
Hand-tested with sed on several configurations and locales, checking that optimize_utf8 is called when needed and that map_notascii is reset. Ok?
Paolo
2004-04-27 Paolo Bonzini <bonzini@gnu.org>
* regcomp.c (locale_charset, charset_non_ascii): New functions.
(init_dfa): Use them to enable optimizations outside glibc.
* regex_internal.c (build_wcs_upper_buffer): Do not conditionalize
code on _LIBC.
* regex_internal.h: Include langinfo.h.
[WIN32]: Include windows.h.
diff -ru lib_save/regcomp.c lib/regcomp.c
--- lib_save/regcomp.c 2004-10-27 10:28:37.000000000 +0200
+++ lib/regcomp.c 2004-10-27 11:52:14.000000000 +0200
@@ -138,6 +138,9 @@
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
+static const char * locale_charset (void);
+static int charset_non_ascii (const char *);
+
/* This table gives an error message for each of the error codes listed
in regex.h. Obviously the order here has to be same as there.
@@ -815,6 +818,119 @@
return err;
}
+/* Macro used to avoid problems with Turkish locales, where toupper would
+ yield locale names such as <DOTTED I>SO-8859-1. */
+#define ascii_toupper(c) ((c) >= 'a' && (c) <= 'z' ? (c) - 32 : (c))
+
+/* Determine the current locale's character encoding, and canonicalize it.
+ The result must be freed. If the canonical name cannot be determined,
+ the result is an empty string. */
+
+#ifndef _LIBC
+static const char *
+locale_charset ()
+{
+#ifdef WIN32
+ static char buf[2 + 10 + 1];
+
+ /* Win32 has a function returning the locale's codepage as a number. */
+ sprintf (buf, "CP%u", GetACP ());
+ return strdup (buf);
+
+#else
+ const char *p_prefix, *p_repl;
+
+ /* List of prefixes to be replaced, including an empty sentinel prefix at
+ the end to terminate the loop. Longer prefixes should come earlier. */
+ static const char prefixes[] = "ISO8859-\0" "ISO8859\0" "DIS-\0"
+ "UTF-\0" "UTF\0" "IBM-eucCN\0" "eucCN\0" "EUC-CN\0" "IBM-euc\0" "euc\0"
+ "SJIS\0" "IBM-\0" "646\0" "ASCII\0";
+
+ static const char replacements[] = "ISO-8859-\0" "ISO-8859-\0" "ISO-\0"
+ "UTF-\0" "UTF-\0" "GB2312\0" "GB2312\0" "GB2312\0" "EUC-\0" "EUC-\0"
+ "SHIFT-JIS\0" "CP\0" "ANSI_X3.4-1968\0" "ANSI_X3.4-1968\0";
+
+ char *codeset, *q;
+ const char *locale, *p;
+
+#if defined (HAVE_LANGINFO_CODESET)
+ /* Most systems support nl_langinfo (CODESET) nowadays. */
+ locale = nl_langinfo (CODESET);
+ if (!locale)
+ return strdup ("");
+#else
+ /* On old systems we use getenv. This includes FreeBSD, so we need it. */
+ locale = getenv ("LC_ALL");
+ if (locale == NULL || locale[0] == '\0')
+ {
+ locale = getenv ("LC_CTYPE");
+ if (locale == NULL || locale[0] == '\0')
+ locale = getenv ("LANG");
+ }
+
+ locale = strchr (locale, '.');
+ if (!locale)
+ return strdup ("");
+ ++locale;
+#endif
+
+ p = locale;
+ p_prefix = prefixes;
+ p_repl = replacements;
+ for (;;)
+ {
+ if (!*p_prefix)
+ {
+ /* Prefix matched successfully. */
+ int len_repl = strlen (p_repl);
+ int len_suffix = strlen (p);
+ codeset = malloc (len_repl + len_suffix + 1);
+ memcpy (codeset, p_repl, len_repl);
+ for (q = codeset + len_repl; *p && *p != '@'; p++, q++)
+ *q = ascii_toupper (*p), p;
+ *q = 0;
+ return codeset;
+ }
+
+ if (*p_prefix != ascii_toupper (*p))
+ {
+ /* Try the next prefix, we have a sentinel at the end. */
+ p_prefix += strlen (p_prefix) + 1;
+ p_repl += strlen (p_repl) + 1;
+ p = locale;
+ }
+ else
+ {
+ /* Go ahead with the current prefix. */
+ ++p;
+ ++p_prefix;
+ }
+ }
+#endif /* !WIN32 */
+}
+#endif /* !_LIBC */
+
+/* Return whether the current character set, passed in CODESET_NAME for
+ convenience, is a superset of ASCII (ISO-646). */
+int
+charset_non_ascii (const char *codeset_name __attribute ((unused)))
+{
+#ifdef _LIBC
+ return _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0;
+#else
+ /* Windows codepages (including CP932 SHIFT-JIS), ISO-8859, and EUC
+ (including GB2312) are the most used multi-byte character sets,
+ and they're all supersets of ASCII. */
+ return strncmp (codeset_name, "CP", 2) != 0
+ && strncmp (codeset_name, "ISO-8859-", 9) != 0
+ && strncmp (codeset_name, "EUC-", 4) != 0
+ && strncmp (codeset_name, "ANSI_X3.4-", 10) != 0
+ && strcmp (codeset_name, "UTF-8") != 0
+ && strcmp (codeset_name, "GB2312") != 0
+ && strcmp (codeset_name, "SHIFT-JIS") != 0;
+#endif
+}
+
/* Initialize DFA. We use the length of the regular expression PAT_LEN
as the initial length of some arrays. */
@@ -824,6 +938,9 @@
int pat_len;
{
int table_size;
+#ifdef RE_ENABLE_I18N
+ const char *codeset_name;
+#endif
memset (dfa, '\0', sizeof (re_dfa_t));
@@ -847,14 +964,22 @@
dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
dfa->mb_cur_max = MB_CUR_MAX;
+#ifdef RE_ENABLE_I18N
#ifdef _LIBC
- if (dfa->mb_cur_max == 6
- && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
+ codeset_name = _NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME);
+#else
+ codeset_name = locale_charset ();
+#endif
+
+ if (dfa->mb_cur_max == 6 && strcmp (codeset_name, "UTF-8") == 0)
dfa->is_utf8 = 1;
- dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
- != 0);
+ else
+ dfa->map_notascii = charset_non_ascii (codeset_name);
+
+#ifndef _LIBC
+ free ((char *) codeset_name);
#endif
-#ifdef RE_ENABLE_I18N
+
if (dfa->mb_cur_max > 1)
{
int i, j, ch;
diff -ru lib_save/regex_internal.c lib/regex_internal.c
--- lib_save/regex_internal.c 2004-10-27 10:28:37.000000000 +0200
+++ lib/regex_internal.c 2004-10-27 10:56:51.000000000 +0200
@@ -293,7 +293,6 @@
byte_idx = pstr->valid_len;
end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
-#ifdef _LIBC
/* The following optimization assumes that the wchar_t encoding is
always ISO 10646. */
if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
@@ -368,14 +367,11 @@
return REG_NOERROR;
}
else
-#endif
for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
{
wchar_t wc;
const char *p;
-#ifdef _LIBC
offsets_needed:
-#endif
remain_len = end_idx - byte_idx;
prev_st = pstr->cur_state;
if (BE (pstr->trans != NULL, 0))
diff -ru lib_save/regex_internal.h lib/regex_internal.h
--- lib_save/regex_internal.h 2004-10-27 10:28:37.000000000 +0200
+++ lib/regex_internal.h 2004-10-27 11:21:39.000000000 +0200
@@ -36,6 +36,16 @@
#if defined HAVE_WCTYPE_H || defined _LIBC
# include <wctype.h>
#endif /* HAVE_WCTYPE_H || _LIBC */
+#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
+# include <langinfo.h>
+#endif /* HAVE_LANGINFO_H || _LIBC */
+
+#if defined _WIN32 || defined __WIN32__
+# undef WIN32 /* avoid warning on mingw32 */
+# define WIN32
+# define WIN32_LEAN_AND_MEAN
+# include <windows.h>
+#endif
/* In case that the system doesn't have isblank(). */
#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
Only in lib: utils.o