This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[regex] Optimize UTF-8 and ASCII-superset character sets outside _glibc


This patch optimizes UTF-8 regexps even if !_LIBC.  Support for this feature outside glibc was requested to replace the old (NFA) regex in gnulib.

The most important ASCII supersets are recognized, including Windows/AIX codepages, EUC, GB2312, ISO-8859, and UTF-8.  The code translates between the canonical names maintained in glibc and the names used in other OSes, based on the code in gettext's config.charset and localcharset.c files.  I did not want to encumber regex users with the need to distribute and install additional files, so I rewrote the code. I made it thread-safe along the way.

Hand-tested with sed on several configurations and locales, checking that optimize_utf8 is called when needed and that map_notascii is reset.  Ok?

Paolo

2004-04-27  Paolo Bonzini  <bonzini@gnu.org>

	* regcomp.c (locale_charset, charset_non_ascii): New functions.
	(init_dfa): Use them to enable optimizations outside glibc.
	* regex_internal.c (build_wcs_upper_buffer): Do not conditionalize
	code on _LIBC.
	* regex_internal.h: Include langinfo.h.
	[WIN32]: Include windows.h.

diff -ru lib_save/regcomp.c lib/regcomp.c
--- lib_save/regcomp.c	2004-10-27 10:28:37.000000000 +0200
+++ lib/regcomp.c	2004-10-27 11:52:14.000000000 +0200
@@ -138,6 +138,9 @@
 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
+static const char * locale_charset (void);
+static int charset_non_ascii (const char *);
+
 
 /* This table gives an error message for each of the error codes listed
    in regex.h.  Obviously the order here has to be same as there.
@@ -815,6 +818,119 @@
   return err;
 }
 
+/* Macro used to avoid problems with Turkish locales, where toupper would
+   yield locale names such as <DOTTED I>SO-8859-1.  */
+#define ascii_toupper(c)	((c) >= 'a' && (c) <= 'z' ? (c) - 32 : (c))
+
+/* Determine the current locale's character encoding, and canonicalize it.
+   The result must be freed.  If the canonical name cannot be determined,
+   the result is an empty string.  */
+
+#ifndef _LIBC
+static const char *
+locale_charset ()
+{
+#ifdef WIN32
+  static char buf[2 + 10 + 1];
+
+  /* Win32 has a function returning the locale's codepage as a number.  */
+  sprintf (buf, "CP%u", GetACP ());
+  return strdup (buf);
+
+#else
+  const char *p_prefix, *p_repl;
+
+  /* List of prefixes to be replaced, including an empty sentinel prefix at
+     the end to terminate the loop.  Longer prefixes should come earlier.  */
+  static const char prefixes[] = "ISO8859-\0" "ISO8859\0" "DIS-\0"
+    "UTF-\0" "UTF\0" "IBM-eucCN\0" "eucCN\0" "EUC-CN\0" "IBM-euc\0" "euc\0"
+    "SJIS\0" "IBM-\0" "646\0" "ASCII\0";
+
+  static const char replacements[] = "ISO-8859-\0" "ISO-8859-\0" "ISO-\0"
+    "UTF-\0" "UTF-\0" "GB2312\0" "GB2312\0" "GB2312\0" "EUC-\0" "EUC-\0"
+    "SHIFT-JIS\0" "CP\0" "ANSI_X3.4-1968\0" "ANSI_X3.4-1968\0";
+
+  char *codeset, *q;
+  const char *locale, *p;
+
+#if defined (HAVE_LANGINFO_CODESET)
+  /* Most systems support nl_langinfo (CODESET) nowadays.  */
+  locale = nl_langinfo (CODESET);
+  if (!locale)
+    return strdup ("");
+#else
+  /* On old systems we use getenv.  This includes FreeBSD, so we need it.  */
+  locale = getenv ("LC_ALL");
+  if (locale == NULL || locale[0] == '\0')
+    {
+      locale = getenv ("LC_CTYPE");
+      if (locale == NULL || locale[0] == '\0')
+	locale = getenv ("LANG");
+    }
+
+  locale = strchr (locale, '.');
+  if (!locale)
+    return strdup ("");
+  ++locale;
+#endif
+
+  p = locale;
+  p_prefix = prefixes;
+  p_repl = replacements;
+  for (;;)
+    {
+      if (!*p_prefix)
+	{
+	  /* Prefix matched successfully.  */
+	  int len_repl = strlen (p_repl);
+	  int len_suffix = strlen (p);
+	  codeset = malloc (len_repl + len_suffix + 1);
+	  memcpy (codeset, p_repl, len_repl);
+	  for (q = codeset + len_repl; *p && *p != '@'; p++, q++)
+	    *q = ascii_toupper (*p), p;
+	  *q = 0;
+	  return codeset;
+	}
+
+      if (*p_prefix != ascii_toupper (*p))
+	{
+	  /* Try the next prefix, we have a sentinel at the end.  */
+	  p_prefix += strlen (p_prefix) + 1;
+	  p_repl += strlen (p_repl) + 1;
+	  p = locale;
+	}
+      else
+	{
+	  /* Go ahead with the current prefix.  */
+	  ++p;
+	  ++p_prefix;
+	}
+    }
+#endif /* !WIN32 */
+}
+#endif /* !_LIBC */
+
+/* Return whether the current character set, passed in CODESET_NAME for
+   convenience, is a superset of ASCII (ISO-646).  */
+int
+charset_non_ascii (const char *codeset_name __attribute ((unused)))
+{
+#ifdef _LIBC
+  return _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0;
+#else
+  /* Windows codepages (including CP932 SHIFT-JIS), ISO-8859, and EUC
+     (including GB2312) are the most used multi-byte character sets,
+     and they're all supersets of ASCII.  */
+  return strncmp (codeset_name, "CP", 2) != 0
+	 && strncmp (codeset_name, "ISO-8859-", 9) != 0
+	 && strncmp (codeset_name, "EUC-", 4) != 0
+	 && strncmp (codeset_name, "ANSI_X3.4-", 10) != 0
+	 && strcmp (codeset_name, "UTF-8") != 0
+	 && strcmp (codeset_name, "GB2312") != 0
+	 && strcmp (codeset_name, "SHIFT-JIS") != 0;
+#endif
+}
+
 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
    as the initial length of some arrays.  */
 
@@ -824,6 +938,9 @@
      int pat_len;
 {
   int table_size;
+#ifdef RE_ENABLE_I18N
+  const char *codeset_name;
+#endif
 
   memset (dfa, '\0', sizeof (re_dfa_t));
 
@@ -847,14 +964,22 @@
   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 
   dfa->mb_cur_max = MB_CUR_MAX;
+#ifdef RE_ENABLE_I18N
 #ifdef _LIBC
-  if (dfa->mb_cur_max == 6
-      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
+  codeset_name = _NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME);
+#else
+  codeset_name = locale_charset ();
+#endif
+
+  if (dfa->mb_cur_max == 6 && strcmp (codeset_name, "UTF-8") == 0)
     dfa->is_utf8 = 1;
-  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
-		       != 0);
+  else
+    dfa->map_notascii = charset_non_ascii (codeset_name);
+
+#ifndef _LIBC
+  free ((char *) codeset_name);
 #endif
-#ifdef RE_ENABLE_I18N
+
   if (dfa->mb_cur_max > 1)
     {
       int i, j, ch;
diff -ru lib_save/regex_internal.c lib/regex_internal.c
--- lib_save/regex_internal.c	2004-10-27 10:28:37.000000000 +0200
+++ lib/regex_internal.c	2004-10-27 10:56:51.000000000 +0200
@@ -293,7 +293,6 @@
   byte_idx = pstr->valid_len;
   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 
-#ifdef _LIBC
   /* The following optimization assumes that the wchar_t encoding is
      always ISO 10646.  */
   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
@@ -368,14 +367,11 @@
       return REG_NOERROR;
     }
   else
-#endif
     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
       {
 	wchar_t wc;
 	const char *p;
-#ifdef _LIBC
 offsets_needed:
-#endif
 	remain_len = end_idx - byte_idx;
 	prev_st = pstr->cur_state;
 	if (BE (pstr->trans != NULL, 0))
diff -ru lib_save/regex_internal.h lib/regex_internal.h
--- lib_save/regex_internal.h	2004-10-27 10:28:37.000000000 +0200
+++ lib/regex_internal.h	2004-10-27 11:21:39.000000000 +0200
@@ -36,6 +36,16 @@
 #if defined HAVE_WCTYPE_H || defined _LIBC
 # include <wctype.h>
 #endif /* HAVE_WCTYPE_H || _LIBC */
+#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
+# include <langinfo.h>
+#endif /* HAVE_LANGINFO_H || _LIBC */
+
+#if defined _WIN32 || defined __WIN32__
+# undef WIN32   /* avoid warning on mingw32 */
+# define WIN32
+# define WIN32_LEAN_AND_MEAN
+# include <windows.h>
+#endif
 
 /* In case that the system doesn't have isblank().  */
 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
Only in lib: utils.o

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]