This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
Unicode 3.1 support (1)
- To: libc-alpha at sources dot redhat dot com
- Subject: Unicode 3.1 support (1)
- From: Bruno Haible <haible at ilog dot fr>
- Date: Tue, 29 May 2001 14:50:50 +0200 (CEST)
This is the second patch for supporting Unicode 3.1 in libc. (The first one
was the UTF-32 encoding.)
While Unicode 3.0 ended at U+FFFD, Unicode 3.1 extends up to U+10FFFD.
First, there was a special hack in localedef for speeding up the find_idx
function for values < 0x10000. Here is a patch which uses the 3-level table
datastructure to extend this upto 0xFFFFFFFF. It doesn't make a measurable
speed difference versus the old hack. Without this, the generation of an
UTF-8 locale doesn't finish within a minute.
Second, without use of the ".." notation, the UTF-8 and GB18030 charmaps
would each become larger than 9 MB. So, to make these charmaps small, first
fix a bug in the ".." treatment: the ->ucs4 value was always set to
UNINITIALIZED_CHAR_VALUE.
2001-05-29 Bruno Haible <haible@clisp.cons.org>
* locale/programs/ld-ctype.c (idx_table): New struct type.
(idx_table_init, idx_table_get, idx_table_add): New functions.
(MAX_CHARNAMES_IDX): Remove macro.
(locale_ctype_t): Change type of charnames_idx field.
(ctype_startup): Change initialization of charnames_idx field.
(find_idx): Use idx_table_get and idx_table_add for speed.
* locale/programs/charmap.c (charmap_new_char): Fix ucs4 value
computation of characters in a range.
--- glibc-20010430/locale/programs/ld-ctype.c.bak Tue Feb 6 14:39:10 2001
+++ glibc-20010430/locale/programs/ld-ctype.c Sun May 27 12:23:24 2001
@@ -116,6 +116,14 @@
};
+/* Sparse table of uint32_t. */
+#define TABLE idx_table
+#define ELEMENT uint32_t
+#define DEFAULT ~((uint32_t) 0)
+#define NO_FINALIZE
+#include "3level.h"
+
+
/* The real definition of the struct for the LC_CTYPE locale. */
struct locale_ctype_t
{
@@ -123,8 +131,7 @@
size_t charnames_max;
size_t charnames_act;
/* An index lookup table, to speedup find_idx. */
-#define MAX_CHARNAMES_IDX 0x10000
- uint32_t *charnames_idx;
+ struct idx_table charnames_idx;
struct repertoire_t *repertoire;
@@ -261,10 +268,7 @@
for (cnt = 0; cnt < 256; ++cnt)
ctype->charnames[cnt] = cnt;
ctype->charnames_act = 256;
- ctype->charnames_idx =
- (uint32_t *) xmalloc (MAX_CHARNAMES_IDX * sizeof (uint32_t));
- for (cnt = 0; cnt < MAX_CHARNAMES_IDX; ++cnt)
- ctype->charnames_idx[cnt] = ~((uint32_t) 0);
+ idx_table_init (&ctype->charnames_idx);
/* Fill character class information. */
ctype->last_class_char = ILLEGAL_CHAR_VALUE;
@@ -1280,23 +1284,17 @@
if (idx < 256)
return table == NULL ? NULL : &(*table)[idx];
- /* If idx is in the usual range, use the charnames_idx lookup table
- instead of the slow search loop. */
- if (idx < MAX_CHARNAMES_IDX)
- {
- if (ctype->charnames_idx[idx] != ~((uint32_t) 0))
- /* Found. */
- cnt = ctype->charnames_idx[idx];
- else
- /* Not found. */
- cnt = ctype->charnames_act;
- }
- else
- {
- for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
- if (ctype->charnames[cnt] == idx)
- break;
- }
+ /* Use the charnames_idx lookup table instead of the slow search loop. */
+#if 1
+ cnt = idx_table_get (&ctype->charnames_idx, idx);
+ if (cnt == ~((uint32_t) 0))
+ /* Not found. */
+ cnt = ctype->charnames_act;
+#else
+ for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
+ if (ctype->charnames[cnt] == idx)
+ break;
+#endif
/* We have to distinguish two cases: the name is found or not. */
if (cnt == ctype->charnames_act)
@@ -1310,8 +1308,7 @@
sizeof (uint32_t) * ctype->charnames_max);
}
ctype->charnames[ctype->charnames_act++] = idx;
- if (idx < MAX_CHARNAMES_IDX)
- ctype->charnames_idx[idx] = cnt;
+ idx_table_add (&ctype->charnames_idx, idx, cnt);
}
if (table == NULL)
--- glibc-20010430/locale/programs/charmap.c.bak Thu Apr 5 22:19:13 2001
+++ glibc-20010430/locale/programs/charmap.c Tue May 29 01:19:27 2001
@@ -1042,7 +1042,7 @@
char *endp;
errno = 0;
- newp->ucs4 = strtoul (name_end, &endp, 16);
+ newp->ucs4 = strtoul (name_end + 1, &endp, 16);
if (endp - name_end != len1
|| (newp->ucs4 == ULONG_MAX && errno == ERANGE)
|| newp->ucs4 >= 0x80000000)