This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
[RFA] Refresh towlower and towupper (was Re: Update wctype functions to Unicode 5.2?)
- From: Corinna Vinschen <vinschen at redhat dot com>
- To: newlib at sourceware dot org
- Date: Sat, 13 Feb 2010 21:51:43 +0100
- Subject: [RFA] Refresh towlower and towupper (was Re: Update wctype functions to Unicode 5.2?)
- References: <20100212205705.GF5683@calimero.vinschen.de>
- Reply-to: newlib at sourceware dot org
On Feb 12 21:57, Corinna Vinschen wrote:
> Additionally the functions iswblank, iswspace, towlower and towupper
> could need some revamp. If an update of the aforementioned tables to
> Unicode 5.2 is not a big deal, I'd volunteer to update these functions
> as required.
And here are the revamped towlower and towupper. I tested them against
Unicode 5.2 by performing the following tests:
$ cat > test-towfuncs.c << EOF
#include <stdio.h>
#include <wctype.h>
#if defined (TEST_TOLOWER)
#define towfunc towlower
#elif defined (TEST_TOUPPER)
#define towfunc towupper
#endif
int
main ()
{
wint_t upc, lwc;
for (upc = 0; upc < 0x1fffff; ++upc)
if ((lwc = towfunc (upc)) != upc)
printf ("%04X %04X\n", upc, lwc);
return 0;
}
EOF
$ gcc -DTEST_TOLOWER test-towfuncs.c -o test-towlower
$ gcc -DTEST_TOUPPER test-towfuncs.c -o test-towupper
$ ./test-towlower > towlower.newlib
$ ./test-towupper > towupper.newlib
$ wget http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt
$ awk -F\; '{ if ( $14 != "" ) print $1 " " $14; }' UnicodeData.txt > towlower.unicode
$ awk -F\; '{ if ( $13 != "" ) print $1 " " $13; }' UnicodeData.txt > towupper.unicode
$ diff towupper.newlib towupper.unicode
$ diff towlower.newlib towlower.unicode
Ok to apply?
Thanks,
Corinna
* libc/ctype/towlower.c (towlower): Update to Unicode 5.2.
* libc/ctype/towupper.c (towupper): Ditto.
Index: libc/ctype/towlower.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/ctype/towlower.c,v
retrieving revision 1.7
diff -u -p -r1.7 towlower.c
--- libc/ctype/towlower.c 14 May 2009 20:16:21 -0000 1.7
+++ libc/ctype/towlower.c 13 Feb 2010 20:50:45 -0000
@@ -71,15 +71,14 @@ _DEFUN(towlower,(c), wint_t c)
{
#ifdef _MB_CAPABLE
c = _jp2uc (c);
+ /* Based on and tested against Unicode 5.2 */
if (c < 0x100)
{
if ((c >= 0x0041 && c <= 0x005a) ||
- (c >= 0x00c0 && c <= 0x00de))
+ (c >= 0x00c0 && c <= 0x00d6) ||
+ (c >= 0x00d8 && c <= 0x00de))
return (c + 0x20);
- if (c == 0x00b5)
- return 0x03bc;
-
return c;
}
else if (c < 0x300)
@@ -96,8 +95,11 @@ _DEFUN(towlower,(c), wint_t c)
return c;
}
+ if (c == 0x0130)
+ return 0x0069;
+
if ((c >= 0x0139 && c <= 0x0147) ||
- (c >= 0x01cd && c <= 0x91db))
+ (c >= 0x01cd && c <= 0x01db))
{
if (c & 0x01)
return (c + 1);
@@ -146,9 +148,6 @@ _DEFUN(towlower,(c), wint_t c)
case 0x01f4:
k = c + 1;
break;
- case 0x017f:
- k = 0x0073;
- break;
case 0x0181:
k = 0x0253;
break;
@@ -227,17 +226,56 @@ _DEFUN(towlower,(c), wint_t c)
if (k != 0)
return k;
}
-
- if (c == 0x0220)
- return 0x019e;
+ else if (c == 0x0220)
+ return 0x019e;
+ else if (c >= 0x023a && c <= 0x024e)
+ {
+ wint_t k;
+ switch (c)
+ {
+ case 0x023a:
+ k = 0x2c65;
+ break;
+ case 0x023b:
+ case 0x0241:
+ case 0x0246:
+ case 0x0248:
+ case 0x024a:
+ case 0x024c:
+ case 0x024e:
+ k = c + 1;
+ break;
+ case 0x023d:
+ k = 0x019a;
+ break;
+ case 0x023e:
+ k = 0x2c66;
+ break;
+ case 0x0243:
+ k = 0x0180;
+ break;
+ case 0x0244:
+ k = 0x0289;
+ break;
+ case 0x0245:
+ k = 0x028c;
+ break;
+ default:
+ k = 0;
+ }
+ if (k != 0)
+ return k;
+ }
}
else if (c < 0x0400)
{
+ if (c == 0x0370 || c == 0x0372 || c == 0x0376)
+ return (c + 1);
if (c >= 0x0391 && c <= 0x03ab && c != 0x03a2)
return (c + 0x20);
if (c >= 0x03d8 && c <= 0x03ee && !(c & 0x01))
return (c + 1);
- if (c >= 0x0386 && c <= 0x03f5)
+ if (c >= 0x0386 && c <= 0x03ff)
{
wint_t k;
switch (c)
@@ -261,37 +299,31 @@ _DEFUN(towlower,(c), wint_t c)
k = 0x03cd;
break;
case 0x038f:
- k = 0x038f;
+ k = 0x03ce;
break;
- case 0x03c2:
- k = 0x03c3;
+ case 0x03cf:
+ k = 0x03d7;
break;
- case 0x03d0:
- k = 0x03b2;
- break;
- case 0x03d1:
+ case 0x03f4:
k = 0x03b8;
break;
- case 0x03d5:
- k = 0x03c6;
- break;
- case 0x03d6:
- k = 0x03c0;
+ case 0x03f7:
+ k = 0x03f8;
break;
- case 0x03f0:
- k = 0x03ba;
+ case 0x03f9:
+ k = 0x03f2;
break;
- case 0x03f1:
- k = 0x03c1;
+ case 0x03fa:
+ k = 0x03fb;
break;
- case 0x03f2:
- k = 0x03c3;
+ case 0x03fd:
+ k = 0x037b;
break;
- case 0x03f4:
- k = 0x03b8;
+ case 0x03fe:
+ k = 0x037c;
break;
- case 0x03f5:
- k = 0x03b5;
+ case 0x03ff:
+ k = 0x037d;
break;
default:
k = 0;
@@ -299,9 +331,6 @@ _DEFUN(towlower,(c), wint_t c)
if (k != 0)
return k;
}
-
- if (c == 0x0345)
- return 0x03b9;
}
else if (c < 0x500)
{
@@ -313,14 +342,16 @@ _DEFUN(towlower,(c), wint_t c)
if ((c >= 0x0460 && c <= 0x0480) ||
(c >= 0x048a && c <= 0x04be) ||
- (c >= 0x04d0 && c <= 0x04f4) ||
- (c == 0x04f8))
+ (c >= 0x04d0 && c <= 0x04fe))
{
if (!(c & 0x01))
return (c + 1);
return c;
}
+ if (c == 0x04c0)
+ return 0x04cf;
+
if (c >= 0x04c1 && c <= 0x04cd)
{
if (c & 0x01)
@@ -331,6 +362,7 @@ _DEFUN(towlower,(c), wint_t c)
else if (c < 0x1f00)
{
if ((c >= 0x0500 && c <= 0x050e) ||
+ (c >= 0x0510 && c <= 0x0524) ||
(c >= 0x1e00 && c <= 0x1e94) ||
(c >= 0x1ea0 && c <= 0x1ef8))
{
@@ -342,8 +374,14 @@ _DEFUN(towlower,(c), wint_t c)
if (c >= 0x0531 && c <= 0x0556)
return (c + 0x30);
- if (c == 0x1e9b)
- return 0x1e61;
+ if (c >= 0x10a0 && c <= 0x10c5)
+ return (c + 0x1c60);
+
+ if (c == 0x1e9e)
+ return 0x00df;
+
+ if (c >= 0x1efa && c <= 0x1efe && !(c & 0x01))
+ return (c + 1);
}
else if (c < 0x2000)
{
@@ -385,9 +423,6 @@ _DEFUN(towlower,(c), wint_t c)
case 0x1fbc:
k = 0x1fb3;
break;
- case 0x1fbe:
- k = 0x03b9;
- break;
case 0x1fc8:
case 0x1fc9:
case 0x1fca:
@@ -408,6 +443,10 @@ _DEFUN(towlower,(c), wint_t c)
case 0x1fec:
k = 0x1fe5;
break;
+ case 0x1ff8:
+ case 0x1ff9:
+ k = c - 0x80;
+ break;
case 0x1ffa:
case 0x1ffb:
k = c - 0x7e;
@@ -422,26 +461,100 @@ _DEFUN(towlower,(c), wint_t c)
return k;
}
}
- else
+ else if (c < 0x2c00)
{
if (c >= 0x2160 && c <= 0x216f)
return (c + 0x10);
-
+
if (c >= 0x24b6 && c <= 0x24cf)
return (c + 0x1a);
+ switch (c)
+ {
+ case 0x2126:
+ return 0x03c9;
+ case 0x212a:
+ return 0x006b;
+ case 0x212b:
+ return 0x00e5;
+ case 0x2132:
+ return 0x214e;
+ case 0x2183:
+ return 0x2184;
+ }
+ }
+ else if (c < 0x2d00)
+ {
+ if (c >= 0x2c00 && c <= 0x2c2e)
+ return (c + 0x30);
+
+ if (c >= 0x2c80 && c <= 0x2ce2 && !(c & 0x01))
+ return (c + 1);
+
+ switch (c)
+ {
+ case 0x2c60:
+ return 0x2c61;
+ case 0x2c62:
+ return 0x026b;
+ case 0x2c63:
+ return 0x1d7d;
+ case 0x2c64:
+ return 0x027d;
+ case 0x2c67:
+ case 0x2c69:
+ case 0x2c6b:
+ case 0x2c72:
+ case 0x2c75:
+ case 0x2ceb:
+ case 0x2ced:
+ return c + 1;
+ case 0x2c6d:
+ return 0x0251;
+ case 0x2c6e:
+ return 0x0271;
+ case 0x2c6f:
+ return 0x0250;
+ case 0x2c70:
+ return 0x0252;
+ case 0x2c7e:
+ return 0x023f;
+ case 0x2c7f:
+ return 0x0240;
+ }
+ }
+ else if (c >= 0xa600 && c < 0xa800)
+ {
+ if ((c >= 0xa640 && c <= 0xa65e) ||
+ (c >= 0xa662 && c <= 0xa66c) ||
+ (c >= 0xa680 && c <= 0xa696) ||
+ (c >= 0xa722 && c <= 0xa72e) ||
+ (c >= 0xa732 && c <= 0xa76e) ||
+ (c >= 0xa77f && c <= 0xa786))
+ {
+ if (!(c & 1))
+ return (c + 1);
+ return c;
+ }
+
+ switch (c)
+ {
+ case 0xa779:
+ case 0xa77b:
+ case 0xa77e:
+ case 0xa78b:
+ return (c + 1);
+ case 0xa77d:
+ return 0x1d79;
+ }
+ }
+ else
+ {
if (c >= 0xff21 && c <= 0xff3a)
return (c + 0x20);
- if (c >= 0x10400 && c <= 0x10425)
+ if (c >= 0x10400 && c <= 0x10427)
return (c + 0x28);
-
- if (c == 0x2126)
- return 0x03c9;
- if (c == 0x212a)
- return 0x006b;
- if (c == 0x212b)
- return 0x00e5;
}
return c;
#else
Index: libc/ctype/towupper.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/ctype/towupper.c,v
retrieving revision 1.7
diff -u -p -r1.7 towupper.c
--- libc/ctype/towupper.c 14 May 2009 20:16:21 -0000 1.7
+++ libc/ctype/towupper.c 13 Feb 2010 20:50:45 -0000
@@ -71,12 +71,13 @@ _DEFUN(towupper,(c), wint_t c)
{
#ifdef _MB_CAPABLE
c = _jp2uc (c);
+ /* Based on and tested against Unicode 5.2 */
if (c < 0x100)
{
if (c == 0x00b5)
return 0x039c;
- if ((c >= 0x00e0 && c <= 0x00fe) ||
+ if ((c >= 0x00e0 && c <= 0x00fe && c != 0x00f7) ||
(c >= 0x0061 && c <= 0x007a))
return (c - 0x20);
@@ -92,7 +93,8 @@ _DEFUN(towupper,(c), wint_t c)
(c >= 0x014b && c <= 0x0177) ||
(c >= 0x01df && c <= 0x01ef) ||
(c >= 0x01f9 && c <= 0x021f) ||
- (c >= 0x0223 && c <= 0x0233))
+ (c >= 0x0223 && c <= 0x0233) ||
+ (c >= 0x0247 && c <= 0x024f))
{
if (c & 0x01)
return (c - 1);
@@ -100,7 +102,8 @@ _DEFUN(towupper,(c), wint_t c)
}
if ((c >= 0x013a && c <= 0x0148) ||
- (c >= 0x01ce && c <= 0x1dc))
+ (c >= 0x01ce && c <= 0x01dc) ||
+ c == 0x023c || c == 0x0242)
{
if (!(c & 0x01))
return (c - 1);
@@ -121,6 +124,9 @@ _DEFUN(towupper,(c), wint_t c)
case 0x017f:
k = 0x0053;
break;
+ case 0x0180:
+ k = 0x0243;
+ break;
case 0x0183:
k = 0x0182;
break;
@@ -142,6 +148,9 @@ _DEFUN(towupper,(c), wint_t c)
case 0x0199:
k = 0x0198;
break;
+ case 0x019a:
+ k = 0x023d;
+ break;
case 0x019e:
k = 0x0220;
break;
@@ -176,6 +185,21 @@ _DEFUN(towupper,(c), wint_t c)
case 0x01f3:
k = 0x01f1;
break;
+ case 0x023f:
+ k = 0x2c7e;
+ break;
+ case 0x0240:
+ k = 0x2c7f;
+ break;
+ case 0x0250:
+ k = 0x2c6f;
+ break;
+ case 0x0251:
+ k = 0x2c6d;
+ break;
+ case 0x0252:
+ k = 0x2c70;
+ break;
case 0x0253:
k = 0x0181;
break;
@@ -206,15 +230,24 @@ _DEFUN(towupper,(c), wint_t c)
case 0x0269:
k = 0x0196;
break;
+ case 0x026b:
+ k = 0x2c62;
+ break;
case 0x026f:
k = 0x019c;
break;
+ case 0x0271:
+ k = 0x2c6e;
+ break;
case 0x0272:
k = 0x019d;
break;
case 0x0275:
k = 0x019f;
break;
+ case 0x027d:
+ k = 0x2c64;
+ break;
case 0x0280:
k = 0x01a6;
break;
@@ -224,12 +257,18 @@ _DEFUN(towupper,(c), wint_t c)
case 0x0288:
k = 0x01ae;
break;
+ case 0x0289:
+ k = 0x0244;
+ break;
case 0x028a:
k = 0x01b1;
break;
case 0x028b:
k = 0x01b2;
break;
+ case 0x028c:
+ k = 0x0245;
+ break;
case 0x0292:
k = 0x01b7;
break;
@@ -242,86 +281,91 @@ _DEFUN(towupper,(c), wint_t c)
}
else if (c < 0x0400)
{
- if (c == 0x03ac)
- return 0x0386;
-
- if ((c & 0xfff0) == 0x03a0 && c >= 0x03ad)
- return (c - 0x15);
-
+ wint_t k;
+
+ if (c >= 0x03ad && c <= 0x03af)
+ return (c - 0x25);
+
if (c >= 0x03b1 && c <= 0x03cb && c != 0x03c2)
return (c - 0x20);
- if (c == 0x03c2)
- return 0x03a3;
-
- if (c >= 0x03cc && c <= 0x03f5)
+ if (c >= 0x03d9 && c <= 0x03ef && (c & 1))
+ return (c - 1);
+
+ switch (c)
{
- wint_t k;
- switch (c)
- {
- case 0x03cc:
- k = 0x038c;
- break;
- case 0x03cd:
- case 0x03ce:
- k = c - 0x3f;
- break;
- case 0x03d0:
- k = 0x0392;
- break;
- case 0x03d1:
- k = 0x0398;
- break;
- case 0x03d5:
- k = 0x03a6;
- break;
- case 0x03d6:
- k = 0x03a0;
- break;
- case 0x03d9:
- case 0x03db:
- case 0x03dd:
- case 0x03df:
- case 0x03e1:
- case 0x03e3:
- case 0x03e5:
- case 0x03e7:
- case 0x03e9:
- case 0x03eb:
- case 0x03ed:
- case 0x03ef:
- k = c - 1;
- break;
- case 0x03f0:
- k = 0x039a;
- break;
- case 0x03f1:
- k = 0x03a1;
- break;
- case 0x03f2:
- k = 0x03a3;
- break;
- case 0x03f5:
- k = 0x0395;
- break;
- default:
- k = 0;
- }
- if (k != 0)
- return k;
+ case 0x0345:
+ k = 0x0399;
+ break;
+ case 0x0371:
+ case 0x0373:
+ case 0x0377:
+ case 0x03f8:
+ case 0x03fb:
+ k = c - 1;
+ break;
+ case 0x037b:
+ case 0x037c:
+ case 0x037d:
+ k = c + 0x82;
+ break;
+ case 0x03ac:
+ k = 0x0386;
+ break;
+ case 0x03c2:
+ k = 0x03a3;
+ break;
+ case 0x03cc:
+ k = 0x038c;
+ break;
+ case 0x03cd:
+ case 0x03ce:
+ k = c - 0x3f;
+ break;
+ case 0x03d0:
+ k = 0x0392;
+ break;
+ case 0x03d1:
+ k = 0x0398;
+ break;
+ case 0x03d5:
+ k = 0x03a6;
+ break;
+ case 0x03d6:
+ k = 0x03a0;
+ break;
+ case 0x03d7:
+ k = 0x03cf;
+ break;
+ case 0x03f0:
+ k = 0x039a;
+ break;
+ case 0x03f1:
+ k = 0x03a1;
+ break;
+ case 0x03f2:
+ k = 0x03f9;
+ break;
+ case 0x03f5:
+ k = 0x0395;
+ break;
+ default:
+ k = 0;
}
+ if (k != 0)
+ return k;
}
else if (c < 0x500)
{
- if (c >= 0x0450 && c <= 0x045f)
- return (c - 0x50);
-
if (c >= 0x0430 && c <= 0x044f)
return (c - 0x20);
+ if (c >= 0x0450 && c <= 0x045f)
+ return (c - 0x50);
+
if ((c >= 0x0461 && c <= 0x0481) ||
(c >= 0x048b && c <= 0x04bf) ||
- (c >= 0x04d1 && c <= 0x04f5))
+ (c >= 0x04d1 && c <= 0x04ff))
{
if (c & 0x01)
return (c - 1);
@@ -335,23 +379,36 @@ _DEFUN(towupper,(c), wint_t c)
return c;
}
- if (c == 0x04f9)
- return 0x04f8;
+ if (c == 0x04cf)
+ return 0x04c0;
+
+ if (c >= 0x04f7 && c <= 0x04f9)
+ return (c - 1);
+ }
+ else if (c < 0x0600)
+ {
+ if (c >= 0x0501 && c <= 0x0525 && (c & 1))
+ return c - 1;
+
+ if (c >= 0x0561 && c <= 0x0586)
+ return (c - 0x30);
}
else if (c < 0x1f00)
{
- if ((c >= 0x0501 && c <= 0x050f) ||
- (c >= 0x1e01 && c <= 0x1e95) ||
- (c >= 0x1ea1 && c <= 0x1ef9))
+ if (c == 0x1d79)
+ return 0xa77d;
+
+ if (c == 0x1d7d)
+ return 0x2c63;
+
+ if ((c >= 0x1e01 && c <= 0x1e95) ||
+ (c >= 0x1ea1 && c <= 0x1eff))
{
if (c & 0x01)
return (c - 1);
return c;
}
- if (c >= 0x0561 && c <= 0x0586)
- return (c - 0x30);
-
if (c == 0x1e9b)
return 0x1e60;
}
@@ -407,6 +464,9 @@ _DEFUN(towupper,(c), wint_t c)
case 0x1f75:
k = 0x1fcb;
break;
+ case 0x1fc3:
+ k = 0x1fcc;
+ break;
case 0x1fd0:
k = 0x1fd8;
break;
@@ -456,18 +516,65 @@ _DEFUN(towupper,(c), wint_t c)
return k;
}
}
- else
+ else if (c < 0x3000)
{
+ if (c == 0x214e)
+ return 0x2132;
+
+ if (c == 0x2184)
+ return 0x2183;
+
if (c >= 0x2170 && c <= 0x217f)
return (c - 0x10);
if (c >= 0x24d0 && c <= 0x24e9)
return (c - 0x1a);
+ if (c >= 0x2c30 && c <= 0x2c5e)
+ return (c - 0x30);
+
+ if ((c >= 0x2c68 && c <= 0x2c6c && !(c & 1)) ||
+ (c >= 0x2c81 && c <= 0x2ce3 && (c & 1)) ||
+ c == 0x2c73 || c == 0x2c76 ||
+ c == 0x2cec || c == 0x2cee)
+ return (c - 1);
+
+ if (c >= 0x2c81 && c <= 0x2ce3 && (c & 1))
+ return (c - 1);
+
+ if (c >= 0x2d00 && c <= 0x2d25)
+ return (c - 0x1c60);
+
+ switch (c)
+ {
+ case 0x2c61:
+ return 0x2c60;
+ case 0x2c65:
+ return 0x023a;
+ case 0x2c66:
+ return 0x023e;
+ }
+ }
+ else if (c >= 0xa000 && c < 0xb000)
+ {
+ if (((c >= 0xa641 && c <= 0xa65f) ||
+ (c >= 0xa663 && c <= 0xa66d) ||
+ (c >= 0xa681 && c <= 0xa697) ||
+ (c >= 0xa723 && c <= 0xa72f) ||
+ (c >= 0xa733 && c <= 0xa76f) ||
+ (c >= 0xa77f && c <= 0xa787)) &&
+ (c & 1))
+ return (c - 1);
+
+ if (c == 0xa77a || c == 0xa77c || c == 0xa78c)
+ return (c - 1);
+ }
+ else
+ {
if (c >= 0xff41 && c <= 0xff5a)
return (c - 0x20);
- if (c >= 0x10428 && c <= 0x1044d)
+ if (c >= 0x10428 && c <= 0x1044f)
return (c - 0x28);
}
return c;
--
Corinna Vinschen
Cygwin Project Co-Leader
Red Hat