This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Remove erroneous Uincode conversions from _wctomb_r and mbtowc_r


Hi,

the conversion functions _wctomb_r and _mbtowc_r convert 5 and 6 byte
UTF-8 sequences into a wchar counterpart.  Vice versa, wchar_t values >
0x10ffff are converted to 4, 5 and 6 byte UTF-8 sequences.  However, per
the Unicode standard (http://www.unicode.org/standard/standard.html),
these values are invalid.  Unicode is restricted to the value range
0x000000 to 0x10ffff.  Any character outside this range has to be
treated as invalid.

The below patch fixes the two functions to handle only valid UTF characters.


Corinna


	* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
	sequences since they are invalid in the Unicode standard.
	* wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t
	values beyond 0x10ffff into UTF-8 chars.


Index: libc/stdlib/mbtowc_r.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/stdlib/mbtowc_r.c,v
retrieving revision 1.7
diff -u -p -r1.7 mbtowc_r.c
--- libc/stdlib/mbtowc_r.c	23 Apr 2004 21:44:22 -0000	1.7
+++ libc/stdlib/mbtowc_r.c	17 Feb 2009 17:48:07 -0000
@@ -193,120 +193,6 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state)
 	  state->__count = 0;
 	  return i;
 	}
-      else if (ch >= 0xf8 && ch <= 0xfb)
-	{
-	  /* five-byte sequence */
-	  if (sizeof(wchar_t) < 4)
-	    return -1; /* we can't store such a value */
-	  state->__value.__wchb[0] = ch;
-	  if (state->__count == 0)
-	    state->__count = 1;
-	  else
-	    ++n;
-	  if (n < 2)
-	    return -2;
-	  ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
-	  if (state->__value.__wchb[0] == 0xf8 && ch < 0x88)
-	    /* overlong UTF-8 sequence */
-	    return -1;
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[1] = ch;
-	  if (state->__count == 1)
-	    state->__count = 2;
-	  else
-	    ++n;
-	  if (n < 3)
-	    return -2;
-	  ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[2] = ch;
-	  if (state->__count == 2)
-	    state->__count = 3;
-	  else
-	    ++n;
-	  if (n < 4)
-	    return -2;
-	  ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[3] = ch;
-	  state->__count = 4;
-	  if (n < 5)
-	    return -2;
-	  ch = t[i++];
-	  *pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24)
-	    |    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18)
-	    |    (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12)
-	    |    (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6)
-	    |    (wchar_t)(ch & 0x3f);
-	
-	  state->__count = 0;
-	  return i;
-	}
-      else if (ch >= 0xfc && ch <= 0xfd)
-        {
-          /* six-byte sequence */
-	  int ch2;
-	  if (sizeof(wchar_t) < 4)
-	    return -1; /* we can't store such a value */
-	  state->__value.__wchb[0] = ch;
-	  if (state->__count == 0)
-	    state->__count = 1;
-	  else
-	    ++n;
-	  if (n < 2)
-	    return -2;
-	  ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
-	  if (state->__value.__wchb[0] == 0xfc && ch < 0x84)
-	    /* overlong UTF-8 sequence */
-	    return -1;
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[1] = ch;
-	  if (state->__count == 1)
-	    state->__count = 2;
-	  else
-	    ++n;
-	  if (n < 3)
-	    return -2;
-	  ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[2] = ch;
-	  if (state->__count == 2)
-	    state->__count = 3;
-	  else
-	    ++n;
-	  if (n < 4)
-	    return -2;
-	  ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  state->__value.__wchb[3] = ch;
-	  if (state->__count == 3)
-	    state->__count = 4;
-	  else
-	    ++n;
-	  if (n < 5)
-	    return -2;
-	  if (n == 5)
-	    return -1; /* at this point we can't save enough to restart */
-	  ch = t[i++];
-	  if (ch < 0x80 || ch > 0xbf)
-	    return -1;
-	  ch2 = t[i++];
-	  *pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30)
-	    |    (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24)
-	    |    (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18)
-	    |    (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12)
-	    |    (wchar_t)((ch & 0x3f) << 6)
-	    |    (wchar_t)(ch2 & 0x3f);
-	
-	  state->__count = 0;
-	  return i;
-	}
       else
 	return -1;
     }      
Index: libc/stdlib/wctomb_r.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/stdlib/wctomb_r.c,v
retrieving revision 1.7
diff -u -p -r1.7 wctomb_r.c
--- libc/stdlib/wctomb_r.c	16 May 2007 19:31:06 -0000	1.7
+++ libc/stdlib/wctomb_r.c	17 Feb 2009 17:48:07 -0000
@@ -50,7 +50,7 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
           *s   = 0x80 |  (wchar &   0x3f);
           return 3;
         }
-      else if (wchar >= 0x10000 && wchar <= 0x1fffff)
+      else if (wchar >= 0x10000 && wchar <= 0x10ffff)
         {
           *s++ = 0xf0 | ((wchar & 0x1c0000) >> 18);
           *s++ = 0x80 | ((wchar &  0x3f000) >> 12);
@@ -58,25 +58,6 @@ _DEFUN (_wctomb_r, (r, s, wchar, state),
           *s   = 0x80 |  (wchar &     0x3f);
           return 4;
         }
-      else if (wchar >= 0x200000 && wchar <= 0x3ffffff)
-        {
-          *s++ = 0xf8 | ((wchar & 0x3000000) >> 24);
-          *s++ = 0x80 | ((wchar &  0xfc0000) >> 18);
-          *s++ = 0x80 | ((wchar &   0x3f000) >> 12);
-          *s++ = 0x80 | ((wchar &     0xfc0) >> 6);
-          *s   = 0x80 |  (wchar &      0x3f);
-          return 5;
-        }
-      else if (wchar >= 0x4000000 && wchar <= 0x7fffffff)
-        {
-          *s++ = 0xfc | ((wchar & 0x40000000) >> 30);
-          *s++ = 0x80 | ((wchar & 0x3f000000) >> 24);
-          *s++ = 0x80 | ((wchar &   0xfc0000) >> 18);
-          *s++ = 0x80 | ((wchar &    0x3f000) >> 12);
-          *s++ = 0x80 | ((wchar &      0xfc0) >> 6);
-          *s   = 0x80 |  (wchar &       0x3f);
-          return 6;
-        }
       else
         return -1;
     }


-- 
Corinna Vinschen
Cygwin Project Co-Leader
Red Hat


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]