This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

bug in mbrtowc



ISO C 99 says that
     mbrtowc(NULL, NULL, 0, ps)
is equivalent to
     mbrtowc(NULL, "", 1, ps)
but in glibc they behave differently. Below is a test, where test1() succeeds
and test2() fails, and a fix.

================================ foo.c =======================================
#include <assert.h>
#include <locale.h>
#include <stdio.h>
#include <wchar.h>

void test1 ()
{
  wchar_t wc;
  mbstate_t s;

  wc = 42;			/* arbitrary number */
  memset (&s, 0, sizeof (s));	/* get s into initial state */
  assert (mbrtowc (NULL, "", 1, &s) == 0);
  assert (mbsinit (&s));

  wc = 42;			/* arbitrary number */
  memset (&s, 0, sizeof (s));	/* get s into initial state */
  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
  assert (mbrtowc (NULL, "", 1, &s) == (size_t) -1);

  wc = 42;			/* arbitrary number */
  memset (&s, 0, sizeof (s));	/* get s into initial state */
  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
  assert (mbrtowc (NULL, "", 1, &s) == (size_t) -1);

  wc = 42;			/* arbitrary number */
  memset (&s, 0, sizeof (s));	/* get s into initial state */
  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
  assert (mbrtowc (&wc, "\xA0", 1, &s) == 1);	/* 3nd byte processed */
  assert (mbrtowc (NULL, "", 1, &s) == 0);
  assert (mbsinit (&s));
}

void test2 ()
{
  wchar_t wc;
  mbstate_t s;

  wc = 42;			/* arbitrary number */
  memset (&s, 0, sizeof (s));	/* get s into initial state */
  assert (mbrtowc (NULL, NULL, 0, &s) == 0);
  assert (mbsinit (&s));

  wc = 42;			/* arbitrary number */
  memset (&s, 0, sizeof (s));	/* get s into initial state */
  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
  assert (mbrtowc (NULL, NULL, 0, &s) == (size_t) -1);

  wc = 42;			/* arbitrary number */
  memset (&s, 0, sizeof (s));	/* get s into initial state */
  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
  assert (mbrtowc (NULL, NULL, 0, &s) == (size_t) -1);

  wc = 42;			/* arbitrary number */
  memset (&s, 0, sizeof (s));	/* get s into initial state */
  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
  assert (mbrtowc (&wc, "\xA0", 1, &s) == 1);	/* 3nd byte processed */
  assert (mbrtowc (NULL, NULL, 0, &s) == 0);
  assert (mbsinit (&s));
}

int main ()
{
  /* UTF-8 single byte feeding test for mbrtowc().  */
  const char *locale = "de_DE.UTF-8";

  if (!setlocale (LC_CTYPE, locale))
    {
      fprintf (stderr, "locale '%s' not available!\n", locale);
      exit (1);
    }

  test1 ();
  test2 ();

  return 0;
}
===============================================================================

The testsuite has also to be changed, because it assumes that
mbrlen (NULL, 0, NULL) will reset mbrlen's hidden state - which is not
true: when mbrlen has accumulated a partial/incomplete multibyte character,
mbrlen (NULL, 0, NULL) must return -1 / EILSEQ, and its state afterwards
is unspecified. (See ISO C 99 sections 7.24.6.3.1 and 7.24.6.3.2.)


ChangeLog:
2001-05-21  Bruno Haible  <haible@clisp.cons.org>

	* wcsmbs/mbrtowc.c (mbrtowc): Remove local variable 'flush', always
	use 0 instead, and rely on the converter to do the flush.
	* wcsmbs/tst-mbrtowc.c (utf8_test_1): New function, taken from
	utf8_test.
	(utf8_test_2, utf8_test_3): New function.
	(utf8_test): Call utf8_test_1, utf8_test_2, utf8_test_3.

localedata/ChangeLog:
2001-05-21  Bruno Haible  <haible@clisp.cons.org>

	* tests-mbwc/dat_mbrlen.c (tst_mbrlen_loc): Use a private mbstate_t
	for some tests, because mbrlen (NULL, 0, &s) does not always put back
	s into the initial state.

--- glibc-20010430/wcsmbs/mbrtowc.c.bak	Mon Apr  9 16:51:43 2001
+++ glibc-20010430/wcsmbs/mbrtowc.c	Mon May 21 12:31:06 2001
@@ -42,7 +42,6 @@
   size_t dummy;
   const unsigned char *inbuf;
   char *outbuf = (char *) (pwc ?: buf);
-  int flush = 0;
 
   /* Set information for this step.  */
   data.__invocation_counter = 0;
@@ -58,7 +57,6 @@
       outbuf = (char *) buf;
       s = "";
       n = 1;
-      flush = 1;
     }
 
   /* Tell where we want the result.  */
@@ -72,7 +70,7 @@
   inbuf = (const unsigned char *) s;
   status = DL_CALL_FCT (__wcsmbs_gconv_fcts.towc->__fct,
 			(__wcsmbs_gconv_fcts.towc, &data, &inbuf, inbuf + n,
-			 NULL, &dummy, flush, 1));
+			 NULL, &dummy, 0, 1));
 
   /* There must not be any problems with the conversion but illegal input
      characters.  The output buffer must be large enough, otherwise the
--- glibc-20010430/wcsmbs/tst-mbrtowc.c.bak	Thu Jan 11 22:01:06 2001
+++ glibc-20010430/wcsmbs/tst-mbrtowc.c	Mon May 21 12:29:06 2001
@@ -27,24 +27,18 @@
 
 static int check_ascii (const char *locname);
 
-/* Test for mbrtowc, contributed by Markus Kuhn <mkuhn@acm.org>.  */
+/* UTF-8 single byte feeding test for mbrtowc(),
+   contributed by Markus Kuhn <mkuhn@acm.org>.  */
 static int
-utf8_test (void)
+utf8_test_1 (void)
 {
-  /* UTF-8 single byte feeding test for mbrtowc().  */
   wchar_t wc;
   mbstate_t s;
-  const char *locale = "de_DE.UTF-8";
 
-  if (!setlocale (LC_CTYPE, locale))
-    {
-      fprintf (stderr, "locale '%s' not available!\n", locale);
-      exit (1);
-    }
   wc = 42;			/* arbitrary number */
   memset (&s, 0, sizeof (s));	/* get s into initial state */
-  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) - 2);	/* 1st byte processed */
-  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) - 2);	/* 2nd byte processed */
+  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
+  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
   assert (wc == 42);		/* no value has not been stored into &wc yet */
   assert (mbrtowc (&wc, "\xA0", 1, &s) == 1);	/* 3nd byte processed */
   assert (wc == 0x2260);	/* E2 89 A0 = U+2260 (not equal) decoded correctly */
@@ -54,6 +48,93 @@
   return 0;
 }
 
+/* Test for NUL byte processing via empty string.  */
+static int
+utf8_test_2 (void)
+{
+  wchar_t wc;
+  mbstate_t s;
+
+  wc = 42;			/* arbitrary number */
+  memset (&s, 0, sizeof (s));	/* get s into initial state */
+  assert (mbrtowc (NULL, "", 1, &s) == 0); /* valid terminator */
+  assert (mbsinit (&s));
+
+  wc = 42;			/* arbitrary number */
+  memset (&s, 0, sizeof (s));	/* get s into initial state */
+  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
+  assert (mbrtowc (NULL, "", 1, &s) == (size_t) -1); /* invalid terminator */
+
+  wc = 42;			/* arbitrary number */
+  memset (&s, 0, sizeof (s));	/* get s into initial state */
+  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
+  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
+  assert (mbrtowc (NULL, "", 1, &s) == (size_t) -1); /* invalid terminator */
+
+  wc = 42;			/* arbitrary number */
+  memset (&s, 0, sizeof (s));	/* get s into initial state */
+  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
+  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
+  assert (mbrtowc (&wc, "\xA0", 1, &s) == 1);	/* 3nd byte processed */
+  assert (mbrtowc (NULL, "", 1, &s) == 0); /* valid terminator */
+  assert (mbsinit (&s));
+
+  return 0;
+}
+
+/* Test for NUL byte processing via NULL string.  */
+static int
+utf8_test_3 (void)
+{
+  wchar_t wc;
+  mbstate_t s;
+
+  wc = 42;			/* arbitrary number */
+  memset (&s, 0, sizeof (s));	/* get s into initial state */
+  assert (mbrtowc (NULL, NULL, 0, &s) == 0); /* valid terminator */
+  assert (mbsinit (&s));
+
+  wc = 42;			/* arbitrary number */
+  memset (&s, 0, sizeof (s));	/* get s into initial state */
+  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
+  assert (mbrtowc (NULL, NULL, 0, &s) == (size_t) -1); /* invalid terminator */
+
+  wc = 42;			/* arbitrary number */
+  memset (&s, 0, sizeof (s));	/* get s into initial state */
+  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
+  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
+  assert (mbrtowc (NULL, NULL, 0, &s) == (size_t) -1); /* invalid terminator */
+
+  wc = 42;			/* arbitrary number */
+  memset (&s, 0, sizeof (s));	/* get s into initial state */
+  assert (mbrtowc (&wc, "\xE2", 1, &s) == (size_t) -2);	/* 1st byte processed */
+  assert (mbrtowc (&wc, "\x89", 1, &s) == (size_t) -2);	/* 2nd byte processed */
+  assert (mbrtowc (&wc, "\xA0", 1, &s) == 1);	/* 3nd byte processed */
+  assert (mbrtowc (NULL, NULL, 0, &s) == 0); /* valid terminator */
+  assert (mbsinit (&s));
+
+  return 0;
+}
+
+static int
+utf8_test (void)
+{
+  const char *locale = "de_DE.UTF-8";
+  int error = 0;
+
+  if (!setlocale (LC_CTYPE, locale))
+    {
+      fprintf (stderr, "locale '%s' not available!\n", locale);
+      exit (1);
+    }
+
+  error |= utf8_test_1 ();
+  error |= utf8_test_2 ();
+  error |= utf8_test_3 ();
+
+  return error;
+}
+
 
 int
 main (void)
--- glibc-20010430/localedata/tests-mbwc/dat_mbrlen.c.bak	Fri Aug 18 18:40:52 2000
+++ glibc-20010430/localedata/tests-mbwc/dat_mbrlen.c	Tue May 22 00:33:21 2001
@@ -99,21 +99,15 @@
       { /*----------------- #01 -----------------*/
 	{
 	  {
-	    { 1, "\317\302",   1,		   0, 0 },
-#ifdef SHOJI_IS_RIGHT
-	    { 0, "",	   0,		   0, 0 },
-#else
-	    /* XXX This test depends on the internal state being empty.
-	       XXX Therefore we must explicitly clean it.  */
-	    { 0, "",	   0,		   0, 1 },
-#endif
-	    { 1, "\317\302",   USE_MBCURMAX,   0, 0 },
+	    { 1, "\317\302",   1,		   1, 1 },
+	    { 0, "",	       0,		   1, 0 },
+	    { 1, "\317\302",   USE_MBCURMAX,	   1, 1 },
 	  }
 	},
 	{
 	  {
 	    { 0,		1, -2,		     },
-	    { 0,		1,  0,		     },
+	    { 0,		1, -1,		     },
 	    { 0,		1,  2,		     },
 	  }
 	}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]