This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Unicode 3.2 support (7), JISX0213 converter


Here is a small simplification of the converter that I submitted. The
FROM direction of the JISX0213 based converters need not be stateful.


2002-04-18  Bruno Haible  <bruno@clisp.org>

	* iconvdata/euc-jisx0213.c (EMIT_SHIFT_TO_INIT, BODY for
	  FROM_DIRECTION): Make the FROM direction stateless.
	* iconvdata/shift_jisx0213.c (EMIT_SHIFT_TO_INIT, BODY for
	  FROM_DIRECTION): Likewise.

--- glibc-20020408/iconvdata/euc-jisx0213.c.bak	Wed Apr 17 00:24:08 2002
+++ glibc-20020408/iconvdata/euc-jisx0213.c	Thu Apr 18 00:07:35 2002
@@ -62,9 +62,7 @@
     *statep = saved_state
 
 
-/* During EUC-JISX0213 to UCS-4 conversion, the COUNT element of the state
-   contains the last UCS-4 character, shifted by 3 bits.
-   During UCS-4 to EUC-JISX0213 conversion, the COUNT element of the state
+/* During UCS-4 to EUC-JISX0213 conversion, the COUNT element of the state
    contains the last two bytes to be output, shifted by 3 bits.  */
 
 /* Since this is a stateful encoding we have to provide code which resets
@@ -74,17 +72,8 @@
   if (data->__statep->__count != 0)					      \
     {									      \
       if (FROM_DIRECTION)						      \
-	{								      \
-	  if (__builtin_expect (outbuf + 4 <= outend, 1))		      \
-	    {								      \
-	      /* Write out the last character.  */			      \
-	      *((uint32_t *) outbuf)++ = data->__statep->__count >> 3;	      \
-	      data->__statep->__count = 0;				      \
-	    }								      \
-	  else								      \
-	    /* We don't have enough room in the output buffer.  */	      \
-	    status = __GCONV_FULL_OUTPUT;				      \
-	}								      \
+	/* We don't use shift states in the FROM_DIRECTION.  */		      \
+	data->__statep->__count = 0;					      \
       else								      \
 	{								      \
 	  if (__builtin_expect (outbuf + 2 <= outend, 1))		      \
@@ -109,33 +98,44 @@
 #define LOOPFCT			FROM_LOOP
 #define BODY \
   {									      \
-    uint32_t ch;							      \
+    uint32_t ch = *inptr;						      \
 									      \
-    /* Determine whether there is a buffered character pending.  */	      \
-    ch = *statep >> 3;							      \
-    if (__builtin_expect (ch == 0, 1))					      \
+    if (ch < 0x80)							      \
+      /* Plain ASCII character.  */					      \
+      ++inptr;								      \
+    else if ((ch >= 0xa1 && ch <= 0xfe) || ch == 0x8e || ch == 0x8f)	      \
       {									      \
-	/* No - so look at the next input byte.  */			      \
-	ch = *inptr;							      \
-	if (ch < 0x80)							      \
-	  /* Plain ASCII character.  */					      \
-	  ++inptr;							      \
-	else if ((ch >= 0xa1 && ch <= 0xfe) || ch == 0x8e || ch == 0x8f)      \
+	/* Two or three byte character.  */				      \
+	uint32_t ch2;							      \
+									      \
+	if (__builtin_expect (inptr + 1 >= inend, 0))			      \
 	  {								      \
-	    /* Two or three byte character.  */				      \
-	    uint32_t ch2;						      \
+	    /* The second byte is not available.  */			      \
+	    result = __GCONV_INCOMPLETE_INPUT;				      \
+	    break;							      \
+	  }								      \
+									      \
+	ch2 = inptr[1];							      \
 									      \
-	    if (__builtin_expect (inptr + 1 >= inend, 0))		      \
+	/* The second byte must be >= 0xa1 and <= 0xfe.  */		      \
+	if (__builtin_expect (ch2 < 0xa1 || ch2 > 0xfe, 0))		      \
+	  {								      \
+	    /* This is an illegal character.  */			      \
+	    if (! ignore_errors_p ())					      \
 	      {								      \
-		/* The second byte is not available.  */		      \
-		result = __GCONV_INCOMPLETE_INPUT;			      \
+		result = __GCONV_ILLEGAL_INPUT;				      \
 		break;							      \
 	      }								      \
 									      \
-	    ch2 = inptr[1];						      \
+	    ++inptr;							      \
+	    ++*irreversible;						      \
+	    break;							      \
+	  }								      \
 									      \
-	    /* The second byte must be >= 0xa1 and <= 0xfe.  */		      \
-	    if (__builtin_expect (ch2 < 0xa1 || ch2 > 0xfe, 0))		      \
+	if (ch == 0x8e)							      \
+	  {								      \
+	    /* Half-width katakana.  */					      \
+	    if (__builtin_expect (ch2 > 0xdf, 0))			      \
 	      {								      \
 		/* This is an illegal character.  */			      \
 		if (! ignore_errors_p ())				      \
@@ -149,107 +149,89 @@
 		break;							      \
 	      }								      \
 									      \
-	    if (ch == 0x8e)						      \
+	    ch = ch2 + 0xfec0;						      \
+	    inptr += 2;							      \
+	  }								      \
+	else								      \
+	  {								      \
+	    const unsigned char *endp;					      \
+									      \
+	    if (ch == 0x8f)						      \
 	      {								      \
-		/* Half-width katakana.  */				      \
-		if (__builtin_expect (ch2 > 0xdf, 0))			      \
-		  {							      \
-		    /* This is an illegal character.  */		      \
-		    if (! ignore_errors_p ())				      \
-		      {							      \
-			result = __GCONV_ILLEGAL_INPUT;			      \
-			break;						      \
-		      }							      \
+		/* JISX 0213 plane 2.  */				      \
+		uint32_t ch3;						      \
 									      \
-		    ++inptr;						      \
-		    ++*irreversible;					      \
+		if (__builtin_expect (inptr + 2 >= inend, 0))		      \
+		  {							      \
+		    /* The third byte is not available.  */		      \
+		    result = __GCONV_INCOMPLETE_INPUT;			      \
 		    break;						      \
 		  }							      \
 									      \
-		ch = ch2 + 0xfec0;					      \
-		inptr += 2;						      \
+		ch3 = inptr[2];						      \
+		endp = inptr + 3;					      \
+									      \
+		ch = jisx0213_to_ucs4 (0x200 - 0x80 + ch2, ch3 ^ 0x80);	      \
 	      }								      \
 	    else							      \
 	      {								      \
-		const unsigned char *endp;				      \
+		/* JISX 0213 plane 1.  */				      \
+		endp = inptr + 2;					      \
 									      \
-		if (ch == 0x8f)						      \
+		ch = jisx0213_to_ucs4 (0x100 - 0x80 + ch, ch2 ^ 0x80);	      \
+	      }								      \
+									      \
+	    if (ch == 0)						      \
+	      {								      \
+		/* This is an illegal character.  */			      \
+		if (! ignore_errors_p ())				      \
 		  {							      \
-		    /* JISX 0213 plane 2.  */				      \
-		    uint32_t ch3;					      \
+		    result = __GCONV_ILLEGAL_INPUT;			      \
+		    break;						      \
+		  }							      \
 									      \
-		    if (__builtin_expect (inptr + 2 >= inend, 0))	      \
-		      {							      \
-			/* The third byte is not available.  */		      \
-			result = __GCONV_INCOMPLETE_INPUT;		      \
-			break;						      \
-		      }							      \
+		++inptr;						      \
+		++*irreversible;					      \
+		break;							      \
+	      }								      \
 									      \
-		    ch3 = inptr[2];					      \
-		    endp = inptr + 3;					      \
+	    inptr = endp;						      \
 									      \
-		    ch = jisx0213_to_ucs4 (0x200 - 0x80 + ch2, ch3 ^ 0x80);   \
+	    if (ch < 0x80)						      \
+	      {								      \
+		/* It's a combining character.  */			      \
+		uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
+		uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
+									      \
+		/* See whether we have room for two characters.  */	      \
+		if (outptr + 8 <= outend)				      \
+		  {							      \
+		    put32 (outptr, u1);					      \
+		    outptr += 4;					      \
+		    put32 (outptr, u2);					      \
+		    outptr += 4;					      \
+		    continue;						      \
 		  }							      \
 		else							      \
 		  {							      \
-		    /* JISX 0213 plane 1.  */				      \
-		    endp = inptr + 2;					      \
-									      \
-		    ch = jisx0213_to_ucs4 (0x100 - 0x80 + ch, ch2 ^ 0x80);    \
-		  }							      \
-									      \
-		if (ch == 0)						      \
-		  {							      \
-		    /* This is an illegal character.  */		      \
-		    if (! ignore_errors_p ())				      \
-		      {							      \
-			result = __GCONV_ILLEGAL_INPUT;			      \
-			break;						      \
-		      }							      \
-									      \
-		    ++inptr;						      \
-		    ++*irreversible;					      \
+		    result = __GCONV_FULL_OUTPUT;			      \
 		    break;						      \
 		  }							      \
-									      \
-		inptr = endp;						      \
-									      \
-		if (ch < 0x80)						      \
-		  {							      \
-		    /* It's a combining character.  */			      \
-		    uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];     \
-		    uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];     \
-									      \
-		    /* See whether we have room for two characters.  */	      \
-		    if (outptr + 8 <= outend)				      \
-		      {							      \
-			put32 (outptr, u1);				      \
-			outptr += 4;					      \
-			put32 (outptr, u2);				      \
-			outptr += 4;					      \
-			continue;					      \
-		      }							      \
-									      \
-		    /* Otherwise store only the first character now, and      \
-		       put the second one into the queue.  */		      \
-		    ch = u1;						      \
-		    *statep = u2 << 3;					      \
-		  }							      \
 	      }								      \
 	  }								      \
-	else								      \
+      }									      \
+    else								      \
+      {									      \
+	/* This is illegal.  */						      \
+	if (! ignore_errors_p ())					      \
 	  {								      \
-	    /* This is illegal.  */					      \
-	    if (! ignore_errors_p ())					      \
-	      {								      \
-		result = __GCONV_ILLEGAL_INPUT;				      \
-		break;							      \
-	      }								      \
-									      \
-	    ++inptr;							      \
-	    ++*irreversible;						      \
-	    continue;							      \
+	    result = __GCONV_ILLEGAL_INPUT;				      \
+	    break;							      \
 	  }								      \
+									      \
+	++inptr;							      \
+	++*irreversible;						      \
+	continue;							      \
       }									      \
 									      \
     put32 (outptr, ch);							      \
--- glibc-20020408/iconvdata/shift_jisx0213.c.bak	Wed Apr 17 00:20:25 2002
+++ glibc-20020408/iconvdata/shift_jisx0213.c	Thu Apr 18 00:13:23 2002
@@ -62,9 +62,7 @@
     *statep = saved_state
 
 
-/* During Shift_JISX0213 to UCS-4 conversion, the COUNT element of the state
-   contains the last UCS-4 character, shifted by 3 bits.
-   During UCS-4 to Shift_JISX0213 conversion, the COUNT element of the state
+/* During UCS-4 to Shift_JISX0213 conversion, the COUNT element of the state
    contains the last two bytes to be output, shifted by 3 bits.  */
 
 /* Since this is a stateful encoding we have to provide code which resets
@@ -74,17 +72,8 @@
   if (data->__statep->__count != 0)					      \
     {									      \
       if (FROM_DIRECTION)						      \
-	{								      \
-	  if (__builtin_expect (outbuf + 4 <= outend, 1))		      \
-	    {								      \
-	      /* Write out the last character.  */			      \
-	      *((uint32_t *) outbuf)++ = data->__statep->__count >> 3;	      \
-	      data->__statep->__count = 0;				      \
-	    }								      \
-	  else								      \
-	    /* We don't have enough room in the output buffer.  */	      \
-	    status = __GCONV_FULL_OUTPUT;				      \
-	}								      \
+	/* We don't use shift states in the FROM_DIRECTION.  */		      \
+	data->__statep->__count = 0;					      \
       else								      \
 	{								      \
 	  if (__builtin_expect (outbuf + 2 <= outend, 1))		      \
@@ -109,136 +98,129 @@
 #define LOOPFCT			FROM_LOOP
 #define BODY \
   {									      \
-    uint32_t ch;							      \
+    uint32_t ch = *inptr;						      \
 									      \
-    /* Determine whether there is a buffered character pending.  */	      \
-    ch = *statep >> 3;							      \
-    if (__builtin_expect (ch == 0, 1))					      \
+    if (ch < 0x80)							      \
       {									      \
-	/* No - so look at the next input byte.  */			      \
-	ch = *inptr;							      \
-	if (ch < 0x80)							      \
-	  {								      \
-	    /* Plain ISO646-JP character.  */				      \
-	    if (__builtin_expect (ch == 0x5c, 0))			      \
-	      ch = 0xa5;						      \
-	    else if (__builtin_expect (ch == 0x7e, 0))			      \
-	      ch = 0x203e;						      \
-	    ++inptr;							      \
-	  }								      \
-	else if (ch >= 0xa1 && ch <= 0xdf)				      \
+	/* Plain ISO646-JP character.  */				      \
+	if (__builtin_expect (ch == 0x5c, 0))				      \
+	  ch = 0xa5;							      \
+	else if (__builtin_expect (ch == 0x7e, 0))			      \
+	  ch = 0x203e;							      \
+	++inptr;							      \
+      }									      \
+    else if (ch >= 0xa1 && ch <= 0xdf)					      \
+      {									      \
+	/* Half-width katakana.  */					      \
+	ch += 0xfec0;							      \
+	++inptr;							      \
+      }									      \
+    else if ((ch >= 0x81 && ch <= 0x9f) || (ch >= 0xe0 && ch <= 0xfc))	      \
+      {									      \
+	/* Two byte character.  */					      \
+	uint32_t ch2;							      \
+									      \
+	if (__builtin_expect (inptr + 1 >= inend, 0))			      \
 	  {								      \
-	    /* Half-width katakana.  */					      \
-	    ch += 0xfec0;						      \
-	    ++inptr;							      \
+	    /* The second byte is not available.  */			      \
+	    result = __GCONV_INCOMPLETE_INPUT;				      \
+	    break;							      \
 	  }								      \
-	else if ((ch >= 0x81 && ch <= 0x9f) || (ch >= 0xe0 && ch <= 0xfc))    \
-	  {								      \
-	    /* Two byte character.  */					      \
-	    uint32_t ch2;						      \
-									      \
-	    if (__builtin_expect (inptr + 1 >= inend, 0))		      \
-	      {								      \
-		/* The second byte is not available.  */		      \
-		result = __GCONV_INCOMPLETE_INPUT;			      \
-		break;							      \
-	      }								      \
 									      \
-	    ch2 = inptr[1];						      \
+	ch2 = inptr[1];							      \
 									      \
-	    /* The second byte must be in the range 0x{40..7E,80..FC}.  */    \
-	    if (__builtin_expect (ch2 < 0x40 || ch2 == 0x7f || ch2 > 0xfc, 0))\
+	/* The second byte must be in the range 0x{40..7E,80..FC}.  */	      \
+	if (__builtin_expect (ch2 < 0x40 || ch2 == 0x7f || ch2 > 0xfc, 0))    \
+	  {								      \
+	    /* This is an illegal character.  */			      \
+	    if (! ignore_errors_p ())					      \
 	      {								      \
-		/* This is an illegal character.  */			      \
-		if (! ignore_errors_p ())				      \
-		  {							      \
-		    result = __GCONV_ILLEGAL_INPUT;			      \
-		    break;						      \
-		  }							      \
-									      \
-		++inptr;						      \
-		++*irreversible;					      \
+		result = __GCONV_ILLEGAL_INPUT;				      \
 		break;							      \
 	      }								      \
 									      \
-	    /* Convert to row and column.  */				      \
-	    if (ch < 0xe0)						      \
-	      ch -= 0x81;						      \
-	    else							      \
-	      ch -= 0xc1;						      \
-	    if (ch2 < 0x80)						      \
-	      ch2 -= 0x40;						      \
-	    else							      \
-	      ch2 -= 0x41;						      \
-	    /* Now 0 <= ch <= 0x3b, 0 <= ch2 <= 0xbb.  */		      \
-	    ch = 2 * ch;						      \
-	    if (ch2 >= 0x5e)						      \
-	      ch2 -= 0x5e, ch++;					      \
-	    ch2 += 0x21;						      \
-	    if (ch >= 0x5e)						      \
-	      {								      \
-		/* Handling of JISX 0213 plane 2 rows.  */		      \
-		if (ch >= 0x67)						      \
-		  ch += 230;						      \
-		else if (ch >= 0x63 || ch == 0x5f)			      \
-		  ch += 168;						      \
-		else 							      \
-		  ch += 162;						      \
-	      }								      \
-									      \
-	    ch = jisx0213_to_ucs4 (0x121 + ch, ch2);			      \
-									      \
-	    if (ch == 0)						      \
-	      {								      \
-		/* This is an illegal character.  */			      \
-		if (! ignore_errors_p ())				      \
-		  {							      \
-		    result = __GCONV_ILLEGAL_INPUT;			      \
-		    break;						      \
-		  }							      \
+	    ++inptr;							      \
+	    ++*irreversible;						      \
+	    break;							      \
+	  }								      \
 									      \
-		++inptr;						      \
-		++*irreversible;					      \
-		break;							      \
-	      }								      \
+	/* Convert to row and column.  */				      \
+	if (ch < 0xe0)							      \
+	  ch -= 0x81;							      \
+	else								      \
+	  ch -= 0xc1;							      \
+	if (ch2 < 0x80)							      \
+	  ch2 -= 0x40;							      \
+	else								      \
+	  ch2 -= 0x41;							      \
+	/* Now 0 <= ch <= 0x3b, 0 <= ch2 <= 0xbb.  */			      \
+	ch = 2 * ch;							      \
+	if (ch2 >= 0x5e)						      \
+	  ch2 -= 0x5e, ch++;						      \
+	ch2 += 0x21;							      \
+	if (ch >= 0x5e)							      \
+	  {								      \
+	    /* Handling of JISX 0213 plane 2 rows.  */			      \
+	    if (ch >= 0x67)						      \
+	      ch += 230;						      \
+	    else if (ch >= 0x63 || ch == 0x5f)				      \
+	      ch += 168;						      \
+	    else 							      \
+	      ch += 162;						      \
+	  }								      \
 									      \
-	    inptr += 2;							      \
+	ch = jisx0213_to_ucs4 (0x121 + ch, ch2);			      \
 									      \
-	    if (ch < 0x80)						      \
-	      {								      \
-		/* It's a combining character.  */			      \
-		uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
-		uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
-									      \
-		/* See whether we have room for two characters.  */	      \
-		if (outptr + 8 <= outend)				      \
-		  {							      \
-		    put32 (outptr, u1);					      \
-		    outptr += 4;					      \
-		    put32 (outptr, u2);					      \
-		    outptr += 4;					      \
-		    continue;						      \
-		  }							      \
-									      \
-		/* Otherwise store only the first character now, and	      \
-		   put the second one into the queue.  */		      \
-		ch = u1;						      \
-		*statep = u2 << 3;					      \
-	      }								      \
-	  }								      \
-	else								      \
+	if (ch == 0)							      \
 	  {								      \
-	    /* This is illegal.  */					      \
+	    /* This is an illegal character.  */			      \
 	    if (! ignore_errors_p ())					      \
 	      {								      \
 		result = __GCONV_ILLEGAL_INPUT;				      \
 		break;							      \
-	      }								      \
+	     }								      \
 									      \
 	    ++inptr;							      \
 	    ++*irreversible;						      \
-	    continue;							      \
+	    break;							      \
+	  }								      \
+									      \
+	inptr += 2;							      \
+									      \
+	if (ch < 0x80)							      \
+	  {								      \
+	    /* It's a combining character.  */				      \
+	    uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];	      \
+	    uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];	      \
+									      \
+	    /* See whether we have room for two characters.  */		      \
+	    if (outptr + 8 <= outend)					      \
+	      {								      \
+		put32 (outptr, u1);					      \
+		outptr += 4;						      \
+		put32 (outptr, u2);					      \
+		outptr += 4;						      \
+		continue;						      \
+	      }								      \
+	    else							      \
+	      {								      \
+		result = __GCONV_FULL_OUTPUT;				      \
+		break;							      \
+	      }								      \
+	  }								      \
+      }									      \
+    else								      \
+      {									      \
+	/* This is illegal.  */						      \
+	if (! ignore_errors_p ())					      \
+	  {								      \
+	    result = __GCONV_ILLEGAL_INPUT;				      \
+	    break;							      \
 	  }								      \
+									      \
+	++inptr;							      \
+	++*irreversible;						      \
+	continue;							      \
       }									      \
 									      \
     put32 (outptr, ch);							      \


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]