This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

bugs in ISO-2022-CN-EXT converter



The ISO-2022-CN-EXT has a few bugs. The worst is that is cannot be loaded
because it requires symbols which are not available from libISOIR165.so:

$ nm /glibc22/lib/gconv/ISO-2022-CN-EXT.so | grep __isoir165
         U __isoir165_from_idx
         U __isoir165_from_tab
         U __isoir165_to_tab
$ nm /glibc22/lib/gconv/libISOIR165.so | grep __isoir165
00000880 R __isoir165_from_idx
000018c0 R __isoir165_tab
00008d60 R __isoir165_to_tab

Moreover,

- The CNS 11643-1992 plane 3 is not treated (it is the major part of
  CNS 11643-1986 plane 14).
- Setting GB7590_set = 0 is wrong, because that would mean that in initial
  state, GB7590 don't need a shift sequence.
- In the TO direction, SS2 and SS3 shift sequences are never emitted.
- Plus the same bugs as in the ISO-2022-CN converter.
- cns11643.h uses an undeclared variable.

Here is a patch which fixes them.


2000-09-17  Bruno Haible  <haible@clisp.cons.org>

	* iconvdata/iso-ir-165.c (__isoir165_from_tab): Renamed from
	__isoir165_tab.
	* iconvdata/cns11643.h (__cns11643l1_to_ucs4_tab): New declaration.
	* iconvdata/iso-2022-cn-ext.c: Include "cns11643.h".
	(GB7590_set, GB13132_set, CNS11643_3_set, CNS11643_4_set,
	CNS11643_5_set, CNS11643_6_set, CNS11643_7_set): Change enum values.
	(BODY for FROM_LOOP): Fix buffer overrun. Treat CNS11643 plane 3.
	Return __GCONV_INCOMPLETE_INPUT instead of __GCONV_EMPTY_INPUT.
	(BODY for TO_LOOP): Fix usage of `set' vs. `used'. Fix typo that
	caused GB2312 to be used instead of ISO-IR-165. Treat CNS11643
	plane 3. Fix shift sequences. Output announcement for SS2 and SS3
	encodings when needed. When outputting an announcement, don't clear
	most other announcements.

*** glibc-20000914/iconvdata/iso-ir-165.c.bak	Thu Jan 13 07:53:29 2000
--- glibc-20000914/iconvdata/iso-ir-165.c	Sun Sep 17 19:58:52 2000
***************
*** 546,552 ****
  };
  
  
! const char __isoir165_tab[29852] =
    "\x2a\x21" "\x2a\x22" "\x2a\x23" "\x21\x67" "\x2a\x25" "\x2a\x26" "\x2a\x27"
    "\x2a\x28" "\x2a\x29" "\x2a\x2a" "\x2a\x2b" "\x2a\x2c" "\x2a\x2d" "\x2a\x2e"
    "\x2a\x2f" "\x2a\x30" "\x2a\x31" "\x2a\x32" "\x2a\x33" "\x2a\x34" "\x2a\x35"
--- 546,552 ----
  };
  
  
! const char __isoir165_from_tab[29852] =
    "\x2a\x21" "\x2a\x22" "\x2a\x23" "\x21\x67" "\x2a\x25" "\x2a\x26" "\x2a\x27"
    "\x2a\x28" "\x2a\x29" "\x2a\x2a" "\x2a\x2b" "\x2a\x2c" "\x2a\x2d" "\x2a\x2e"
    "\x2a\x2f" "\x2a\x30" "\x2a\x31" "\x2a\x32" "\x2a\x33" "\x2a\x34" "\x2a\x35"
*** glibc-20000914/iconvdata/cns11643.h.bak	Tue Sep  5 15:24:48 2000
--- glibc-20000914/iconvdata/cns11643.h	Sun Sep 17 23:33:24 2000
***************
*** 20,27 ****
--- 20,30 ----
  
  #include <stdint.h>
  
+ /* Table for CNS 11643, plane 1 to UCS4 conversion.  */
+ extern const uint16_t __cns11643l1_to_ucs4_tab[];
  /* Table for CNS 11643, plane 2 to UCS4 conversion.  */
  extern const uint16_t __cns11643l2_to_ucs4_tab[];
+ /* Table for CNS 11643, plane 14 to UCS4 conversion.  */
  extern const uint16_t __cns11643l14_to_ucs4_tab[];
  
  
*** glibc-20000914/iconvdata/iso-2022-cn-ext.c.bak	Wed Sep 13 11:09:09 2000
--- glibc-20000914/iconvdata/iso-2022-cn-ext.c	Mon Sep 18 10:54:27 2000
***************
*** 24,29 ****
--- 24,30 ----
  #include <string.h>
  #include "gb2312.h"
  #include "iso-ir-165.h"
+ #include "cns11643.h"
  #include "cns11643l1.h"
  #include "cns11643l2.h"
  
***************
*** 80,120 ****
    ISO_IR_165_set,
    SO_mask = 7,
  
!   GB7589_set = 8,
!   GB13131_set = 16,
!   CNS11643_2_set = 24,
!   SS2_mask = 24,
! 
!   GB7590_set = 0,
!   GB13132_set = 32,
!   CNS11643_3_set = 64,
!   CNS11643_4_set = 96,
!   CNS11643_5_set = 128,
!   CNS11643_6_set = 160,
!   CNS11643_7_set = 192,
!   SS3_mask = 224,
  
  #define CURRENT_MASK (SO_mask | SS2_mask | SS3_mask)
  
!   GB2312_ann = 256,
!   GB12345_ann = 512,
!   CNS11643_1_ann = 768,
!   ISO_IR_165_ann = 1024,
!   SO_ann = 1792,
! 
!   GB7589_ann = 2048,
!   GB13131_ann = 4096,
!   CNS11643_2_ann = 6144,
!   SS2_ann = 6144,
! 
!   GB7590_ann = 8192,
!   GB13132_ann = 16384,
!   CNS11643_3_ann = 24576,
!   CNS11643_4_ann = 32768,
!   CNS11643_5_ann = 40960,
!   CNS11643_6_ann = 49152,
!   CNS11643_7_ann = 57344,
!   SS3_ann = 57344
  };
  
  
--- 81,121 ----
    ISO_IR_165_set,
    SO_mask = 7,
  
!   GB7589_set = 1 << 3,
!   GB13131_set = 2 << 3,
!   CNS11643_2_set = 3 << 3,
!   SS2_mask = 3 << 3,
! 
!   GB7590_set = 1 << 5,
!   GB13132_set = 2 << 5,
!   CNS11643_3_set = 3 << 5,
!   CNS11643_4_set = 4 << 5,
!   CNS11643_5_set = 5 << 5,
!   CNS11643_6_set = 6 << 5,
!   CNS11643_7_set = 7 << 5,
!   SS3_mask = 7 << 5,
  
  #define CURRENT_MASK (SO_mask | SS2_mask | SS3_mask)
  
!   GB2312_ann = 1 << 8,
!   GB12345_ann = 2 << 8,
!   CNS11643_1_ann = 3 << 8,
!   ISO_IR_165_ann = 4 << 8,
!   SO_ann = 7 << 8,
! 
!   GB7589_ann = 1 << 11,
!   GB13131_ann = 2 << 11,
!   CNS11643_2_ann = 3 << 11,
!   SS2_ann = 3 << 11,
! 
!   GB7590_ann = 1 << 13,
!   GB13132_ann = 2 << 13,
!   CNS11643_3_ann = 3 << 13,
!   CNS11643_4_ann = 4 << 13,
!   CNS11643_5_ann = 5 << 13,
!   CNS11643_6_ann = 6 << 13,
!   CNS11643_7_ann = 7 << 13,
!   SS3_ann = 7 << 13
  };
  
  
***************
*** 190,205 ****
  	   - the initial byte of the SS2 sequence.			      \
  	   - the initial byte of the SS3 sequence.			      \
  	*/								      \
! 	if (inptr + 1 > inend						      \
  	    || (inptr[1] == '$'						      \
! 		&& (inptr + 2 > inend					      \
! 		    || (inptr[2] == ')' && inptr + 3 > inend)		      \
! 		    || (inptr[2] == '*' && inptr + 3 > inend)		      \
! 		    || (inptr[2] == '+' && inptr + 3 > inend)))		      \
! 	    || (inptr[1] == SS2_1 && inptr + 3 > inend)			      \
! 	    || (inptr[1] == SS3_1 && inptr + 3 > inend))		      \
  	  {								      \
! 	    result = __GCONV_EMPTY_INPUT;				      \
  	    break;							      \
  	  }								      \
  	if (inptr[1] == '$'						      \
--- 191,206 ----
  	   - the initial byte of the SS2 sequence.			      \
  	   - the initial byte of the SS3 sequence.			      \
  	*/								      \
! 	if (inptr + 2 > inend						      \
  	    || (inptr[1] == '$'						      \
! 		&& (inptr + 3 > inend					      \
! 		    || (inptr[2] == ')' && inptr + 4 > inend)		      \
! 		    || (inptr[2] == '*' && inptr + 4 > inend)		      \
! 		    || (inptr[2] == '+' && inptr + 4 > inend)))		      \
! 	    || (inptr[1] == SS2_1 && inptr + 4 > inend)			      \
! 	    || (inptr[1] == SS3_1 && inptr + 4 > inend))		      \
  	  {								      \
! 	    result = __GCONV_INCOMPLETE_INPUT;				      \
  	    break;							      \
  	  }								      \
  	if (inptr[1] == '$'						      \
***************
*** 285,301 ****
  	continue;							      \
        }									      \
  									      \
!     if (ch == ESC && (inend - inptr == 1 || inptr[1] == SS2_1))		      \
        {									      \
  	/* This is a character from CNS 11643 plane 2.			      \
  	   XXX We could test here whether the use of this character	      \
  	   set was announced.						      \
  	   XXX Current GB7589 and GB13131 are not supported.  */	      \
- 	if (inend - inptr < 4)						      \
- 	  {								      \
- 	    result = __GCONV_INCOMPLETE_INPUT;				      \
- 	    break;							      \
- 	  }								      \
  	inptr += 2;							      \
  	ch = cns11643l2_to_ucs4 (&inptr, 2, 0);				      \
  	if (ch == __UNKNOWN_10646_CHAR)					      \
--- 286,297 ----
  	continue;							      \
        }									      \
  									      \
!     if (ch == ESC && inptr[1] == SS2_1)					      \
        {									      \
  	/* This is a character from CNS 11643 plane 2.			      \
  	   XXX We could test here whether the use of this character	      \
  	   set was announced.						      \
  	   XXX Current GB7589 and GB13131 are not supported.  */	      \
  	inptr += 2;							      \
  	ch = cns11643l2_to_ucs4 (&inptr, 2, 0);				      \
  	if (ch == __UNKNOWN_10646_CHAR)					      \
***************
*** 306,340 ****
  		result = __GCONV_ILLEGAL_INPUT;				      \
  		break;							      \
  	      }								      \
  	    ++*irreversible;						      \
  	    continue;							      \
  	  }								      \
        }									      \
!     /* Note that we can assume here that at least bytes are available if      \
         the first byte is ESC since otherwise the first if would have been     \
         true.  */							      \
      else if (ch == ESC && inptr[1] == SS3_1)				      \
        {									      \
  	/* This is a character from CNS 11643 plane 3 or higher.	      \
! 	   XXX Current GB7590 and GB13132 are not supported.  */	      \
! 	if (inend - inptr < 4)						      \
! 	  {								      \
! 	    result = __GCONV_INCOMPLETE_INPUT;				      \
  	    break;							      \
  	  }								      \
- 	inptr += 2;							      \
- 	ch = cns11643l2_to_ucs4 (&inptr, 2, 0);				      \
  	if (ch == __UNKNOWN_10646_CHAR)					      \
  	  {								      \
  	    if (! ignore_errors_p ())					      \
  	      {								      \
- 		inptr -= 2;						      \
  		result = __GCONV_ILLEGAL_INPUT;				      \
  		break;							      \
  	      }								      \
  	    ++*irreversible;						      \
  	    continue;							      \
  	  }								      \
        }									      \
      else if (set == ASCII_set)						      \
        {									      \
--- 302,354 ----
  		result = __GCONV_ILLEGAL_INPUT;				      \
  		break;							      \
  	      }								      \
+ 	    inptr += 2;							      \
  	    ++*irreversible;						      \
  	    continue;							      \
  	  }								      \
        }									      \
!     /* Note that we can assume here that at least 4 bytes are available if    \
         the first byte is ESC since otherwise the first if would have been     \
         true.  */							      \
      else if (ch == ESC && inptr[1] == SS3_1)				      \
        {									      \
  	/* This is a character from CNS 11643 plane 3 or higher.	      \
! 	   XXX Currently GB7590 and GB13132 are not supported.  */	      \
! 	char buf[3];							      \
! 	const char *tmp = buf;						      \
! 									      \
! 	buf[1] = inptr[2];						      \
! 	buf[2] = inptr[3];						      \
! 	switch (ann & SS3_ann)						      \
! 	  {								      \
! 	  case CNS11643_3_ann:						      \
! 	    /* CNS 11643 plane 3 is part of the old CNS 11643 plane 14.  */   \
! 	    if (buf[1] < 0x62 || (buf[1] == 0x62 && buf[2] <= 0x45))	      \
! 	      {								      \
! 		buf[0] = 0x2e;						      \
! 		ch = cns11643_to_ucs4 (&tmp, 3, 0);			      \
! 	      }								      \
! 	    else							      \
! 	      ch = __UNKNOWN_10646_CHAR;				      \
! 	    break;							      \
! 	  default:							      \
! 	    /* XXX Currently planes 4 to 7 are not supported.  */	      \
! 	    ch = __UNKNOWN_10646_CHAR;					      \
  	    break;							      \
  	  }								      \
  	if (ch == __UNKNOWN_10646_CHAR)					      \
  	  {								      \
  	    if (! ignore_errors_p ())					      \
  	      {								      \
  		result = __GCONV_ILLEGAL_INPUT;				      \
  		break;							      \
  	      }								      \
+ 	    inptr += 4;							      \
  	    ++*irreversible;						      \
  	    continue;							      \
  	  }								      \
+ 	assert (tmp == buf + 3);					      \
+ 	inptr += 4;							      \
        }									      \
      else if (set == ASCII_set)						      \
        {									      \
***************
*** 361,367 ****
  									      \
  	if (ch == 0)							      \
  	  {								      \
! 	    result = __GCONV_EMPTY_INPUT;				      \
  	    break;							      \
  	  }								      \
  	else if (ch == __UNKNOWN_10646_CHAR)				      \
--- 375,381 ----
  									      \
  	if (ch == 0)							      \
  	  {								      \
! 	    result = __GCONV_INCOMPLETE_INPUT;				      \
  	    break;							      \
  	  }								      \
  	else if (ch == __UNKNOWN_10646_CHAR)				      \
***************
*** 427,442 ****
  	char buf[2];							      \
  	int used;							      \
  									      \
! 	if (set == GB2312_set || ((ann & CNS11643_1_ann) == 0		      \
! 				  && (ann & ISO_IR_165_ann) == 0))	      \
  	  {								      \
  	    written = ucs4_to_gb2312 (ch, buf, 2);			      \
  	    used = GB2312_set;						      \
  	  }								      \
! 	else if (set == ISO_IR_165_set || (ann & ISO_IR_165_set) != 0)	      \
  	  {								      \
! 	    written = ucs4_to_gb2312 (ch, buf, 2);			      \
! 	    used = GB2312_set;						      \
  	  }								      \
  	else								      \
  	  {								      \
--- 441,456 ----
  	char buf[2];							      \
  	int used;							      \
  									      \
! 	if (set == GB2312_set || ((ann & SO_ann) != CNS11643_1_ann	      \
! 				  && (ann & SO_ann) != ISO_IR_165_ann))	      \
  	  {								      \
  	    written = ucs4_to_gb2312 (ch, buf, 2);			      \
  	    used = GB2312_set;						      \
  	  }								      \
! 	else if (set == ISO_IR_165_set || (ann & SO_ann) == ISO_IR_165_set)   \
  	  {								      \
! 	    written = ucs4_to_isoir165 (ch, buf, 2);			      \
! 	    used = ISO_IR_165_set;					      \
  	  }								      \
  	else								      \
  	  {								      \
***************
*** 454,482 ****
  	      used = CNS11643_2_set;					      \
  	    else							      \
  	      {								      \
! 		/* Well, see whether we have to change the SO set.  */	      \
! 		if (set != GB2312_set)					      \
! 		  {							      \
! 		    written = ucs4_to_gb2312 (ch, buf, 2);		      \
! 		    if (written != __UNKNOWN_10646_CHAR)		      \
! 		      used = GB2312_set;				      \
! 		  }							      \
! 		if (written == __UNKNOWN_10646_CHAR && set != ISO_IR_165_set) \
! 		  {							      \
! 		    written = ucs4_to_isoir165 (ch, buf, 2);		      \
! 		    if (written != __UNKNOWN_10646_CHAR)		      \
! 		      used = ISO_IR_165_set;				      \
! 		  }							      \
! 		if (written == __UNKNOWN_10646_CHAR && set != CNS11643_1_set) \
! 		  {							      \
! 		    written = ucs4_to_cns11643l1 (ch, buf, 2);		      \
! 		    if (written != __UNKNOWN_10646_CHAR)		      \
! 		      used = CNS11643_1_set;				      \
! 		  }							      \
  									      \
! 		if (written == __UNKNOWN_10646_CHAR)			      \
  		  {							      \
  		    /* Even this does not work.  Error.  */		      \
  		    STANDARD_ERR_HANDLER (4);				      \
  		  }							      \
  	      }								      \
--- 468,533 ----
  	      used = CNS11643_2_set;					      \
  	    else							      \
  	      {								      \
! 		char tmpbuf[3];						      \
  									      \
! 		switch (0)						      \
  		  {							      \
+ 		  default:						      \
+ 		    /* Well, see whether we have to change the SO set.  */    \
+ 									      \
+ 		    if (used != GB2312_set)				      \
+ 		      {							      \
+ 			written = ucs4_to_gb2312 (ch, buf, 2);		      \
+ 			if (written != __UNKNOWN_10646_CHAR)		      \
+ 			  {						      \
+ 			    used = GB2312_set;				      \
+ 			    break;					      \
+ 			  }						      \
+ 		      }							      \
+ 									      \
+ 		    if (used != ISO_IR_165_set)				      \
+ 		      {							      \
+ 			written = ucs4_to_isoir165 (ch, buf, 2);	      \
+ 			if (written != __UNKNOWN_10646_CHAR)		      \
+ 			  {						      \
+ 			    used = ISO_IR_165_set;			      \
+ 			    break;					      \
+ 			  }						      \
+ 		      }							      \
+ 									      \
+ 		    if (used != CNS11643_1_set)				      \
+ 		      {							      \
+ 			written = ucs4_to_cns11643l1 (ch, buf, 2);	      \
+ 			if (written != __UNKNOWN_10646_CHAR)		      \
+ 			  {						      \
+ 			    used = CNS11643_1_set;			      \
+ 			    break;					      \
+ 			  }						      \
+ 		      }							      \
+ 									      \
+ 		    written = ucs4_to_cns11643 (ch, tmpbuf, 3);		      \
+ 		    if (written == 3 && tmpbuf[0] != 1 && tmpbuf[0] != 2)     \
+ 		      {							      \
+ 			buf[0] = tmpbuf[1];				      \
+ 			buf[1] = tmpbuf[2];				      \
+ 			written = 2;					      \
+ 			/* CNS 11643 plane 3 is part of the old CNS 11643     \
+ 			   plane 14.					      \
+ 			   XXX Currently planes 4 to 7 are not supported.  */ \
+ 			if (tmpbuf[0] == 14				      \
+ 			    && (tmpbuf[1] < 0x62			      \
+ 				|| (tmpbuf[1] == 0x62 && tmpbuf[2] <= 0x45))) \
+ 			  {						      \
+ 			    used = CNS11643_3_set;			      \
+ 			    break;					      \
+ 			  }						      \
+ 		      }							      \
+ 									      \
  		    /* Even this does not work.  Error.  */		      \
+ 		    used = ASCII_set;					      \
+ 		  }							      \
+ 		if (used == ASCII_set)					      \
+ 		  {							      \
  		    STANDARD_ERR_HANDLER (4);				      \
  		  }							      \
  	      }								      \
***************
*** 488,494 ****
  	  {								      \
  	    /* First see whether we announced that we use this		      \
  	       character set.  */					      \
! 	    if ((ann & (2 << used)) == 0)				      \
  	      {								      \
  		const char *escseq;					      \
  									      \
--- 539,545 ----
  	  {								      \
  	    /* First see whether we announced that we use this		      \
  	       character set.  */					      \
! 	    if ((used & SO_mask) != 0 && (ann & SO_ann) != (used << 8))	      \
  	      {								      \
  		const char *escseq;					      \
  									      \
***************
*** 499,516 ****
  		  }							      \
  									      \
  		assert (used >= 1 && used <= 4);			      \
! 		escseq = "\e$)A\e$)G\e$*H\e$)E" + (used - 1) * 4;	      \
  		*outptr++ = *escseq++;					      \
  		*outptr++ = *escseq++;					      \
  		*outptr++ = *escseq++;					      \
  		*outptr++ = *escseq++;					      \
  									      \
! 		if (used == GB2312_set)					      \
! 		  ann = (ann & CNS11643_2_ann) | GB2312_ann;		      \
! 		else if (used == CNS11643_1_set)			      \
! 		  ann = (ann & CNS11643_2_ann) | CNS11643_1_ann;	      \
! 		else							      \
! 		  ann |= CNS11643_2_ann;				      \
  	      }								      \
  									      \
  	    if (used == CNS11643_2_set)					      \
--- 550,588 ----
  		  }							      \
  									      \
  		assert (used >= 1 && used <= 4);			      \
! 		escseq = ")A\0\0)G)E" + (used - 1) * 2;			      \
! 		*outptr++ = ESC;					      \
! 		*outptr++ = '$';					      \
! 		*outptr++ = *escseq++;					      \
! 		*outptr++ = *escseq++;					      \
! 									      \
! 		ann = (ann & ~SO_ann) | (used << 8);			      \
! 	      }								      \
! 	    else if ((used & SS2_mask) != 0 && (ann & SS2_ann) != (used << 8))\
! 	      {								      \
! 		const char *escseq;					      \
! 									      \
! 		assert (used == CNS11643_2_set); /* XXX */		      \
! 		escseq = "*H";						      \
! 		*outptr++ = ESC;					      \
! 		*outptr++ = '$';					      \
  		*outptr++ = *escseq++;					      \
  		*outptr++ = *escseq++;					      \
+ 									      \
+ 		ann = (ann & ~SS2_ann) | (used << 8);			      \
+ 	      }								      \
+ 	    else if ((used & SS3_mask) != 0 && (ann & SS3_ann) != (used << 8))\
+ 	      {								      \
+ 		const char *escseq;					      \
+ 									      \
+ 		assert ((used >> 5) >= 3 && (used >> 5) <= 7);		      \
+ 		escseq = "+I+J+K+L+M" + ((used >> 5) - 3) * 2;		      \
+ 		*outptr++ = ESC;					      \
+ 		*outptr++ = '$';					      \
  		*outptr++ = *escseq++;					      \
  		*outptr++ = *escseq++;					      \
  									      \
! 		ann = (ann & ~SS3_ann) | (used << 8);			      \
  	      }								      \
  									      \
  	    if (used == CNS11643_2_set)					      \
***************
*** 523,528 ****
--- 595,610 ----
  		*outptr++ = SS2_0;					      \
  		*outptr++ = SS2_1;					      \
  	      }								      \
+ 	    else if (used >= CNS11643_3_set && used <= CNS11643_7_set)	      \
+ 	      {								      \
+ 		if (outptr + 2 > outend)				      \
+ 		  {							      \
+ 		    result = __GCONV_FULL_OUTPUT;			      \
+ 		    break;						      \
+ 		  }							      \
+ 		*outptr++ = SS3_0;					      \
+ 		*outptr++ = SS3_1;					      \
+ 	      }								      \
  	    else							      \
  	      {								      \
  		/* We only have to emit something if currently ASCII is	      \
***************
*** 555,560 ****
--- 637,643 ----
  									      \
  	*outptr++ = buf[0];						      \
  	*outptr++ = buf[1];						      \
+ 	set = used;							      \
        }									      \
  									      \
      /* Now that we wrote the output increment the input pointer.  */	      \

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]