This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

bugs in GBK converter


The GBK charmap and converter currently disagree in three aspects:

1) The charmap maps 0x80 to U+0080, the converter doesn't.

2) The charmap maps 0xA1AA to U+2014 (consistent with Solaris iconv and
   unicode.org CP936 charmap and Microsoft CP936 charts), the converter
   maps it to U+2015 (like GB2312 does).

3) The converter from GBK to Unicode accepts input > 0xFEA0 and returns
   garbage (out-of-bounds array access).

4) The converter maps some GBK characters to Unicode "Private Use" characters.
   See next mail.

The patch below fixes 1 to 3:

1) Removes the extraneous entry for 0x80 from the charmap,

2) Change the converter to map 0xA1AA to U+2014 and 0xA844 to U+2015, like
   all the other GBK converters in the world do. This gives a second
   difference between GB2312 and GBK, the first being the mapping of 0xA1A4.

3) Avoid the out-of-range array access.


2000-09-23  Bruno Haible  <haible@clisp.cons.org>

	* charmaps/GBK: Remove /x80 entry.

2000-09-23  Bruno Haible  <haible@clisp.cons.org>

	* iconvdata/gbk.c (__gbk_to_ucs): Swap U+2014 and U+2015.
	(__gbk_from_ucs4_tab4): Swap entries for U+2014 and U+2015.
	(BODY for FROM_LOOP): Reject input > 0xFEA0, avoids out-of-bounds
	array access.
	* iconvdata/gbgbk.c (BODY for FROM_LOOP): Map 0xA844 to 0xA1AA.
	* iconvdata/testdata/GBK..UTF8: Swap U+2014 and U+2015.

*** glibc-20000914/localedata/charmaps/GBK.bak	Wed Aug 30 23:44:19 2000
--- glibc-20000914/localedata/charmaps/GBK	Sat Sep 23 23:14:59 2000
***************
*** 133,139 ****
  <U007D>     /x7d         RIGHT CURLY BRACKET
  <U007E>     /x7e         TILDE
  <U007F>     /x7f         DELETE (DEL)
- <U0080>     /x80         PADDING CHARACTER (PAD)
  
  <U4E02>     /x81/x40     <CJK>
  <U4E04>     /x81/x41     <CJK>
--- 133,138 ----
*** glibc-20000914/iconvdata/gbk.c.bak	Mon Jul  3 16:39:27 2000
--- glibc-20000914/iconvdata/gbk.c	Sat Sep 23 23:42:21 2000
***************
*** 1570,1576 ****
    [0x17fb] = 0x72d6, [0x17fc] = 0x72d8, [0x17fd] = 0x72da, [0x17fe] = 0x72db,
    [0x1861] = 0x3000, [0x1862] = 0x3001, [0x1863] = 0x3002, [0x1864] = 0x00b7,
    [0x1865] = 0x02c9, [0x1866] = 0x02c7, [0x1867] = 0x00a8, [0x1868] = 0x3003,
!   [0x1869] = 0x3005, [0x186a] = 0x2015, [0x186b] = 0xff5e, [0x186c] = 0x2016,
    [0x186d] = 0x2026, [0x186e] = 0x2018, [0x186f] = 0x2019, [0x1870] = 0x201c,
    [0x1871] = 0x201d, [0x1872] = 0x3014, [0x1873] = 0x3015, [0x1874] = 0x3008,
    [0x1875] = 0x3009, [0x1876] = 0x300a, [0x1877] = 0x300b, [0x1878] = 0x300c,
--- 1576,1582 ----
    [0x17fb] = 0x72d6, [0x17fc] = 0x72d8, [0x17fd] = 0x72da, [0x17fe] = 0x72db,
    [0x1861] = 0x3000, [0x1862] = 0x3001, [0x1863] = 0x3002, [0x1864] = 0x00b7,
    [0x1865] = 0x02c9, [0x1866] = 0x02c7, [0x1867] = 0x00a8, [0x1868] = 0x3003,
!   [0x1869] = 0x3005, [0x186a] = 0x2014, [0x186b] = 0xff5e, [0x186c] = 0x2016,
    [0x186d] = 0x2026, [0x186e] = 0x2018, [0x186f] = 0x2019, [0x1870] = 0x201c,
    [0x1871] = 0x201d, [0x1872] = 0x3014, [0x1873] = 0x3015, [0x1874] = 0x3008,
    [0x1875] = 0x3009, [0x1876] = 0x300a, [0x1877] = 0x300b, [0x1878] = 0x300c,
***************
*** 1712,1718 ****
    [0x1d2a] = 0x0448, [0x1d2b] = 0x0449, [0x1d2c] = 0x044a, [0x1d2d] = 0x044b,
    [0x1d2e] = 0x044c, [0x1d2f] = 0x044d, [0x1d30] = 0x044e, [0x1d31] = 0x044f,
    [0x1d40] = 0x02ca, [0x1d41] = 0x02cb, [0x1d42] = 0x02d9, [0x1d43] = 0x2013,
!   [0x1d44] = 0x2014, [0x1d45] = 0x2025, [0x1d46] = 0x2035, [0x1d47] = 0x2105,
    [0x1d48] = 0x2109, [0x1d49] = 0x2196, [0x1d4a] = 0x2197, [0x1d4b] = 0x2198,
    [0x1d4c] = 0x2199, [0x1d4d] = 0x2215, [0x1d4e] = 0x221f, [0x1d4f] = 0x2223,
    [0x1d50] = 0x2252, [0x1d51] = 0x2266, [0x1d52] = 0x2267, [0x1d53] = 0x22bf,
--- 1718,1724 ----
    [0x1d2a] = 0x0448, [0x1d2b] = 0x0449, [0x1d2c] = 0x044a, [0x1d2d] = 0x044b,
    [0x1d2e] = 0x044c, [0x1d2f] = 0x044d, [0x1d30] = 0x044e, [0x1d31] = 0x044f,
    [0x1d40] = 0x02ca, [0x1d41] = 0x02cb, [0x1d42] = 0x02d9, [0x1d43] = 0x2013,
!   [0x1d44] = 0x2015, [0x1d45] = 0x2025, [0x1d46] = 0x2035, [0x1d47] = 0x2105,
    [0x1d48] = 0x2109, [0x1d49] = 0x2196, [0x1d4a] = 0x2197, [0x1d4b] = 0x2198,
    [0x1d4c] = 0x2199, [0x1d4d] = 0x2215, [0x1d4e] = 0x221f, [0x1d4f] = 0x2223,
    [0x1d50] = 0x2252, [0x1d51] = 0x2266, [0x1d52] = 0x2267, [0x1d53] = 0x22bf,
***************
*** 5661,5668 ****
  */
  static const char __gbk_from_ucs4_tab4[][2] =
  {
!   [0x0000] = "\xa9\x5c", [0x0003] = "\xa8\x43", [0x0004] = "\xa8\x44",
!   [0x0005] = "\xa1\xaa", [0x0006] = "\xa1\xac", [0x0008] = "\xa1\xae",
    [0x0009] = "\xa1\xaf", [0x000c] = "\xa1\xb0", [0x000d] = "\xa1\xb1",
    [0x0015] = "\xa8\x45", [0x0016] = "\xa1\xad", [0x0020] = "\xa1\xeb",
    [0x0022] = "\xa1\xe4", [0x0023] = "\xa1\xe5", [0x0025] = "\xa8\x46",
--- 5682,5689 ----
  */
  static const char __gbk_from_ucs4_tab4[][2] =
  {
!   [0x0000] = "\xa9\x5c", [0x0003] = "\xa8\x43", [0x0004] = "\xa1\xaa",
!   [0x0005] = "\xa8\x44", [0x0006] = "\xa1\xac", [0x0008] = "\xa1\xae",
    [0x0009] = "\xa1\xaf", [0x000c] = "\xa1\xb0", [0x000d] = "\xa1\xb1",
    [0x0015] = "\xa8\x45", [0x0016] = "\xa1\xad", [0x0020] = "\xa1\xeb",
    [0x0022] = "\xa1\xe4", [0x0023] = "\xa1\xe5", [0x0025] = "\xa8\x46",
***************
*** 13153,13160 ****
  									      \
  	  ch2 = inptr[1];						      \
  									      \
! 	  /* All second bytes of a multibyte character must be >= 0x40. */    \
! 	  if (__builtin_expect (ch2, 0x41) < 0x40)			      \
  	    {								      \
  	      /* This is an illegal character.  */			      \
  	      if (! ignore_errors_p ())					      \
--- 13176,13185 ----
  									      \
  	  ch2 = inptr[1];						      \
  									      \
! 	  /* All second bytes of a multibyte character must be >= 0x40, and   \
! 	     the __gbk_to_ucs table only covers the range up to 0xfe 0xa0. */ \
! 	  if (__builtin_expect (ch2, 0x41) < 0x40			      \
! 	      || (__builtin_expect (ch, 0x81) == 0xfe && ch2 > 0xa0))	      \
  	    {								      \
  	      /* This is an illegal character.  */			      \
  	      if (! ignore_errors_p ())					      \
*** glibc-20000914/iconvdata/gbgbk.c.bak	Mon Jul  3 16:39:27 2000
--- glibc-20000914/iconvdata/gbgbk.c	Sat Sep 23 22:46:28 2000
***************
*** 65,73 ****
  	   All these characters are not defined in GB2312.  Besides this      \
  	   there is an incomatibility in the mapping.  The Unicode tables     \
  	   say that 0xA1A4 maps in GB2312 to U30FB while in GBK it maps to    \
! 	   U00B7.  Since we are free to do whatever we want if a mapping      \
! 	   is not available we will not flag this as an error but instead     \
! 	   map the two positions.  But this means that the mapping	      \
  									      \
  		UCS4 -> GB2312 -> GBK -> UCS4				      \
  									      \
--- 65,75 ----
  	   All these characters are not defined in GB2312.  Besides this      \
  	   there is an incomatibility in the mapping.  The Unicode tables     \
  	   say that 0xA1A4 maps in GB2312 to U30FB while in GBK it maps to    \
! 	   U00B7.  Similarly, 0xA1AA maps in GB2312 to U2015 while in GBK     \
! 	   it maps to U2014.  Since we are free to do whatever we want if     \
! 	   a mapping is not available we will not flag this as an error	      \
! 	   but instead map the two positions.  But this means that the	      \
! 	   mapping							      \
  									      \
  		UCS4 -> GB2312 -> GBK -> UCS4				      \
  									      \
***************
*** 89,94 ****
--- 91,100 ----
  									      \
  	ch = (ch << 8) | inptr[1];					      \
  									      \
+ 	/* Map 0xA844 (U2015 in GBK) to 0xA1AA (U2015 in GB2312).  */	      \
+ 	if (__builtin_expect (ch == 0xa844, 0))				      \
+ 	  ch = 0xa1aa;							      \
+ 									      \
  	/* Now determine whether the character is valid.  */		      \
  	if (__builtin_expect (ch, 0xa1a1) < 0xa1a1			      \
  	    || __builtin_expect (ch, 0xa1a1) > 0xf7fe			      \
***************
*** 123,130 ****
  #define BODY \
    {									      \
      /* We don't have to care about characters we cannot map.  The only	      \
!        problem is the mapping of 0xA1A4 but as explained above we do not      \
!        do anything special here.  */					      \
      unsigned char ch = *inptr++;					      \
  									      \
      if (ch > 0x7f)							      \
--- 129,136 ----
  #define BODY \
    {									      \
      /* We don't have to care about characters we cannot map.  The only	      \
!        problem are the mapping of 0xA1A4 and 0xA1AA but as explained above    \
!        we do not do anything special here.  */				      \
      unsigned char ch = *inptr++;					      \
  									      \
      if (ch > 0x7f)							      \
*** glibc-20000914/iconvdata/testdata/GBK..UTF8.bak	Tue Dec 28 05:44:55 1999
--- glibc-20000914/iconvdata/testdata/GBK..UTF8	Sun Sep 24 00:12:17 2000
***************
*** 389,395 ****
   犘 犙 犚 犛 犜 犝 犞 犠 犡 犢 犣 犤 犥 犦 犧 犨
   犩 犪 犫 犮 犱 犲 犳 犵 犺 犻 犼 犽 犾 犿 狀 狅
   狆 狇 狉 狊 狋 狌 狏 狑 狓 狔 狕 狖 狘 狚 狛
!       、 。 · ˉ ˇ ¨ 〃 々 ― ~ ‖ … ‘ ’
   “ ” 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】
   ± × ÷ ∶ ∧ ∨ ∑ ∏ ∪ ∩ ∈ ∷ √ ⊥ ∥ ∠
   ⌒ ⊙ ∫ ∮ ≡ ≌ ≈ ∽ ∝ ≠ ≮ ≯ ≤ ≥ ∞ ∵
--- 389,395 ----
   犘 犙 犚 犛 犜 犝 犞 犠 犡 犢 犣 犤 犥 犦 犧 犨
   犩 犪 犫 犮 犱 犲 犳 犵 犺 犻 犼 犽 犾 犿 狀 狅
   狆 狇 狉 狊 狋 狌 狏 狑 狓 狔 狕 狖 狘 狚 狛
!       、 。 · ˉ ˇ ¨ 〃 々 — ~ ‖ … ‘ ’
   “ ” 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】
   ± × ÷ ∶ ∧ ∨ ∑ ∏ ∪ ∩ ∈ ∷ √ ⊥ ∥ ∠
   ⌒ ⊙ ∫ ∮ ≡ ≌ ≈ ∽ ∝ ≠ ≮ ≯ ≤ ≥ ∞ ∵
***************
*** 431,437 ****
      а б в г д е ё ж з и й к л м н
   о п р с т у ф х ц ч ш щ ъ ы ь э
   ю я
!  ˊ ˋ ˙ – — ‥ ‵ ℅ ℉ ↖ ↗ ↘ ↙ ∕ ∟ ∣
   ≒ ≦ ≧ ⊿ ═ ║ ╒ ╓ ╔ ╕ ╖ ╗ ╘ ╙ ╚ ╛
   ╜ ╝ ╞ ╟ ╠ ╡ ╢ ╣ ╤ ╥ ╦ ╧ ╨ ╩ ╪ ╫
   ╬ ╭ ╮ ╯ ╰ ╱ ╲ ╳ ▁ ▂ ▃ ▄ ▅ ▆ ▇
--- 431,437 ----
      а б в г д е ё ж з и й к л м н
   о п р с т у ф х ц ч ш щ ъ ы ь э
   ю я
!  ˊ ˋ ˙ – ― ‥ ‵ ℅ ℉ ↖ ↗ ↘ ↙ ∕ ∟ ∣
   ≒ ≦ ≧ ⊿ ═ ║ ╒ ╓ ╔ ╕ ╖ ╗ ╘ ╙ ╚ ╛
   ╜ ╝ ╞ ╟ ╠ ╡ ╢ ╣ ╤ ╥ ╦ ╧ ╨ ╩ ╪ ╫
   ╬ ╭ ╮ ╯ ╰ ╱ ╲ ╳ ▁ ▂ ▃ ▄ ▅ ▆ ▇

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]