This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH/RFA] Fix EUCJP multibyte/widechar conversion


Ping?

On Apr  6 12:21, Corinna Vinschen wrote:
> Hi,
> 
> while looking into a problem in the eucJP conversion under Cygwin it
> occured to me that the eucJP character conversion in newlib is incomplete.
> 
> It only correctly recognizes and converts characters from the lower half
> of JIS-X-0201 (ASCII) and the characters from JIS-X-0208, which are
> implemented as doublebyte values {0xa1-0xfe, 0xa1-0xfe}.
> 
> It does neither recognize characters from the upper half of JIS-X-0201
> (Halfwidth Katakana), implemented in eucJP as doublebyte sequences
> {0x8e, 0xa1-0xdf}, nor characters from JIS-X-0212, implemented as
> triplebyte sequences {0x8f, 0xa1-0xfe, 0xa1-0xfe}.
> 
> This also points to a bug in locale.c.  __mb_cur_max is set to 2 for
> eucJP, even though eucJP contains triplebyte sequences.
> 
> Below is a patch which implements the missing sequences in __eucjp_mbtowc
> and __eucjp_wctomb.  It also sets __mb_cur_max to 3 in loadlocale.
> 
> The triplebyte sequences are converted to and from widechar using a
> trick borrowed from the implementation of the Windows codepage 20932,
> which is Windows' eucJP implementation.  It has only one minor flaw:  It
> is incompatible to eucJP.
> 
> Instead of the aforementioned triplebyte sequences it has a doublebyte
> substitute representation.  The leading 0x8f byte is skipped, the second
> byte is taken as is, the third byte is masked with 0x7f.  This leads to
> a well-defined doublebyte representation.  I'm using this method to de-
> and encode the widechar value for the eucJP triplebyte sequences.
> 
> I tested this patch by enabling these functions temporarily for Cygwin.
> 
> 
> Corinna
> 
> 
> 	* libc/locale/locale.c (loadlocale): Set mbc_max to 3 for EUCJP.
> 	* libc/stdlib/mbctype.h (_iseucjp1): Like _iseucjp, but also
> 	recognizes 0x8e and 0x8f lead bytes.
> 	(_iseucjp2): Rename from _iseucjp.
> 	* libc/stdlib/mbtowc_r.c (__eucjp_mbtowc): Convert JIS-X-0212
> 	triplebyte sequences as well.
> 	* libc/stdlib/wctomb_r.c (__eucjp_wctomb): Convert to JIS-X-0212
> 	triplebyte sequences as well.
> 
> 
> Index: libc/locale/locale.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/locale/locale.c,v
> retrieving revision 1.17
> diff -u -p -r1.17 locale.c
> --- libc/locale/locale.c	31 Mar 2009 09:31:38 -0000	1.17
> +++ libc/locale/locale.c	5 Apr 2009 20:52:23 -0000
> @@ -468,7 +468,7 @@ loadlocale(struct _reent *p, int categor
>        if (!strcmp (charset, "EUCJP") || !strcmp (charset, "eucJP"))
>  	{
>  	  strcpy (charset, "EUCJP");
> -	  mbc_max = 2;
> +	  mbc_max = 3;
>  #ifdef _MB_CAPABLE
>  	  l_wctomb = __eucjp_wctomb;
>  	  l_mbtowc = __eucjp_mbtowc;
> Index: libc/stdlib/mbctype.h
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/stdlib/mbctype.h,v
> retrieving revision 1.2
> diff -u -p -r1.2 mbctype.h
> --- libc/stdlib/mbctype.h	17 Apr 2000 17:10:17 -0000	1.2
> +++ libc/stdlib/mbctype.h	5 Apr 2009 20:52:23 -0000
> @@ -14,7 +14,8 @@ int _EXFUN(_isjis, (int c));
>  
>  #define _issjis1(c)    (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xef))
>  #define _issjis2(c)    (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
> -#define _iseucjp(c)    ((c) >= 0xa1 && (c) <= 0xfe)
> +#define _iseucjp1(c)   ((c) == 0x8e || (c) == 0x8f || ((c) >= 0xa1 && (c) <= 0xfe))
> +#define _iseucjp2(c)   ((c) >= 0xa1 && (c) <= 0xfe)
>  #define _isjis(c)      ((c) >= 0x21 && (c) <= 0x7e)
>  
>  #endif /* _MBCTYPE_H_ */
> Index: libc/stdlib/mbtowc_r.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/stdlib/mbtowc_r.c,v
> retrieving revision 1.12
> diff -u -p -r1.12 mbtowc_r.c
> --- libc/stdlib/mbtowc_r.c	24 Mar 2009 10:13:27 -0000	1.12
> +++ libc/stdlib/mbtowc_r.c	6 Apr 2009 10:17:25 -0000
> @@ -470,7 +470,7 @@ _DEFUN (__eucjp_mbtowc, (r, pwc, s, n, c
>    ch = t[i++];
>    if (state->__count == 0)
>      {
> -      if (_iseucjp (ch))
> +      if (_iseucjp1 (ch))
>  	{
>  	  state->__value.__wchb[0] = ch;
>  	  state->__count = 1;
> @@ -481,9 +481,35 @@ _DEFUN (__eucjp_mbtowc, (r, pwc, s, n, c
>      }
>    if (state->__count == 1)
>      {
> -      if (_iseucjp (ch))
> +      if (_iseucjp2 (ch))
>  	{
> -	  *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
> +	  if (state->__value.__wchb[0] == 0x8f)
> +	    {
> +	      state->__value.__wchb[1] = ch;
> +	      state->__count = 2;
> +	      if (n <= i)
> +		return -2;
> +	      ch = t[i++];
> +	    }
> +	  else
> +	    {
> +	      *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch;
> +	      state->__count = 0;
> +	      return i;
> +	    }
> +	}
> +      else
> +	{
> +	  r->_errno = EILSEQ;
> +	  return -1;
> +	}
> +    }
> +  if (state->__count == 2)
> +    {
> +      if (_iseucjp2 (ch))
> +	{
> +	  *pwc = (((wchar_t)state->__value.__wchb[1]) << 8)
> +		 + (wchar_t)(ch & 0x7f);
>  	  state->__count = 0;
>  	  return i;
>  	}
> Index: libc/stdlib/wctomb_r.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/stdlib/wctomb_r.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 wctomb_r.c
> --- libc/stdlib/wctomb_r.c	24 Mar 2009 10:13:27 -0000	1.13
> +++ libc/stdlib/wctomb_r.c	6 Apr 2009 10:17:26 -0000
> @@ -195,12 +195,19 @@ _DEFUN (__eucjp_wctomb, (r, s, wchar, ch
>    if (char1 != 0x00)
>      {
>      /* first byte is non-zero..validate multi-byte char */
> -      if (_iseucjp (char1) && _iseucjp (char2)) 
> +      if (_iseucjp1 (char1) && _iseucjp2 (char2)) 
>  	{
>  	  *s++ = (char)char1;
>  	  *s = (char)char2;
>  	  return 2;
>  	}
> +      else if (_iseucjp2 (char1) && _iseucjp2 (char2 | 0x80))
> +	{
> +	  *s++ = (char)0x8f;
> +	  *s++ = (char)char1;
> +	  *s = (char)(char2 | 0x80);
> +	  return 3;
> +	}
>        else
>  	{
>  	  r->_errno = EILSEQ;
> 
> 
> -- 
> Corinna Vinschen
> Cygwin Project Co-Leader
> Red Hat

-- 
Corinna Vinschen
Cygwin Project Co-Leader
Red Hat


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]