This is the mail archive of the libc-alpha@sourceware.cygnus.com mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
gettext shall use iconv

To: libc-alpha at sourceware dot cygnus dot com
Subject: gettext shall use iconv
From: Bruno Haible <haible at ilog dot fr>
Date: Mon, 20 Dec 1999 17:01:03 +0100 (MET)

Hello Ulrich,

In the current glibc CVS you have put code to use gconv (when _LIBC) or
iconv (when HAVE_ICONV) to convert the messages on the fly to the current
locale's encoding. To make this work outside of glibc, a few more patches
are needed. Appended below. Of course the AM_GNU_GETTEXT macro needs to
be modified to test for <langinfo.h> and 'iconv'.

Besides that, there is an API problem. My program which calls gettext()
doesn't know which is the encoding of the string returned by gettext().
It needs to know, because its internal representation is Unicode.
Of course, I could setenv("OUTPUT_CHARSET","UTF-8"), but that would have
an effect on other libraries in the same process, and even on subprocesses,
which is undesirable.

One solution would be to export the internal function used by _nl_load_domain
that returns the target codeset: Move the code

          outcharset = getenv ("OUTPUT_CHARSET");
          if (outcharset == NULL || outcharset[0] == '\0')
            outcharset = ....;

into a function called '_nl_langinfo_codeset', and document this function.

Another solution would be to introduce a function that lets the program
declare which encoding it desires to receive from gettext(). Similarly
to 'bindtextdomain' which declares the directory name, we would have
introduce a function

      void bind_textdomain_codeset (const char *domain_name,
                                    const char *encoding);

The advantage of this latter solution is obviously that strings will not
be translated twice (by gettext() from the .gmo encoding to the current
locale's encoding and then by the application to UTF-8 or UCS-2).

What do you think?

Bruno

        * loadmsgcat.c (_GNU_SOURCE): Define before all system includes,
        otherwise it has no effect.
        (string.h): Include.
        (langinfo.h): Also include if !_LIBC && HAVE_ICONV && HAVE_LANGINFO_H.
        (get_locale_charset): New function.
        (_nl_load_domain): If !_LIBC, call get_locale_charset. Allocate a
        conversion descriptor only if outcharset and charset are different.
        * dcgettext.c (iconv_string): New function.
        (_nl_find_msg): If !_LIBC && HAVE_ICONV, use iconv_string to convert
        the result string.

*** loadmsgcat.c.orig	Sun Dec 19 20:24:49 1999
--- loadmsgcat.c	Sun Dec 19 23:19:31 1999
***************
*** 19,24 ****
--- 19,29 ----
  # include <config.h>
  #endif
  
+ /* When using the GNU C library, tell <langinfo.h> to define CODESET. */
+ #ifndef _GNU_SOURCE
+ # define _GNU_SOURCE    1
+ #endif
+ 
  #include <fcntl.h>
  #include <sys/types.h>
  #include <sys/stat.h>
***************
*** 27,37 ****
  # include <stdlib.h>
  #endif
  
  #if defined HAVE_UNISTD_H || defined _LIBC
  # include <unistd.h>
  #endif
  
! #ifdef _LIBC
  # include <langinfo.h>
  #endif
  
--- 32,48 ----
  # include <stdlib.h>
  #endif
  
+ #if defined HAVE_STRING_H || defined _LIBC
+ # include <string.h>
+ #else
+ # include <strings.h>
+ #endif
+ 
  #if defined HAVE_UNISTD_H || defined _LIBC
  # include <unistd.h>
  #endif
  
! #if defined _LIBC || (defined HAVE_ICONV && defined HAVE_LANGINFO_H)
  # include <langinfo.h>
  #endif
  
***************
*** 70,75 ****
--- 81,447 ----
  int _nl_msg_cat_cntr = 0;
  
  
+ #if ! defined _LIBC && defined HAVE_ICONV
+ 
+ /* Returns a canonical name for the character encoding used in the
+    current locale. Returns NULL if it cannot be determined.  */
+ static const char *get_locale_charset ();
+ 
+ # if HAVE_LANGINFO_H
+ 
+ static const char *
+ get_locale_charset ()
+ {
+   return nl_langinfo (CODESET);
+ }
+ 
+ # else
+ 
+ static const char *
+ get_locale_charset ()
+ {
+   const char* locale_charset = NULL;
+   const char * locale;
+ 
+   locale = getenv ("LC_ALL");
+   if (!locale || !*locale)
+     {
+       locale = getenv ("LC_CTYPE");
+       if (!locale || !*locale)
+         locale = getenv ("LANG");
+     }
+   if (locale && *locale)
+     {
+       char* buf = (char*) malloc (strlen (locale) + 1);
+       const char* codeset = NULL;
+ 
+       {
+         const char* cp = locale;
+ 
+         for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
+           {
+             if (*cp == '.')
+               {
+                 codeset = ++cp;
+                 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++);
+                 if (*cp != '\0')
+                   {
+                     size_t n = cp - codeset;
+                     memcpy (buf, codeset, n);
+                     buf[n] = '\0';
+                     codeset = buf;
+                   }
+                 break;
+               }
+           }
+       }
+ 
+       if (codeset)
+         {
+           /* Canonicalize the charset given after the dot. */
+ 
+           if (   ! strcmp (codeset, "ISO8859-1")
+               || ! strcmp (codeset, "ISO_8859-1")
+               || ! strcmp (codeset, "iso88591")
+               || ! strcmp (codeset, "88591")
+               || ! strcmp (codeset, "88591.en")
+               || ! strcmp (codeset, "8859")
+               || ! strcmp (codeset, "8859.in")
+               || ! strcmp (codeset, "ascii")
+              )
+             locale_charset = "ISO-8859-1";
+           else
+           if (   ! strcmp (codeset, "ISO8859-2")
+               || ! strcmp (codeset, "ISO_8859-2")
+               || ! strcmp (codeset, "iso88592")
+              )
+             locale_charset = "ISO-8859-2";
+           else
+           if (   ! strcmp (codeset, "ISO8859-5")
+               || ! strcmp (codeset, "ISO_8859-5")
+               || ! strcmp (codeset, "iso88595")
+              )
+             locale_charset = "ISO-8859-5";
+           else
+           if (   ! strcmp (codeset, "ISO8859-6")
+               || ! strcmp (codeset, "ISO_8859-6")
+               || ! strcmp (codeset, "iso88596")
+              )
+             locale_charset = "ISO-8859-6";
+           else
+           if (   ! strcmp (codeset, "ISO8859-7")
+               || ! strcmp (codeset, "ISO_8859-7")
+               || ! strcmp (codeset, "iso88597")
+              )
+             locale_charset = "ISO-8859-7";
+           else
+           if (   ! strcmp (codeset, "ISO8859-8")
+               || ! strcmp (codeset, "iso88598")
+              )
+             locale_charset = "ISO-8859-8";
+           else
+           if (   ! strcmp (codeset, "ISO8859-9")
+               || ! strcmp (codeset, "ISO_8859-9")
+               || ! strcmp (codeset, "iso88599")
+              )
+             locale_charset = "ISO-8859-9";
+           else
+           if (! strcmp (codeset, "KOI8-R"))
+             locale_charset = "KOI8-R";
+           else
+           if (! strcmp (codeset, "KOI8-U"))
+             locale_charset = "KOI8-U";
+           else
+           if (   ! strcmp (codeset, "eucJP")
+               || ! strcmp (codeset, "ujis")
+               || ! strcmp (codeset, "AJEC")
+              )
+             locale_charset = "eucJP";
+           else
+           if (   ! strcmp (codeset, "JIS7")
+               || ! strcmp (codeset, "jis7")
+               || ! strcmp (codeset, "JIS")
+               || ! strcmp (codeset, "ISO-2022-JP")
+              )
+             locale_charset = "ISO-2022-JP"; /* was: "JIS7"; */
+           else
+           if (   ! strcmp (codeset, "SJIS")
+               || ! strcmp (codeset, "mscode")
+               || ! strcmp (codeset, "932")
+              )
+             locale_charset = "SJIS";
+           else
+           if (   ! strcmp (codeset, "eucKR")
+               || ! strcmp (codeset, "949")
+              )
+             locale_charset = "eucKR";
+           else
+           if (! strcmp (codeset, "eucCN"))
+             locale_charset = "eucCN";
+           else
+           if (! strcmp (codeset, "eucTW"))
+             locale_charset = "eucTW";
+           else
+           if (! strcmp (codeset, "TACTIS"))
+             locale_charset = "TIS-620"; /* was: "TACTIS"; */
+           else
+           if (! strcmp (codeset, "EUC") || ! strcmp (codeset, "euc"))
+             {
+               if (locale[0] == 'j' && locale[1] == 'a')
+                 locale_charset = "eucJP";
+               else if (locale[0] == 'k' && locale[1] == 'o')
+                 locale_charset = "eucKR";
+               else if (locale[0] == 'z' && locale[1] == 'h' && locale[2] == '_')
+                 {
+                   if (locale[3] == 'C' && locale[4] == 'N')
+                     locale_charset = "eucCN";
+                   else if (locale[3] == 'T' && locale[4] == 'W')
+                     locale_charset = "eucTW";
+                 }
+             }
+           else
+           // The following are CLISP extensions.
+           if (   ! strcmp (codeset, "UTF-8")
+               || ! strcmp (codeset, "utf8")
+              )
+             locale_charset = "UTF-8";
+         }
+       else
+         {
+           /* No dot found. Choose a default, based on locale. */
+ 
+           if (   !strcmp(locale,"iso_8859_1")
+               || !strcmp(locale,"ISO8859-1")
+               || !strcmp(locale,"ISO-8859-1")
+              )
+             locale_charset = "ISO-8859-1";
+           else
+           if (0)
+             locale_charset = "ISO-8859-2";
+           else
+           if (0)
+             locale_charset = "ISO-8859-5";
+           else
+           if (0)
+             locale_charset = "ISO-8859-6";
+           else
+           if (0)
+             locale_charset = "ISO-8859-7";
+           else
+           if (0)
+             locale_charset = "ISO-8859-8";
+           else
+           if (0)
+             locale_charset = "ISO-8859-9";
+           else
+           if (0)
+             locale_charset = "KOI8-R";
+           else
+           if (0)
+             locale_charset = "KOI8-U";
+           else
+           if (0)
+             locale_charset = "eucJP";
+           else
+           if (0)
+             locale_charset = "ISO-2022-JP"; /* was: "JIS7"; */
+           else
+           if (0)
+             locale_charset = "SJIS";
+           else
+           if (0)
+             locale_charset = "eucKR";
+           else
+           if (!strcmp(locale,"zh_CN") || !strcmp(locale,"zh")
+              )
+             locale_charset = "eucCN";
+           else
+           if (!strcmp(locale,"zh_TW")
+              )
+             locale_charset = "eucTW";
+           else
+           if (0)
+             locale_charset = "TIS-620"; /* was: "TACTIS"; */
+           else
+             {
+               /* Choose a default, based on the language only. */
+ 
+               const char* underscore = strchr (locale, '_');
+               const char* lang;
+ 
+               if (underscore)
+                 {
+                   size_t n = underscore - locale;
+                   memcpy (buf, locale, n);
+                   buf[n] = '\0';
+                   lang = buf;
+                 }
+               else
+                 lang = locale;
+ 
+               if (   ! strcmp (lang, "af") || ! strcmp (lang, "afrikaans")
+                   || ! strcmp (lang, "ca") || ! strcmp (lang, "catalan")
+                   || ! strcmp (lang, "da") || ! strcmp (lang, "danish")
+                                         || ! strcmp (lang, "dansk")
+                   || ! strcmp (lang, "de") || ! strcmp (lang, "german")
+                                         || ! strcmp (lang, "deutsch")
+                   || ! strcmp (lang, "en") || ! strcmp (lang, "english")
+                   || ! strcmp (lang, "es") || ! strcmp (lang, "spanish")
+                   || ! strcmp (lang, "eu") || ! strcmp (lang, "basque")
+                   || ! strcmp (lang, "fi") || ! strcmp (lang, "finnish")
+                   || ! strcmp (lang, "fo") || ! strcmp (lang, "faroese")
+                                         || ! strcmp (lang, "faeroese")
+                   || ! strcmp (lang, "fr") || ! strcmp (lang, "french")
+                   || ! strcmp (lang, "ga") || ! strcmp (lang, "irish")
+                   || ! strcmp (lang, "gd") || ! strcmp (lang, "scottish")
+                   || ! strcmp (lang, "gl") || ! strcmp (lang, "galician")
+                   || ! strcmp (lang, "is") || ! strcmp (lang, "icelandic")
+                   || ! strcmp (lang, "it") || ! strcmp (lang, "italian")
+                   || ! strcmp (lang, "nl") || ! strcmp (lang, "dutch")
+                   || ! strcmp (lang, "no") || ! strcmp (lang, "norwegian")
+                   || ! strcmp (lang, "pt") || ! strcmp (lang, "portuguese")
+                   || ! strcmp (lang, "sv") || ! strcmp (lang, "swedish")
+                  )
+                 locale_charset = "ISO-8859-1";
+               else
+               if (   ! strcmp (lang, "cs") || ! strcmp (lang, "czech")
+                   || ! strcmp (lang, "cz")
+                   || ! strcmp (lang, "hr") || ! strcmp (lang, "croatian")
+                   || ! strcmp (lang, "hu") || ! strcmp (lang, "hungarian")
+                   || ! strcmp (lang, "pl") || ! strcmp (lang, "polish")
+                   || ! strcmp (lang, "ro") || ! strcmp (lang, "romanian")
+                                         || ! strcmp (lang, "rumanian")
+                   || ! strcmp (lang, "sh")
+                   || ! strcmp (lang, "sk") || ! strcmp (lang, "slovak")
+                   || ! strcmp (lang, "sl") || ! strcmp (lang, "slovene")
+                                         || ! strcmp (lang, "slovenian")
+                   || ! strcmp (lang, "sq") || ! strcmp (lang, "albanian")
+                  )
+                 locale_charset = "ISO-8859-2";
+               else
+               if (   ! strcmp (lang, "eo") || ! strcmp (lang, "esperanto")
+                   || ! strcmp (lang, "mt") || ! strcmp (lang, "maltese")
+                  )
+                 locale_charset = "ISO-8859-3";
+               else
+               if (   ! strcmp (lang, "be") || ! strcmp (lang, "byelorussian")
+                   || ! strcmp (lang, "bg") || ! strcmp (lang, "bulgarian")
+                   || ! strcmp (lang, "mk") || ! strcmp (lang, "macedonian")
+                   || ! strcmp (lang, "sp")
+                   || ! strcmp (lang, "sr") || ! strcmp (lang, "serbian")
+                  )
+                 locale_charset = "ISO-8859-5";
+               else
+               if (! strcmp (lang, "ar") || ! strcmp (lang, "arabic")
+                  )
+                 locale_charset = "ISO-8859-6";
+               else
+               if (! strcmp (lang, "el") || ! strcmp (lang, "greek")
+                  )
+                 locale_charset = "ISO-8859-7";
+               else
+               if (! strcmp (lang, "iw") || ! strcmp (lang, "he")
+                                      || ! strcmp (lang, "hebrew")
+                  )
+                 locale_charset = "ISO-8859-8";
+               else
+               if (! strcmp (lang, "tr") || ! strcmp (lang, "turkish")
+                  )
+                 locale_charset = "ISO-8859-9";
+               else
+               if (   ! strcmp (lang, "et") || ! strcmp (lang, "estonian")
+                   || ! strcmp (lang, "lt") || ! strcmp (lang, "lithuanian")
+                   || ! strcmp (lang, "lv") || ! strcmp (lang, "latvian")
+                  )
+                 locale_charset = "ISO-8859-10";
+               else
+               if (! strcmp (lang, "ru") || ! strcmp (lang, "russian")
+                  )
+                 locale_charset = "KOI8-R";
+               else
+               if (! strcmp (lang, "uk") || ! strcmp (lang, "ukrainian")
+                  )
+                 locale_charset = "KOI8-U";
+               else
+               if (   ! strcmp (lang, "ja")
+                   || ! strcmp (lang, "Jp")
+                   || ! strcmp (lang, "japan")
+                   || ! strcmp (lang, "Japanese-EUC")
+                  )
+                 locale_charset = "eucJP";
+               else
+               if (0)
+                 locale_charset = "ISO-2022-JP"; /* was: "JIS7"; */
+               else
+               if (! strcmp (lang, "japanese")
+                  )
+                 locale_charset = "SJIS";
+               else
+               if (! strcmp (lang, "ko") || ! strcmp (lang, "korean")
+                  )
+                 locale_charset = "eucKR";
+               else
+               if (! strcmp (lang, "chinese-s")
+                  )
+                 locale_charset = "eucCN";
+               else
+               if (! strcmp (lang, "chinese-t")
+                  )
+                 locale_charset = "eucTW";
+               else
+               if (! strcmp (lang, "th")
+                  )
+                 locale_charset = "TIS-620"; /* was: "TACTIS"; */
+             }
+         }
+       free (buf);
+     }
+   return locale_charset;
+ }
+ # endif
+ #endif
+ 
+ 
  /* Load the message catalogs specified by FILENAME.  If it is no valid
     message catalog do nothing.  */
  void
***************
*** 253,259 ****
--- 625,637 ----
  	     set up so we provide a possibility to override this.  */
  	  outcharset = getenv ("OUTPUT_CHARSET");
  	  if (outcharset == NULL || outcharset[0] == '\0')
+ #ifdef _LIBC
  	    outcharset = (*_nl_current[LC_CTYPE])->values[_NL_ITEM_INDEX (CODESET)].string;
+ #else
+ # if HAVE_ICONV
+ 	    outcharset = get_locale_charset ();
+ # endif
+ #endif
  
  #ifdef _LIBC
  	  if (__gconv_open (outcharset, charset, &domain->conv,
***************
*** 262,268 ****
  	    domain->conv = (__gconv_t) -1;
  #else
  # if HAVE_ICONV
! 	  domain->conv = iconv_open (outcharset, charset);
  # endif
  #endif
  	}
--- 640,647 ----
  	    domain->conv = (__gconv_t) -1;
  #else
  # if HAVE_ICONV
! 	  if (strcmp (outcharset, charset))
! 	    domain->conv = iconv_open (outcharset, charset);
  # endif
  #endif
  	}
*** dcgettext.c.orig	Sun Dec 19 20:19:39 1999
--- dcgettext.c	Sun Dec 19 23:50:25 1999
***************
*** 397,402 ****
--- 397,486 ----
  #endif
  
  
+ #if ! defined _LIBC && defined HAVE_ICONV
+ 
+ /* Converts the string starting at START using the conversion
+    descriptor CD. The result is allocated using malloc().
+    Returns NULL if an allocation or conversion error occurs.  */
+ 
+ static char *
+ iconv_string (cd, start)
+      iconv_t cd;
+      const char *start;
+ {
+   const char *end = start + strlen (start) + 1;
+   size_t dummy = 0;
+   size_t length;
+   char *result;
+ 
+   /* Determine the length we need. */
+   iconv (cd, NULL, NULL, NULL, &dummy);
+   {
+     size_t count = 0;
+     char tmpbuf[4096];
+     const char *inptr = start;
+     size_t insize = end - start;
+     while (insize > 0)
+       {
+         char *outptr = tmpbuf;
+         size_t outsize = sizeof(tmpbuf);
+         size_t res = iconv (cd, &inptr, &insize, &outptr, &outsize);
+         if (res == (size_t)(-1))
+           {
+             if (errno == EINVAL)
+               break;
+             else
+               return NULL;
+           }
+         count += outptr - tmpbuf;
+       }
+     {
+       char *outptr = tmpbuf;
+       size_t outsize = sizeof(tmpbuf);
+       size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
+       if (res == (size_t)(-1))
+         return NULL;
+       count += outptr - tmpbuf;
+     }
+     length = count;
+   }
+ 
+   result = malloc (length);
+   if (result == NULL)
+     return NULL;
+ 
+   /* Do the conversion for real. */
+   iconv (cd, NULL, NULL, NULL, &dummy);
+   {
+     const char *inptr = start;
+     size_t insize = end - start;
+     char *outptr = result;
+     size_t outsize = length;
+     while (insize > 0)
+       {
+         size_t res = iconv (cd, &inptr, &insize, &outptr, &outsize);
+         if (res == (size_t)(-1))
+           {
+             if (errno == EINVAL)
+               break;
+             else
+               return NULL;
+           }
+       }
+     {
+       size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
+       if (res == (size_t)(-1))
+         return NULL;
+     }
+     if (outsize != 0)
+       abort ();
+   }
+ 
+   return result;
+ }
+ #endif
+ 
+ 
  char *
  internal_function
  _nl_find_msg (domain_file, msgid)
***************
*** 511,516 ****
--- 595,606 ----
  
  		out:
  		  __libc_lock_unlock (lock);
+ #else
+ # if HAVE_ICONV
+ 		  char *converted_result = iconv_string (domain->conv, result);
+ 		  if (converted_result != NULL)
+ 		    domain->conv_tab[idx] = converted_result;
+ # endif
  #endif
  		}
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]