* libc/ctype/ctype_cp.h (_CTYPE_GEORGIAN_PS_128_254): Define.

(_CTYPE_GEORGIAN_PS_255): Define. (_CTYPE_PT154_128_254): Define. (_CTYPE_PT154_255): Define. (__ctype_cp): Add array members for above ctype definitions. * libc/locale/locale.c (loadlocale): Make TIS-620 charset name available for all targets. Add guards for setting the conversion function pointers. Add support for GEORGIAN-PS and PT154 charsets. Change documentation to reflect current behaviour more closely. * libc/locale/nl_langinfo.c (nl_langinfo): On Cygwin, translate "CP101" to "GEORGIAN-PS" and "CP102" to "PT154". * libc/stdlib/sb_charsets.c (__cp_conv): Add conversion arrays for GEORGIAN-PS and PT154. (__cp_index): Map invalid Windows codepage number 101 to GEORGIAN-PS conversion array, 102 to PT154 conversion array.
2010-02-06 18:28:33 +00:00 · 2010-02-06 18:28:33 +00:00 · 5eb556c849
parent 38d9821daf
commit 5eb556c849
5 changed files with 218 additions and 37 deletions
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@ -1,3 +1,21 @@
+2010-02-06  Corinna Vinschen  <corinna@vinschen.de>
+
+	* libc/ctype/ctype_cp.h (_CTYPE_GEORGIAN_PS_128_254): Define.
+	(_CTYPE_GEORGIAN_PS_255): Define.
+	(_CTYPE_PT154_128_254): Define.
+	(_CTYPE_PT154_255): Define.
+	(__ctype_cp): Add array members for above ctype definitions.
+	* libc/locale/locale.c (loadlocale): Make TIS-620 charset name
+	available for all targets.  Add guards for setting the conversion
+	function pointers.  Add support for GEORGIAN-PS and PT154 charsets.
+	Change documentation to reflect current behaviour more closely.
+	* libc/locale/nl_langinfo.c (nl_langinfo): On Cygwin, translate
+	"CP101" to "GEORGIAN-PS" and "CP102" to "PT154".
+	* libc/stdlib/sb_charsets.c (__cp_conv): Add conversion arrays
+	for GEORGIAN-PS and PT154.
+	(__cp_index): Map invalid Windows codepage number 101 to
+	GEORGIAN-PS conversion array, 102 to PT154 conversion array.
+
 2010-02-06  Ralf Corsepius  <ralf.corsepius@rtems.org>

 	* libc/posix/telldir.c: Remove bogus nested prototype of lseek().
--- a/newlib/libc/ctype/ctype_cp.h
+++ b/newlib/libc/ctype/ctype_cp.h
@ -433,6 +433,42 @@
 	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
 	_U,	_U,	_U,	_U,	_U,	_U,	_U
 #define _CTYPE_CP21866_255 _U
+#define _CTYPE_GEORGIAN_PS_128_254 \
+   	_P,	0,	_P,	_L,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_U,	_P,	_U,	_U,	0,	0,  \
+	0,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_L,	_P,	_L,	0,	_L,	_U, \
+	_S|_B,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_P,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L, \
+	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_U|_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_P, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L
+#define _CTYPE_GEORGIAN_PS_255 _L
+#define _CTYPE_PT154_128_254 \
+   	_U,	_U,	_U,	_L,	_P,	_P,	_U,	_U, \
+	_U,	_L,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_L,	_P,	_P,	_P,	_P,	_P,	_P,	_P, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_S|_B,	_U,	_L,	_U,	_U,	_U,	_U,	_P, \
+	_U,	_P,	_U,	_P,	_P,	_L,	_P,	_U, \
+	_P,	_L,	_U,	_L,	_L,	_L,	_P,	_P, \
+	_L,	_P,	_L,	_P,	_L,	_U,	_L,	_L, \
+	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_U,	_U,	_U,	_U,	_U,	_U,	_U,	_U, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L,	_L, \
+	_L,	_L,	_L,	_L,	_L,	_L,	_L
+#define _CTYPE_PT154_255 _L


 extern int __cp_index (const char *charset_ext);
@ -442,7 +478,7 @@ extern int __cp_index (const char *charset_ext);
 #ifndef __CYGWIN__
 static _CONST
 #endif
-char __ctype_cp[24][128 + 256] = {
+char __ctype_cp[26][128 + 256] = {
  { _CTYPE_CP437_128_254,
    0,
    _CTYPE_DATA_0_127,
@ -587,11 +623,23 @@ char __ctype_cp[24][128 + 256] = {
    _CTYPE_CP21866_128_254,
    _CTYPE_CP21866_255
  },
+  { _CTYPE_GEORGIAN_PS_128_254,
+    0,
+    _CTYPE_DATA_0_127,
+    _CTYPE_GEORGIAN_PS_128_254,
+    _CTYPE_GEORGIAN_PS_255
+  },
+  { _CTYPE_PT154_128_254,
+    0,
+    _CTYPE_DATA_0_127,
+    _CTYPE_PT154_128_254,
+    _CTYPE_PT154_255
+  },
 };

 #else /* !defined(ALLOW_NEGATIVE_CTYPE_INDEX) */

-static _CONST char __ctype_cp[22][1 + 256] = {
+static _CONST char __ctype_cp[26][1 + 256] = {
  { 0,
    _CTYPE_DATA_0_127,
    _CTYPE_CP437_128_254,
@ -712,6 +760,16 @@ static _CONST char __ctype_cp[22][1 + 256] = {
    _CTYPE_CP21866_128_254,
    _CTYPE_CP21866_255
  },
+  { 0,
+    _CTYPE_DATA_0_127,
+    _CTYPE_GEORGIAN_PS_128_254,
+    _CTYPE_GEORGIAN_PS_255
+  },
+  { 0,
+    _CTYPE_DATA_0_127,
+    _CTYPE_PT154_128_254,
+    _CTYPE_PT154_255
+  },
 };

 #endif /* ALLOW_NEGATIVE_CTYPE_INDEX */
--- a/newlib/libc/locale/locale.c
+++ b/newlib/libc/locale/locale.c
@ -56,34 +56,36 @@ for a given language, a three character string per ISO 639-3.
 <<"TERRITORY">> is a country code per ISO 3166.  For <<"charset">> and
 <<"modifier">> see below.

-Additionally to the POSIX specifier, seven extensions are supported for
-backward compatibility with older implementations using newlib:
-<<"C-UTF-8">>, <<"C-JIS">>, <<"C-eucJP">>, <<"C-SJIS">>, <<C-KOI8-R>>,
-<<C-KOI8-U>>, <<"C-ISO-8859-x">> with 1 <= x <= 15, or <<"C-CPxxx">> with
-xxx in [437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 932,
-1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258].
-
-Instead of <<"C-">>, you can specify also <<"C.">>.  Both variations allow
+Additionally to the POSIX specifier, the following extension is supported
+for backward compatibility with older implementations using newlib:
+<<"C-charset">>.
+Instead of <<"C-">>, you can also specify <<"C.">>.  Both variations allow
 to specify language neutral locales while using other charsets than ASCII,
 for instance <<"C.UTF-8">>, which keeps all settings as in the C locale,
 but uses the UTF-8 charset.

-Even when using POSIX locale strings, the only charsets allowed are
+The following charsets are recogized:
 <<"UTF-8">>, <<"JIS">>, <<"EUCJP">>, <<"SJIS">>, <<"KOI8-R">>, <<"KOI8-U">>,
-<<"ISO-8859-x">> with 1 <= x <= 15, or <<"CPxxx">> with xxx in
-[437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 932, 1125, 1250,
-1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258].
+<<"GEORGIAN-PS">>, <<"PT154">>, <<"TIS-620">>, <<"ISO-8859-x">> with
+1 <= x <= 16, or <<"CPxxx">> with xxx in [437, 720, 737, 775, 850, 852, 855,
+857, 858, 862, 866, 874, 932, 1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256,
+1257, 1258].
+
 Charsets are case insensitive.  For instance, <<"EUCJP">> and <<"eucJP">>
 are equivalent.  Charset names with dashes can also be written without
 dashes, as in <<"UTF8">>, <<"iso88591">> or <<"koi8r">>.  <<"EUCJP">> and
 <<"EUCKR"> are also recognized with dash, <<"EUC-JP">> and <<"EUC-KR">>.

+Full support for all of the above charsets requires that newlib has been
+build with multibyte support and support for all ISO and Windows Codepage.
+Otherwise all singlebyte charsets are simply mapped to ASCII.  Right now,
+only newlib for Cygwin is built with full charset support by default.
+Under Cygwin, this implementation additionally supports the charsets
+<<"GBK">>, <<"eucKR">>, and <<"Big5">>.  Cygwin does not support <<"JIS">>.
+
 (<<"">> is also accepted; if given, the settings are read from the
 corresponding LC_* environment variables and $LANG according to POSIX rules.

-Under Cygwin, this implementation additionally supports the charsets
-<<"GBK">>, <<"eucKR">>, <<"Big5">>, and <<"TIS-620">>.
-
 This implementation also supports a single modifier, <<"cjknarrow">>.
 Any other modifier is ignored.  <<"cjknarrow">>, in conjunction with one
 of the language specifiers <<"ja">>, <<"ko">>, and <<"zh">> specifies
@ -720,18 +722,82 @@ loadlocale(struct _reent *p, int category)
      l_mbtowc = __ascii_mbtowc;
 #endif
      break;
-#ifdef __CYGWIN__
    case 'G':
    case 'g':
-      if (strcasecmp (charset, "GBK"))
-      	return NULL;
-      strcpy (charset, "GBK");
-      mbc_max = 2;
+#ifdef __CYGWIN__
+      if (!strcasecmp (charset, "GBK"))
+      	{
+	  strcpy (charset, "GBK");
+	  mbc_max = 2;
 #ifdef _MB_CAPABLE
-      l_wctomb = __gbk_wctomb;
-      l_mbtowc = __gbk_mbtowc;
+	  l_wctomb = __gbk_wctomb;
+	  l_mbtowc = __gbk_mbtowc;
+#endif
+	}
+      else
+#endif /* __CYGWIN__ */
+      /* GEORGIAN-PS and the alias without dash */
+      if (!strncasecmp (charset, "GEORGIAN", 8))
+	{
+	  c = charset + 8;
+	  if (*c == '-')
+	    ++c;
+	  if (strcasecmp (c, "PS"))
+	    return NULL;
+	  strcpy (charset, "CP101");
+	  mbc_max = 1;
+#ifdef _MB_CAPABLE
+#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
+	  l_wctomb = __cp_wctomb;
+	  l_mbtowc = __cp_mbtowc;
+#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
+	  l_wctomb = __ascii_wctomb;
+	  l_mbtowc = __ascii_mbtowc;
+#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
+#endif
+	}
+      else
+	return NULL;
+      break;
+    case 'P':
+    case 'p':
+      /* PT154 */
+      if (strcasecmp (charset, "PT154"))
+	return NULL;
+      strcpy (charset, "CP102");
+      mbc_max = 1;
+#ifdef _MB_CAPABLE
+#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
+      l_wctomb = __cp_wctomb;
+      l_mbtowc = __cp_mbtowc;
+#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
+      l_wctomb = __ascii_wctomb;
+      l_mbtowc = __ascii_mbtowc;
+#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
 #endif
      break;
+    case 'T':
+    case 't':
+      if (strncasecmp (charset, "TIS", 3))
+      	return NULL;
+      c = charset + 3;
+      if (*c == '-')
+	++c;
+      if (strcasecmp (c, "620"))
+      	return NULL;
+      strcpy (charset, "CP874");
+      mbc_max = 1;
+#ifdef _MB_CAPABLE
+#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
+      l_wctomb = __cp_wctomb;
+      l_mbtowc = __cp_mbtowc;
+#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
+      l_wctomb = __ascii_wctomb;
+      l_mbtowc = __ascii_mbtowc;
+#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
+#endif
+      break;
+#ifdef __CYGWIN__
    case 'B':
    case 'b':
      if (strcasecmp (charset, "BIG5"))
@ -741,17 +807,6 @@ loadlocale(struct _reent *p, int category)
 #ifdef _MB_CAPABLE
      l_wctomb = __big5_wctomb;
      l_mbtowc = __big5_mbtowc;
-#endif
-      break;
-    case 'T':
-    case 't':
-      if (strcasecmp (charset, "TIS620") && strcasecmp (charset, "TIS-620"))
-      	return NULL;
-      strcpy (charset, "CP874");
-      mbc_max = 1;
-#ifdef _MB_CAPABLE
-      l_wctomb = __cp_wctomb;
-      l_mbtowc = __cp_mbtowc;
 #endif
      break;
 #endif /* __CYGWIN__ */
--- a/newlib/libc/locale/nl_langinfo.c
+++ b/newlib/libc/locale/nl_langinfo.c
@ -78,6 +78,10 @@ _DEFUN(nl_langinfo, (item),
 		      ret = "KOI8-R";
 		    else if (strcmp (ret + 2, "21866") == 0)
 		      ret = "KOI8-U";
+		    else if (strcmp (ret + 2, "101") == 0)
+		      ret = "GEORGIAN-PS";
+		    else if (strcmp (ret + 2, "102") == 0)
+		      ret = "PT154";
 		  }
 		else if (ret[0] == 'S'/*JIS*/)
 		  {
--- a/newlib/libc/stdlib/sb_charsets.c
+++ b/newlib/libc/stdlib/sb_charsets.c
@ -203,7 +203,7 @@ wchar_t __iso_8859_conv[14][0x60] = {
   value (function __cp_index), the second index is the value of the
   incoming character - 0x80.
   Values < 0x80 don't have to be converted anyway. */
-wchar_t __cp_conv[24][0x80] = {
+wchar_t __cp_conv[26][0x80] = {
  /* CP437 */
  { 0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7,
    0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5,
@ -611,7 +611,47 @@ wchar_t __cp_conv[24][0x80] = {
    0x42e, 0x410, 0x411, 0x426, 0x414, 0x415, 0x424, 0x413,
    0x425, 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e,
    0x41f, 0x42f, 0x420, 0x421, 0x422, 0x423, 0x416, 0x412,
-    0x42c, 0x42b, 0x417, 0x428, 0x42d, 0x429, 0x427, 0x42a }
+    0x42c, 0x42b, 0x417, 0x428, 0x42d, 0x429, 0x427, 0x42a },
+  /* The following are not valid Windows codepages, but they fit nicely here.
+     The CP numbers are only used internally and are guranteed not to clash
+     with valid Windows codepage identifier. */
+  /* CP101 (GEORGIAN-PS)  Georgian charset, used as the default charset in
+     the ka_GE locale (Georgian, Georgia).  Apparently derived from Windows
+     CP1252. */
+  { 0x80, 0x81, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021,
+    0x2c6, 0x2030, 0x160, 0x2039, 0x152, 0x8d, 0x8e, 0x8f,
+    0x90, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
+    0x2dc, 0x2122, 0x161, 0x203a, 0x153, 0x9d, 0x9e, 0x178, 
+    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 
+    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 
+    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 
+    0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 
+    0x10d0, 0x10d1, 0x10d2, 0x10d3, 0x10d4, 0x10d5, 0x10d6, 0x10f1,
+    0x10d7, 0x10d8, 0x10d9, 0x10da, 0x10db, 0x10dc, 0x10f2, 0x10dd,
+    0x10de, 0x10df, 0x10e0, 0x10e1, 0x10e2, 0x10f3, 0x10e3, 0x10e4,
+    0x10e5, 0x10e6, 0x10e7, 0x10e8, 0x10e9, 0x10ea, 0x10eb, 0x10ec,
+    0x10ed, 0x10ee, 0x10f4, 0x10ef, 0x10f0, 0x10f5, 0xe6, 0xe7,
+    0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 
+    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 
+    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff },
+  /* CP102 (PT154) Cyrillic-Asian charset, used as the default charset in
+     the kk_KZ locale (Kazakh, Kazakhstan). */
+  { 0x496, 0x492, 0x4ee, 0x493, 0x201e, 0x2026, 0x4b6, 0x4ae,
+    0x4b2, 0x4af, 0x4a0, 0x4e2, 0x4a2, 0x49a, 0x4ba, 0x4b8, 
+    0x497, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
+    0x4b3, 0x4b7, 0x4a1, 0x4e3, 0x4a3, 0x49b, 0x4bb, 0x4b9, 
+    0xa0, 0x40e, 0x45e, 0x408, 0x4e8, 0x498, 0x4b0, 0xa7,
+    0x401, 0xa9, 0x4d8, 0xab, 0xac, 0x4ef, 0xae, 0x49c,
+    0xb0, 0x4b1, 0x406, 0x456, 0x499, 0x4e9, 0xb6, 0xb7,
+    0x451, 0x2116, 0x4d9, 0xbb, 0x458, 0x4aa, 0x4ab, 0x49d, 
+    0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 
+    0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e, 0x41f, 
+    0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 
+    0x428, 0x429, 0x42a, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 
+    0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 
+    0x438, 0x439, 0x43a, 0x43b, 0x43c, 0x43d, 0x43e, 0x43f, 
+    0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 
+    0x448, 0x449, 0x44a, 0x44b, 0x44c, 0x44d, 0x44e, 0x44f }
 };
 #endif /* _MB_EXTENDED_CHARSETS_WINDOWS */

@ -727,6 +767,12 @@ __cp_index (const char *charset_ext)
    case 21866:
      cp_idx = 23;
      break;
+    case 101:
+      cp_idx = 24;
+      break;
+    case 102:
+      cp_idx = 25;
+      break;
    default:
      cp_idx = -1;
      break;