* libc/ctype/ctype_cp.h (_CTYPE_GEORGIAN_PS_128_254): Define.

(_CTYPE_GEORGIAN_PS_255): Define.
	(_CTYPE_PT154_128_254): Define.
	(_CTYPE_PT154_255): Define.
	(__ctype_cp): Add array members for above ctype definitions.
	* libc/locale/locale.c (loadlocale): Make TIS-620 charset name
	available for all targets.  Add guards for setting the conversion
	function pointers.  Add support for GEORGIAN-PS and PT154 charsets.
	Change documentation to reflect current behaviour more closely.
	* libc/locale/nl_langinfo.c (nl_langinfo): On Cygwin, translate
	"CP101" to "GEORGIAN-PS" and "CP102" to "PT154".
	* libc/stdlib/sb_charsets.c (__cp_conv): Add conversion arrays
	for GEORGIAN-PS and PT154.
	(__cp_index): Map invalid Windows codepage number 101 to
	GEORGIAN-PS conversion array, 102 to PT154 conversion array.
This commit is contained in:
Corinna Vinschen 2010-02-06 18:28:33 +00:00
parent 38d9821daf
commit 5eb556c849
5 changed files with 218 additions and 37 deletions

View file

@ -1,3 +1,21 @@
2010-02-06 Corinna Vinschen <corinna@vinschen.de>
* libc/ctype/ctype_cp.h (_CTYPE_GEORGIAN_PS_128_254): Define.
(_CTYPE_GEORGIAN_PS_255): Define.
(_CTYPE_PT154_128_254): Define.
(_CTYPE_PT154_255): Define.
(__ctype_cp): Add array members for above ctype definitions.
* libc/locale/locale.c (loadlocale): Make TIS-620 charset name
available for all targets. Add guards for setting the conversion
function pointers. Add support for GEORGIAN-PS and PT154 charsets.
Change documentation to reflect current behaviour more closely.
* libc/locale/nl_langinfo.c (nl_langinfo): On Cygwin, translate
"CP101" to "GEORGIAN-PS" and "CP102" to "PT154".
* libc/stdlib/sb_charsets.c (__cp_conv): Add conversion arrays
for GEORGIAN-PS and PT154.
(__cp_index): Map invalid Windows codepage number 101 to
GEORGIAN-PS conversion array, 102 to PT154 conversion array.
2010-02-06 Ralf Corsepius <ralf.corsepius@rtems.org>
* libc/posix/telldir.c: Remove bogus nested prototype of lseek().

View file

@ -433,6 +433,42 @@
_U, _U, _U, _U, _U, _U, _U, _U, \
_U, _U, _U, _U, _U, _U, _U
#define _CTYPE_CP21866_255 _U
#define _CTYPE_GEORGIAN_PS_128_254 \
_P, 0, _P, _L, _P, _P, _P, _P, \
_P, _P, _U, _P, _U, _U, 0, 0, \
0, _P, _P, _P, _P, _P, _P, _P, \
_P, _P, _L, _P, _L, 0, _L, _U, \
_S|_B, _P, _P, _P, _P, _P, _P, _P, \
_P, _P, _P, _P, _P, _P, _P, _P, \
_P, _P, _P, _P, _P, _P, _P, _P, \
_P, _P, _P, _P, _P, _P, _P, _P, \
_U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, \
_U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, \
_U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, \
_U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, \
_U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _U|_L, _L, _L, \
_L, _L, _L, _L, _L, _L, _L, _L, \
_L, _L, _L, _L, _L, _L, _L, _P, \
_L, _L, _L, _L, _L, _L, _L
#define _CTYPE_GEORGIAN_PS_255 _L
#define _CTYPE_PT154_128_254 \
_U, _U, _U, _L, _P, _P, _U, _U, \
_U, _L, _U, _U, _U, _U, _U, _U, \
_L, _P, _P, _P, _P, _P, _P, _P, \
_L, _L, _L, _L, _L, _L, _L, _L, \
_S|_B, _U, _L, _U, _U, _U, _U, _P, \
_U, _P, _U, _P, _P, _L, _P, _U, \
_P, _L, _U, _L, _L, _L, _P, _P, \
_L, _P, _L, _P, _L, _U, _L, _L, \
_U, _U, _U, _U, _U, _U, _U, _U, \
_U, _U, _U, _U, _U, _U, _U, _U, \
_U, _U, _U, _U, _U, _U, _U, _U, \
_U, _U, _U, _U, _U, _U, _U, _U, \
_L, _L, _L, _L, _L, _L, _L, _L, \
_L, _L, _L, _L, _L, _L, _L, _L, \
_L, _L, _L, _L, _L, _L, _L, _L, \
_L, _L, _L, _L, _L, _L, _L
#define _CTYPE_PT154_255 _L
extern int __cp_index (const char *charset_ext);
@ -442,7 +478,7 @@ extern int __cp_index (const char *charset_ext);
#ifndef __CYGWIN__
static _CONST
#endif
char __ctype_cp[24][128 + 256] = {
char __ctype_cp[26][128 + 256] = {
{ _CTYPE_CP437_128_254,
0,
_CTYPE_DATA_0_127,
@ -587,11 +623,23 @@ char __ctype_cp[24][128 + 256] = {
_CTYPE_CP21866_128_254,
_CTYPE_CP21866_255
},
{ _CTYPE_GEORGIAN_PS_128_254,
0,
_CTYPE_DATA_0_127,
_CTYPE_GEORGIAN_PS_128_254,
_CTYPE_GEORGIAN_PS_255
},
{ _CTYPE_PT154_128_254,
0,
_CTYPE_DATA_0_127,
_CTYPE_PT154_128_254,
_CTYPE_PT154_255
},
};
#else /* !defined(ALLOW_NEGATIVE_CTYPE_INDEX) */
static _CONST char __ctype_cp[22][1 + 256] = {
static _CONST char __ctype_cp[26][1 + 256] = {
{ 0,
_CTYPE_DATA_0_127,
_CTYPE_CP437_128_254,
@ -712,6 +760,16 @@ static _CONST char __ctype_cp[22][1 + 256] = {
_CTYPE_CP21866_128_254,
_CTYPE_CP21866_255
},
{ 0,
_CTYPE_DATA_0_127,
_CTYPE_GEORGIAN_PS_128_254,
_CTYPE_GEORGIAN_PS_255
},
{ 0,
_CTYPE_DATA_0_127,
_CTYPE_PT154_128_254,
_CTYPE_PT154_255
},
};
#endif /* ALLOW_NEGATIVE_CTYPE_INDEX */

View file

@ -56,34 +56,36 @@ for a given language, a three character string per ISO 639-3.
<<"TERRITORY">> is a country code per ISO 3166. For <<"charset">> and
<<"modifier">> see below.
Additionally to the POSIX specifier, seven extensions are supported for
backward compatibility with older implementations using newlib:
<<"C-UTF-8">>, <<"C-JIS">>, <<"C-eucJP">>, <<"C-SJIS">>, <<C-KOI8-R>>,
<<C-KOI8-U>>, <<"C-ISO-8859-x">> with 1 <= x <= 15, or <<"C-CPxxx">> with
xxx in [437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 932,
1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258].
Instead of <<"C-">>, you can specify also <<"C.">>. Both variations allow
Additionally to the POSIX specifier, the following extension is supported
for backward compatibility with older implementations using newlib:
<<"C-charset">>.
Instead of <<"C-">>, you can also specify <<"C.">>. Both variations allow
to specify language neutral locales while using other charsets than ASCII,
for instance <<"C.UTF-8">>, which keeps all settings as in the C locale,
but uses the UTF-8 charset.
Even when using POSIX locale strings, the only charsets allowed are
The following charsets are recogized:
<<"UTF-8">>, <<"JIS">>, <<"EUCJP">>, <<"SJIS">>, <<"KOI8-R">>, <<"KOI8-U">>,
<<"ISO-8859-x">> with 1 <= x <= 15, or <<"CPxxx">> with xxx in
[437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 932, 1125, 1250,
1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258].
<<"GEORGIAN-PS">>, <<"PT154">>, <<"TIS-620">>, <<"ISO-8859-x">> with
1 <= x <= 16, or <<"CPxxx">> with xxx in [437, 720, 737, 775, 850, 852, 855,
857, 858, 862, 866, 874, 932, 1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256,
1257, 1258].
Charsets are case insensitive. For instance, <<"EUCJP">> and <<"eucJP">>
are equivalent. Charset names with dashes can also be written without
dashes, as in <<"UTF8">>, <<"iso88591">> or <<"koi8r">>. <<"EUCJP">> and
<<"EUCKR"> are also recognized with dash, <<"EUC-JP">> and <<"EUC-KR">>.
Full support for all of the above charsets requires that newlib has been
build with multibyte support and support for all ISO and Windows Codepage.
Otherwise all singlebyte charsets are simply mapped to ASCII. Right now,
only newlib for Cygwin is built with full charset support by default.
Under Cygwin, this implementation additionally supports the charsets
<<"GBK">>, <<"eucKR">>, and <<"Big5">>. Cygwin does not support <<"JIS">>.
(<<"">> is also accepted; if given, the settings are read from the
corresponding LC_* environment variables and $LANG according to POSIX rules.
Under Cygwin, this implementation additionally supports the charsets
<<"GBK">>, <<"eucKR">>, <<"Big5">>, and <<"TIS-620">>.
This implementation also supports a single modifier, <<"cjknarrow">>.
Any other modifier is ignored. <<"cjknarrow">>, in conjunction with one
of the language specifiers <<"ja">>, <<"ko">>, and <<"zh">> specifies
@ -720,18 +722,82 @@ loadlocale(struct _reent *p, int category)
l_mbtowc = __ascii_mbtowc;
#endif
break;
#ifdef __CYGWIN__
case 'G':
case 'g':
if (strcasecmp (charset, "GBK"))
return NULL;
strcpy (charset, "GBK");
mbc_max = 2;
#ifdef __CYGWIN__
if (!strcasecmp (charset, "GBK"))
{
strcpy (charset, "GBK");
mbc_max = 2;
#ifdef _MB_CAPABLE
l_wctomb = __gbk_wctomb;
l_mbtowc = __gbk_mbtowc;
l_wctomb = __gbk_wctomb;
l_mbtowc = __gbk_mbtowc;
#endif
}
else
#endif /* __CYGWIN__ */
/* GEORGIAN-PS and the alias without dash */
if (!strncasecmp (charset, "GEORGIAN", 8))
{
c = charset + 8;
if (*c == '-')
++c;
if (strcasecmp (c, "PS"))
return NULL;
strcpy (charset, "CP101");
mbc_max = 1;
#ifdef _MB_CAPABLE
#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
l_wctomb = __cp_wctomb;
l_mbtowc = __cp_mbtowc;
#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
l_wctomb = __ascii_wctomb;
l_mbtowc = __ascii_mbtowc;
#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
#endif
}
else
return NULL;
break;
case 'P':
case 'p':
/* PT154 */
if (strcasecmp (charset, "PT154"))
return NULL;
strcpy (charset, "CP102");
mbc_max = 1;
#ifdef _MB_CAPABLE
#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
l_wctomb = __cp_wctomb;
l_mbtowc = __cp_mbtowc;
#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
l_wctomb = __ascii_wctomb;
l_mbtowc = __ascii_mbtowc;
#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
#endif
break;
case 'T':
case 't':
if (strncasecmp (charset, "TIS", 3))
return NULL;
c = charset + 3;
if (*c == '-')
++c;
if (strcasecmp (c, "620"))
return NULL;
strcpy (charset, "CP874");
mbc_max = 1;
#ifdef _MB_CAPABLE
#ifdef _MB_EXTENDED_CHARSETS_WINDOWS
l_wctomb = __cp_wctomb;
l_mbtowc = __cp_mbtowc;
#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */
l_wctomb = __ascii_wctomb;
l_mbtowc = __ascii_mbtowc;
#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
#endif
break;
#ifdef __CYGWIN__
case 'B':
case 'b':
if (strcasecmp (charset, "BIG5"))
@ -741,17 +807,6 @@ loadlocale(struct _reent *p, int category)
#ifdef _MB_CAPABLE
l_wctomb = __big5_wctomb;
l_mbtowc = __big5_mbtowc;
#endif
break;
case 'T':
case 't':
if (strcasecmp (charset, "TIS620") && strcasecmp (charset, "TIS-620"))
return NULL;
strcpy (charset, "CP874");
mbc_max = 1;
#ifdef _MB_CAPABLE
l_wctomb = __cp_wctomb;
l_mbtowc = __cp_mbtowc;
#endif
break;
#endif /* __CYGWIN__ */

View file

@ -78,6 +78,10 @@ _DEFUN(nl_langinfo, (item),
ret = "KOI8-R";
else if (strcmp (ret + 2, "21866") == 0)
ret = "KOI8-U";
else if (strcmp (ret + 2, "101") == 0)
ret = "GEORGIAN-PS";
else if (strcmp (ret + 2, "102") == 0)
ret = "PT154";
}
else if (ret[0] == 'S'/*JIS*/)
{

View file

@ -203,7 +203,7 @@ wchar_t __iso_8859_conv[14][0x60] = {
value (function __cp_index), the second index is the value of the
incoming character - 0x80.
Values < 0x80 don't have to be converted anyway. */
wchar_t __cp_conv[24][0x80] = {
wchar_t __cp_conv[26][0x80] = {
/* CP437 */
{ 0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7,
0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5,
@ -611,7 +611,47 @@ wchar_t __cp_conv[24][0x80] = {
0x42e, 0x410, 0x411, 0x426, 0x414, 0x415, 0x424, 0x413,
0x425, 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e,
0x41f, 0x42f, 0x420, 0x421, 0x422, 0x423, 0x416, 0x412,
0x42c, 0x42b, 0x417, 0x428, 0x42d, 0x429, 0x427, 0x42a }
0x42c, 0x42b, 0x417, 0x428, 0x42d, 0x429, 0x427, 0x42a },
/* The following are not valid Windows codepages, but they fit nicely here.
The CP numbers are only used internally and are guranteed not to clash
with valid Windows codepage identifier. */
/* CP101 (GEORGIAN-PS) Georgian charset, used as the default charset in
the ka_GE locale (Georgian, Georgia). Apparently derived from Windows
CP1252. */
{ 0x80, 0x81, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021,
0x2c6, 0x2030, 0x160, 0x2039, 0x152, 0x8d, 0x8e, 0x8f,
0x90, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
0x2dc, 0x2122, 0x161, 0x203a, 0x153, 0x9d, 0x9e, 0x178,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
0x10d0, 0x10d1, 0x10d2, 0x10d3, 0x10d4, 0x10d5, 0x10d6, 0x10f1,
0x10d7, 0x10d8, 0x10d9, 0x10da, 0x10db, 0x10dc, 0x10f2, 0x10dd,
0x10de, 0x10df, 0x10e0, 0x10e1, 0x10e2, 0x10f3, 0x10e3, 0x10e4,
0x10e5, 0x10e6, 0x10e7, 0x10e8, 0x10e9, 0x10ea, 0x10eb, 0x10ec,
0x10ed, 0x10ee, 0x10f4, 0x10ef, 0x10f0, 0x10f5, 0xe6, 0xe7,
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff },
/* CP102 (PT154) Cyrillic-Asian charset, used as the default charset in
the kk_KZ locale (Kazakh, Kazakhstan). */
{ 0x496, 0x492, 0x4ee, 0x493, 0x201e, 0x2026, 0x4b6, 0x4ae,
0x4b2, 0x4af, 0x4a0, 0x4e2, 0x4a2, 0x49a, 0x4ba, 0x4b8,
0x497, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
0x4b3, 0x4b7, 0x4a1, 0x4e3, 0x4a3, 0x49b, 0x4bb, 0x4b9,
0xa0, 0x40e, 0x45e, 0x408, 0x4e8, 0x498, 0x4b0, 0xa7,
0x401, 0xa9, 0x4d8, 0xab, 0xac, 0x4ef, 0xae, 0x49c,
0xb0, 0x4b1, 0x406, 0x456, 0x499, 0x4e9, 0xb6, 0xb7,
0x451, 0x2116, 0x4d9, 0xbb, 0x458, 0x4aa, 0x4ab, 0x49d,
0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417,
0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, 0x41e, 0x41f,
0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427,
0x428, 0x429, 0x42a, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f,
0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437,
0x438, 0x439, 0x43a, 0x43b, 0x43c, 0x43d, 0x43e, 0x43f,
0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447,
0x448, 0x449, 0x44a, 0x44b, 0x44c, 0x44d, 0x44e, 0x44f }
};
#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */
@ -727,6 +767,12 @@ __cp_index (const char *charset_ext)
case 21866:
cp_idx = 23;
break;
case 101:
cp_idx = 24;
break;
case 102:
cp_idx = 25;
break;
default:
cp_idx = -1;
break;