* fhandler.h (class dev_console): Constify charset parameter of

str_to_con.
	* fhandler_console.cc (dev_console::con_to_str): Simplify.  Always
	default to the current internal locale.
	(dev_console::get_console_cp): Always use codepage 437 for alternate
	charset.
	(dev_console::str_to_con): Constify charset parameter.
	(fhandler_console::write_normal): Always use codepage 437 for alternate
	charset.  Otherwise always default to the current internal locale.
	Replace ASCII SO with ASCII CAN.
	* strfuncs.cc: Tweka comments according to below changes.
	(sys_cp_wcstombs): Constify charset parameter.  Convert all wchar_t
	values in the Unicode private use area U+F0xx to the singlebyte
	counterpart.  Drop special handling creating ASCII SO sequence from
	U+DCxx value.  Rearrange for performance.  Replace ASCII SO with
	ASCII CAN.
	(sys_cp_mbstowcs): Constify charset parameter.  Replace ASCII SO with
	ASCII CAN.  Drop special case for U+DCxx ASCII SO sequences.  Always
	create a replacement from the Unicode private use area U+F0xx for
	invalid byte values in a multibyte sequence.  Do the same for wchar_t
	values from the U+F0xx range to make them roundtrip safe.
	* wchar.h (sys_cp_wcstombs): Constify charset parameter.
	(sys_cp_mbstowcs): Ditto.
This commit is contained in:
Corinna Vinschen 2009-09-28 12:10:32 +00:00
parent d856640e1c
commit 587b75f7bd
5 changed files with 101 additions and 84 deletions

View File

@ -1,3 +1,29 @@
2009-09-28 Corinna Vinschen <corinna@vinschen.de>
* fhandler.h (class dev_console): Constify charset parameter of
str_to_con.
* fhandler_console.cc (dev_console::con_to_str): Simplify. Always
default to the current internal locale.
(dev_console::get_console_cp): Always use codepage 437 for alternate
charset.
(dev_console::str_to_con): Constify charset parameter.
(fhandler_console::write_normal): Always use codepage 437 for alternate
charset. Otherwise always default to the current internal locale.
Replace ASCII SO with ASCII CAN.
* strfuncs.cc: Tweka comments according to below changes.
(sys_cp_wcstombs): Constify charset parameter. Convert all wchar_t
values in the Unicode private use area U+F0xx to the singlebyte
counterpart. Drop special handling creating ASCII SO sequence from
U+DCxx value. Rearrange for performance. Replace ASCII SO with
ASCII CAN.
(sys_cp_mbstowcs): Constify charset parameter. Replace ASCII SO with
ASCII CAN. Drop special case for U+DCxx ASCII SO sequences. Always
create a replacement from the Unicode private use area U+F0xx for
invalid byte values in a multibyte sequence. Do the same for wchar_t
values from the U+F0xx range to make them roundtrip safe.
* wchar.h (sys_cp_wcstombs): Constify charset parameter.
(sys_cp_mbstowcs): Ditto.
2009-09-28 Corinna Vinschen <corinna@vinschen.de>
* cygheap.cc (cygheap_init): Default locale.charset to "UTF-8".

View File

@ -934,7 +934,7 @@ class dev_console
inline UINT get_console_cp ();
DWORD con_to_str (char *d, int dlen, WCHAR w);
DWORD str_to_con (mbtowc_p, char *, PWCHAR d, const char *s, DWORD sz);
DWORD str_to_con (mbtowc_p, const char *, PWCHAR d, const char *s, DWORD sz);
void set_color (HANDLE);
bool fillin_info (HANDLE);
void set_default_attr ();

View File

@ -127,19 +127,19 @@ tty_list::get_tty (int n)
inline DWORD
dev_console::con_to_str (char *d, int dlen, WCHAR w)
{
return sys_cp_wcstombs (*cygheap->locale.charset == 'A'
? __ascii_wctomb : cygheap->locale.wctomb,
cygheap->locale.charset, d, dlen, &w, 1);
return sys_cp_wcstombs (cygheap->locale.wctomb, cygheap->locale.charset,
d, dlen, &w, 1);
}
inline UINT
dev_console::get_console_cp ()
{
return alternate_charset_active ? GetConsoleOutputCP () : 0;
/* The alternate charset is always 437, just as in the Linux console. */
return alternate_charset_active ? 437 : 0;
}
inline DWORD
dev_console::str_to_con (mbtowc_p f_mbtowc, char *charset,
dev_console::str_to_con (mbtowc_p f_mbtowc, const char *charset,
PWCHAR d, const char *s, DWORD sz)
{
return sys_cp_mbstowcs (f_mbtowc, charset, d, CONVERT_LIMIT, s, sz);
@ -1454,16 +1454,19 @@ fhandler_console::write_normal (const unsigned char *src,
size_t ret;
mbstate_t ps;
UINT cp = dev_state->get_console_cp ();
char charsetbuf[ENCODING_LEN + 1];
char *charset;
const char *charset;
mbtowc_p f_mbtowc;
if (cp)
f_mbtowc = __set_charset_from_codepage (cp, charset = charsetbuf);
{
/* The alternate charset is always 437, just as in the Linux console. */
f_mbtowc = __cp_mbtowc;
charset = "CP437";
}
else
{
f_mbtowc = cygheap->locale.mbtowc;
charset = cygheap->locale.charset;
f_mbtowc = (*charset == 'A') ? __ascii_mbtowc : cygheap->locale.mbtowc;
}
/* First check if we have cached lead bytes of a former try to write
@ -1606,10 +1609,10 @@ fhandler_console::write_normal (const unsigned char *src,
cursor_set (false, 0, y);
break;
case ERR:
/* Don't print chars marked as ERR chars, except for a SO sequence
which is printed as singlebyte chars from the UTF Basic Latin
and Latin 1 Supplement plains. */
if (*found == 0x0e)
/* Don't print chars marked as ERR chars, except for a ASCII CAN
sequence which is printed as singlebyte chars from the UTF
Basic Latin and Latin 1 Supplement plains. */
if (*found == 0x18)
{
write_replacement_char ();
if (found + 1 < end)

View File

@ -310,8 +310,7 @@ __big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
Called from newlib's setlocale() with codepage set to 0, if the
charset isn't given explicitely in the POSIX compatible locale specifier.
The function also returns a pointer to the corresponding _mbtowc_r
function. Also called from fhandler_console::write_normal() if the
"Alternate Charset" has been switched on by an escape sequence. */
function. */
extern "C" mbtowc_p
__set_charset_from_codepage (UINT cp, char *charset)
{
@ -400,17 +399,17 @@ __set_charset_from_codepage (UINT cp, char *charset)
multibyte charset, then usually you wouldn't be able to access the
file. To fix this problem, sys_wcstombs creates a replacement multibyte
sequences for the non-representable wide-char. The sequence starts with
an ASCII SO (0x0e, Ctrl-N), followed by the UTF-8 representation of the
character. The sys_(cp_)mbstowcs function detects ASCII SO characters
an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the
character. The sys_(cp_)mbstowcs function detects ASCII CAN characters
in the input multibyte string and converts the following multibyte
sequence in by treating it as an UTF-8 char. If that fails, the ASCII
SO was probably standalone and it gets just copied over as ASCII SO.
CAN was probably standalone and it gets just copied over as ASCII CAN.
- The functions always create 0-terminated results, no matter what.
If the result is truncated due to buffer size, it's a bug in Cygwin
and the buffer in the calling function should be raised. */
size_t __stdcall
sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
const wchar_t *src, size_t nwc)
{
char buf[10];
@ -426,46 +425,47 @@ sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
while (n < len && nwc-- > 0)
{
wchar_t pw = *pwcs;
/* Convert UNICODE private use area. Reverse functionality (only for
path names) is transform_chars in path.cc. */
if ((pw & 0xff00) == 0xf000)
pw &= 0xff;
int bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
if (bytes == -1 && (pw & 0xff00) == 0xdc00)
int bytes;
/* Convert UNICODE private use area. Reverse functionality for the
ASCII area <= 0x7f (only for path names) is transform_chars in
path.cc. Reverse functionality for invalid bytes in a multibyte
sequence is in sys_cp_mbstowcs. */
if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1))
{
/* Reverse functionality of the single invalid second half of a
surrogate pair in the 0xDCxx range specifying an invalid byte
value when converting from MB to WC.
The comment in sys_cp_mbstowcs below explains it. */
buf[0] = 0x0e; /* ASCII SO */
buf[1] = 0xff;
buf[2] = (char) (pw & 0xff);
bytes = 3;
}
else if (bytes == -1 && *charset != 'U'/*TF-8*/)
buf[0] = pw & 0xff;
bytes = 1;
}
else
{
/* Convert chars invalid in the current codepage to a sequence
ASCII SO; UTF-8 representation of invalid char. */
buf[0] = 0x0e; /* ASCII SO */
bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
if (bytes == -1)
bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
if (bytes == -1 && *charset != 'U'/*TF-8*/)
{
++pwcs;
ps.__count = 0;
continue;
}
++bytes; /* Add the ASCII SO to the byte count. */
if (ps.__count == -4 && nwc > 0) /* First half of a surrogate pair. */
{
++pwcs;
if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
/* Convert chars invalid in the current codepage to a sequence
ASCII CAN; UTF-8 representation of invalid char. */
buf[0] = 0x18; /* ASCII CAN */
bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
if (bytes == -1)
{
++pwcs;
ps.__count = 0;
continue;
}
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset, &ps);
nwc--;
++bytes; /* Add the ASCII CAN to the byte count. */
if (ps.__count == -4 && nwc > 0)
{
/* First half of a surrogate pair. */
++pwcs;
if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
{
++pwcs;
ps.__count = 0;
continue;
}
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset,
&ps);
nwc--;
}
}
}
if (n + bytes <= len)
@ -535,8 +535,8 @@ sys_wcstombs_alloc (char **dst_p, int type, const wchar_t *src, size_t nwc)
charset, which is the charset returned by GetConsoleCP (). Most of the
time this is used for box and line drawing characters. */
size_t __stdcall
sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
const char *src, size_t nms)
sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
size_t dlen, const char *src, size_t nms)
{
wchar_t *ptr = dst;
unsigned const char *pmbs = (unsigned const char *) src;
@ -551,10 +551,10 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
len = (size_t)-1;
while (len > 0 && nms > 0)
{
/* ASCII SO handling. */
if (*pmbs == 0x0e)
/* ASCII CAN handling. */
if (*pmbs == 0x18)
{
/* Sanity check: If this is a lead SO byte for a following UTF-8
/* Sanity check: If this is a lead CAN byte for a following UTF-8
sequence, there must be at least two more bytes left, and the
next byte must be a valid UTF-8 start byte. If the charset
isn't UTF-8 anyway, try to convert the following bytes as UTF-8
@ -565,16 +565,16 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
nms - 1, charset, &ps);
if (bytes < 0)
{
/* Invalid UTF-8 sequence? Treat the ASCII SO character as
stand-alone ASCII SO char. */
/* Invalid UTF-8 sequence? Treat the ASCII CAN character as
stand-alone ASCII CAN char. */
bytes = 1;
if (dst)
*ptr = 0x0e;
*ptr = 0x18;
memset (&ps, 0, sizeof ps);
}
else
{
++bytes; /* Count SO byte */
++bytes; /* Count CAN byte */
if (bytes > 1 && ps.__count == 4)
{
/* First half of a surrogate. */
@ -594,40 +594,28 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
}
}
}
/* Sequence for an invalid byte originally created in the next outer
else branch below. This must be converted back to a 0xDCxx value
as well. */
else if (nms > 2 && pmbs[1] == 0xff)
{
bytes = 3;
if (dst)
*ptr = L'\xdc80' | pmbs[2];
}
/* Otherwise it's just a simple ASCII SO. */
/* Otherwise it's just a simple ASCII CAN. */
else
{
bytes = 1;
if (dst)
*ptr = 0x0e;
*ptr = 0x18;
}
}
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
charset, &ps)) < 0
&& *pmbs >= 0x80)
|| (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80))
{
/* This should probably be handled in f_mbtowc which can operate
on sequences rather than individual characters.
The technique is based on a discussion here:
/* The technique is based on a discussion here:
http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
This is hardly perfect. Windows doesn't do anything sensical with
characters converted to this format. It does allow processing of
src to continue, however, which, since there is no way to signal
decoding errors, seems like the best we can do. */
Invalid bytes in a multibyte secuence are converted to
the private use area which is already used to store ASCII
chars invalid in Windows filenames. This techinque allows
to store them in a symmetric way. */
bytes = 1;
if (dst)
*ptr = L'\xdc80' | *pmbs;
*ptr = L'\xf000' | *pmbs;
memset (&ps, 0, sizeof ps);
}

View File

@ -51,7 +51,7 @@ extern mbtowc_p __set_charset_from_codepage (unsigned int cp, char *charset);
#endif
#ifdef __INSIDE_CYGWIN__
size_t __stdcall sys_cp_wcstombs (wctomb_p, char *, char *, size_t,
size_t __stdcall sys_cp_wcstombs (wctomb_p, const char *, char *, size_t,
const wchar_t *, size_t = (size_t) -1)
__attribute__ ((regparm(3)));
size_t __stdcall sys_wcstombs (char *dst, size_t len, const wchar_t * src,
@ -61,7 +61,7 @@ size_t __stdcall sys_wcstombs_alloc (char **, int, const wchar_t *,
size_t = (size_t) -1)
__attribute__ ((regparm(3)));
size_t __stdcall sys_cp_mbstowcs (mbtowc_p, char *, wchar_t *, size_t,
size_t __stdcall sys_cp_mbstowcs (mbtowc_p, const char *, wchar_t *, size_t,
const char *, size_t = (size_t) -1)
__attribute__ ((regparm(3)));
size_t __stdcall sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src,