From 4b65f190450f70bd5819bb5c18e3370d75ffebde Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Wed, 6 Feb 2008 18:24:50 +0000 Subject: [PATCH] * fhandler.h (fhandler_console::trunc_buf): Add to use as cache for truncated multibyte characters on input. (fhandler_console::write_replacement_char): Declare new method. * fhandler_console.cc (CONVERT_LIMIT): Raise to 64K. (fhandler_console::fhandler_console): Initialize trunc_buf. (ERR): Define as independent value again. (fhandler_console::write_replacement_char): New method to print replacement chars. (fhandler_console::write_normal): Add handling for truncated multibyte sequences. Call next_char instead of pathetic CharNextExA function. Don't change src, rather just work with found later on. * miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc. Don't call Windows function, restrict to well-known ANSI/OEM codepages and UTF-8. (next_char): Call CharNextExA only for doublebyte codepages. Implement for UTF-8 here. * strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc. * winsup.h (next_char): Declare. * include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX as defined by newlib for now. --- winsup/cygwin/ChangeLog | 23 ++++++ winsup/cygwin/fhandler.h | 8 +++ winsup/cygwin/fhandler_console.cc | 99 ++++++++++++++++++------- winsup/cygwin/include/limits.h | 4 +- winsup/cygwin/miscfuncs.cc | 115 +++++++++++++++++++++++++++++- winsup/cygwin/strfuncs.cc | 8 --- winsup/cygwin/winsup.h | 3 + 7 files changed, 226 insertions(+), 34 deletions(-) diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog index 98b5076ce..a5213a705 100644 --- a/winsup/cygwin/ChangeLog +++ b/winsup/cygwin/ChangeLog @@ -1,3 +1,26 @@ +2008-02-06 Corinna Vinschen + + * fhandler.h (fhandler_console::trunc_buf): Add to use as cache for + truncated multibyte characters on input. + (fhandler_console::write_replacement_char): Declare new method. + * fhandler_console.cc (CONVERT_LIMIT): Raise to 64K. + (fhandler_console::fhandler_console): Initialize trunc_buf. + (ERR): Define as independent value again. + (fhandler_console::write_replacement_char): New method to print + replacement chars. + (fhandler_console::write_normal): Add handling for truncated multibyte + sequences. Call next_char instead of pathetic CharNextExA function. + Don't change src, rather just work with found later on. + * miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc. + Don't call Windows function, restrict to well-known ANSI/OEM codepages + and UTF-8. + (next_char): Call CharNextExA only for doublebyte codepages. + Implement for UTF-8 here. + * strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc. + * winsup.h (next_char): Declare. + * include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX + as defined by newlib for now. + 2008-02-05 Corinna Vinschen * autoload.cc (CharToOemA): Remove. diff --git a/winsup/cygwin/fhandler.h b/winsup/cygwin/fhandler.h index 35d942f6d..28f34df45 100644 --- a/winsup/cygwin/fhandler.h +++ b/winsup/cygwin/fhandler.h @@ -896,6 +896,13 @@ class fhandler_console: public fhandler_termios static dev_console *dev_state; static bool invisible_console; + /* Used when we encounter a truncated multi-byte sequence. The + lead bytes are stored here and revisited in the next write call. */ + struct { + int len; + unsigned char buf[4]; /* Max len of valid UTF-8 sequence. */ + } trunc_buf; + /* Output calls */ void set_default_attr (); @@ -904,6 +911,7 @@ class fhandler_console: public fhandler_termios void cursor_set (bool, int, int); void cursor_get (int *, int *); void cursor_rel (int, int); + void write_replacement_char (const unsigned char *); const unsigned char *write_normal (unsigned const char*, unsigned const char *); void char_command (char); bool set_raw_win32_keyboard_mode (bool); diff --git a/winsup/cygwin/fhandler_console.cc b/winsup/cygwin/fhandler_console.cc index f23bacb3e..cc972e10a 100644 --- a/winsup/cygwin/fhandler_console.cc +++ b/winsup/cygwin/fhandler_console.cc @@ -33,7 +33,7 @@ details. */ #include "cygtls.h" #include "registry.h" -#define CONVERT_LIMIT 16384 +#define CONVERT_LIMIT 65536 /* * Scroll the screen context. @@ -895,7 +895,9 @@ fhandler_console::tcgetattr (struct termios *t) fhandler_console::fhandler_console () : fhandler_termios () { + trunc_buf.len = 0; } + void dev_console::set_color (HANDLE h) { @@ -1037,7 +1039,7 @@ fhandler_console::cursor_get (int *x, int *y) #define ESC 2 #define NOR 0 #define IGN 4 -#if 0 +#if 1 #define ERR 5 #else #define ERR NOR @@ -1425,41 +1427,86 @@ beep () MessageBeep (MB_OK); } +/* This gets called when we found an invalid UTF-8 character. We try with + the default ANSI codepage. If that fails we just print a question mark. + Looks ugly but is a neat and alomst sane fallback for many languages. */ +void +fhandler_console::write_replacement_char (const unsigned char *char_p) +{ + int n; + WCHAR def_cp_chars[2]; + DWORD done; + + n = MultiByteToWideChar (GetACP (), 0, (const CHAR *) char_p, 1, + def_cp_chars, 2); + if (n) + WriteConsoleW (get_output_handle (), def_cp_chars, n, &done, 0); + else + WriteConsoleW (get_output_handle (), L"?", 1, &done, 0); +} + const unsigned char * fhandler_console::write_normal (const unsigned char *src, const unsigned char *end) { /* Scan forward to see what a char which needs special treatment */ DWORD done; - unsigned char *found = (unsigned char *) src; + DWORD buf_len; + const unsigned char *found = src; + const unsigned char *nfound; UINT cp = dev_state->get_console_cp (); - bool mb = is_cp_multibyte (cp); + + /* First check if we have cached lead bytes of a former try to write + a truncated multibyte sequence. If so, process it. */ + if (trunc_buf.len) + { + int cp_len = min (end - src, 4 - trunc_buf.len); + memcpy (trunc_buf.buf + trunc_buf.len, src, cp_len); + nfound = next_char (cp, trunc_buf.buf, + trunc_buf.buf + trunc_buf.len + cp_len); + if (!nfound) /* Invalid multibyte sequence. */ + { /* Give up and print replacement chars. */ + for (int i = 0; i < trunc_buf.len; ++i) + write_replacement_char (trunc_buf.buf + i); + } + else if (nfound == trunc_buf.buf) + { /* Still truncated multibyte sequence. */ + trunc_buf.len += cp_len; + return end; + } + else + { + /* Valid multibyte sequence. Process. */ + WCHAR buf[2]; + buf_len = dev_state->str_to_con (buf, (const char *) trunc_buf.buf, + nfound - trunc_buf.buf); + WriteConsoleW (get_output_handle (), buf, buf_len, &done, 0); + found = src + (nfound - trunc_buf.buf - trunc_buf.len); + } + /* Mark trunc_buf as unused. */ + trunc_buf.len = 0; + } while (found < end && found - src < CONVERT_LIMIT && base_chars[*found] == NOR) { - if (mb && *found && *found >= 0x80) - { - unsigned char *nfound = (unsigned char *) - CharNextExA (cp, (const CHAR *) found, 0); - /* Sanity check for UTF-8 to workaround the problem in - MultiByteToWideChar, that it's not capable of using replacement - characters for invalid source chars in the given codepage. */ - if (nfound == found + 1 && cp == CP_UTF8) - *found++ = '?'; - else - found = nfound; + nfound = next_char (cp, found, end); + if (!nfound) /* Invalid multibyte sequence. */ + break; + if (nfound == found) /* Truncated multibyte sequence. */ + { /* Stick to it until the next write. */ + trunc_buf.len = end - found; + memcpy (trunc_buf.buf, found, trunc_buf.len); + return end; } - else - ++found; + found = nfound; } /* Print all the base ones out */ if (found != src) { DWORD len = found - src; - DWORD buf_len; PWCHAR buf = (PWCHAR) alloca (CONVERT_LIMIT * sizeof (WCHAR)); buf_len = dev_state->str_to_con (buf, (const char *) src, len); @@ -1490,13 +1537,14 @@ fhandler_console::write_normal (const unsigned char *src, buf += done; } while (buf_len > 0); - src = found; + if (len >= CONVERT_LIMIT) + return found; } - if (src < end) + if (found < end) { int x, y; - switch (base_chars[*src]) + switch (base_chars[*found]) { case BEL: beep (); @@ -1529,16 +1577,19 @@ fhandler_console::write_normal (const unsigned char *src, cursor_set (false, 0, y); break; case ERR: - WriteFile (get_output_handle (), src, 1, &done, 0); + WriteFile (get_output_handle (), found, 1, &done, 0); break; case TAB: cursor_get (&x, &y); cursor_set (false, 8 * (x / 8 + 1), y); break; + case NOR: + write_replacement_char (found); + break; } - src ++; + found++; } - return src; + return found; } int diff --git a/winsup/cygwin/include/limits.h b/winsup/cygwin/include/limits.h index 7f43cfb5c..e6e089da9 100644 --- a/winsup/cygwin/include/limits.h +++ b/winsup/cygwin/include/limits.h @@ -28,7 +28,9 @@ details. */ /* Maximum length of a multibyte character. */ #ifndef MB_LEN_MAX -#define MB_LEN_MAX 1 +/* TODO: This is newlib's max value. We should probably rather define our + own _mbtowc_r and _wctomb_r functions which are only codepage dependent. */ +#define MB_LEN_MAX 8 #endif /* Minimum and maximum values a `signed char' can hold. */ diff --git a/winsup/cygwin/miscfuncs.cc b/winsup/cygwin/miscfuncs.cc index 0ec0b4873..4edfbab94 100644 --- a/winsup/cygwin/miscfuncs.cc +++ b/winsup/cygwin/miscfuncs.cc @@ -17,7 +17,8 @@ details. */ #include #include #include -#include +#include +#include #include #include "cygthread.h" #include "cygtls.h" @@ -192,6 +193,118 @@ cygwin_strupr (char *string) return string; } +/* FIXME? We only support standard ANSI/OEM codepages according to + http://www.microsoft.com/globaldev/reference/cphome.mspx as well + as UTF-8 and codepage 1361, which is also mentioned as valid + doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx). + Everything else will be hosed. */ + +bool +is_cp_multibyte (UINT cp) +{ + switch (cp) + { + case 932: + case 936: + case 949: + case 950: + case 1361: + case 65001: + return true; + } + return false; +} + +/* OMYGOD! CharNextExA is not UTF-8 aware! It only works fine with + double byte charsets. So we have to do it ourselves for UTF-8. + + While being at it, we do more. If a double-byte or multibyte + sequence is trucated due to an early end, we need a way to recognize + it. The reason is that multiple buffered write statements might + accidentally stop and start in the middle of a single character byte + sequence. If we have to interpret the byte sequences (as in + fhandler_console, we would print wrong output in these cases. + + So we have four possible return values here: + + ret = end if str >= end + ret = NULL if we encounter an invalid byte sequence + ret = str if we encounter the start byte of a truncated byte sequence + ret = str + n if we encounter a vaild byte sequence +*/ + +const unsigned char * +next_char (UINT cp, const unsigned char *str, const unsigned char *end) +{ + const unsigned char *ret; + + if (str >= end) + return end; + + switch (cp) + { + case 932: + case 936: + case 949: + case 950: + case 1361: + if (*str <= 0x7f) + ret = str + 1; + else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str)) + ret = str; + else + ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0); + break; + case CP_UTF8: + switch (str[0] >> 4) + { + case 0x0 ... 0x7: /* One byte character. */ + ret = str + 1; + break; + case 0x8 ... 0xb: /* Followup byte. Invalid as first byte. */ + ret = NULL; + break; + case 0xc ... 0xd: /* Two byte character. */ + /* Check followup bytes for validity. */ + if (str >= end - 1) + ret = str; + else if (str[1] <= 0xbf) + ret = str + 2; + else + ret = NULL; + break; + case 0xe: /* Three byte character. */ + if (str >= end - 2) + ret = str; + else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 + && (str[0] != 0xe0 || str[1] >= 0xa0) + && (str[0] != 0xed || str[1] <= 0x9f)) + ret = str + 3; + else + ret = NULL; + break; + case 0xf: /* Four byte character. */ + if (str[0] >= 0xf8) + ret = NULL; + else if (str >= end - 3) + ret = str; + else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 + && (str[3] & 0xc0) == 0x80 + && (str[0] == 0xf0 || str[1] >= 0x90) + && (str[0] == 0xf4 || str[1] <= 0x8f)) + ret = str + 4; + else + ret = NULL; + break; + } + break; + default: + ret = str + 1; + break; + } + return ret; +} + int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) { diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc index 130be76f1..23471d258 100644 --- a/winsup/cygwin/strfuncs.cc +++ b/winsup/cygwin/strfuncs.cc @@ -36,14 +36,6 @@ get_cp () return active_codepage; } -bool -is_cp_multibyte (UINT cp) -{ - CPINFO cpi; - GetCPInfo (cp, &cpi); - return cpi.MaxCharSize > 1; -} - /* tlen is always treated as the maximum buffer size, including the '\0' character. sys_wcstombs will always return a 0-terminated result, no matter what. */ diff --git a/winsup/cygwin/winsup.h b/winsup/cygwin/winsup.h index f88f23fab..952292862 100644 --- a/winsup/cygwin/winsup.h +++ b/winsup/cygwin/winsup.h @@ -110,6 +110,7 @@ extern const char case_folded_upper[]; /* The one function we use from winuser.h most of the time */ extern "C" DWORD WINAPI GetLastError (void); +/* Codepage and multibyte string specific stuff. */ enum codepage_type {ansi_cp, oem_cp, utf8_cp}; extern codepage_type current_codepage; extern UINT active_codepage; @@ -117,6 +118,8 @@ extern UINT active_codepage; void codepage_init (const char *buf); UINT get_cp (); bool is_cp_multibyte (UINT cp); +const unsigned char *next_char (UINT cp, const unsigned char *str, + const unsigned char *end); /* Used as type by sys_wcstombs_alloc and sys_mbstowcs_alloc. For a description see there. */