From 4b65f190450f70bd5819bb5c18e3370d75ffebde Mon Sep 17 00:00:00 2001
From: Corinna Vinschen <corinna@vinschen.de>
Date: Wed, 6 Feb 2008 18:24:50 +0000
Subject: [PATCH] 	* fhandler.h (fhandler_console::trunc_buf): Add to use
 as cache for 	truncated multibyte characters on input. 
 (fhandler_console::write_replacement_char): Declare new method. 	*
 fhandler_console.cc (CONVERT_LIMIT): Raise to 64K. 
 (fhandler_console::fhandler_console): Initialize trunc_buf. 	(ERR): Define
 as independent value again. 	(fhandler_console::write_replacement_char):
 New method to print 	replacement chars. 
 (fhandler_console::write_normal): Add handling for truncated multibyte 
 sequences.  Call next_char instead of pathetic CharNextExA function. 	Don't
 change src, rather just work with found later on. 	* miscfuncs.cc
 (is_cp_multibyte): Move here from strfuncs.cc. 	Don't call Windows
 function, restrict to well-known ANSI/OEM codepages 	and UTF-8. 
 (next_char): Call CharNextExA only for doublebyte codepages. 	Implement for
 UTF-8 here. 	* strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc. 	*
 winsup.h (next_char): Declare. 	* include/limits.h (MB_LEN_MAX): Set
 to maximum value of MB_CUR_MAX 	as defined by newlib for now.

---
 winsup/cygwin/ChangeLog           |  23 ++++++
 winsup/cygwin/fhandler.h          |   8 +++
 winsup/cygwin/fhandler_console.cc |  99 ++++++++++++++++++-------
 winsup/cygwin/include/limits.h    |   4 +-
 winsup/cygwin/miscfuncs.cc        | 115 +++++++++++++++++++++++++++++-
 winsup/cygwin/strfuncs.cc         |   8 ---
 winsup/cygwin/winsup.h            |   3 +
 7 files changed, 226 insertions(+), 34 deletions(-)

diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog
index 98b5076ce..a5213a705 100644
--- a/winsup/cygwin/ChangeLog
+++ b/winsup/cygwin/ChangeLog
@@ -1,3 +1,26 @@
+2008-02-06  Corinna Vinschen  <corinna@vinschen.de>
+
+	* fhandler.h (fhandler_console::trunc_buf): Add to use as cache for
+	truncated multibyte characters on input.
+	(fhandler_console::write_replacement_char): Declare new method.
+	* fhandler_console.cc (CONVERT_LIMIT): Raise to 64K.
+	(fhandler_console::fhandler_console): Initialize trunc_buf.
+	(ERR): Define as independent value again.
+	(fhandler_console::write_replacement_char): New method to print
+	replacement chars.
+	(fhandler_console::write_normal): Add handling for truncated multibyte
+	sequences.  Call next_char instead of pathetic CharNextExA function.
+	Don't change src, rather just work with found later on.
+	* miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc.
+	Don't call Windows function, restrict to well-known ANSI/OEM codepages
+	and UTF-8.
+	(next_char): Call CharNextExA only for doublebyte codepages.
+	Implement for UTF-8 here.
+	* strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc.
+	* winsup.h (next_char): Declare.
+	* include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX
+	as defined by newlib for now.
+
 2008-02-05  Corinna Vinschen  <corinna@vinschen.de>
 
 	* autoload.cc (CharToOemA): Remove.
diff --git a/winsup/cygwin/fhandler.h b/winsup/cygwin/fhandler.h
index 35d942f6d..28f34df45 100644
--- a/winsup/cygwin/fhandler.h
+++ b/winsup/cygwin/fhandler.h
@@ -896,6 +896,13 @@ class fhandler_console: public fhandler_termios
   static dev_console *dev_state;
   static bool invisible_console;
 
+  /* Used when we encounter a truncated multi-byte sequence.  The
+     lead bytes are stored here and revisited in the next write call. */
+  struct {
+    int len;
+    unsigned char buf[4]; /* Max len of valid UTF-8 sequence. */
+  } trunc_buf;
+
 /* Output calls */
   void set_default_attr ();
 
@@ -904,6 +911,7 @@ class fhandler_console: public fhandler_termios
   void cursor_set (bool, int, int);
   void cursor_get (int *, int *);
   void cursor_rel (int, int);
+  void write_replacement_char (const unsigned char *);
   const unsigned char *write_normal (unsigned const char*, unsigned const char *);
   void char_command (char);
   bool set_raw_win32_keyboard_mode (bool);
diff --git a/winsup/cygwin/fhandler_console.cc b/winsup/cygwin/fhandler_console.cc
index f23bacb3e..cc972e10a 100644
--- a/winsup/cygwin/fhandler_console.cc
+++ b/winsup/cygwin/fhandler_console.cc
@@ -33,7 +33,7 @@ details. */
 #include "cygtls.h"
 #include "registry.h"
 
-#define CONVERT_LIMIT 16384
+#define CONVERT_LIMIT 65536
 
 /*
  * Scroll the screen context.
@@ -895,7 +895,9 @@ fhandler_console::tcgetattr (struct termios *t)
 fhandler_console::fhandler_console () :
   fhandler_termios ()
 {
+  trunc_buf.len = 0;
 }
+
 void
 dev_console::set_color (HANDLE h)
 {
@@ -1037,7 +1039,7 @@ fhandler_console::cursor_get (int *x, int *y)
 #define ESC 2
 #define NOR 0
 #define IGN 4
-#if 0
+#if 1
 #define ERR 5
 #else
 #define ERR NOR
@@ -1425,41 +1427,86 @@ beep ()
   MessageBeep (MB_OK);
 }
 
+/* This gets called when we found an invalid UTF-8 character.  We try with
+   the default ANSI codepage.  If that fails we just print a question mark. 
+   Looks ugly but is a neat and alomst sane fallback for many languages. */
+void
+fhandler_console::write_replacement_char (const unsigned char *char_p)
+{
+  int n;
+  WCHAR def_cp_chars[2];
+  DWORD done;
+
+  n = MultiByteToWideChar (GetACP (), 0, (const CHAR *) char_p, 1,
+			   def_cp_chars, 2);
+  if (n)
+    WriteConsoleW (get_output_handle (), def_cp_chars, n, &done, 0);
+  else
+    WriteConsoleW (get_output_handle (), L"?", 1, &done, 0);
+}
+
 const unsigned char *
 fhandler_console::write_normal (const unsigned char *src,
 				const unsigned char *end)
 {
   /* Scan forward to see what a char which needs special treatment */
   DWORD done;
-  unsigned char *found = (unsigned char *) src;
+  DWORD buf_len;
+  const unsigned char *found = src;
+  const unsigned char *nfound;
   UINT cp = dev_state->get_console_cp ();
-  bool mb = is_cp_multibyte (cp);
+
+  /* First check if we have cached lead bytes of a former try to write 
+     a truncated multibyte sequence.  If so, process it. */
+  if (trunc_buf.len)
+    {
+      int cp_len = min (end - src, 4 - trunc_buf.len);
+      memcpy (trunc_buf.buf + trunc_buf.len, src, cp_len);
+      nfound = next_char (cp, trunc_buf.buf,
+			  trunc_buf.buf + trunc_buf.len + cp_len);
+      if (!nfound)		/* Invalid multibyte sequence. */
+        {			/* Give up and print replacement chars. */
+	  for (int i = 0; i < trunc_buf.len; ++i)
+	    write_replacement_char (trunc_buf.buf + i);
+	}
+      else if (nfound == trunc_buf.buf)
+	{			/* Still truncated multibyte sequence. */
+	  trunc_buf.len += cp_len;
+	  return end;
+	}
+      else
+	{
+	  /* Valid multibyte sequence.  Process. */
+	  WCHAR buf[2];
+	  buf_len = dev_state->str_to_con (buf, (const char *) trunc_buf.buf,
+					   nfound - trunc_buf.buf);
+	  WriteConsoleW (get_output_handle (), buf, buf_len, &done, 0);
+	  found = src + (nfound - trunc_buf.buf - trunc_buf.len);
+	}
+      /* Mark trunc_buf as unused. */
+      trunc_buf.len = 0;
+    }
 
   while (found < end
 	 && found - src < CONVERT_LIMIT
 	 && base_chars[*found] == NOR)
     {
-      if (mb && *found && *found >= 0x80)
-	{
-	  unsigned char *nfound = (unsigned char *)
-				  CharNextExA (cp, (const CHAR *) found, 0);
-	  /* Sanity check for UTF-8 to workaround the problem in
-	     MultiByteToWideChar, that it's not capable of using replacement
-	     characters for invalid source chars in the given codepage. */
-	  if (nfound == found + 1 && cp == CP_UTF8)
-	    *found++ = '?';
-	  else
-	    found = nfound;
+      nfound = next_char (cp, found, end);
+      if (!nfound)		/* Invalid multibyte sequence. */
+	break;
+      if (nfound == found)	/* Truncated multibyte sequence. */
+        {			/* Stick to it until the next write. */
+	  trunc_buf.len = end - found;
+	  memcpy (trunc_buf.buf, found, trunc_buf.len);
+	  return end;
 	}
-      else
-	++found;
+      found = nfound;
     }
 
   /* Print all the base ones out */
   if (found != src)
     {
       DWORD len = found - src;
-      DWORD buf_len;
       PWCHAR buf = (PWCHAR) alloca (CONVERT_LIMIT * sizeof (WCHAR));
 
       buf_len = dev_state->str_to_con (buf, (const char *) src, len);
@@ -1490,13 +1537,14 @@ fhandler_console::write_normal (const unsigned char *src,
 	  buf += done;
 	}
       while (buf_len > 0);
-      src = found;
+      if (len >= CONVERT_LIMIT)
+	return found;
     }
 
-  if (src < end)
+  if (found < end)
     {
       int x, y;
-      switch (base_chars[*src])
+      switch (base_chars[*found])
 	{
 	case BEL:
 	  beep ();
@@ -1529,16 +1577,19 @@ fhandler_console::write_normal (const unsigned char *src,
 	  cursor_set (false, 0, y);
 	  break;
 	case ERR:
-	  WriteFile (get_output_handle (), src, 1, &done, 0);
+	  WriteFile (get_output_handle (), found, 1, &done, 0);
 	  break;
 	case TAB:
 	  cursor_get (&x, &y);
 	  cursor_set (false, 8 * (x / 8 + 1), y);
 	  break;
+	case NOR:
+	  write_replacement_char (found);
+	  break;
 	}
-      src ++;
+      found++;
     }
-  return src;
+  return found;
 }
 
 int
diff --git a/winsup/cygwin/include/limits.h b/winsup/cygwin/include/limits.h
index 7f43cfb5c..e6e089da9 100644
--- a/winsup/cygwin/include/limits.h
+++ b/winsup/cygwin/include/limits.h
@@ -28,7 +28,9 @@ details. */
 
 /* Maximum length of a multibyte character.  */
 #ifndef MB_LEN_MAX
-#define MB_LEN_MAX 1
+/* TODO: This is newlib's max value.  We should probably rather define our
+   own _mbtowc_r and _wctomb_r functions which are only codepage dependent. */
+#define MB_LEN_MAX 8
 #endif
 
 /* Minimum and maximum values a `signed char' can hold.  */
diff --git a/winsup/cygwin/miscfuncs.cc b/winsup/cygwin/miscfuncs.cc
index 0ec0b4873..4edfbab94 100644
--- a/winsup/cygwin/miscfuncs.cc
+++ b/winsup/cygwin/miscfuncs.cc
@@ -17,7 +17,8 @@ details. */
 #include <alloca.h>
 #include <limits.h>
 #include <wchar.h>
-#include <winbase.h>
+#include <wingdi.h>
+#include <winuser.h>
 #include <winnls.h>
 #include "cygthread.h"
 #include "cygtls.h"
@@ -192,6 +193,118 @@ cygwin_strupr (char *string)
   return string;
 }
 
+/* FIXME?  We only support standard ANSI/OEM codepages according to
+   http://www.microsoft.com/globaldev/reference/cphome.mspx as well
+   as UTF-8 and codepage 1361, which is also mentioned as valid
+   doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx).
+   Everything else will be hosed. */
+
+bool
+is_cp_multibyte (UINT cp)
+{
+  switch (cp)
+    {
+    case 932:
+    case 936:
+    case 949:
+    case 950:
+    case 1361:
+    case 65001:
+      return true;
+    }
+  return false;
+}
+
+/* OMYGOD!  CharNextExA is not UTF-8 aware!  It only works fine with
+   double byte charsets.  So we have to do it ourselves for UTF-8.
+   
+   While being at it, we do more.  If a double-byte or multibyte
+   sequence is trucated due to an early end, we need a way to recognize
+   it.  The reason is that multiple buffered write statements might
+   accidentally stop and start in the middle of a single character byte
+   sequence.  If we have to interpret the byte sequences (as in
+   fhandler_console, we would print wrong output in these cases.
+   
+   So we have four possible return values here:
+
+   ret = end      if str >= end
+   ret = NULL	  if we encounter an invalid byte sequence
+   ret = str      if we encounter the start byte of a truncated byte sequence
+   ret = str + n  if we encounter a vaild byte sequence
+*/
+
+const unsigned char *
+next_char (UINT cp, const unsigned char *str, const unsigned char *end)
+{
+  const unsigned char *ret;
+
+  if (str >= end)
+    return end;
+
+  switch (cp)
+    {
+    case 932:
+    case 936:
+    case 949:
+    case 950:
+    case 1361:
+      if (*str <= 0x7f)
+        ret = str + 1;
+      else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str))
+	ret = str;
+      else
+	ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0);
+      break;
+    case CP_UTF8:
+      switch (str[0] >> 4)
+	{
+	case 0x0 ... 0x7:	/* One byte character. */
+	  ret = str + 1;
+	  break;
+	case 0x8 ... 0xb:	/* Followup byte.  Invalid as first byte. */
+	  ret = NULL;
+	  break;
+	case 0xc ... 0xd:	/* Two byte character. */
+	  /* Check followup bytes for validity. */
+	  if (str >= end - 1)
+	    ret = str;
+	  else if (str[1] <= 0xbf)
+	    ret = str + 2;
+	  else
+	    ret = NULL;
+	  break;
+	case 0xe:		/* Three byte character. */
+	  if (str >= end - 2)
+	    ret = str;
+	  else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
+		   && (str[0] != 0xe0 || str[1] >= 0xa0)
+		   && (str[0] != 0xed || str[1] <= 0x9f))
+	    ret = str + 3;
+	  else
+	    ret = NULL;
+	  break;
+	case 0xf:		/* Four byte character. */
+	  if (str[0] >= 0xf8)
+	    ret = NULL;
+	  else if (str >= end - 3)
+	    ret = str;
+	  else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
+		   && (str[3] & 0xc0) == 0x80
+		   && (str[0] == 0xf0 || str[1] >= 0x90)
+		   && (str[0] == 0xf4 || str[1] <= 0x8f))
+	    ret = str + 4;
+	  else
+	    ret = NULL;
+	  break;
+	}
+      break;
+    default:
+      ret = str + 1;
+      break;
+    }
+  return ret;
+}
+
 int __stdcall
 check_invalid_virtual_addr (const void *s, unsigned sz)
 {
diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc
index 130be76f1..23471d258 100644
--- a/winsup/cygwin/strfuncs.cc
+++ b/winsup/cygwin/strfuncs.cc
@@ -36,14 +36,6 @@ get_cp ()
   return active_codepage;
 }
 
-bool
-is_cp_multibyte (UINT cp)
-{
-  CPINFO cpi; 
-  GetCPInfo (cp, &cpi);
-  return cpi.MaxCharSize > 1;
-}
-
 /* tlen is always treated as the maximum buffer size, including the '\0'
    character.  sys_wcstombs will always return a 0-terminated result, no
    matter what. */
diff --git a/winsup/cygwin/winsup.h b/winsup/cygwin/winsup.h
index f88f23fab..952292862 100644
--- a/winsup/cygwin/winsup.h
+++ b/winsup/cygwin/winsup.h
@@ -110,6 +110,7 @@ extern const char case_folded_upper[];
 /* The one function we use from winuser.h most of the time */
 extern "C" DWORD WINAPI GetLastError (void);
 
+/* Codepage and multibyte string specific stuff. */
 enum codepage_type {ansi_cp, oem_cp, utf8_cp};
 extern codepage_type current_codepage;
 extern UINT active_codepage;
@@ -117,6 +118,8 @@ extern UINT active_codepage;
 void codepage_init (const char *buf);
 UINT get_cp ();
 bool is_cp_multibyte (UINT cp);
+const unsigned char *next_char (UINT cp, const unsigned char *str,
+				const unsigned char *end);
 
 /* Used as type by sys_wcstombs_alloc and sys_mbstowcs_alloc.  For a
    description see there. */