* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate

value encoding.
	* libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
	value decoding.
This commit is contained in:
Corinna Vinschen 2009-10-03 08:51:07 +00:00
parent 9c47bbb6e9
commit 6ff28fc3b1
3 changed files with 42 additions and 38 deletions

View File

@ -1,3 +1,10 @@
2009-10-03 Corinna Vinschen <corinna@vinschen.de>
* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
value encoding.
* libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
value decoding.
2009-09-29 Corinna Vinschen <corinna@vinschen.de>
* libc/locale/locale.c (loadlocale): Allow "C." same as "C-" as locale

View File

@ -295,12 +295,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
| (wchar_t)(ch & 0x3f);
/* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */
if (tmp >= 0xd800 && tmp <= 0xdfff)
{
r->_errno = EILSEQ;
return -1;
}
*pwc = tmp;
return i;
}

View File

@ -63,72 +63,75 @@ _DEFUN (__utf8_wctomb, (r, s, wchar, charset, state),
mbstate_t *state)
{
wint_t wchar = _wchar;
int ret = 0;
if (s == NULL)
return 0; /* UTF-8 encoding is not state-dependent */
if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
if (sizeof (wchar_t) == 2 && state->__count == -4
&& (wchar < 0xdc00 || wchar >= 0xdfff))
{
/* At this point only the second half of a surrogate pair is valid. */
r->_errno = EILSEQ;
return -1;
/* There's a leftover lone high surrogate. Write out the CESU-8 value
of the surrogate and proceed to convert the given character. Note
to return extra 3 bytes. */
wchar_t tmp;
tmp = (state->__value.__wchb[0] << 16 | state->__value.__wchb[1] << 8)
- 0x10000 >> 10 | 0xd80d;
*s++ = 0xe0 | ((tmp & 0xf000) >> 12);
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
*s++ = 0x80 | (tmp & 0x3f);
state->__count = 0;
ret = 3;
}
if (wchar <= 0x7f)
{
*s = wchar;
return 1;
return ret + 1;
}
if (wchar >= 0x80 && wchar <= 0x7ff)
{
*s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
*s = 0x80 | (wchar & 0x3f);
return 2;
return ret + 2;
}
if (wchar >= 0x800 && wchar <= 0xffff)
{
if (wchar >= 0xd800 && wchar <= 0xdfff)
/* No UTF-16 surrogate handling in UCS-4 */
if (sizeof (wchar_t) == 2 && wchar >= 0xd800 && wchar <= 0xdfff)
{
wint_t tmp;
/* UTF-16 surrogates -- must not occur in normal UCS-4 data */
if (sizeof (wchar_t) != 2)
if (wchar <= 0xdbff)
{
r->_errno = EILSEQ;
return -1;
/* First half of a surrogate pair. Store the state and
return ret + 0. */
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
state->__count = -4;
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
return ret;
}
if (wchar >= 0xdc00)
if (state->__count == -4)
{
/* Second half of a surrogate pair. It's not valid if
we don't have already read a first half of a surrogate
before. */
if (state->__count != -4)
{
r->_errno = EILSEQ;
return -1;
}
/* If it's valid, reconstruct the full Unicode value and
return the trailing three bytes of the UTF-8 char. */
/* Second half of a surrogate pair. Reconstruct the full
Unicode value and return the trailing three bytes of the
UTF-8 character. */
tmp = (state->__value.__wchb[0] << 16)
| (state->__value.__wchb[1] << 8)
| (wchar & 0x3ff);
state->__count = 0;
*s++ = 0xf0 | ((tmp & 0x1c0000) >> 18);
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
*s = 0x80 | (tmp & 0x3f);
return 3;
return 4;
}
/* First half of a surrogate pair. Store the state and return
the first byte of the UTF-8 char. */
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
state->__count = -4;
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
return 1;
/* Otherwise translate into CESU-8 value. */
}
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
*s = 0x80 | (wchar & 0x3f);
return 3;
return ret + 3;
}
if (wchar >= 0x10000 && wchar <= 0x10ffff)
{