* regex/engine.c (step): Drop Cygwin-specific definition.

(NONCHAR): Better cast here to make the test work.  Move comment
	from step here.
	(matcher): Disable skipping initial string in multibyte case.
	* regex/regcomp.c (p_bracket): Don't simplify singleton in the invert
	case.
	(p_b_term): Handle early end of pattern after dash in bracket
	expression.
	(singleton): Don't ignore the wides just because there's already a
	singleton in the single byte chars.  Fix condition for a singleton
	wide accordingly.
	(findmust): Check for LC_CTYPE charset, rather than LC_COLLATE charset.
	* regex2.h (CHIN): Fix condition in the icase & invert case.
	(ISWORD): Fix wrong cast to unsigned char.
This commit is contained in:
Corinna Vinschen 2010-02-11 21:19:19 +00:00
parent 45c8c6469a
commit 44caccfca2
4 changed files with 42 additions and 22 deletions

View File

@ -1,3 +1,20 @@
2010-02-11 Corinna Vinschen <corinna@vinschen.de>
* regex/engine.c (step): Drop Cygwin-specific definition.
(NONCHAR): Better cast here to make the test work. Move comment
from step here.
(matcher): Disable skipping initial string in multibyte case.
* regex/regcomp.c (p_bracket): Don't simplify singleton in the invert
case.
(p_b_term): Handle early end of pattern after dash in bracket
expression.
(singleton): Don't ignore the wides just because there's already a
singleton in the single byte chars. Fix condition for a singleton
wide accordingly.
(findmust): Check for LC_CTYPE charset, rather than LC_COLLATE charset.
* regex2.h (CHIN): Fix condition in the icase & invert case.
(ISWORD): Fix wrong cast to unsigned char.
2010-02-11 Andy Koppe <andy.koppe@gmail.com>
* nlsfuncs.cc (initial_setlocale): Move check whether charset has

View File

@ -106,11 +106,7 @@ static const char *dissect(struct match *m, const char *start, const char *stop,
static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
#ifdef __CYGWIN__
static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
#else
static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
#endif
#define MAX_RECURSION 100
#define BOL (OUT-1)
#define EOL (BOL-1)
@ -119,7 +115,10 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_
#define BOW (BOL-4)
#define EOW (BOL-5)
#define BADCHAR (BOL-6)
#define NONCHAR(c) ((c) <= OUT)
/* When using wint_t, which is defined as unsigned int on BSD,
as well as on Cygwin or Linux, the NONCHAR test is broken without
the below cast. I'm wondering how this is supposed to work at all... */
#define NONCHAR(c) ((int)(c) <= OUT)
#ifdef REDEBUG
static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
#endif
@ -248,9 +247,12 @@ matcher(struct re_guts *g,
ZAPSTATE(&m->mbs);
/* Adjust start according to moffset, to speed things up */
#ifndef MNAMES
/* The code evaluating moffset doesn't seem to work right
in the multibyte case. */
if (g->moffset > -1)
start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
#endif
SP("mloop", m->st, *start);
/* this loop does only one repetition except for backrefs */
@ -993,14 +995,7 @@ step(struct re_guts *g,
sopno start, /* start state within strip */
sopno stop, /* state after stop state within strip */
states bef, /* states reachable before */
#ifdef __CYGWIN__
/* When using wint_t, which is defined as unsigned int on BSD,
as well as on Cygwin or Linux, the NONCHAR test is broken.
I'm wondering how this is supposed to work at all... */
int ch, /* character or NONCHAR code */
#else
wint_t ch, /* character or NONCHAR code */
#endif
states aft) /* states already known reachable after */
{
cset *cs;

View File

@ -762,7 +762,8 @@ p_bracket(struct parse *p)
if (cs->invert && p->g->cflags&REG_NEWLINE)
cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
if ((ch = singleton(cs)) != OUT /* optimize singleton sets */
&& cs->invert == 0) { /* But not in invert case. */
ordinary(p, ch);
freeset(p, cs);
} else
@ -833,6 +834,9 @@ p_b_term(struct parse *p, cset *cs)
finish = '-';
else
finish = p_b_symbol(p);
} else if (SEE('-') && !MORE2()) {
SETERROR(REG_EBRACK);
return;
} else
finish = start;
if (start == finish)
@ -1212,9 +1216,9 @@ singleton(cset *cs)
n++;
s = i;
}
if (n == 1)
if (n == 1 && cs->nwides == 0)
return (s);
if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
if (n == 0 && cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
cs->icase == 0)
return (cs->wides[0]);
/* Don't bother handling the other cases. */
@ -1467,7 +1471,7 @@ findmust(struct parse *p, struct re_guts *g)
*/
if (MB_CUR_MAX > 1 &&
#ifdef __CYGWIN__
strcmp(collate_charset, "UTF-8") != 0)
strcmp(__locale_charset (), "UTF-8") != 0)
#else
strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
#endif

View File

@ -151,10 +151,14 @@ CHIN(cset *cs, wint_t ch)
if (ch < NC)
return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
cs->invert);
else if (cs->icase)
return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) ||
CHIN1(cs, towupper(ch)));
else
else if (cs->icase) {
if (cs->invert)
return (CHIN1(cs, ch) && CHIN1(cs, towlower(ch)) &&
CHIN1(cs, towupper(ch)));
else
return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) ||
CHIN1(cs, towupper(ch)));
} else
return (CHIN1(cs, ch));
}
@ -189,4 +193,4 @@ struct re_guts {
/* misc utilities */
#define OUT (CHAR_MIN - 1) /* a non-character value */
#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')
#define ISWORD(c) (iswalnum((wint_t)(c)) || (c) == '_')