This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.
Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Hi! I wrote some new word boundary tests (though there are really many things which I haven't even touched yet, like \>, \w, \W) plus infrastructure so that UTF-8 tests are generated from the ASCII ones. None of the ASCII tests fail, but many UTF-8 tests fail, more without my patch from today and slightly less with the patch applied. So that glibc testsuite passes, perhaps ret |= do_mb_tests (&tests[i]); can be replaced with do_mb_tests (&tests[i]); until this is fixed. If somebody has good ideas for new \</\>/\b/\B/\w/\W tests, feel free to add some. 2003-11-21 Jakub Jelinek <jakub@redhat.com> * posix/bug-regex19.c (BRE, ERE): Define. (tests): Add many new tests, remove UTF-8 ones. (do_one_test, do_mb_tests): New functions. (main): Rewritten using do_one_test and do_mb_tests. --- libc/posix/bug-regex19.c.jj 2003-11-20 21:24:03.000000000 +0100 +++ libc/posix/bug-regex19.c 2003-11-21 01:02:28.000000000 +0100 @@ -26,87 +26,274 @@ #include <string.h> #include <locale.h> -static struct +#define BRE RE_SYNTAX_POSIX_BASIC +#define ERE RE_SYNTAX_POSIX_EXTENDED + +static struct test_s { int syntax; const char *pattern; const char *string; int start, res; } tests[] = { - /* \xc3\x84 LATIN CAPITAL LETTER A WITH DIAERESIS - \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS - \xe2\x80\x94 EM DASH */ - /* Should not match. */ - {RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 0, -1}, - {RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 2, -1}, - {RE_SYNTAX_POSIX_BASIC, "A\\>", "aAAO", 1, -1}, - {RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 0, -1}, - {RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 2, -1}, - {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAAO", 1, -1}, - {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1}, - {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1}, - {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1}, -#if 0 - /* XXX these 2 tests still fail. */ - {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1}, - {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1}, -#endif - {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1}, - /* Should match. */ - {RE_SYNTAX_POSIX_BASIC, "\\<A", "AA", 0, 0}, - {RE_SYNTAX_POSIX_BASIC, "\\<A", "a-AA", 2, 2}, - {RE_SYNTAX_POSIX_BASIC, "A\\>", "aAA-", 1, 2}, - {RE_SYNTAX_POSIX_BASIC, "A\\>", "aAA", 1, 2}, - {RE_SYNTAX_POSIX_BASIC, "\\bA", "AA", 0, 0}, - {RE_SYNTAX_POSIX_BASIC, "\\bA", "a-AA", 2, 2}, - {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA-", 1, 2}, - {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA", 1, 2}, - {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "\xc3\x84\xc3\x84", 0, 0}, - {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4}, - {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3}, - {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84", 1, 3}, - {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "\xc3\x84\xc3\x84", 0, 0}, - {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4}, - {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3}, - {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84", 1, 3} + {BRE, "\\<A", "CBAA", 0, -1}, + {BRE, "\\<A", "CBAA", 2, -1}, + {BRE, "A\\>", "CAAB", 1, -1}, + {BRE, "\\bA", "CBAA", 0, -1}, + {BRE, "\\bA", "CBAA", 2, -1}, + {BRE, "A\\b", "CAAB", 1, -1}, + {BRE, "\\<A", "AA", 0, 0}, + {BRE, "\\<A", "C-AA", 2, 2}, + {BRE, "A\\>", "CAA-", 1, 2}, + {BRE, "A\\>", "CAA", 1, 2}, + {BRE, "\\bA", "AA", 0, 0}, + {BRE, "\\bA", "C-AA", 2, 2}, + {BRE, "A\\b", "CAA-", 1, 2}, + {BRE, "A\\b", "CAA", 1, 2}, + {ERE, "\\b(A|!|.B)", "A=AC", 0, 0}, + {ERE, "\\b(A|!|.B)", "=AC", 0, 1}, + {ERE, "\\b(A|!|.B)", "!AC", 0, 1}, + {ERE, "\\b(A|!|.B)", "=AB", 0, 1}, + {ERE, "\\b(A|!|.B)", "DA!C", 0, 2}, + {ERE, "\\b(A|!|.B)", "=CB", 0, 1}, + {ERE, "\\b(A|!|.B)", "!CB", 0, 1}, + {ERE, "\\b(A|!|.B)", "D,B", 0, 1}, + {ERE, "\\b(A|!|.B)", "!.C", 0, -1}, + {ERE, "\\b(A|!|.B)", "BCB", 0, -1}, + {ERE, "(A|\\b)(A|B|C)", "DAAD", 0, 1}, + {ERE, "(A|\\b)(A|B|C)", "DABD", 0, 1}, + {ERE, "(A|\\b)(A|B|C)", "AD", 0, 0}, + {ERE, "(A|\\b)(A|B|C)", "C!", 0, 0}, + {ERE, "(A|\\b)(A|B|C)", "D,B", 0, 2}, + {ERE, "(A|\\b)(A|B|C)", "DA?A", 0, 3}, + {ERE, "(A|\\b)(A|B|C)", "BBC", 0, 0}, + {ERE, "(A|\\b)(A|B|C)", "DA", 0, -1}, + {ERE, "(!|\\b)(!|=|~)", "A!=\\", 0, 1}, + {ERE, "(!|\\b)(!|=|~)", "/!=A", 0, 1}, + {ERE, "(!|\\b)(!|=|~)", "A=A", 0, 1}, + {ERE, "(!|\\b)(!|=|~)", "==!=", 0, 2}, + {ERE, "(!|\\b)(!|=|~)", "==C~", 0, 3}, + {ERE, "(!|\\b)(!|=|~)", "=~=", 0, -1}, + {ERE, "(!|\\b)(!|=|~)", "~!", 0, -1}, + {ERE, "(!|\\b)(!|=|~)", "~=~", 0, -1}, + {ERE, "(\\b|A.)[ABC]", "AC", 0, 0}, + {ERE, "(\\b|A.)[ABC]", "=A", 0, 1}, + {ERE, "(\\b|A.)[ABC]", "DACC", 0, 1}, + {ERE, "(\\b|A.)[A~C]", "AC", 0, 0}, + {ERE, "(\\b|A.)[A~C]", "=A", 0, 1}, + {ERE, "(\\b|A.)[A~C]", "DACC", 0, 1}, + {ERE, "(\\b|A.)[A~C]", "B!A=", 0, 2}, + {ERE, "(\\b|A.)[A~C]", "B~C", 0, 1}, + {ERE, ".\\b.", "AA~", 0, 1}, + {ERE, ".\\b.", "=A=", 0, 0}, + {ERE, ".\\b.", "==", 0, -1}, + {ERE, ".\\b.", "ABA", 0, -1}, + {ERE, "\\<(A|!|.B)", "A=AC", 0, 0}, + {ERE, "\\<(A|!|.B)", "=AC", 0, 1}, + {ERE, "\\<(A|!|.B)", "!AC", 0, 1}, + {ERE, "\\<(A|!|.B)", "=AB", 0, 1}, + {ERE, "\\<(A|!|.B)", "=CB", 0, 1}, + {ERE, "\\<(A|!|.B)", "!CB", 0, 1}, + {ERE, "\\<(A|!|.B)", "DA!C", 0, -1}, + {ERE, "\\<(A|!|.B)", "D,B", 0, -1}, + {ERE, "\\<(A|!|.B)", "!.C", 0, -1}, + {ERE, "\\<(A|!|.B)", "BCB", 0, -1}, + {ERE, "(A|\\<)(A|B|C)", "DAAD", 0, 1}, + {ERE, "(A|\\<)(A|B|C)", "DABD", 0, 1}, + {ERE, "(A|\\<)(A|B|C)", "AD", 0, 0}, + {ERE, "(A|\\<)(A|B|C)", "C!", 0, 0}, + {ERE, "(A|\\<)(A|B|C)", "D,B", 0, 2}, + {ERE, "(A|\\<)(A|B|C)", "DA?A", 0, 3}, + {ERE, "(A|\\<)(A|B|C)", "BBC", 0, 0}, + {ERE, "(A|\\<)(A|B|C)", "DA", 0, -1}, + {ERE, "(!|\\<)(!|=|~)", "A!=\\", 0, 1}, + {ERE, "(!|\\<)(!|=|~)", "/!=A", 0, 1}, + {ERE, "(!|\\<)(!|=|~)", "==!=", 0, 2}, + {ERE, "(!|\\<)(!|=|~)", "==C~", 0, -1}, + {ERE, "(!|\\<)(!|=|~)", "A=A", 0, -1}, + {ERE, "(!|\\<)(!|=|~)", "=~=", 0, -1}, + {ERE, "(!|\\<)(!|=|~)", "~!", 0, -1}, + {ERE, "(!|\\<)(!|=|~)", "~=~", 0, -1}, + {ERE, "(\\<|A.)[ABC]", "AC", 0, 0}, + {ERE, "(\\<|A.)[ABC]", "=A", 0, 1}, + {ERE, "(\\<|A.)[ABC]", "DACC", 0, 1}, + {ERE, "(\\<|A.)[A~C]", "AC", 0, 0}, + {ERE, "(\\<|A.)[A~C]", "=A", 0, 1}, + {ERE, "(\\<|A.)[A~C]", "DACC", 0, 1}, + {ERE, "(\\<|A.)[A~C]", "B!A=", 0, 2}, + {ERE, "(\\<|A.)[A~C]", "B~C", 0, 2}, + {ERE, ".\\<.", "=A=", 0, 0}, + {ERE, ".\\<.", "AA~", 0, -1}, + {ERE, ".\\<.", "==", 0, -1}, + {ERE, ".\\<.", "ABA", 0, -1}, + {ERE, ".\\B.", "ABA", 0, 0}, + {ERE, ".\\B.", "=BDC", 0, 1}, + {ERE, ".(\\b|\\B).", "=~AB", 0, 1}, + {ERE, ".(\\b|\\B).", "A=C", 0, 0}, + {ERE, ".(\\b|\\B).", "ABC", 0, 0}, + {ERE, ".(\\b|\\B).", "=~\\!", 0, -1}, }; int -main (void) +do_one_test (const struct test_s *test, const char *fail) { - struct re_pattern_buffer regbuf; + int res; const char *err; + struct re_pattern_buffer regbuf; + + re_set_syntax (test->syntax); + memset (®buf, '\0', sizeof (regbuf)); + err = re_compile_pattern (test->pattern, strlen (test->pattern), + ®buf); + if (err != NULL) + { + printf ("%sre_compile_pattern \"%s\" failed: %s\n", fail, test->pattern, + err); + return 1; + } + + res = re_search (®buf, test->string, strlen (test->string), + test->start, strlen (test->string) - test->start, NULL); + if (res != test->res) + { + printf ("%sre_search \"%s\" \"%s\" failed: %d (expected %d)\n", + fail, test->pattern, test->string, res, test->res); + regfree (®buf); + return 1; + } + + if (test->res > 0 && test->start == 0) + { + res = re_search (®buf, test->string, strlen (test->string), + test->res, strlen (test->string) - test->res, NULL); + if (res != test->res) + { + printf ("%sre_search from expected \"%s\" \"%s\" failed: %d (expected %d)\n", + fail, test->pattern, test->string, res, test->res); + regfree (®buf); + return 1; + } + } + + regfree (®buf); + return 0; +} + +static inline char * +replace (char *p, char c) +{ + switch (c) + { + /* A -> A" */ + case 'A': *p++ = '\xc3'; *p++ = '\x84'; break; + /* B -> O" */ + case 'B': *p++ = '\xc3'; *p++ = '\x96'; break; + /* C -> U" */ + case 'C': *p++ = '\xc3'; *p++ = '\x9c'; break; + /* D -> a" */ + case 'D': *p++ = '\xc3'; *p++ = '\xa4'; break; + /* ! -> MULTIPLICATION SIGN */ + case '!': *p++ = '\xc3'; *p++ = '\x97'; break; + /* = -> EM DASH */ + case '=': *p++ = '\xe2'; *p++ = '\x80'; *p++ = '\x94'; break; + /* ~ -> MUSICAL SYMBOL HALF NOTE */ + case '~': *p++ = '\xf0'; *p++ = '\x9d'; *p++ = '\x85'; *p++ = '\x9e'; + break; + } + return p; +} + +int +do_mb_tests (const struct test_s *test) +{ + int i, j; + struct test_s t; + const char *const chars = "ABCD!=~"; + char repl[8], *p; + char pattern[strlen (test->pattern) * 4 + 1]; + char string[strlen (test->string) * 4 + 1]; + char fail[8 + sizeof ("UTF-8 ")]; + + t.pattern = pattern; + t.string = string; + strcpy (fail, "UTF-8 "); + for (i = 1; i < 128; ++i) + { + p = repl; + for (j = 0; j < 7; ++j) + if (i & (1 << j)) + { + if (!strchr (test->pattern, chars[j]) + && !strchr (test->string, chars[j])) + break; + *p++ = chars[j]; + } + if (j < 7) + continue; + *p = '\0'; + + for (j = 0, p = pattern; test->pattern[j]; ++j) + if (strchr (repl, test->pattern[j])) + p = replace (p, test->pattern[j]); + else if (test->pattern[j] == '\\' && test->pattern[j + 1]) + { + *p++ = test->pattern[j++]; + *p++ = test->pattern[j]; + } + else + *p++ = test->pattern[j]; + *p = '\0'; + + t.start = test->start; + t.res = test->res; + + for (j = 0, p = string; test->string[j]; ++j) + if (strchr (repl, test->string[j])) + { + char *d = replace (p, test->string[j]); + if (test->start > j) + t.start += d - p - 1; + if (test->res > j) + t.res += d - p - 1; + p = d; + } + else + *p++ = test->string[j]; + *p = '\0'; + + p = stpcpy (fail + strlen ("UTF-8 "), repl); + *p++ = ' '; + *p = '\0'; + + if (do_one_test (&t, fail)) + return 1; + } + return 0; +} + +int +main (void) +{ size_t i; int ret = 0; mtrace (); - setlocale (LC_ALL, "de_DE.UTF-8"); for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i) { - int res; - re_set_syntax (tests[i].syntax); - memset (®buf, '\0', sizeof (regbuf)); - err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern), - ®buf); - if (err != NULL) + if (setlocale (LC_ALL, "de_DE.ISO-8859-1") == NULL) { - printf ("re_compile_pattern failed: %s\n", err); + puts ("setlocale de_DE.ISO-8859-1 failed"); ret = 1; - continue; } - - res = re_search (®buf, tests[i].string, strlen (tests[i].string), - tests[i].start, - strlen (tests[i].string) - tests[i].start, NULL); - if (res != tests[i].res) + ret |= do_one_test (&tests[i], ""); + if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL) { - printf ("re_search %zd failed: %d\n", i, res); + puts ("setlocale de_DE.UTF-8 failed"); ret = 1; - regfree (®buf); - continue; } - regfree (®buf); + ret |= do_one_test (&tests[i], "UTF-8 "); + ret |= do_mb_tests (&tests[i]); } return ret; Jakub
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |