Date: Wed, 7 Sep 2011 13:01:26 +0000 (UTC) From: Gabor Kovesdan <gabor@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r225435 - user/gabor/grep/trunk/regex Message-ID: <201109071301.p87D1Q5r039697@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gabor Date: Wed Sep 7 13:01:26 2011 New Revision: 225435 URL: http://svn.freebsd.org/changeset/base/225435 Log: - Merge some fixes from TRE - Silence some warnings Modified: user/gabor/grep/trunk/regex/fastmatch.h user/gabor/grep/trunk/regex/hashtable.c user/gabor/grep/trunk/regex/tre-fastmatch.c Modified: user/gabor/grep/trunk/regex/fastmatch.h ============================================================================== --- user/gabor/grep/trunk/regex/fastmatch.h Wed Sep 7 12:53:18 2011 (r225434) +++ user/gabor/grep/trunk/regex/fastmatch.h Wed Sep 7 13:01:26 2011 (r225435) @@ -12,10 +12,12 @@ typedef struct { size_t wlen; size_t len; wchar_t *wpattern; + bool *wescmap; int hasdot; int qsBc[UCHAR_MAX + 1]; int *bmGs; char *pattern; + bool *escmap; int defBc; void *qsBc_table; int *sbmGs; Modified: user/gabor/grep/trunk/regex/hashtable.c ============================================================================== --- user/gabor/grep/trunk/regex/hashtable.c Wed Sep 7 12:53:18 2011 (r225434) +++ user/gabor/grep/trunk/regex/hashtable.c Wed Sep 7 13:01:26 2011 (r225435) @@ -27,6 +27,7 @@ #include "glue.h" #include <errno.h> +#include <inttypes.h> #include <stdlib.h> #include <string.h> @@ -60,9 +61,8 @@ hashtable { hashtable *tbl; - DPRINT(("hashtable_init: table_size %lu, key_size %lu, value_size %lu\n", - (unsigned long)table_size, (unsigned long)key_size, - (unsigned long)value_size)); + DPRINT(("hashtable_init: table_size %zu, key_size %zu, value_size %zu\n", + table_size, key_size, value_size)); tbl = malloc(sizeof(hashtable)); if (tbl == NULL) @@ -111,7 +111,7 @@ hashtable_put(hashtable *tbl, const void } hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size; - DPRINT(("hashtable_put: calculated hash %lu\n", hash)); + DPRINT(("hashtable_put: calculated hash %" PRIu32 "\n", hash)); /* * On hash collision entries are inserted at the next free space, @@ -125,15 +125,15 @@ hashtable_put(hashtable *tbl, const void else if (memcmp(tbl->entries[hash]->key, key, tbl->key_size) == 0) { memcpy(tbl->entries[hash]->value, value, tbl->value_size); - DPRINT(("hashtable_put: effective location is %lu, " - "entry updated\n", hash)); + DPRINT(("hashtable_put: effective location is %" PRIu32 + ", entry updated\n", hash)); return (HASH_UPDATED); } if (++hash == tbl->table_size) hash = 0; } - DPRINT(("hashtable_put: effective location is %lu\n", hash)); + DPRINT(("hashtable_put: effective location is %" PRIu32 "\n", hash)); tbl->entries[hash] = malloc(sizeof(hashtable_entry)); if (tbl->entries[hash] == NULL) @@ -186,7 +186,7 @@ static hashtable_entry return (NULL); else if (memcmp(key, tbl->entries[hash]->key, tbl->key_size) == 0) { - DPRINT(("hashtable_lookup: entry found at location %lu\n", hash)); + DPRINT(("hashtable_lookup: entry found at location %" PRIu32 "\n", hash)); return (&tbl->entries[hash]); } Modified: user/gabor/grep/trunk/regex/tre-fastmatch.c ============================================================================== --- user/gabor/grep/trunk/regex/tre-fastmatch.c Wed Sep 7 12:53:18 2011 (r225434) +++ user/gabor/grep/trunk/regex/tre-fastmatch.c Wed Sep 7 13:01:26 2011 (r225435) @@ -42,7 +42,7 @@ #include "tre-fastmatch.h" #include "xmalloc.h" -static int fastcmp(const void *, const void *, size_t, +static int fastcmp(const void *, const bool *, const void *, size_t, tre_str_type_t, bool, bool); #define FAIL_COMP(errcode) \ @@ -102,11 +102,13 @@ static int fastcmp(const void *, const v switch (type) \ { \ case STR_WIDE: \ - mismatch = fastcmp(fg->wpattern, startptr, fg->wlen, type, \ + mismatch = fastcmp(fg->wpattern, fg->wescmap, startptr, \ + fg->wlen, type, \ fg->icase, fg->newline); \ break; \ default: \ - mismatch = fastcmp(fg->pattern, startptr, fg->len, type, \ + mismatch = fastcmp(fg->pattern, fg->escmap, startptr, \ + fg->len, type, \ fg->icase, fg->newline); \ } \ @@ -206,15 +208,16 @@ static int fastcmp(const void *, const v fg->qsBc[i] = fg->len - fg->hasdot; \ for (int i = fg->hasdot + 1; i < fg->len; i++) \ { \ - fg->qsBc[fg->pattern[i]] = fg->len - i; \ - DPRINT(("BC shift for char %c is %d\n", fg->pattern[i], \ + fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \ + DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i], \ fg->len - i)); \ if (fg->icase) \ { \ - char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i]) \ - : tolower(fg->pattern[i]); \ - fg->qsBc[c] = fg->len - i; \ - DPRINT(("BC shift for char %c is %d\n", c, fg->len - i)); \ + char c = islower((unsigned char)fg->pattern[i]) ? \ + toupper((unsigned char)fg->pattern[i]) : \ + tolower((unsigned char)fg->pattern[i]); \ + fg->qsBc[(unsigned char)c] = fg->len - i; \ + DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i)); \ } \ } @@ -244,7 +247,7 @@ static int fastcmp(const void *, const v r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ if ((r == HASH_FAIL) || (r == HASH_FULL)) \ FAIL_COMP(REG_ESPACE); \ - DPRINT(("BC shift for wide char %lc is %d\n", fg->wpattern[i], \ + DPRINT(("BC shift for wide char %lc is %zu\n", fg->wpattern[i], \ fg->wlen - i)); \ if (fg->icase) \ { \ @@ -253,11 +256,20 @@ static int fastcmp(const void *, const v r = hashtable_put(fg->qsBc_table, &wc, &k); \ if ((r == HASH_FAIL) || (r == HASH_FULL)) \ FAIL_COMP(REG_ESPACE); \ - DPRINT(("BC shift for wide char %lc is %d\n", wc, \ + DPRINT(("BC shift for wide char %lc is %zu\n", wc, \ fg->wlen - i)); \ } \ } +#ifdef _GREP_DEBUG +#define DPRINT_BMGS(len, fmt_str, sh) \ + for (int i = 0; i < len; i++) \ + DPRINT((fmt_str, i, sh[i])); +#else +#define DPRINT_BMGS(len, fmt_str, sh) \ + do { } while(/*CONSTCOND*/0) +#endif + /* * Fills in the good suffix table for SB/MB strings. */ @@ -271,6 +283,7 @@ static int fastcmp(const void *, const v fg->sbmGs[0] = 1; \ else \ _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \ + DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \ } /* @@ -286,6 +299,8 @@ static int fastcmp(const void *, const v fg->bmGs[0] = 1; \ else \ _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \ + DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \ + fg->bmGs); \ } #define _FILL_BMGS(arr, pat, plen, wide) \ @@ -428,7 +443,7 @@ tre_compile_literal(fastmatch_t *fg, con SAVE_PATTERN(pat, n, fg->pattern, fg->len); #endif - DPRINT(("tre_compile_literal: pattern: %s, len %u, icase: %c, word: %c, " + DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, " "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n')); @@ -452,6 +467,7 @@ tre_compile_fast(fastmatch_t *fg, const tre_char_t *tmp; size_t pos = 0; bool escaped = false; + bool *_escmap = NULL; INIT_COMP; @@ -490,111 +506,101 @@ tre_compile_fast(fastmatch_t *fg, const continue; \ } while (0) - /* - * Used for heuristic, only beginning ^, trailing $ and . are treated - * as special. - */ - if (cflags & _REG_HEUR) + for (int i = 0; i < n; i++) { - for (int i = 0; i < n; i++) - switch (pat[i]) - { - case TRE_CHAR('.'): - fg->hasdot = i; + switch (pat[i]) + { + case TRE_CHAR('\\'): + if (escaped) STORE_CHAR; - break; - case TRE_CHAR('$'): - if (i == n - 1) - fg->eol = true; - else - STORE_CHAR; - break; - default: + else + escaped = true; + continue; + case TRE_CHAR('['): + if (escaped) STORE_CHAR; - } - } - else - for (int i = 0; i < n; i++) - { - switch (pat[i]) - { - case TRE_CHAR('\\'): - if (escaped) - STORE_CHAR; - else - escaped = true; - break; - case TRE_CHAR('['): - if (escaped) - STORE_CHAR; - else - goto badpat; - break; - case TRE_CHAR('*'): - if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) - STORE_CHAR; - else - goto badpat; - break; - case TRE_CHAR('+'): - case TRE_CHAR('?'): - if ((cflags & REG_EXTENDED) && (i == 0)) - continue; - else if ((cflags & REG_EXTENDED) ^ !escaped) - STORE_CHAR; - else - goto badpat; - case TRE_CHAR('.'): - if (escaped) - goto badpat; - else - { - fg->hasdot = true; - STORE_CHAR; - } - break; - case TRE_CHAR('^'): + else + goto badpat; + continue; + case TRE_CHAR('*'): + if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) STORE_CHAR; - break; - case TRE_CHAR('$'): - if (!escaped && (i == n - 1)) - fg->eol = true; - else - STORE_CHAR; - break; - case TRE_CHAR('('): - if ((cflags & REG_EXTENDED) ^ escaped) - goto badpat; - else - STORE_CHAR; - break; - case TRE_CHAR('{'): - if (escaped && (i == 0)) - STORE_CHAR; - else if (!(cflags & REG_EXTENDED) && (i == 0)) - STORE_CHAR; - else if ((cflags & REG_EXTENDED) && (i == 0)) - continue; - else - goto badpat; - break; - case TRE_CHAR('|'): - if ((cflags & REG_EXTENDED) ^ (!escaped)) - goto badpat; - else - STORE_CHAR; - break; - default: - if (escaped) - goto badpat; - else + else + goto badpat; + continue; + case TRE_CHAR('+'): + case TRE_CHAR('?'): + if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else if ((cflags & REG_EXTENDED) ^ !escaped) + STORE_CHAR; + else + goto badpat; + continue; + case TRE_CHAR('.'): + if (escaped) + { + if (!_escmap) + _escmap = xmalloc(n * sizeof(bool)); + if (!_escmap) + { + xfree(tmp); + return REG_ESPACE; + } + _escmap[i] = true; + STORE_CHAR; + } + else + { + fg->hasdot = i; STORE_CHAR; - } - continue; + } + continue; + case TRE_CHAR('^'): + STORE_CHAR; + continue; + case TRE_CHAR('$'): + if (!escaped && (i == n - 1)) + fg->eol = true; + else + STORE_CHAR; + continue; + case TRE_CHAR('('): + if ((cflags & REG_EXTENDED) ^ escaped) + goto badpat; + else + STORE_CHAR; + continue; + case TRE_CHAR('{'): + if (!(cflags & REG_EXTENDED) ^ escaped) + STORE_CHAR; + else if (!(cflags & REG_EXTENDED) && (i == 0)) + STORE_CHAR; + else if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else + goto badpat; + continue; + case TRE_CHAR('|'): + if ((cflags & REG_EXTENDED) ^ escaped) + goto badpat; + else + STORE_CHAR; + continue; + default: + if (escaped) + goto badpat; + else + STORE_CHAR; + continue; + } + continue; badpat: - xfree(tmp); - return REG_BADPAT; - } + xfree(tmp); + DPRINT(("tre_compile_fast: compilation of pattern failed, falling" + "back to NFA\n")); + return REG_BADPAT; + } /* * The pattern has been processed and copied to tmp as a literal string @@ -603,14 +609,38 @@ badpat: */ #ifdef TRE_WCHAR SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen); + fg->wescmap = _escmap; STORE_MBS_PAT; + if (fg->wescmap != NULL) + { + bool escaped = false; + + fg->escmap = xmalloc(fg->len * sizeof(bool)); + if (!fg->escmap) + { + tre_free_fast(fg); + return REG_ESPACE; + } + + for (int i = 0; i < fg->len; i++) + if (fg->pattern[i] == '\\') + escaped = ! escaped; + else if (fg->pattern[i] == '.' && escaped) + { + fg->escmap[i] = true; + escaped = false; + } + else + escaped = false; + } #else SAVE_PATTERN(tmp, pos, fg->pattern, fg->len); + fg->escmap = _escmap; #endif xfree(tmp); - DPRINT(("tre_compile_fast: pattern: %s, len %u, bol %c, eol %c, " + DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, " "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len, fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n', fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n', @@ -703,7 +733,7 @@ tre_match_fast(const fastmatch_t *fg, co const tre_char_t *str_wide = data; /* Calculate length if unspecified. */ - if (len == (unsigned)-1) + if (len == (size_t)-1) switch (type) { case STR_WIDE: @@ -822,10 +852,14 @@ tre_free_fast(fastmatch_t *fg) hashtable_free(fg->qsBc_table); if (!fg->hasdot) xfree(fg->bmGs); + if (fg->wescmap) + xfree(fg->wescmap); xfree(fg->wpattern); #endif if (!fg->hasdot) xfree(fg->sbmGs); + if (fg->escmap) + xfree(fg->escmap); xfree(fg->pattern); } @@ -835,7 +869,7 @@ tre_free_fast(fastmatch_t *fg) * REG_OK on success */ static inline int -fastcmp(const void *pat, const void *data, size_t len, +fastcmp(const void *pat, const bool *escmap, const void *data, size_t len, tre_str_type_t type, bool icase, bool newline) { const char *str_byte = data; @@ -851,7 +885,7 @@ fastcmp(const void *pat, const void *dat case STR_WIDE: /* Check dot */ - if (pat_wide[i] == TRE_CHAR('.') && + if (pat_wide[i] == TRE_CHAR('.') && (!escmap || !escmap[i]) && (!newline || (str_wide[i] != TRE_CHAR('\n')))) continue; @@ -862,7 +896,7 @@ fastcmp(const void *pat, const void *dat break; default: /* Check dot */ - if (pat_byte[i] == '.' && + if (pat_byte[i] == '.' && (!escmap || !escmap[i]) && (!newline || (str_byte[i] != '\n'))) continue;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201109071301.p87D1Q5r039697>