Date: Wed, 7 Sep 2011 12:53:18 +0000 (UTC) From: Gabor Kovesdan <gabor@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r225434 - in user/gabor/tre-integration: contrib/tre/lib include Message-ID: <201109071253.p87CrI3a039403@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gabor Date: Wed Sep 7 12:53:18 2011 New Revision: 225434 URL: http://svn.freebsd.org/changeset/base/225434 Log: - Make the heuristic code loosly coupled to the fast matcher by providing properly escaped patterns instead of using an internal flag - Add some struct fields for escaped dots, forgotten in a previous commit Modified: user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c user/gabor/tre-integration/include/fastmatch.h user/gabor/tre-integration/include/regex.h Modified: user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Wed Sep 7 07:52:45 2011 (r225433) +++ user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Wed Sep 7 12:53:18 2011 (r225434) @@ -263,6 +263,15 @@ static int fastcmp(const void *, const b } \ } +#ifdef TRE_DEBUG +#define DPRINT_BMGS(len, fmt_str, sh) \ + for (int i = 0; i < len; i++) \ + DPRINT((fmt_str, i, sh[i])); +#else +#define DPRINT_BMGS(len, fmt_str, sh) \ + do { } while(/*CONSTCOND*/0) +#endif + /* * Fills in the good suffix table for SB/MB strings. */ @@ -276,6 +285,7 @@ static int fastcmp(const void *, const b fg->sbmGs[0] = 1; \ else \ _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \ + DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \ } /* @@ -291,6 +301,8 @@ static int fastcmp(const void *, const b fg->bmGs[0] = 1; \ else \ _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \ + DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \ + fg->bmGs); \ } #define _FILL_BMGS(arr, pat, plen, wide) \ @@ -496,121 +508,99 @@ tre_compile_fast(fastmatch_t *fg, const continue; \ } while (0) - /* - * Used for heuristic, only beginning ^, trailing $ and . are treated - * as special. - */ - if (cflags & _REG_HEUR) + for (int i = 0; i < n; i++) { - for (int i = 0; i < n; i++) - switch (pat[i]) - { - case TRE_CHAR('.'): - fg->hasdot = i; + switch (pat[i]) + { + case TRE_CHAR('\\'): + if (escaped) STORE_CHAR; - break; - case TRE_CHAR('$'): - if (i == n - 1) - fg->eol = true; - else - STORE_CHAR; - break; - default: + else + escaped = true; + continue; + case TRE_CHAR('['): + if (escaped) STORE_CHAR; - } - } - else - for (int i = 0; i < n; i++) - { - switch (pat[i]) - { - case TRE_CHAR('\\'): - if (escaped) - STORE_CHAR; - else - escaped = true; - break; - case TRE_CHAR('['): - if (escaped) - STORE_CHAR; - else - goto badpat; - break; - case TRE_CHAR('*'): - if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) - STORE_CHAR; - else - goto badpat; - break; - case TRE_CHAR('+'): - case TRE_CHAR('?'): - if ((cflags & REG_EXTENDED) && (i == 0)) - continue; - else if ((cflags & REG_EXTENDED) ^ !escaped) - STORE_CHAR; - else - goto badpat; - case TRE_CHAR('.'): - if (escaped) - { - if (!_escmap) - _escmap = xmalloc(n * sizeof(bool)); - if (!_escmap) - { - xfree(tmp); - return REG_ESPACE; - } - _escmap[i] = true; - STORE_CHAR; - } - else - { - fg->hasdot = i; - STORE_CHAR; - } - break; - case TRE_CHAR('^'): + else + goto badpat; + continue; + case TRE_CHAR('*'): + if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) STORE_CHAR; - break; - case TRE_CHAR('$'): - if (!escaped && (i == n - 1)) - fg->eol = true; - else - STORE_CHAR; - break; - case TRE_CHAR('('): - if ((cflags & REG_EXTENDED) ^ escaped) - goto badpat; - else - STORE_CHAR; - break; - case TRE_CHAR('{'): - if (escaped && (i == 0)) - STORE_CHAR; - else if (!(cflags & REG_EXTENDED) && (i == 0)) - STORE_CHAR; - else if ((cflags & REG_EXTENDED) && (i == 0)) - continue; - else - goto badpat; - break; - case TRE_CHAR('|'): - if ((cflags & REG_EXTENDED) ^ (!escaped)) - goto badpat; - else - STORE_CHAR; - break; - default: - if (escaped) - goto badpat; - else + else + goto badpat; + continue; + case TRE_CHAR('+'): + case TRE_CHAR('?'): + if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else if ((cflags & REG_EXTENDED) ^ !escaped) + STORE_CHAR; + else + goto badpat; + continue; + case TRE_CHAR('.'): + if (escaped) + { + if (!_escmap) + _escmap = xmalloc(n * sizeof(bool)); + if (!_escmap) + { + xfree(tmp); + return REG_ESPACE; + } + _escmap[i] = true; + STORE_CHAR; + } + else + { + fg->hasdot = i; STORE_CHAR; - } - continue; + } + continue; + case TRE_CHAR('^'): + STORE_CHAR; + continue; + case TRE_CHAR('$'): + if (!escaped && (i == n - 1)) + fg->eol = true; + else + STORE_CHAR; + continue; + case TRE_CHAR('('): + if ((cflags & REG_EXTENDED) ^ escaped) + goto badpat; + else + STORE_CHAR; + continue; + case TRE_CHAR('{'): + if (!(cflags & REG_EXTENDED) ^ escaped) + STORE_CHAR; + else if (!(cflags & REG_EXTENDED) && (i == 0)) + STORE_CHAR; + else if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else + goto badpat; + continue; + case TRE_CHAR('|'): + if ((cflags & REG_EXTENDED) ^ escaped) + goto badpat; + else + STORE_CHAR; + continue; + default: + if (escaped) + goto badpat; + else + STORE_CHAR; + continue; + } + continue; badpat: - xfree(tmp); - return REG_BADPAT; - } + xfree(tmp); + return REG_BADPAT; + } /* * The pattern has been processed and copied to tmp as a literal string Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c Wed Sep 7 07:52:45 2011 (r225433) +++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c Wed Sep 7 12:53:18 2011 (r225434) @@ -127,11 +127,14 @@ goto end_segment; \ } while (0) -#define STORE_CHAR \ +#define STORE_CHAR(esc) \ do \ { \ - escaped = false; \ + if (esc) \ + heur[pos++] = TRE_CHAR('\\'); \ heur[pos++] = regex[i]; \ + escaped = false; \ + continue; \ } while (0) @@ -178,11 +181,13 @@ tre_compile_heur(heur_t *h, const tre_ch * bracket is escaped. */ case TRE_CHAR('['): - PARSE_BRACKETS; if (escaped) - STORE_CHAR; + STORE_CHAR(true); else - heur[pos++] = TRE_CHAR('.'); + { + PARSE_BRACKETS; + heur[pos++] = TRE_CHAR('.'); + } continue; /* @@ -192,9 +197,9 @@ tre_compile_heur(heur_t *h, const tre_ch */ case TRE_CHAR('{'): if (escaped && (i == 1)) - STORE_CHAR; + STORE_CHAR(true); else if ((i == 0) && !(cflags & REG_EXTENDED)) - STORE_CHAR; + STORE_CHAR(true); else if ((i == 0) && (cflags & REG_EXTENDED)) continue; @@ -205,7 +210,7 @@ tre_compile_heur(heur_t *h, const tre_ch END_SEGMENT; } else - STORE_CHAR; + STORE_CHAR(cflags & REG_EXTENDED); continue; /* @@ -213,11 +218,13 @@ tre_compile_heur(heur_t *h, const tre_ch * otherwise treated as a normal character. */ case TRE_CHAR('('): - PARSE_UNIT('(', ')'); if (escaped ^ (cflags & REG_EXTENDED)) - END_SEGMENT; + { + PARSE_UNIT('(', ')'); + END_SEGMENT; + } else - STORE_CHAR; + STORE_CHAR(cflags & REG_EXTENDED); continue; /* @@ -227,9 +234,9 @@ tre_compile_heur(heur_t *h, const tre_ch */ case TRE_CHAR('\\'): if (escaped) - STORE_CHAR; + STORE_CHAR(true); else - escaped = !escaped; + escaped = true; continue; /* @@ -240,7 +247,7 @@ tre_compile_heur(heur_t *h, const tre_ch */ case TRE_CHAR('*'): if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) - STORE_CHAR; + STORE_CHAR(true); else if ((i != 0)) { pos--; @@ -262,7 +269,7 @@ tre_compile_heur(heur_t *h, const tre_ch else if ((cflags & REG_EXTENDED) ^ escaped) END_SEGMENT; else - STORE_CHAR; + STORE_CHAR(cflags & REG_EXTENDED); continue; /* @@ -281,7 +288,7 @@ tre_compile_heur(heur_t *h, const tre_ch END_SEGMENT; } else - STORE_CHAR; + STORE_CHAR(true); continue; /* @@ -296,7 +303,7 @@ tre_compile_heur(heur_t *h, const tre_ch else if (!(cflags & REG_EXTENDED) && escaped) END_SEGMENT; else - STORE_CHAR; + STORE_CHAR(cflags & REG_EXTENDED); continue; /* @@ -304,10 +311,7 @@ tre_compile_heur(heur_t *h, const tre_ch * cannot handle it. */ case TRE_CHAR('.'): - if (escaped) - END_SEGMENT; - else - STORE_CHAR; + STORE_CHAR(escaped); continue; /* @@ -319,7 +323,7 @@ tre_compile_heur(heur_t *h, const tre_ch if (escaped) END_SEGMENT; else - STORE_CHAR; + STORE_CHAR(false); continue; } } @@ -352,7 +356,7 @@ end_segment: goto space1; } - ret = tre_compile_fast(h->start, heur, pos, _REG_HEUR); + ret = tre_compile_fast(h->start, heur, pos, 0); if (ret != REG_OK) { errcode = REG_BADPAT; @@ -386,7 +390,7 @@ end_segment: goto space2; } - ret = tre_compile_fast(h->end, heur, pos, _REG_HEUR); + ret = tre_compile_fast(h->end, heur, pos, 0); if (ret != REG_OK) { xfree(h->end); Modified: user/gabor/tre-integration/include/fastmatch.h ============================================================================== --- user/gabor/tre-integration/include/fastmatch.h Wed Sep 7 07:52:45 2011 (r225433) +++ user/gabor/tre-integration/include/fastmatch.h Wed Sep 7 12:53:18 2011 (r225434) @@ -12,10 +12,12 @@ typedef struct { size_t wlen; size_t len; wchar_t *wpattern; + bool *wescmap; int hasdot; int qsBc[UCHAR_MAX + 1]; int *bmGs; char *pattern; + bool *escmap; int defBc; void *qsBc_table; int *sbmGs; Modified: user/gabor/tre-integration/include/regex.h ============================================================================== --- user/gabor/tre-integration/include/regex.h Wed Sep 7 07:52:45 2011 (r225433) +++ user/gabor/tre-integration/include/regex.h Wed Sep 7 12:53:18 2011 (r225434) @@ -110,7 +110,6 @@ typedef enum { #define REG_PEND (REG_UNGREEDY << 1) #define REG_GNU (REG_PEND << 1) #define REG_WORD (REG_GNU << 1) -#define _REG_HEUR (REG_WORD << 1) /* POSIX tre_regexec() flags. */ #define REG_NOTBOL 1
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201109071253.p87CrI3a039403>