Date: Fri, 12 Aug 2011 16:17:15 +0000 (UTC) From: Gabor Kovesdan <gabor@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r224802 - in user/gabor/tre-integration: contrib/tre/lib include Message-ID: <201108121617.p7CGHFVI072840@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gabor Date: Fri Aug 12 16:17:15 2011 New Revision: 224802 URL: http://svn.freebsd.org/changeset/base/224802 Log: - Introduce new flag for word-boundary matching: REG_WORD - Partly recover broken word-boundary matching; rest is TODO - Macroify fastcomp() and fastcomp_literal() initialization code - Adjust a comment Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c user/gabor/tre-integration/contrib/tre/lib/tre.h user/gabor/tre-integration/include/regex.h Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Fri Aug 12 15:13:06 2011 (r224801) +++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Fri Aug 12 16:17:15 2011 (r224802) @@ -332,6 +332,18 @@ static int fastcmp(const void *, const v memcpy(p, pat, l * sizeof(tre_char_t)); \ p[l] = TRE_CHAR('\0'); +#define INIT_COMP \ + /* Initialize. */ \ + memset(fg, 0, sizeof(*fg)); \ + fg->icase = (cflags & REG_ICASE); \ + fg->word = (cflags & REG_WORD); \ + \ + /* Cannot handle REG_ICASE with MB string */ \ + if (fg->icase && (MB_CUR_MAX > 1)) \ + return REG_BADPAT; \ + \ + /* Calculate length if unspecified */ \ + n = (n == 0) ? tre_strlen(pat) : n; /* * Returns: REG_OK on success, error code otherwise @@ -340,12 +352,10 @@ int tre_fastcomp_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n, int cflags) { - /* Initialize. */ - memset(fg, 0, sizeof(*fg)); - fg->icase = (cflags & REG_ICASE); + INIT_COMP; - /* Cannot handle REG_ICASE with MB string */ - if (fg->icase && (MB_CUR_MAX > 1)) + /* Cannot handle word boundaries with MB string */ + if (fg->word && (MB_CUR_MAX > 1)) return REG_BADPAT; #ifdef TRE_WCHAR @@ -372,15 +382,7 @@ int tre_fastcomp(fastmatch_t *fg, const tre_char_t *pat, size_t n, int cflags) { - /* Initialize. */ - memset(fg, 0, sizeof(*fg)); - fg->icase = (cflags & REG_ICASE); - - /* Cannot handle REG_ICASE with MB string */ - if (fg->icase && (MB_CUR_MAX > 1)) - return REG_BADPAT; - - n = (n == 0) ? tre_strlen(pat) : n; + INIT_COMP; /* Remove end-of-line character ('$'). */ if ((n > 0) && (pat[n - 1] == TRE_CHAR('$'))) @@ -408,6 +410,10 @@ tre_fastcomp(fastmatch_t *fg, const tre_ fg->word = true; } + /* Cannot handle word boundaries with MB string */ + if (fg->word && (MB_CUR_MAX > 1)) + return REG_BADPAT; + /* Look for ways to cheat...er...avoid the full regex engine. */ for (unsigned int i = 0; i < n; i++) { @@ -445,6 +451,34 @@ tre_fastcomp(fastmatch_t *fg, const tre_ return REG_OK; } +#define CHECK_WORD_BOUNDARY \ + { \ + bool bbound, ebound; \ + \ + switch (type) \ + { \ + case STR_WIDE: \ + bbound = (j == 0) || !(tre_isalnum(str_wide[j - 1]) || \ + (str_wide[j - 1] == TRE_CHAR('_'))); \ + ebound = (j + fg->wlen == len) || \ + !(tre_isalnum(str_wide[j + fg->wlen]) || \ + (str_wide[j + fg->wlen] == TRE_CHAR('_'))); \ + break; \ + default: \ + bbound = (j == 0) || !(tre_isalnum(str_byte[j - 1]) || \ + (str_byte[j - 1] == '_')); \ + ebound = (j + fg->len == len) || \ + !(tre_isalnum(str_byte[j + fg->len]) || \ + (str_byte[j + fg->len] == '_')); \ + } \ + if (!bbound || !ebound) \ + { \ + shift = 1; \ + j += shift; \ + continue; \ + } \ + } + /* * Executes matching of the precompiled pattern on the input string. * Returns REG_OK or REG_NOMATCH depending on if we find a match or not. @@ -485,6 +519,7 @@ tre_fastexec(const fastmatch_t *fg, cons shift = fg->len; } + /* XXX: Fix with word boundaries */ /* Only try once at the beginning or ending of the line. */ if (fg->bol || fg->eol) { @@ -506,7 +541,7 @@ tre_fastexec(const fastmatch_t *fg, cons } else { - /* Quick Search algorithm. */ + /* Quick Search / Turbo Boyer-Moore algorithm. */ j = 0; do { @@ -514,6 +549,8 @@ tre_fastexec(const fastmatch_t *fg, cons COMPARE; if (mismatch == REG_OK) { + if (fg->word) + CHECK_WORD_BOUNDARY; pmatch[0].rm_so = j; pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len); return REG_OK; Modified: user/gabor/tre-integration/contrib/tre/lib/tre.h ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/tre.h Fri Aug 12 15:13:06 2011 (r224801) +++ user/gabor/tre-integration/contrib/tre/lib/tre.h Fri Aug 12 16:17:15 2011 (r224802) @@ -90,6 +90,7 @@ typedef enum { #define REG_UNGREEDY (REG_RIGHT_ASSOC << 1) #define REG_PEND (REG_UNGREEDY << 1) #define REG_GNU (REG_PEND << 1) +#define REG_WORD (REG_GNU << 1) /* POSIX tre_regexec() flags. */ #define REG_NOTBOL 1 Modified: user/gabor/tre-integration/include/regex.h ============================================================================== --- user/gabor/tre-integration/include/regex.h Fri Aug 12 15:13:06 2011 (r224801) +++ user/gabor/tre-integration/include/regex.h Fri Aug 12 16:17:15 2011 (r224802) @@ -82,6 +82,7 @@ typedef enum { #define REG_UNGREEDY (REG_RIGHT_ASSOC << 1) #define REG_PEND (REG_UNGREEDY << 1) #define REG_GNU (REG_PEND << 1) +#define REG_WORD (REG_GNU << 1) /* POSIX tre_regexec() flags. */ #define REG_NOTBOL 1
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108121617.p7CGHFVI072840>