Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 12 Aug 2011 16:17:15 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r224802 - in user/gabor/tre-integration: contrib/tre/lib include
Message-ID:  <201108121617.p7CGHFVI072840@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Fri Aug 12 16:17:15 2011
New Revision: 224802
URL: http://svn.freebsd.org/changeset/base/224802

Log:
  - Introduce new flag for word-boundary matching: REG_WORD
  - Partly recover broken word-boundary matching; rest is TODO
  - Macroify fastcomp() and fastcomp_literal() initialization code
  - Adjust a comment

Modified:
  user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
  user/gabor/tre-integration/contrib/tre/lib/tre.h
  user/gabor/tre-integration/include/regex.h

Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/fastmatch.c	Fri Aug 12 15:13:06 2011	(r224801)
+++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.c	Fri Aug 12 16:17:15 2011	(r224802)
@@ -332,6 +332,18 @@ static int	fastcmp(const void *, const v
   memcpy(p, pat, l * sizeof(tre_char_t));				\
   p[l] = TRE_CHAR('\0');
 
+#define INIT_COMP							\
+  /* Initialize. */							\
+  memset(fg, 0, sizeof(*fg));						\
+  fg->icase = (cflags & REG_ICASE);					\
+  fg->word = (cflags & REG_WORD);					\
+									\
+  /* Cannot handle REG_ICASE with MB string */				\
+  if (fg->icase && (MB_CUR_MAX > 1))					\
+    return REG_BADPAT;							\
+									\
+  /* Calculate length if unspecified */					\
+  n = (n == 0) ? tre_strlen(pat) : n;
 
 /*
  * Returns: REG_OK on success, error code otherwise
@@ -340,12 +352,10 @@ int
 tre_fastcomp_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
 		     int cflags)
 {
-  /* Initialize. */
-  memset(fg, 0, sizeof(*fg));
-  fg->icase = (cflags & REG_ICASE);
+  INIT_COMP;
 
-  /* Cannot handle REG_ICASE with MB string */
-  if (fg->icase && (MB_CUR_MAX > 1))
+  /* Cannot handle word boundaries with MB string */
+  if (fg->word && (MB_CUR_MAX > 1))
     return REG_BADPAT;
 
 #ifdef TRE_WCHAR
@@ -372,15 +382,7 @@ int
 tre_fastcomp(fastmatch_t *fg, const tre_char_t *pat, size_t n,
 	     int cflags)
 {
-  /* Initialize. */
-  memset(fg, 0, sizeof(*fg));
-  fg->icase = (cflags & REG_ICASE);
-
-  /* Cannot handle REG_ICASE with MB string */
-  if (fg->icase && (MB_CUR_MAX > 1))
-    return REG_BADPAT;
-
-  n = (n == 0) ? tre_strlen(pat) : n;
+  INIT_COMP;
 
   /* Remove end-of-line character ('$'). */
   if ((n > 0) && (pat[n - 1] == TRE_CHAR('$')))
@@ -408,6 +410,10 @@ tre_fastcomp(fastmatch_t *fg, const tre_
       fg->word = true;
     }
 
+  /* Cannot handle word boundaries with MB string */
+  if (fg->word && (MB_CUR_MAX > 1))
+    return REG_BADPAT;
+
   /* Look for ways to cheat...er...avoid the full regex engine. */
   for (unsigned int i = 0; i < n; i++)
     {
@@ -445,6 +451,34 @@ tre_fastcomp(fastmatch_t *fg, const tre_
   return REG_OK;
 }
 
+#define CHECK_WORD_BOUNDARY						\
+  {									\
+    bool bbound, ebound;						\
+									\
+    switch (type)							\
+      {									\
+	case STR_WIDE:							\
+	  bbound = (j == 0) || !(tre_isalnum(str_wide[j - 1]) ||	\
+	    (str_wide[j - 1] == TRE_CHAR('_')));			\
+	  ebound = (j + fg->wlen == len) ||				\
+	     !(tre_isalnum(str_wide[j + fg->wlen]) ||			\
+	     (str_wide[j + fg->wlen] == TRE_CHAR('_')));		\
+	  break;							\
+	default:							\
+	  bbound = (j == 0) || !(tre_isalnum(str_byte[j - 1]) ||	\
+	    (str_byte[j - 1] == '_'));					\
+	  ebound = (j + fg->len == len) ||				\
+	    !(tre_isalnum(str_byte[j + fg->len]) ||			\
+	    (str_byte[j + fg->len] == '_'));				\
+      }									\
+    if (!bbound || !ebound)						\
+      {									\
+	shift = 1;							\
+	j += shift;							\
+	continue;							\
+      }									\
+  }
+
 /*
  * Executes matching of the precompiled pattern on the input string.
  * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
@@ -485,6 +519,7 @@ tre_fastexec(const fastmatch_t *fg, cons
 	shift = fg->len;
     }
 
+  /* XXX: Fix with word boundaries */
   /* Only try once at the beginning or ending of the line. */
   if (fg->bol || fg->eol)
     {
@@ -506,7 +541,7 @@ tre_fastexec(const fastmatch_t *fg, cons
     }
   else
     {
-      /* Quick Search algorithm. */
+      /* Quick Search / Turbo Boyer-Moore algorithm. */
       j = 0;
       do
 	{
@@ -514,6 +549,8 @@ tre_fastexec(const fastmatch_t *fg, cons
 	  COMPARE;
 	  if (mismatch == REG_OK)
 	    {
+	      if (fg->word)
+		CHECK_WORD_BOUNDARY;
 	      pmatch[0].rm_so = j;
 	      pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
 	      return REG_OK;

Modified: user/gabor/tre-integration/contrib/tre/lib/tre.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre.h	Fri Aug 12 15:13:06 2011	(r224801)
+++ user/gabor/tre-integration/contrib/tre/lib/tre.h	Fri Aug 12 16:17:15 2011	(r224802)
@@ -90,6 +90,7 @@ typedef enum {
 #define REG_UNGREEDY    (REG_RIGHT_ASSOC << 1)
 #define REG_PEND        (REG_UNGREEDY << 1)
 #define REG_GNU		(REG_PEND << 1)
+#define REG_WORD	(REG_GNU << 1)
 
 /* POSIX tre_regexec() flags. */
 #define REG_NOTBOL 1

Modified: user/gabor/tre-integration/include/regex.h
==============================================================================
--- user/gabor/tre-integration/include/regex.h	Fri Aug 12 15:13:06 2011	(r224801)
+++ user/gabor/tre-integration/include/regex.h	Fri Aug 12 16:17:15 2011	(r224802)
@@ -82,6 +82,7 @@ typedef enum {
 #define REG_UNGREEDY    (REG_RIGHT_ASSOC << 1)
 #define REG_PEND	(REG_UNGREEDY << 1)
 #define REG_GNU         (REG_PEND << 1)
+#define REG_WORD	(REG_GNU << 1)
 
 /* POSIX tre_regexec() flags. */
 #define REG_NOTBOL 1



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108121617.p7CGHFVI072840>