Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 7 Sep 2011 13:01:26 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r225435 - user/gabor/grep/trunk/regex
Message-ID:  <201109071301.p87D1Q5r039697@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Wed Sep  7 13:01:26 2011
New Revision: 225435
URL: http://svn.freebsd.org/changeset/base/225435

Log:
  - Merge some fixes from TRE
  - Silence some warnings

Modified:
  user/gabor/grep/trunk/regex/fastmatch.h
  user/gabor/grep/trunk/regex/hashtable.c
  user/gabor/grep/trunk/regex/tre-fastmatch.c

Modified: user/gabor/grep/trunk/regex/fastmatch.h
==============================================================================
--- user/gabor/grep/trunk/regex/fastmatch.h	Wed Sep  7 12:53:18 2011	(r225434)
+++ user/gabor/grep/trunk/regex/fastmatch.h	Wed Sep  7 13:01:26 2011	(r225435)
@@ -12,10 +12,12 @@ typedef struct {
   size_t	 wlen;
   size_t	 len;
   wchar_t	*wpattern;
+  bool		*wescmap;
   int		 hasdot;
   int		 qsBc[UCHAR_MAX + 1];
   int		*bmGs;
   char		*pattern;
+  bool		*escmap;
   int		 defBc;
   void		*qsBc_table;
   int		*sbmGs;

Modified: user/gabor/grep/trunk/regex/hashtable.c
==============================================================================
--- user/gabor/grep/trunk/regex/hashtable.c	Wed Sep  7 12:53:18 2011	(r225434)
+++ user/gabor/grep/trunk/regex/hashtable.c	Wed Sep  7 13:01:26 2011	(r225435)
@@ -27,6 +27,7 @@
 #include "glue.h"
 
 #include <errno.h>
+#include <inttypes.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -60,9 +61,8 @@ hashtable
 {
   hashtable *tbl;
 
-  DPRINT(("hashtable_init: table_size %lu, key_size %lu, value_size %lu\n",
-	(unsigned long)table_size, (unsigned long)key_size,
-	(unsigned long)value_size));
+  DPRINT(("hashtable_init: table_size %zu, key_size %zu, value_size %zu\n",
+	  table_size, key_size, value_size));
 
   tbl = malloc(sizeof(hashtable));
   if (tbl == NULL)
@@ -111,7 +111,7 @@ hashtable_put(hashtable *tbl, const void
     }
 
   hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size;
-  DPRINT(("hashtable_put: calculated hash %lu\n", hash));
+  DPRINT(("hashtable_put: calculated hash %" PRIu32 "\n", hash));
 
   /*
    * On hash collision entries are inserted at the next free space,
@@ -125,15 +125,15 @@ hashtable_put(hashtable *tbl, const void
       else if (memcmp(tbl->entries[hash]->key, key, tbl->key_size) == 0)
 	{
 	  memcpy(tbl->entries[hash]->value, value, tbl->value_size);
-	  DPRINT(("hashtable_put: effective location is %lu, "
-		  "entry updated\n", hash));
+	  DPRINT(("hashtable_put: effective location is %" PRIu32
+		  ", entry updated\n", hash));
 	  return (HASH_UPDATED);
 	}
       if (++hash == tbl->table_size)
 	hash = 0;
     }
 
-  DPRINT(("hashtable_put: effective location is %lu\n", hash));
+  DPRINT(("hashtable_put: effective location is %" PRIu32 "\n", hash));
 
   tbl->entries[hash] = malloc(sizeof(hashtable_entry));
   if (tbl->entries[hash] == NULL)
@@ -186,7 +186,7 @@ static hashtable_entry
 	return (NULL);
       else if (memcmp(key, tbl->entries[hash]->key, tbl->key_size) == 0)
 	{
-	  DPRINT(("hashtable_lookup: entry found at location %lu\n", hash));
+	  DPRINT(("hashtable_lookup: entry found at location %" PRIu32 "\n", hash));
 	  return (&tbl->entries[hash]);
 	}
 

Modified: user/gabor/grep/trunk/regex/tre-fastmatch.c
==============================================================================
--- user/gabor/grep/trunk/regex/tre-fastmatch.c	Wed Sep  7 12:53:18 2011	(r225434)
+++ user/gabor/grep/trunk/regex/tre-fastmatch.c	Wed Sep  7 13:01:26 2011	(r225435)
@@ -42,7 +42,7 @@
 #include "tre-fastmatch.h"
 #include "xmalloc.h"
 
-static int	fastcmp(const void *, const void *, size_t,
+static int	fastcmp(const void *, const bool *, const void *, size_t,
 			tre_str_type_t, bool, bool);
 
 #define FAIL_COMP(errcode)						\
@@ -102,11 +102,13 @@ static int	fastcmp(const void *, const v
   switch (type)								\
     {									\
       case STR_WIDE:							\
-	mismatch = fastcmp(fg->wpattern, startptr, fg->wlen, type,	\
+	mismatch = fastcmp(fg->wpattern, fg->wescmap, startptr,		\
+			   fg->wlen, type,				\
 			   fg->icase, fg->newline);			\
 	break;								\
       default:								\
-	mismatch = fastcmp(fg->pattern, startptr, fg->len, type,	\
+	mismatch = fastcmp(fg->pattern, fg->escmap, startptr,		\
+			   fg->len, type,				\
 			   fg->icase, fg->newline);			\
       }									\
 
@@ -206,15 +208,16 @@ static int	fastcmp(const void *, const v
     fg->qsBc[i] = fg->len - fg->hasdot;					\
   for (int i = fg->hasdot + 1; i < fg->len; i++)			\
     {									\
-      fg->qsBc[fg->pattern[i]] = fg->len - i;				\
-      DPRINT(("BC shift for char %c is %d\n", fg->pattern[i],		\
+      fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i;		\
+      DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i],		\
 	     fg->len - i));						\
       if (fg->icase)							\
         {								\
-          char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i])	\
-            : tolower(fg->pattern[i]);					\
-          fg->qsBc[c] = fg->len - i;					\
-	  DPRINT(("BC shift for char %c is %d\n", c, fg->len - i));	\
+          char c = islower((unsigned char)fg->pattern[i]) ?		\
+		   toupper((unsigned char)fg->pattern[i]) :		\
+		   tolower((unsigned char)fg->pattern[i]);		\
+          fg->qsBc[(unsigned char)c] = fg->len - i;			\
+	  DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i));	\
         }								\
     }
 
@@ -244,7 +247,7 @@ static int	fastcmp(const void *, const v
       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
       if ((r == HASH_FAIL) || (r == HASH_FULL))				\
 	FAIL_COMP(REG_ESPACE);						\
-      DPRINT(("BC shift for wide char %lc is %d\n", fg->wpattern[i],	\
+      DPRINT(("BC shift for wide char %lc is %zu\n", fg->wpattern[i],	\
 	     fg->wlen - i));						\
       if (fg->icase)							\
 	{								\
@@ -253,11 +256,20 @@ static int	fastcmp(const void *, const v
 	  r = hashtable_put(fg->qsBc_table, &wc, &k);			\
 	  if ((r == HASH_FAIL) || (r == HASH_FULL))			\
 	    FAIL_COMP(REG_ESPACE);					\
-	  DPRINT(("BC shift for wide char %lc is %d\n", wc,		\
+	  DPRINT(("BC shift for wide char %lc is %zu\n", wc,		\
 		 fg->wlen - i));					\
 	}								\
     }
 
+#ifdef _GREP_DEBUG
+#define DPRINT_BMGS(len, fmt_str, sh)					\
+  for (int i = 0; i < len; i++)						\
+    DPRINT((fmt_str, i, sh[i]));
+#else
+#define DPRINT_BMGS(len, fmt_str, sh)					\
+  do { } while(/*CONSTCOND*/0)
+#endif
+
 /*
  * Fills in the good suffix table for SB/MB strings.
  */
@@ -271,6 +283,7 @@ static int	fastcmp(const void *, const v
 	fg->sbmGs[0] = 1;						\
       else								\
 	_FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);		\
+      DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs);	\
     }
 
 /*
@@ -286,6 +299,8 @@ static int	fastcmp(const void *, const v
 	fg->bmGs[0] = 1;						\
       else								\
 	_FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);		\
+      DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n",	\
+		  fg->bmGs);						\
     }
 
 #define _FILL_BMGS(arr, pat, plen, wide)				\
@@ -428,7 +443,7 @@ tre_compile_literal(fastmatch_t *fg, con
   SAVE_PATTERN(pat, n, fg->pattern, fg->len);
 #endif
 
-  DPRINT(("tre_compile_literal: pattern: %s, len %u, icase: %c, word: %c, "
+  DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
 	 "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
 	 fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
 
@@ -452,6 +467,7 @@ tre_compile_fast(fastmatch_t *fg, const 
   tre_char_t *tmp;
   size_t pos = 0;
   bool escaped = false;
+  bool *_escmap = NULL;
 
   INIT_COMP;
 
@@ -490,111 +506,101 @@ tre_compile_fast(fastmatch_t *fg, const 
       continue;								\
     } while (0)
 
-  /*
-   * Used for heuristic, only beginning ^, trailing $ and . are treated
-   * as special.
-   */
-  if (cflags & _REG_HEUR)
+  for (int i = 0; i < n; i++)
     {
-      for (int i = 0; i < n; i++)
-	switch (pat[i])
-	  {
-	    case TRE_CHAR('.'):
-	      fg->hasdot = i;
+      switch (pat[i])
+	{
+	  case TRE_CHAR('\\'):
+	    if (escaped)
 	      STORE_CHAR;
-	      break;
-	    case TRE_CHAR('$'):
-	      if (i == n - 1)
-		fg->eol = true;
-	      else
-		STORE_CHAR;
-	      break;
-	    default:
+	    else
+	      escaped = true;
+	    continue;
+	  case TRE_CHAR('['):
+	    if (escaped)
 	      STORE_CHAR;
-	  }
-    }
-  else
-    for (int i = 0; i < n; i++)
-      {
-	switch (pat[i])
-	  {
-	    case TRE_CHAR('\\'):
-	      if (escaped)
-		STORE_CHAR;
-	      else
-		escaped = true;
-	      break;
-	    case TRE_CHAR('['):
-	      if (escaped)
-		STORE_CHAR;
-	      else
-		goto badpat;
-	      break;
-	    case TRE_CHAR('*'):
-	      if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
-		STORE_CHAR;
-	      else
-		goto badpat;
-	      break;
-	    case TRE_CHAR('+'):
-	    case TRE_CHAR('?'):
-	      if ((cflags & REG_EXTENDED) && (i == 0))
-		continue;
-	      else if ((cflags & REG_EXTENDED) ^ !escaped)
-		STORE_CHAR;
-	      else
-		goto badpat;
-	    case TRE_CHAR('.'):
-	      if (escaped)
-		goto badpat;
-	      else
-		{
-		  fg->hasdot = true;
-		  STORE_CHAR;
-		}
-	      break;
-	    case TRE_CHAR('^'):
+	    else
+	      goto badpat;
+	    continue;
+	  case TRE_CHAR('*'):
+	    if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
 	      STORE_CHAR;
-	      break;
-	    case TRE_CHAR('$'):
-	      if (!escaped && (i == n - 1))
-		fg->eol = true;
-	      else
-		STORE_CHAR;
-	      break;
-	    case TRE_CHAR('('):
-	      if ((cflags & REG_EXTENDED) ^ escaped)
-		goto badpat;
-	      else
-		STORE_CHAR;
-	      break;
-	    case TRE_CHAR('{'):
-	      if (escaped && (i == 0))
-		STORE_CHAR;
-	      else if (!(cflags & REG_EXTENDED) && (i == 0))
-		STORE_CHAR;
-	      else if ((cflags & REG_EXTENDED) && (i == 0))
-		continue;
-	      else
-		goto badpat;
-	      break;
-	    case TRE_CHAR('|'):
-	      if ((cflags & REG_EXTENDED) ^ (!escaped))
-		goto badpat;
-	      else
-		STORE_CHAR;
-	      break;
-	    default:
-	      if (escaped)
-		goto badpat;
-	      else
+	    else
+	      goto badpat;
+	    continue;
+	  case TRE_CHAR('+'):
+	  case TRE_CHAR('?'):
+	    if ((cflags & REG_EXTENDED) && (i == 0))
+	      continue;
+	    else if ((cflags & REG_EXTENDED) ^ !escaped)
+	      STORE_CHAR;
+	    else
+	      goto badpat;
+	    continue;
+	  case TRE_CHAR('.'):
+	    if (escaped)
+	      {
+		if (!_escmap)
+		  _escmap = xmalloc(n * sizeof(bool));
+		if (!_escmap)
+		  {
+		    xfree(tmp);
+		    return REG_ESPACE;
+		  }
+		_escmap[i] = true;
+		STORE_CHAR;
+	      }
+	    else
+	      {
+		fg->hasdot = i;
 		STORE_CHAR;
-	  }
-	continue;
+	      }
+	    continue;
+	  case TRE_CHAR('^'):
+	    STORE_CHAR;
+	    continue;
+	  case TRE_CHAR('$'):
+	    if (!escaped && (i == n - 1))
+	      fg->eol = true;
+	    else
+	      STORE_CHAR;
+	    continue;
+	  case TRE_CHAR('('):
+	    if ((cflags & REG_EXTENDED) ^ escaped)
+	      goto badpat;
+	    else
+	      STORE_CHAR;
+	    continue;
+	  case TRE_CHAR('{'):
+	    if (!(cflags & REG_EXTENDED) ^ escaped)
+	      STORE_CHAR;
+	    else if (!(cflags & REG_EXTENDED) && (i == 0))
+	      STORE_CHAR;
+	    else if ((cflags & REG_EXTENDED) && (i == 0))
+	      continue;
+	    else
+	      goto badpat;
+	    continue;
+	  case TRE_CHAR('|'):
+	    if ((cflags & REG_EXTENDED) ^ escaped)
+	      goto badpat;
+	    else
+	      STORE_CHAR;
+	    continue;
+	  default:
+	    if (escaped)
+	      goto badpat;
+	    else
+	      STORE_CHAR;
+	    continue;
+	}
+      continue;
 badpat:
-	xfree(tmp);
-	return REG_BADPAT;
-      }
+      xfree(tmp);
+      DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
+	      "back to NFA\n"));
+      return REG_BADPAT;
+    }
 
   /*
    * The pattern has been processed and copied to tmp as a literal string
@@ -603,14 +609,38 @@ badpat:
    */
 #ifdef TRE_WCHAR
   SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
+  fg->wescmap = _escmap;
   STORE_MBS_PAT;
+  if (fg->wescmap != NULL)
+    {
+      bool escaped = false;
+
+      fg->escmap = xmalloc(fg->len * sizeof(bool));
+      if (!fg->escmap)
+	{
+	  tre_free_fast(fg);
+	  return REG_ESPACE;
+	}
+
+      for (int i = 0; i < fg->len; i++)
+	if (fg->pattern[i] == '\\')
+	  escaped = ! escaped;
+	else if (fg->pattern[i] == '.' && escaped)
+	  {
+	    fg->escmap[i] = true;
+	    escaped = false;
+	  }
+	else
+	  escaped = false;
+    }
 #else
   SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
+  fg->escmap = _escmap;
 #endif
 
   xfree(tmp);
 
-  DPRINT(("tre_compile_fast: pattern: %s, len %u, bol %c, eol %c, "
+  DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
 	 "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
 	 fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
 	 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
@@ -703,7 +733,7 @@ tre_match_fast(const fastmatch_t *fg, co
   const tre_char_t *str_wide = data;
 
   /* Calculate length if unspecified. */
-  if (len == (unsigned)-1)
+  if (len == (size_t)-1)
     switch (type)
       {
 	case STR_WIDE:
@@ -822,10 +852,14 @@ tre_free_fast(fastmatch_t *fg)
   hashtable_free(fg->qsBc_table);
   if (!fg->hasdot)
     xfree(fg->bmGs);
+  if (fg->wescmap)
+    xfree(fg->wescmap);
   xfree(fg->wpattern);
 #endif
   if (!fg->hasdot)
     xfree(fg->sbmGs);
+  if (fg->escmap)
+    xfree(fg->escmap);
   xfree(fg->pattern);
 }
 
@@ -835,7 +869,7 @@ tre_free_fast(fastmatch_t *fg)
  *		REG_OK on success
  */
 static inline int
-fastcmp(const void *pat, const void *data, size_t len,
+fastcmp(const void *pat, const bool *escmap, const void *data, size_t len,
 	tre_str_type_t type, bool icase, bool newline)
 {
   const char *str_byte = data;
@@ -851,7 +885,7 @@ fastcmp(const void *pat, const void *dat
 	case STR_WIDE:
 
 	  /* Check dot */
-	  if (pat_wide[i] == TRE_CHAR('.') &&
+	  if (pat_wide[i] == TRE_CHAR('.') && (!escmap || !escmap[i]) &&
 	      (!newline || (str_wide[i] != TRE_CHAR('\n'))))
 	    continue;
 
@@ -862,7 +896,7 @@ fastcmp(const void *pat, const void *dat
 	  break;
 	default:
 	  /* Check dot */
-	  if (pat_byte[i] == '.' &&
+	  if (pat_byte[i] == '.' && (!escmap || !escmap[i]) &&
 	      (!newline || (str_byte[i] != '\n')))
 	    continue;
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201109071301.p87D1Q5r039697>