Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 9 Oct 2011 21:39:14 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r226177 - user/gabor/tre-integration/contrib/tre/lib
Message-ID:  <201110092139.p99LdEeR027832@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Sun Oct  9 21:39:14 2011
New Revision: 226177
URL: http://svn.freebsd.org/changeset/base/226177

Log:
  - Rework heuristics to try to use a better maximum shift if possible

Modified:
  user/gabor/tre-integration/contrib/tre/lib/regexec.c
  user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
  user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h

Modified: user/gabor/tre-integration/contrib/tre/lib/regexec.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/regexec.c	Sun Oct  9 21:36:14 2011	(r226176)
+++ user/gabor/tre-integration/contrib/tre/lib/regexec.c	Sun Oct  9 21:39:14 2011	(r226177)
@@ -187,7 +187,7 @@ tre_match(const tre_tnfa_t *tnfa, const 
   if ((heur != NULL) && (type != STR_USER))
     {
       int ret;
-      size_t st = 0, n;
+      size_t st = 0, i = 1, n;
       const char *data_byte = string;
       const tre_char_t *data_wide = string;
 
@@ -198,52 +198,50 @@ tre_match(const tre_tnfa_t *tnfa, const 
 	{
 	  SEEK_TO(st);
 
-	  /* Look for the beginning of possibly matching text. */
-	  ret = tre_match_fast(heur->start, string, len - st, type, nmatch,
+	  /* Prefix heuristic */
+	  ret = tre_match_fast(heur->heurs[0], string, len - st, type, nmatch,
 			       pmatch, eflags);
 	  if (ret != REG_OK)
 	    return ret;
 	  st += pmatch[0].rm_so;
 	  n = pmatch[0].rm_eo;
 
-	  /*
-	   * When having a fixed-length pattern there is only
-	   * one heuristic.
-	   */
-	  if (heur->end == NULL)
+	  /* Intermediate heuristics */
+	  while (!((heur->heurs[i] == NULL) ||
+		(heur->prefix && heur->heurs[i + 1] == NULL)))
 	    {
-	      SEEK_TO(st);
-
-	      DPRINT(("tre_match: calling NFA with offsets [%u/%u]\n",
-		     st, heur->prefix ? len : n + st));
+	      SEEK_TO(st + n);
+	      ret = tre_match_fast(heur->heurs[i], string, len - st - n, type,
+				   nmatch, pmatch, eflags);
+	      if (ret != REG_OK)
+		return ret;
+	      n += pmatch[0].rm_eo;
+	      i++;
+	    }
 
-	      ret = tre_match(tnfa, string,
-			      heur->prefix ? (len - st) :
-			      n, type, nmatch,
-			      pmatch, eflags, NULL, NULL);
+	  /* Suffix heuristic available */
+	  if (heur->prefix && heur->heurs[i] != NULL)
+	    {
+	      SEEK_TO(st + n);
+	      ret = tre_match_fast(heur->heurs[i], string, len - st - n, type,
+				   nmatch, pmatch, eflags);
+	      if (ret != REG_OK)
+		return ret;
+	      n += pmatch[0].rm_eo;
 
+	      SEEK_TO(st);
+	      ret = tre_match(tnfa, string, n, type, nmatch, pmatch,
+			      eflags, NULL, NULL);
+	      FIX_OFFSETS;
+	    }
+	  /* Suffix heuristic not available */
+	  else
+	    {
+	      SEEK_TO(st);
+	      ret = tre_match(tnfa, string, len - st, type, nmatch, pmatch,
+			      eflags, NULL, NULL);
 	      FIX_OFFSETS;
 	    }
-
-	  SEEK_TO(st + n);
-
-	  /* Look for the end of possibly matching text. */
-	  ret = tre_match_fast(heur->end, string, len - st - n, type,
-			       nmatch, pmatch, eflags);
-	  n += pmatch[0].rm_eo;
-
-	  if (ret != REG_OK)
-	    return ret;
-
-	  SEEK_TO(st);
-
-	  DPRINT(("tre_match: calling NFA with offsets [%u/%u]\n",
-		 st, st + n));
-
-	  ret = tre_match(tnfa, string, n,
-			  type, nmatch, pmatch, eflags, NULL, NULL);
-
-	  FIX_OFFSETS;
 	}
     }
 

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Sun Oct  9 21:36:14 2011	(r226176)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Sun Oct  9 21:39:14 2011	(r226177)
@@ -54,6 +54,8 @@
  * performance impact.
  */
 
+#define MAX_FRAGMENTS 32
+
 /*
  * Parses bracket expression seeking to the end of the enclosed text.
  * The parameters are the opening (oe) and closing elements (ce).
@@ -122,18 +124,15 @@
 int
 tre_compile_heur(heur_t *h, const tre_char_t *regex, size_t len, int cflags)
 {
-  tre_char_t *heur;
-  int st = 0, pos = 0;
+  tre_char_t *arr[MAX_FRAGMENTS], *heur;
+  size_t length[MAX_FRAGMENTS];
+  int errcode, j = 0, pos = 0, st = 0;
   bool escaped = false;
-  int errcode, ret;
 
-  /* Temporary space, len will be enough. */
-  heur = xmalloc(len);
+  heur = xmalloc(len * sizeof(tre_char_t));
   if (!heur)
     return REG_ESPACE;
 
-  memset(h, 0, sizeof(*h));
-
   while (true)
     {
 
@@ -274,7 +273,7 @@ tre_compile_heur(heur_t *h, const tre_ch
 		if ((cflags & REG_EXTENDED) && !escaped)
 		  {
 		    errcode = REG_BADPAT;
-		    goto badpat2;
+		    goto err;
 		  }
 		else if (!(cflags & REG_EXTENDED) && escaped)
 		  END_SEGMENT;
@@ -309,87 +308,102 @@ tre_compile_heur(heur_t *h, const tre_ch
  
 end_segment:
 
-      /* If it is not initialized yet, then we just got the first segment. */
-      if (h->start == NULL)
+      if (st == len && pos == 0)
 	{
-
-	  /*
-	   * An empty or a one-char prefix segment is useless,
-	   * better to just fail.
-	   */
-	  if (pos <= 1)
-	    {
-	      errcode = REG_BADPAT;
-	      DPRINT("tre_compile_heur: pattern does not have a "
-		     " fixed-length prefix that is long enough\n");
-	      goto badpat1;
-	    }
-
-	  h->start = xmalloc(sizeof(fastmatch_t));
-	  if (!h->start)
-	    {
-	      errcode = REG_ESPACE;
-	      goto space1;
-	    }
-
-	  ret = tre_compile_fast(h->start, heur, pos, 0);
-	  if (ret != REG_OK)
+	  if (j == 0)
 	    {
 	      errcode = REG_BADPAT;
-	      goto badpat2;
+	      goto err;
 	    }
-	  DPRINT(("tre_compile_heur: fixed-length prefix is %s\n",
-		 h->start->pattern));
+	  h->prefix = true;
+	  goto ok;
 	}
 
-      /*
-       * If true, this is the last segment. We do not care about the
-       * middle ones.
-       */
-      else if (st == len)
+      if (j == MAX_FRAGMENTS)
 	{
-
-	  /* If empty, we only have a prefix heuristic. */
-	  if (pos == 0)
-	    {
-	      h->prefix = true;
-	      errcode = REG_OK;
-	      DPRINT("tre-compile_heur: using only a fixed-length prefix; "
-		     "no fixed-length suffix is available\n");
-	      goto ok;
-	    }
-
-	  h->end = xmalloc(sizeof(fastmatch_t));
-	  if (!h->end)
-	    {
-	      errcode = REG_ESPACE;
-	      goto space2;
-	    }
-	    
-	  ret = tre_compile_fast(h->end, heur, pos, 0);
-	  if (ret != REG_OK)
-	    {
-	      xfree(h->end);
-	      h->prefix = true;
-	    }
-	  errcode = REG_OK;
-	  DPRINT(("tre_compile_heur: fixed-length suffix is %s\n",
-		 h->end->pattern));
-	  goto ok;
+	  errcode = REG_BADPAT;
+	  goto err;
 	}
 
-      /* Just drop middle segments by overwriting the temporary space. */
+      arr[j] = xmalloc((pos + 1) * sizeof(tre_char_t));
+      if (!arr[j])
+	{
+	  errcode = REG_ESPACE;
+	  goto err;
+	}
+      memcpy(&arr[j], &heur, pos);
+      arr[j][pos] = TRE_CHAR('\0');
+      length[j] = pos;
+      j++;
       pos = 0;
     }
 
-badpat2:
-space2:
-  if (h->start != NULL)
-    xfree(h->start);
-badpat1:
-space1:
-  DPRINT("tre_compile_heur: compiling a heuristic failed\n");
 ok:
+
+  {
+    size_t m = 1;
+    int ret;
+
+    for(int i = 1; i < j; i++)
+      m = (length[i] > length[m]) ? i : m;
+
+    if (!h->heurs)
+      {
+	errcode = REG_ESPACE;
+	goto err;
+      }
+
+    for (int i = 0; i < MIN(3, j + 1); i++)
+      {
+	h->heurs[i] = xmalloc(sizeof(fastmatch_t));
+	if (!h->heurs[i])
+	  {
+	    errcode = REG_ESPACE;
+	    goto err;
+	  }
+      }
+
+#define CHECK_ERR
+  if (ret != REG_OK)
+    {
+      errcode = REG_BADPAT;
+      goto err2;
+    }
+
+    ret = tre_compile_fast(h->heurs[0], arr[0], length[0], 0);
+    CHECK_ERR
+    if (j == 1)
+      {
+	xfree(h->heurs[1]);
+	h->heurs[1] = NULL;
+	goto finish;
+      }
+    else
+      ret = tre_compile_fast(h->heurs[1], arr[m], length[m], 0);
+    CHECK_ERR
+    if (h->prefix)
+      {
+	xfree(h->heurs[2]);
+	h->heurs[2] = NULL;
+	goto finish;
+      }
+    else
+      ret = tre_compile_fast(h->heurs[2], arr[j - 1], length[j - 1], 0);
+    CHECK_ERR
+    h->heurs[3] = NULL;
+
+    errcode = REG_OK;
+    goto finish;
+  }
+
+err2:
+  for (int i = 0; h->heurs[i] != NULL; i++)
+    tre_free_fast(h->heurs[i]);
+  xfree(h->heurs);
+err:
+finish:
+  for (int i = 0; i < j; i++)
+    xfree(arr[i]);
   xfree(heur);
   return errcode;
 }
@@ -400,10 +414,8 @@ ok:
 void
 tre_free_heur(heur_t *h)
 {
-  if (h->start != NULL)
-    xfree(h->start);
-  if (h->end != NULL)
-    xfree(h->end);
+  for (int i = 0; h->heurs[i] != NULL; i++)
+    tre_free_fast(h->heurs[i]);
 
   DPRINT("tre_free_heur: resources are freed\n");
 }

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h	Sun Oct  9 21:36:14 2011	(r226176)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h	Sun Oct  9 21:39:14 2011	(r226177)
@@ -8,12 +8,11 @@
 #include "tre-internal.h"
 
 typedef struct {
-  fastmatch_t *start;
-  fastmatch_t *end;
+  fastmatch_t *heurs[4];
   bool prefix;
+  bool newline;
 } heur_t;
 
-
 extern int tre_compile_heur(heur_t *h, const tre_char_t *regex,
 			    size_t len, int cflags);
 extern void tre_free_heur(heur_t *h);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201110092139.p99LdEeR027832>