Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 21 Aug 2011 00:51:43 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r225051 - in user/gabor/tre-integration: contrib/tre/lib include lib/libc/regex
Message-ID:  <201108210051.p7L0phFm075623@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Sun Aug 21 00:51:42 2011
New Revision: 225051
URL: http://svn.freebsd.org/changeset/base/225051

Log:
  - Add the heuristic matching code. Currently
    * it only supports BRE
    * the heuristic construction is limited because the fast matching
      algorithm is also limited and only allows simple heuristic patterns
    * it has a bug and fills in incorrectly the matching offsets
  - Even with the above limitations, some searches only require half of the
    time with BSD grep.

Added:
  user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c   (contents, props changed)
  user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h   (contents, props changed)
Modified:
  user/gabor/tre-integration/contrib/tre/lib/regcomp.c
  user/gabor/tre-integration/contrib/tre/lib/regexec.c
  user/gabor/tre-integration/contrib/tre/lib/tre-compile.c
  user/gabor/tre-integration/include/regex.h
  user/gabor/tre-integration/lib/libc/regex/Makefile.inc

Modified: user/gabor/tre-integration/contrib/tre/lib/regcomp.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/regcomp.c	Sun Aug 21 00:48:03 2011	(r225050)
+++ user/gabor/tre-integration/contrib/tre/lib/regcomp.c	Sun Aug 21 00:51:42 2011	(r225051)
@@ -16,6 +16,7 @@
 #include <stdlib.h>
 
 #include "tre-fastmatch.h"
+#include "tre-heuristic.h"
 #include "tre-internal.h"
 #include "xmalloc.h"
 
@@ -143,7 +144,15 @@ void
 tre_regfree(regex_t *preg)
 {
   if (preg->shortcut != NULL)
-    tre_free_fast(preg->shortcut);
+    {
+      tre_free_fast(preg->shortcut);
+      xfree(preg->shortcut);
+    }
+  if (preg->heur != NULL)
+    {
+      tre_free_heur(preg->heur);
+      xfree(preg->heur);
+    }
   tre_free(preg);
 }
 

Modified: user/gabor/tre-integration/contrib/tre/lib/regexec.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/regexec.c	Sun Aug 21 00:48:03 2011	(r225050)
+++ user/gabor/tre-integration/contrib/tre/lib/regexec.c	Sun Aug 21 00:51:42 2011	(r225051)
@@ -46,6 +46,7 @@ char *alloca ();
 #include <limits.h>
 
 #include "tre-fastmatch.h"
+#include "tre-heuristic.h"
 #include "tre-internal.h"
 #include "xmalloc.h"
 
@@ -151,14 +152,53 @@ tre_have_approx(const regex_t *preg)
 static int
 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
 	  tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
-	  int eflags, fastmatch_t *shortcut)
+	  int eflags, fastmatch_t *shortcut, heur_t *heur)
 {
   reg_errcode_t status;
   int *tags = NULL, eo;
 
-  /* Check if we can cheat with a faster algorithm */
+  /* Check if we can cheat with a faster algorithm. */
   if (shortcut != NULL)
-    return tre_match_fast(shortcut, string, len, type, nmatch, pmatch, eflags);
+    return tre_match_fast(shortcut, string, len, type, nmatch,
+			  pmatch, eflags);
+
+  /* Check if we have a heuristic to speed up the search. */
+  if (heur != NULL)
+    {
+      int ret;
+      size_t st = 0, j;
+      const char *data_byte = string;
+      const tre_char_t *data_wide = string;
+      const void *tmp;
+
+      /* Look for the beginning of possibly matching text. */
+      ret = tre_match_fast(heur->start, string, len, type, nmatch,
+			   pmatch, eflags);
+      if (ret != REG_OK)
+	return ret;
+      st = pmatch[0].rm_so;
+      j = pmatch[0].rm_eo;
+      string = (type == STR_WIDE) ? (void *)&data_wide[st] :
+	(void *)&data_byte[st];
+
+      /* When having a fixed-length pattern there is only one heuristic. */
+      if (heur->end == NULL)
+	return tre_match(tnfa, string,
+			 heur->prefix ? (len - st) : (j - st),
+			 type, nmatch, pmatch, eflags, NULL, NULL);
+
+      tmp = (type == STR_WIDE) ? (void *)&data_wide[j] :
+	(void *)&data_byte[j];
+
+      /* Look for the end of possibly matching text. */
+      ret = tre_match_fast(heur->end, tmp, len - j, type, nmatch,
+			   pmatch, eflags);
+      if (ret != REG_OK)
+        return ret;
+
+      return tre_match(tnfa, string, pmatch[0].rm_eo + j - st,
+		       type, nmatch, pmatch, eflags, NULL, NULL);
+    }
 
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
@@ -227,7 +267,7 @@ tre_match(const tre_tnfa_t *tnfa, const 
     if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0)		\
       return REG_NOMATCH;						\
     ret = tre_match(tnfa, &str[offset], slen, type, nmatch,		\
-		    pmatch, eflags, preg->shortcut);			\
+		    pmatch, eflags, preg->shortcut, preg->heur);	\
     for (unsigned i = 0; (i == 0) || (!(eflags & REG_NOSUB) &&		\
 	 (i < nmatch)); i++)						\
       {									\
@@ -248,7 +288,7 @@ tre_regnexec(const regex_t *preg, const 
     ADJUST_OFFSETS
   else
     return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags,
-		     preg->shortcut);
+		     preg->shortcut, preg->heur);
 }
 
 int
@@ -272,7 +312,7 @@ tre_regwnexec(const regex_t *preg, const
     ADJUST_OFFSETS
   else
     return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags,
-		     preg->shortcut);
+		     preg->shortcut, preg->heur);
 }
 
 int
@@ -290,7 +330,7 @@ tre_reguexec(const regex_t *preg, const 
 {
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags,
-		   preg->shortcut);
+		   preg->shortcut, preg->heur);
 }
 
 
@@ -314,7 +354,7 @@ tre_match_approx(const tre_tnfa_t *tnfa,
   if (params.max_cost == 0 && !tnfa->have_approx
       && !(eflags & REG_APPROX_MATCHER))
     return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
-		     eflags, NULL);
+		     eflags, NULL, NULL);
 
   /* Back references are not supported by the approximate matcher. */
   if (tnfa->have_backrefs)

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-compile.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-compile.c	Sun Aug 21 00:48:03 2011	(r225050)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-compile.c	Sun Aug 21 00:51:42 2011	(r225051)
@@ -22,6 +22,7 @@
 #include <string.h>
 
 #include "tre-fastmatch.h"
+#include "tre-heuristic.h"
 #include "tre-internal.h"
 #include "tre-mem.h"
 #include "tre-stack.h"
@@ -1867,6 +1868,7 @@ tre_compile(regex_t *preg, const tre_cha
   reg_errcode_t errcode;
   tre_mem_t mem;
   fastmatch_t *shortcut;
+  heur_t *heur;
 
   /* Parse context. */
   tre_parse_ctx_t parse_ctx;
@@ -2179,6 +2181,28 @@ tre_compile(regex_t *preg, const tre_cha
   xfree(offs);
 
   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
+
+  /*
+   * If we reach here, the regex is parsed and legal. Now we try to construct
+   * a heuristic to speed up matching.
+   */
+
+  heur = xmalloc(sizeof(heur_t));
+  if (!heur)
+    {
+      errcode = REG_ESPACE;
+      goto error_exit;
+    }
+
+  ret = tre_compile_heur(heur, regex, n, cflags);
+  if (ret != REG_OK)
+    {
+      xfree(heur);
+      preg->heur = NULL;
+    }
+  else
+    preg->heur = heur;
+
   return REG_OK;
 
  error_exit:

Added: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Sun Aug 21 00:51:42 2011	(r225051)
@@ -0,0 +1,271 @@
+/*-
+ * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+#include <regex.h>
+#include <stdbool.h>
+#include <string.h>
+#ifdef TRE_WCHAR
+#include <wctype.h>
+#endif
+
+#include "tre-fastmatch.h"
+#include "tre-heuristic.h"
+#include "tre-internal.h"
+#include "xmalloc.h"
+
+/*
+ * Parses bracket expression seeking to the end of the enclosed text.
+ * The parameters are the opening (oe) and closing elements (ce).
+ * Can handle nested bracket expressions.
+ */
+#define PARSE_UNIT(oe, ce)						\
+  {									\
+    int level = 0;							\
+									\
+    while (i < len)							\
+      {									\
+	if (regex[i] == TRE_CHAR(oe))					\
+	  level++;							\
+	else if (regex[i] == TRE_CHAR(ce))				\
+	  level--;							\
+	if (level == 0)							\
+	  break;							\
+	i++;								\
+      }									\
+  }
+
+/*
+ * Finishes a segment (fixed-length text fragment).
+ */
+#define END_SEGMENT							\
+  do									\
+    {									\
+      st = i + 1;							\
+      escaped = false;							\
+      goto end_segment;							\
+    } while (0);
+
+/*
+ * Parses a regular expression and constructs a heuristic in heur_t and
+ * returns REG_OK if successful or the corresponding error code if a
+ * heuristic cannot be constructed.
+ */
+int
+tre_compile_heur(heur_t *h, const tre_char_t *regex, size_t len, int cflags)
+{
+  tre_char_t *heur;
+  int st = 0, pos = 0;
+  bool escaped = false;
+  int errcode, ret;
+
+  /* XXX: only basic regexes are supported. */
+  if (cflags & REG_EXTENDED)
+    return REG_BADPAT;
+
+  /* Temporary space, len will be enough. */
+  heur = xmalloc(len);
+  if (!heur)
+    return REG_ESPACE;
+
+  memset(h, 0, sizeof(*h));
+
+  while (true)
+    {
+
+      /*
+       * Process the pattern char-by-char.
+       *
+       * i: position in regex
+       * st: start offset of current segment (fixed-length fragment)
+       *     to be processed
+       * pos: current position (and length) in the temporary space where
+       *      we copy the current segment
+       */
+      for (int i = st; i < len; i++)
+        {
+	  switch (regex[i])
+	    {
+
+	      /* Bracketed expression is substituted with a dot. */
+	      case TRE_CHAR('['):
+		PARSE_UNIT('[', ']');
+		heur[pos++] = TRE_CHAR('.');
+		continue;
+
+	      /*
+	       * If a repetition marker, erases the repeting character
+	       * and terminates the segment.
+	       * Otherwise just terminates the segment (XXX).
+	       */
+	      case TRE_CHAR('{'):
+		PARSE_UNIT('{', '}');
+		if (escaped)
+		  pos--;
+		END_SEGMENT;
+		break;
+
+	      /*
+	       * Terminates the current segment whether a subexpression
+	       * marker or not. (XXX)
+	       */
+	      case TRE_CHAR('('):
+		PARSE_UNIT('(', ')');
+		END_SEGMENT;
+		break;
+
+	      /*
+	       * Sets escaped flag.
+	       * Escaped escape terminates current segment. (XXX)
+	       */
+	      case TRE_CHAR('\\'):
+		if (escaped)
+		  END_SEGMENT;
+		escaped = !escaped;
+		continue;
+
+	      /*
+	       * If not the first character, erases the last character
+	       * and terminates the segment.
+	       * Otherwise heuristic construction fails. (XXX)
+	       */
+	      case TRE_CHAR('*'):
+		if (i != 0)
+		  {
+		    pos--;
+		    END_SEGMENT;
+		  }
+		else
+		  goto badpat1;
+		break;
+
+	      /*
+	       * If a backreference (escaped digit), terminates segment.
+	       * Otherwise adds current character to the current segment
+	       * by copying it to the temporary space.
+	       */
+	      default:
+		if (escaped && tre_isdigit(regex[i]))
+		  END_SEGMENT;
+		heur[pos++] = regex[i];
+		continue;
+	    }
+	}
+
+      /* We are done with the pattern if we got here. */
+      st = len;
+ 
+end_segment:
+
+      /* If it is not initialized yet, then we just got the first segment. */
+      if (h->start == NULL)
+	{
+
+	  /*
+	   * An empty or a one-char prefix segment is useless,
+	   * better to just fail.
+	   */
+	  if (pos <= 1)
+	    {
+	      errcode = REG_BADPAT;
+	      goto badpat1;
+	    }
+
+	  h->start = xmalloc(sizeof(fastmatch_t));
+	  if (!h->start)
+	    {
+	      errcode = REG_ESPACE;
+	      goto space1;
+	    }
+
+	  ret = tre_compile_fast(h->start, heur, pos, 0);
+	  if (ret != REG_OK)
+	    {
+	      errcode = REG_BADPAT;
+	      goto badpat2;
+	    }
+	}
+
+      /*
+       * If true, this is the last segment. We do not care about the
+       * middle ones.
+       */
+      else if (st == len)
+	{
+
+	  /* If empty, we only have a prefix heuristic. */
+	  if (pos == 0)
+	    {
+	      h->prefix = true;
+	      errcode = REG_OK;
+	      goto ok;
+	    }
+
+	  h->end = xmalloc(sizeof(fastmatch_t));
+	  if (!h->end)
+	    {
+	      errcode = REG_ESPACE;
+	      goto space2;
+	    }
+	    
+	  ret = tre_compile_fast(h->end, heur, pos, 0);
+	  if (ret != REG_OK)
+	    {
+	      xfree(h->end);
+	      h->prefix = true;
+	    }
+	  errcode = REG_OK;
+	  goto ok;
+	}
+
+      /* Just drop middle segments by overwriting the temporary space. */
+      pos = 0;
+    }
+
+badpat2:
+space2:
+  if (h->start != NULL)
+    xfree(h->start);
+badpat1:
+space1:
+ok:
+  xfree(heur);
+  return errcode;
+}
+
+/*
+ * Frees a heuristic.
+ */
+void
+tre_free_heur(heur_t *h)
+{
+  if (h->start != NULL)
+    xfree(h->start);
+  if (h->end != NULL)
+    xfree(h->end);
+}

Added: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h	Sun Aug 21 00:51:42 2011	(r225051)
@@ -0,0 +1,21 @@
+#ifndef TRE_HEURISTIC_H
+#define TRE_HEURISTIC_H 1
+
+#include <fastmatch.h>
+#include <stdbool.h>
+
+#include "tre-fastmatch.h"
+#include "tre-internal.h"
+
+typedef struct {
+  fastmatch_t *start;
+  fastmatch_t *end;
+  bool prefix;
+} heur_t;
+
+
+extern int tre_compile_heur(heur_t *h, const tre_char_t *regex,
+			    size_t len, int cflags);
+extern void tre_free_heur(heur_t *h);
+
+#endif	/* TRE_HEURISTIC_H */

Modified: user/gabor/tre-integration/include/regex.h
==============================================================================
--- user/gabor/tre-integration/include/regex.h	Sun Aug 21 00:48:03 2011	(r225050)
+++ user/gabor/tre-integration/include/regex.h	Sun Aug 21 00:51:42 2011	(r225051)
@@ -67,6 +67,7 @@ typedef struct {
   size_t re_nsub;  /* Number of parenthesized subexpressions. */
   void *value;	   /* For internal use only. */
   void *shortcut;  /* For internal use only. */
+  void *heur;	   /* For internal use only. */
   const char *re_endp;
 } regex_t;
 

Modified: user/gabor/tre-integration/lib/libc/regex/Makefile.inc
==============================================================================
--- user/gabor/tre-integration/lib/libc/regex/Makefile.inc	Sun Aug 21 00:48:03 2011	(r225050)
+++ user/gabor/tre-integration/lib/libc/regex/Makefile.inc	Sun Aug 21 00:51:42 2011	(r225051)
@@ -6,7 +6,7 @@
 CFLAGS+=-DHAVE_CONFIG_H -DTRE_LIBC_BUILD -I${.CURDIR}/../../contrib/tre
 
 SRCS+=	fastmatch.c hashtable.c regcomp.c regerror.c regexec.c tre-ast.c \
-	tre-compile.c tre-fastmatch.c tre-match-approx.c \
+	tre-compile.c tre-fastmatch.c tre-heuristic.c tre-match-approx.c \
 	tre-match-backtrack.c tre-match-parallel.c tre-mem.c tre-parse.c \
 	tre-stack.c xmalloc.c
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108210051.p7L0phFm075623>