From owner-svn-src-user@FreeBSD.ORG Sun Aug 21 00:51:43 2011 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 360C9106566C; Sun, 21 Aug 2011 00:51:43 +0000 (UTC) (envelope-from gabor@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 244788FC08; Sun, 21 Aug 2011 00:51:43 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id p7L0phJ5075631; Sun, 21 Aug 2011 00:51:43 GMT (envelope-from gabor@svn.freebsd.org) Received: (from gabor@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id p7L0phFm075623; Sun, 21 Aug 2011 00:51:43 GMT (envelope-from gabor@svn.freebsd.org) Message-Id: <201108210051.p7L0phFm075623@svn.freebsd.org> From: Gabor Kovesdan Date: Sun, 21 Aug 2011 00:51:43 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r225051 - in user/gabor/tre-integration: contrib/tre/lib include lib/libc/regex X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 21 Aug 2011 00:51:43 -0000 Author: gabor Date: Sun Aug 21 00:51:42 2011 New Revision: 225051 URL: http://svn.freebsd.org/changeset/base/225051 Log: - Add the heuristic matching code. Currently * it only supports BRE * the heuristic construction is limited because the fast matching algorithm is also limited and only allows simple heuristic patterns * it has a bug and fills in incorrectly the matching offsets - Even with the above limitations, some searches only require half of the time with BSD grep. Added: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c (contents, props changed) user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h (contents, props changed) Modified: user/gabor/tre-integration/contrib/tre/lib/regcomp.c user/gabor/tre-integration/contrib/tre/lib/regexec.c user/gabor/tre-integration/contrib/tre/lib/tre-compile.c user/gabor/tre-integration/include/regex.h user/gabor/tre-integration/lib/libc/regex/Makefile.inc Modified: user/gabor/tre-integration/contrib/tre/lib/regcomp.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/regcomp.c Sun Aug 21 00:48:03 2011 (r225050) +++ user/gabor/tre-integration/contrib/tre/lib/regcomp.c Sun Aug 21 00:51:42 2011 (r225051) @@ -16,6 +16,7 @@ #include #include "tre-fastmatch.h" +#include "tre-heuristic.h" #include "tre-internal.h" #include "xmalloc.h" @@ -143,7 +144,15 @@ void tre_regfree(regex_t *preg) { if (preg->shortcut != NULL) - tre_free_fast(preg->shortcut); + { + tre_free_fast(preg->shortcut); + xfree(preg->shortcut); + } + if (preg->heur != NULL) + { + tre_free_heur(preg->heur); + xfree(preg->heur); + } tre_free(preg); } Modified: user/gabor/tre-integration/contrib/tre/lib/regexec.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/regexec.c Sun Aug 21 00:48:03 2011 (r225050) +++ user/gabor/tre-integration/contrib/tre/lib/regexec.c Sun Aug 21 00:51:42 2011 (r225051) @@ -46,6 +46,7 @@ char *alloca (); #include #include "tre-fastmatch.h" +#include "tre-heuristic.h" #include "tre-internal.h" #include "xmalloc.h" @@ -151,14 +152,53 @@ tre_have_approx(const regex_t *preg) static int tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], - int eflags, fastmatch_t *shortcut) + int eflags, fastmatch_t *shortcut, heur_t *heur) { reg_errcode_t status; int *tags = NULL, eo; - /* Check if we can cheat with a faster algorithm */ + /* Check if we can cheat with a faster algorithm. */ if (shortcut != NULL) - return tre_match_fast(shortcut, string, len, type, nmatch, pmatch, eflags); + return tre_match_fast(shortcut, string, len, type, nmatch, + pmatch, eflags); + + /* Check if we have a heuristic to speed up the search. */ + if (heur != NULL) + { + int ret; + size_t st = 0, j; + const char *data_byte = string; + const tre_char_t *data_wide = string; + const void *tmp; + + /* Look for the beginning of possibly matching text. */ + ret = tre_match_fast(heur->start, string, len, type, nmatch, + pmatch, eflags); + if (ret != REG_OK) + return ret; + st = pmatch[0].rm_so; + j = pmatch[0].rm_eo; + string = (type == STR_WIDE) ? (void *)&data_wide[st] : + (void *)&data_byte[st]; + + /* When having a fixed-length pattern there is only one heuristic. */ + if (heur->end == NULL) + return tre_match(tnfa, string, + heur->prefix ? (len - st) : (j - st), + type, nmatch, pmatch, eflags, NULL, NULL); + + tmp = (type == STR_WIDE) ? (void *)&data_wide[j] : + (void *)&data_byte[j]; + + /* Look for the end of possibly matching text. */ + ret = tre_match_fast(heur->end, tmp, len - j, type, nmatch, + pmatch, eflags); + if (ret != REG_OK) + return ret; + + return tre_match(tnfa, string, pmatch[0].rm_eo + j - st, + type, nmatch, pmatch, eflags, NULL, NULL); + } if (tnfa->num_tags > 0 && nmatch > 0) { @@ -227,7 +267,7 @@ tre_match(const tre_tnfa_t *tnfa, const if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \ return REG_NOMATCH; \ ret = tre_match(tnfa, &str[offset], slen, type, nmatch, \ - pmatch, eflags, preg->shortcut); \ + pmatch, eflags, preg->shortcut, preg->heur); \ for (unsigned i = 0; (i == 0) || (!(eflags & REG_NOSUB) && \ (i < nmatch)); i++) \ { \ @@ -248,7 +288,7 @@ tre_regnexec(const regex_t *preg, const ADJUST_OFFSETS else return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags, - preg->shortcut); + preg->shortcut, preg->heur); } int @@ -272,7 +312,7 @@ tre_regwnexec(const regex_t *preg, const ADJUST_OFFSETS else return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags, - preg->shortcut); + preg->shortcut, preg->heur); } int @@ -290,7 +330,7 @@ tre_reguexec(const regex_t *preg, const { tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags, - preg->shortcut); + preg->shortcut, preg->heur); } @@ -314,7 +354,7 @@ tre_match_approx(const tre_tnfa_t *tnfa, if (params.max_cost == 0 && !tnfa->have_approx && !(eflags & REG_APPROX_MATCHER)) return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch, - eflags, NULL); + eflags, NULL, NULL); /* Back references are not supported by the approximate matcher. */ if (tnfa->have_backrefs) Modified: user/gabor/tre-integration/contrib/tre/lib/tre-compile.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/tre-compile.c Sun Aug 21 00:48:03 2011 (r225050) +++ user/gabor/tre-integration/contrib/tre/lib/tre-compile.c Sun Aug 21 00:51:42 2011 (r225051) @@ -22,6 +22,7 @@ #include #include "tre-fastmatch.h" +#include "tre-heuristic.h" #include "tre-internal.h" #include "tre-mem.h" #include "tre-stack.h" @@ -1867,6 +1868,7 @@ tre_compile(regex_t *preg, const tre_cha reg_errcode_t errcode; tre_mem_t mem; fastmatch_t *shortcut; + heur_t *heur; /* Parse context. */ tre_parse_ctx_t parse_ctx; @@ -2179,6 +2181,28 @@ tre_compile(regex_t *preg, const tre_cha xfree(offs); preg->TRE_REGEX_T_FIELD = (void *)tnfa; + + /* + * If we reach here, the regex is parsed and legal. Now we try to construct + * a heuristic to speed up matching. + */ + + heur = xmalloc(sizeof(heur_t)); + if (!heur) + { + errcode = REG_ESPACE; + goto error_exit; + } + + ret = tre_compile_heur(heur, regex, n, cflags); + if (ret != REG_OK) + { + xfree(heur); + preg->heur = NULL; + } + else + preg->heur = heur; + return REG_OK; error_exit: Added: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c Sun Aug 21 00:51:42 2011 (r225051) @@ -0,0 +1,271 @@ +/*- + * Copyright (C) 2011 Gabor Kovesdan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ +#include +#include +#include +#ifdef TRE_WCHAR +#include +#endif + +#include "tre-fastmatch.h" +#include "tre-heuristic.h" +#include "tre-internal.h" +#include "xmalloc.h" + +/* + * Parses bracket expression seeking to the end of the enclosed text. + * The parameters are the opening (oe) and closing elements (ce). + * Can handle nested bracket expressions. + */ +#define PARSE_UNIT(oe, ce) \ + { \ + int level = 0; \ + \ + while (i < len) \ + { \ + if (regex[i] == TRE_CHAR(oe)) \ + level++; \ + else if (regex[i] == TRE_CHAR(ce)) \ + level--; \ + if (level == 0) \ + break; \ + i++; \ + } \ + } + +/* + * Finishes a segment (fixed-length text fragment). + */ +#define END_SEGMENT \ + do \ + { \ + st = i + 1; \ + escaped = false; \ + goto end_segment; \ + } while (0); + +/* + * Parses a regular expression and constructs a heuristic in heur_t and + * returns REG_OK if successful or the corresponding error code if a + * heuristic cannot be constructed. + */ +int +tre_compile_heur(heur_t *h, const tre_char_t *regex, size_t len, int cflags) +{ + tre_char_t *heur; + int st = 0, pos = 0; + bool escaped = false; + int errcode, ret; + + /* XXX: only basic regexes are supported. */ + if (cflags & REG_EXTENDED) + return REG_BADPAT; + + /* Temporary space, len will be enough. */ + heur = xmalloc(len); + if (!heur) + return REG_ESPACE; + + memset(h, 0, sizeof(*h)); + + while (true) + { + + /* + * Process the pattern char-by-char. + * + * i: position in regex + * st: start offset of current segment (fixed-length fragment) + * to be processed + * pos: current position (and length) in the temporary space where + * we copy the current segment + */ + for (int i = st; i < len; i++) + { + switch (regex[i]) + { + + /* Bracketed expression is substituted with a dot. */ + case TRE_CHAR('['): + PARSE_UNIT('[', ']'); + heur[pos++] = TRE_CHAR('.'); + continue; + + /* + * If a repetition marker, erases the repeting character + * and terminates the segment. + * Otherwise just terminates the segment (XXX). + */ + case TRE_CHAR('{'): + PARSE_UNIT('{', '}'); + if (escaped) + pos--; + END_SEGMENT; + break; + + /* + * Terminates the current segment whether a subexpression + * marker or not. (XXX) + */ + case TRE_CHAR('('): + PARSE_UNIT('(', ')'); + END_SEGMENT; + break; + + /* + * Sets escaped flag. + * Escaped escape terminates current segment. (XXX) + */ + case TRE_CHAR('\\'): + if (escaped) + END_SEGMENT; + escaped = !escaped; + continue; + + /* + * If not the first character, erases the last character + * and terminates the segment. + * Otherwise heuristic construction fails. (XXX) + */ + case TRE_CHAR('*'): + if (i != 0) + { + pos--; + END_SEGMENT; + } + else + goto badpat1; + break; + + /* + * If a backreference (escaped digit), terminates segment. + * Otherwise adds current character to the current segment + * by copying it to the temporary space. + */ + default: + if (escaped && tre_isdigit(regex[i])) + END_SEGMENT; + heur[pos++] = regex[i]; + continue; + } + } + + /* We are done with the pattern if we got here. */ + st = len; + +end_segment: + + /* If it is not initialized yet, then we just got the first segment. */ + if (h->start == NULL) + { + + /* + * An empty or a one-char prefix segment is useless, + * better to just fail. + */ + if (pos <= 1) + { + errcode = REG_BADPAT; + goto badpat1; + } + + h->start = xmalloc(sizeof(fastmatch_t)); + if (!h->start) + { + errcode = REG_ESPACE; + goto space1; + } + + ret = tre_compile_fast(h->start, heur, pos, 0); + if (ret != REG_OK) + { + errcode = REG_BADPAT; + goto badpat2; + } + } + + /* + * If true, this is the last segment. We do not care about the + * middle ones. + */ + else if (st == len) + { + + /* If empty, we only have a prefix heuristic. */ + if (pos == 0) + { + h->prefix = true; + errcode = REG_OK; + goto ok; + } + + h->end = xmalloc(sizeof(fastmatch_t)); + if (!h->end) + { + errcode = REG_ESPACE; + goto space2; + } + + ret = tre_compile_fast(h->end, heur, pos, 0); + if (ret != REG_OK) + { + xfree(h->end); + h->prefix = true; + } + errcode = REG_OK; + goto ok; + } + + /* Just drop middle segments by overwriting the temporary space. */ + pos = 0; + } + +badpat2: +space2: + if (h->start != NULL) + xfree(h->start); +badpat1: +space1: +ok: + xfree(heur); + return errcode; +} + +/* + * Frees a heuristic. + */ +void +tre_free_heur(heur_t *h) +{ + if (h->start != NULL) + xfree(h->start); + if (h->end != NULL) + xfree(h->end); +} Added: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h Sun Aug 21 00:51:42 2011 (r225051) @@ -0,0 +1,21 @@ +#ifndef TRE_HEURISTIC_H +#define TRE_HEURISTIC_H 1 + +#include +#include + +#include "tre-fastmatch.h" +#include "tre-internal.h" + +typedef struct { + fastmatch_t *start; + fastmatch_t *end; + bool prefix; +} heur_t; + + +extern int tre_compile_heur(heur_t *h, const tre_char_t *regex, + size_t len, int cflags); +extern void tre_free_heur(heur_t *h); + +#endif /* TRE_HEURISTIC_H */ Modified: user/gabor/tre-integration/include/regex.h ============================================================================== --- user/gabor/tre-integration/include/regex.h Sun Aug 21 00:48:03 2011 (r225050) +++ user/gabor/tre-integration/include/regex.h Sun Aug 21 00:51:42 2011 (r225051) @@ -67,6 +67,7 @@ typedef struct { size_t re_nsub; /* Number of parenthesized subexpressions. */ void *value; /* For internal use only. */ void *shortcut; /* For internal use only. */ + void *heur; /* For internal use only. */ const char *re_endp; } regex_t; Modified: user/gabor/tre-integration/lib/libc/regex/Makefile.inc ============================================================================== --- user/gabor/tre-integration/lib/libc/regex/Makefile.inc Sun Aug 21 00:48:03 2011 (r225050) +++ user/gabor/tre-integration/lib/libc/regex/Makefile.inc Sun Aug 21 00:51:42 2011 (r225051) @@ -6,7 +6,7 @@ CFLAGS+=-DHAVE_CONFIG_H -DTRE_LIBC_BUILD -I${.CURDIR}/../../contrib/tre SRCS+= fastmatch.c hashtable.c regcomp.c regerror.c regexec.c tre-ast.c \ - tre-compile.c tre-fastmatch.c tre-match-approx.c \ + tre-compile.c tre-fastmatch.c tre-heuristic.c tre-match-approx.c \ tre-match-backtrack.c tre-match-parallel.c tre-mem.c tre-parse.c \ tre-stack.c xmalloc.c