From owner-svn-src-all@FreeBSD.ORG Wed Aug 18 17:40:10 2010 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 77F991065706; Wed, 18 Aug 2010 17:40:10 +0000 (UTC) (envelope-from gabor@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 5C9E68FC0A; Wed, 18 Aug 2010 17:40:10 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id o7IHeAlw075989; Wed, 18 Aug 2010 17:40:10 GMT (envelope-from gabor@svn.freebsd.org) Received: (from gabor@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id o7IHeA4c075984; Wed, 18 Aug 2010 17:40:10 GMT (envelope-from gabor@svn.freebsd.org) Message-Id: <201008181740.o7IHeA4c075984@svn.freebsd.org> From: Gabor Kovesdan Date: Wed, 18 Aug 2010 17:40:10 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r211463 - head/usr.bin/grep X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 18 Aug 2010 17:40:10 -0000 Author: gabor Date: Wed Aug 18 17:40:10 2010 New Revision: 211463 URL: http://svn.freebsd.org/changeset/base/211463 Log: - Refactor file reading code to use pure syscalls and an internal buffer instead of stdio. This gives BSD grep a very big performance boost, its speed is now almost comparable to GNU grep. Submitted by: Dimitry Andric Approved by: delphij (mentor) Modified: head/usr.bin/grep/fastgrep.c head/usr.bin/grep/file.c head/usr.bin/grep/grep.h head/usr.bin/grep/util.c Modified: head/usr.bin/grep/fastgrep.c ============================================================================== --- head/usr.bin/grep/fastgrep.c Wed Aug 18 17:39:47 2010 (r211462) +++ head/usr.bin/grep/fastgrep.c Wed Aug 18 17:40:10 2010 (r211463) @@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat } int -grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t *pmatch) +grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch) { unsigned int j; int ret = REG_NOMATCH; Modified: head/usr.bin/grep/file.c ============================================================================== --- head/usr.bin/grep/file.c Wed Aug 18 17:39:47 2010 (r211462) +++ head/usr.bin/grep/file.c Wed Aug 18 17:40:10 2010 (r211463) @@ -2,7 +2,8 @@ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav - * Copyright (C) 2008-2009 Gabor Kovesdan + * Copyright (C) 2008-2010 Gabor Kovesdan + * Copyright (C) 2010 Dimitry Andric * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,7 +38,8 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include +#include +#include #include #include #include @@ -47,222 +49,204 @@ __FBSDID("$FreeBSD$"); #include "grep.h" -static char fname[MAXPATHLEN]; /* file name */ +#define MAXBUFSIZ (32 * 1024) +#define LNBUFBUMP 80 -#define MAXBUFSIZ (16 * 1024) -#define PREREAD_M 0.2 +static gzFile gzbufdesc; +static BZFILE* bzbufdesc; -/* Some global variables for the buffering and reading. */ -static char *lnbuf; -static size_t lnbuflen; -static unsigned char *binbuf; -static int binbufsiz; -unsigned char *binbufptr; -static int bzerr; +static unsigned char buffer[MAXBUFSIZ]; +static unsigned char *bufpos; +static size_t bufrem; -#define iswbinary(ch) (!iswspace((ch)) && iswcntrl((ch)) && \ - (ch != L'\b') && (ch != L'\0')) +static unsigned char *lnbuf; +static size_t lnbuflen; -/* - * Returns a single character according to the file type. - * Returns -1 on failure. - */ static inline int -grep_fgetc(struct file *f) +grep_refill(struct file *f) { - unsigned char c; + ssize_t nr; + int bzerr; - switch (filebehave) { - case FILE_STDIO: - return (getc_unlocked(f->f)); - case FILE_GZIP: - return (gzgetc(f->gzf)); - case FILE_BZIP: - BZ2_bzRead(&bzerr, f->bzf, &c, 1); - if (bzerr == BZ_STREAM_END) - return (-1); - else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK) - errx(2, "%s", getstr(2)); - return (c); - } - return (-1); + bufpos = buffer; + bufrem = 0; + + if (filebehave == FILE_GZIP) + nr = gzread(gzbufdesc, buffer, MAXBUFSIZ); + else if (filebehave == FILE_BZIP && bzbufdesc != NULL) { + nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ); + switch (bzerr) { + case BZ_OK: + case BZ_STREAM_END: + /* No problem, nr will be okay */ + break; + case BZ_DATA_ERROR_MAGIC: + /* + * As opposed to gzread(), which simply returns the + * plain file data, if it is not in the correct + * compressed format, BZ2_bzRead() instead aborts. + * + * So, just restart at the beginning of the file again, + * and use plain reads from now on. + */ + BZ2_bzReadClose(&bzerr, bzbufdesc); + bzbufdesc = NULL; + if (lseek(f->fd, 0, SEEK_SET) == -1) + return (-1); + nr = read(f->fd, buffer, MAXBUFSIZ); + break; + default: + /* Make sure we exit with an error */ + nr = -1; + } + } else + nr = read(f->fd, buffer, MAXBUFSIZ); + + if (nr < 0) + return (-1); + + bufrem = nr; + return (0); } -/* - * Returns true if the file position is a EOF, returns false - * otherwise. - */ static inline int -grep_feof(struct file *f) +grep_lnbufgrow(size_t newlen) { - switch (filebehave) { - case FILE_STDIO: - return (feof_unlocked(f->f)); - case FILE_GZIP: - return (gzeof(f->gzf)); - case FILE_BZIP: - return (bzerr == BZ_STREAM_END); + if (lnbuflen < newlen) { + lnbuf = grep_realloc(lnbuf, newlen); + lnbuflen = newlen; } - return (1); + + return (0); } -/* - * At the first call, fills in an internal buffer and checks if the given - * file is a binary file and sets the binary flag accordingly. Then returns - * a single line and sets len to the length of the returned line. - * At any other call returns a single line either from the internal buffer - * or from the file if the buffer is exhausted and sets len to the length - * of the line. - */ char * -grep_fgetln(struct file *f, size_t *len) +grep_fgetln(struct file *f, size_t *lenp) { - struct stat st; - size_t bufsiz, i = 0; - int ch = 0; - - /* Fill in the buffer if it is empty. */ - if (binbufptr == NULL) { - - /* Only pre-read to the buffer if we need the binary check. */ - if (binbehave != BINFILE_TEXT) { - if (f->stdin) - st.st_size = MAXBUFSIZ; - else if (stat(fname, &st) != 0) - err(2, NULL); - /* no need to allocate buffer. */ - if (st.st_size == 0) - return (NULL); - - bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ? - (st.st_size / 2) : MAXBUFSIZ; - - binbuf = grep_malloc(sizeof(char) * bufsiz); - - while (i < bufsiz) { - ch = grep_fgetc(f); - if (ch == EOF) - break; - binbuf[i++] = ch; - if ((ch == '\n') && lbflag) - break; - } - - f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ? - '\0' : '\200', i - 1) != NULL; - } - binbufsiz = i; - binbufptr = binbuf; - } - - /* Read a line whether from the buffer or from the file itself. */ - for (i = 0; !(grep_feof(f) && - (binbufptr == &binbuf[binbufsiz])); i++) { - if (binbufptr == &binbuf[binbufsiz]) { - ch = grep_fgetc(f); - } else { - ch = binbufptr[0]; - binbufptr++; - } - if (i >= lnbuflen) { - lnbuflen *= 2; - lnbuf = grep_realloc(lnbuf, ++lnbuflen); - } - if ((ch == '\n') || (ch == EOF)) { - lnbuf[i] = '\0'; + unsigned char *p; + char *ret; + size_t len; + size_t off; + ptrdiff_t diff; + + /* Fill the buffer, if necessary */ + if (bufrem == 0 && grep_refill(f) != 0) + goto error; + + if (bufrem == 0) { + /* Return zero length to indicate EOF */ + *lenp = 0; + return (bufpos); + } + + /* Look for a newline in the remaining part of the buffer */ + if ((p = memchr(bufpos, '\n', bufrem)) != NULL) { + ++p; /* advance over newline */ + ret = bufpos; + len = p - bufpos; + bufrem -= len; + bufpos = p; + *lenp = len; + return (ret); + } + + /* We have to copy the current buffered data to the line buffer */ + for (len = bufrem, off = 0; ; len += bufrem) { + /* Make sure there is room for more data */ + if (grep_lnbufgrow(len + LNBUFBUMP)) + goto error; + memcpy(lnbuf + off, bufpos, len - off); + off = len; + if (grep_refill(f) != 0) + goto error; + if (bufrem == 0) + /* EOF: return partial line */ break; - } else - lnbuf[i] = ch; + if ((p = memchr(bufpos, '\n', bufrem)) == NULL) + continue; + /* got it: finish up the line (like code above) */ + ++p; + diff = p - bufpos; + len += diff; + if (grep_lnbufgrow(len)) + goto error; + memcpy(lnbuf + off, bufpos, diff); + bufrem -= diff; + bufpos = p; + break; } - if (grep_feof(f) && (i == 0) && (ch != '\n')) - return (NULL); - *len = i; + *lenp = len; return (lnbuf); + +error: + *lenp = 0; + return (NULL); } -/* - * Opens the standard input for processing. - */ -struct file * -grep_stdin_open(void) +static inline struct file * +grep_file_init(struct file *f) { - struct file *f; - /* Processing stdin implies --line-buffered for tail -f to work. */ - lbflag = true; + if (filebehave == FILE_GZIP && + (gzbufdesc = gzdopen(f->fd, "r")) == NULL) + goto error; - snprintf(fname, sizeof fname, "%s", getstr(1)); + if (filebehave == FILE_BZIP && + (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL) + goto error; - f = grep_malloc(sizeof *f); + /* Fill read buffer, also catches errors early */ + if (grep_refill(f) != 0) + goto error; - binbuf = NULL; - if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) { - flockfile(f->f); - f->stdin = true; - return (f); - } + /* Check for binary stuff, if necessary */ + if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL) + f->binary = true; + return (f); +error: + close(f->fd); free(f); return (NULL); } /* - * Opens a normal, a gzipped or a bzip2 compressed file for processing. + * Opens a file for processing. */ struct file * grep_open(const char *path) { struct file *f; - snprintf(fname, sizeof fname, "%s", path); - f = grep_malloc(sizeof *f); - - binbuf = NULL; - f->stdin = false; - switch (filebehave) { - case FILE_STDIO: - if ((f->f = fopen(path, "r")) != NULL) { - flockfile(f->f); - return (f); - } - break; - case FILE_GZIP: - if ((f->gzf = gzopen(fname, "r")) != NULL) - return (f); - break; - case FILE_BZIP: - if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL) - return (f); - break; + memset(f, 0, sizeof *f); + if (path == NULL) { + /* Processing stdin implies --line-buffered. */ + lbflag = true; + f->fd = STDIN_FILENO; + } else if ((f->fd = open(path, O_RDONLY)) == -1) { + free(f); + return (NULL); } - free(f); - return (NULL); + return (grep_file_init(f)); } /* - * Closes a normal, a gzipped or a bzip2 compressed file. + * Closes a file. */ void grep_close(struct file *f) { - switch (filebehave) { - case FILE_STDIO: - funlockfile(f->f); - fclose(f->f); - break; - case FILE_GZIP: - gzclose(f->gzf); - break; - case FILE_BZIP: - BZ2_bzclose(f->bzf); - break; - } + close(f->fd); - /* Reset read buffer for the file we are closing */ - binbufptr = NULL; - free(binbuf); + /* Reset read buffer and line buffer */ + bufpos = buffer; + bufrem = 0; + + free(lnbuf); + lnbuf = NULL; + lnbuflen = 0; } Modified: head/usr.bin/grep/grep.h ============================================================================== --- head/usr.bin/grep/grep.h Wed Aug 18 17:39:47 2010 (r211462) +++ head/usr.bin/grep/grep.h Wed Aug 18 17:40:10 2010 (r211463) @@ -77,12 +77,8 @@ extern const char *errstr[]; #define MAX_LINE_MATCHES 32 struct file { - struct mmfile *mmf; - BZFILE *bzf; - FILE *f; - gzFile *gzf; + int fd; bool binary; - bool stdin; }; struct str { @@ -150,11 +146,10 @@ void clearqueue(void); /* file.c */ void grep_close(struct file *f); -struct file *grep_stdin_open(void); struct file *grep_open(const char *path); char *grep_fgetln(struct file *f, size_t *len); /* fastgrep.c */ int fastcomp(fastgrep_t *, const char *); void fgrepcomp(fastgrep_t *, const char *); -int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *); +int grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *); Modified: head/usr.bin/grep/util.c ============================================================================== --- head/usr.bin/grep/util.c Wed Aug 18 17:39:47 2010 (r211462) +++ head/usr.bin/grep/util.c Wed Aug 18 17:40:10 2010 (r211463) @@ -184,7 +184,7 @@ procfile(const char *fn) if (strcmp(fn, "-") == 0) { fn = label != NULL ? label : getstr(1); - f = grep_stdin_open(); + f = grep_open(NULL); } else { if (!stat(fn, &sb)) { /* Check if we need to process the file */ @@ -215,7 +215,7 @@ procfile(const char *fn) for (c = 0; c == 0 || !(lflag || qflag); ) { ln.off += ln.len + 1; - if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) { + if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL || ln.len == 0) { if (ln.line_no == 0 && matchall) exit(0); else