From owner-freebsd-current Tue Sep 12 14:36:09 1995 Return-Path: current-owner Received: (from majordom@localhost) by freefall.freebsd.org (8.6.12/8.6.6) id OAA01405 for current-outgoing; Tue, 12 Sep 1995 14:36:09 -0700 Received: from mail.cs.tu-berlin.de (mail.cs.tu-berlin.de [130.149.17.13]) by freefall.freebsd.org (8.6.12/8.6.6) with ESMTP id OAA01346 for ; Tue, 12 Sep 1995 14:35:50 -0700 Received: from caramba.cs.tu-berlin.de (wosch@caramba.cs.tu-berlin.de [130.149.144.4]) by mail.cs.tu-berlin.de (8.6.12/8.6.12) with ESMTP id XAA06455 for ; Tue, 12 Sep 1995 23:24:59 +0200 From: Wolfram Schneider Received: (wosch@localhost) by caramba.cs.tu-berlin.de (8.6.12/8.6.9) id XAA08914; Tue, 12 Sep 1995 23:24:54 +0200 Date: Tue, 12 Sep 1995 23:24:54 +0200 Message-Id: <199509122124.XAA08914@caramba.cs.tu-berlin.de> To: current@freebsd.org Subject: /usr/libexec/locate.* MIME-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 8bit Sender: current-owner@freebsd.org Precedence: bulk o locate.bigram.c Bigram does not remove newline at end of filename. This break particulary the bigram algorithm and /var/db/locate.database grow up 15 %. It's really a bug!!! The bigram output is silly and need ~1/2 CPU time of database rebuilding. old: locate.bigram < $filelist | sort | uniq -c | sort -T /$TMPDIR -nr ^^^^^^^^^^^^^^ this can easy made bigram new: locate.bigram < $filelist | sort -T /$TMPDIR -nr o locate.code.c Use a lookup array instead a function. 3 x faster (GNU-code is now 6 x slower) o local.updatedb.csh # search locally or everything # find ${SRCHPATHS} -print | \ find ${SRCHPATHS} \! -fstype local -prune -or -print | \ tr '/' '\001' | \ ^^^^^^^^^^^^^^^^^^ Superfluously. Nobody need it. (sort -T $TMPDIR -f; echo $status > $errs) | tr '\001' '/' > $filelist ^^ wrong, made database 0.5% bigger ^^^^^^^^^^^^^^^^^^^^^^^^ Superfluously, see above. It double the disk space for filenames. The filenames are in a temp sort file and at the same time in $filelist. sort -T $TMPDIR -o $filelist avoid this by renaming temp sort file to $filelist. My database is 115MB big ... $LIBDIR/locate.bigram < $filelist | \ (sort -T /$TMPDIR; echo $status >> $errs) | uniq -c | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ see locate.bigram.c --- /usr/src/usr.bin/locate/bigram/locate.bigram.c Fri May 27 14:32:02 1994 +++ locate.bigram.c Tue Sep 12 14:24:56 1995 @@ -53,32 +53,51 @@ #include #include /* for MAXPATHLEN */ +#include /* memchr */ -char buf1[MAXPATHLEN] = " "; -char buf2[MAXPATHLEN]; +u_char buf1[MAXPATHLEN] = " "; +u_char buf2[MAXPATHLEN]; +unsigned int bigram[UCHAR_MAX][UCHAR_MAX]; -main ( ) + +void main ( ) { - register char *cp; - register char *oldpath = buf1, *path = buf2; + register u_char *cp; + register u_char *oldpath = buf1, *path = buf2; + register int i, j; + + /* init bigram buffer */ + for (i = 0; i < UCHAR_MAX; i++) + for (j = 0; j < UCHAR_MAX; j++) + bigram[i][j] = 0; while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { + /* chop newline */ + if ((cp = memchr(path, '\n', sizeof(buf2))) != NULL) + *cp = NULL; + /* skip longest common prefix */ - for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) - if ( *oldpath == NULL ) - break; + for (cp = path; (*cp == *oldpath) != NULL; cp++, oldpath++); + /* * output post-residue bigrams only */ + while ( *cp != NULL && *(cp + 1) != NULL ) { - putchar ( *cp++ ); - putchar ( *cp++ ); - putchar ( '\n' ); + bigram[*cp][*(cp+1)]++; + cp += 2; } + if ( path == buf1 ) /* swap pointers */ path = buf2, oldpath = buf1; else path = buf1, oldpath = buf2; } + + /* output */ + for (i = 0; i < UCHAR_MAX; i++) + for (j = 0; j < UCHAR_MAX; j++) + if (bigram[i][j] != 0) + fprintf(stdout, "%4d %c%c\n", bigram[i][j], i, j); } --- 1.1 1995/09/12 16:40:45 +++ locate.code.c 1995/09/12 18:07:57 @@ -93,8 +93,20 @@ char buf2[MAXPATHLEN]; char bigrams[BGBUFSIZE + 1] = { 0 }; +#if 1 +#define BGINDEX(x) (big[(int)*x][(int)*(x+1)]) +typedef u_char bg_t; +bg_t big[UCHAR_MAX][UCHAR_MAX]; + +#else +#define BGINDEX(x) bgindex(x) +typedef int bg_t; +#endif + int bgindex __P((char *)); void usage __P((void)); +extern int optind; +extern int optopt; int main(argc, argv) @@ -104,6 +116,7 @@ register char *cp, *oldpath, *path; int ch, code, count, diffcount, oldcount; FILE *fp; + register int i, j; while ((ch = getopt(argc, argv, "")) != EOF) switch(ch) { @@ -126,14 +139,22 @@ err(1, "stdout"); (void)fclose(fp); + /* init lookup table */ + for (i = 0; i < UCHAR_MAX; i++) + for (j = 0; j < UCHAR_MAX; j++) + big[i][j] = (bg_t)-1; + + for (cp = bigrams, i = 0; *cp != NULL; i += 2, cp += 2) + big[(int)*cp][(int)*(cp + 1)] = (bg_t)i; + oldpath = buf1; path = buf2; oldcount = 0; while (fgets(path, sizeof(buf2), stdin) != NULL) { - /* Truncate newline. */ - cp = path + strlen(path) - 1; - if (cp > path && *cp == '\n') - *cp = '\0'; + + /* chop newline */ + if ((cp = memchr(path, '\n', sizeof(buf2))) != NULL) + *cp = NULL; /* Squelch characters that would botch the decoding. */ for (cp = path; *cp != NULL; cp++) { @@ -144,9 +165,9 @@ } /* Skip longest common prefix. */ - for (cp = path; *cp == *oldpath; cp++, oldpath++) - if (*oldpath == NULL) - break; + for (cp = path; (*cp == *oldpath) != NULL; cp++, oldpath++) + ; + count = cp - path; diffcount = count - oldcount + OFFSET; oldcount = count; @@ -164,7 +185,7 @@ err(1, "stdout"); break; } - if ((code = bgindex(cp)) < 0) { + if ((code = BGINDEX(cp)) == (bg_t)-1) { if (putchar(*cp++) == EOF || putchar(*cp++) == EOF) err(1, "stdout"); --- /usr/libexec/locate.updatedb.old Sun Jan 1 05:36:58 1995 +++ /usr/libexec/locate.updatedb Tue Sep 12 14:33:32 1995 @@ -58,12 +58,10 @@ # search locally or everything # find ${SRCHPATHS} -print | \ find ${SRCHPATHS} \! -fstype local -prune -or -print | \ - tr '/' '\001' | \ - (sort -T $TMPDIR -f; echo $status > $errs) | tr '\001' '/' > $filelist + sort -T $TMPDIR -o $filelist; echo $status > $errs -$LIBDIR/locate.bigram < $filelist | \ - (sort -T /$TMPDIR; echo $status >> $errs) | \ - uniq -c | sort -T /$TMPDIR -nr | \ +$LIBDIR/locate.bigram.new < $filelist | \ + sort -T $TMPDIR -nr | \ awk '{ if (NR <= 128) print $2 }' | tr -d '\012' > $bigrams # code the file list