Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 12 Sep 1995 23:24:54 +0200
From:      Wolfram Schneider <wosch@cs.tu-berlin.de>
To:        current@freebsd.org
Subject:   /usr/libexec/locate.*
Message-ID:  <199509122124.XAA08914@caramba.cs.tu-berlin.de>

next in thread | raw e-mail | index | archive | help

o locate.bigram.c
	Bigram does not remove newline at end of filename. This
	break particulary the bigram algorithm and /var/db/locate.database
	grow up 15 %. It's really a bug!!!

	The bigram output is silly and need ~1/2 CPU time of
	database rebuilding.

	old:
	locate.bigram < $filelist | sort | uniq -c | sort -T /$TMPDIR -nr 
                                    ^^^^^^^^^^^^^^
				    this can easy made bigram

	new:
        locate.bigram < $filelist | sort -T /$TMPDIR -nr 


o locate.code.c
	Use a lookup array instead a function. 3 x faster (GNU-code is
        now 6 x slower)

o local.updatedb.csh
	# search locally or everything
	# find ${SRCHPATHS} -print | \
	find ${SRCHPATHS} \! -fstype local -prune -or -print | \
	tr '/' '\001' | \
        ^^^^^^^^^^^^^^^^^^
Superfluously. Nobody need it. 

	(sort -T $TMPDIR -f; echo $status > $errs) | tr '\001' '/' > $filelist
                         ^^
wrong, made database 0.5% bigger
                                                     ^^^^^^^^^^^^^^^^^^^^^^^^

Superfluously, see above. It double the disk space for filenames. The
filenames are in a temp sort file and at the same time in $filelist.
sort -T $TMPDIR -o $filelist avoid this by renaming temp sort file
to $filelist. My database is 115MB big ...


	$LIBDIR/locate.bigram < $filelist | \
	(sort -T /$TMPDIR; echo $status >> $errs) | uniq -c | 
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
see locate.bigram.c


--- /usr/src/usr.bin/locate/bigram/locate.bigram.c	Fri May 27 14:32:02 1994
+++ locate.bigram.c	Tue Sep 12 14:24:56 1995
@@ -53,32 +53,51 @@
 
 #include <stdio.h>
 #include <sys/param.h>			/* for MAXPATHLEN */
+#include <string.h>			/* memchr */
 
-char buf1[MAXPATHLEN] = " ";	
-char buf2[MAXPATHLEN];
+u_char buf1[MAXPATHLEN] = " ";	
+u_char buf2[MAXPATHLEN];
+unsigned int bigram[UCHAR_MAX][UCHAR_MAX];
 
-main ( )
+
+void main ( )
 {
-  	register char *cp;
-	register char *oldpath = buf1, *path = buf2;
+  	register u_char *cp;
+	register u_char *oldpath = buf1, *path = buf2;
+	register int i, j;
+
+	/* init bigram buffer */
+	for (i = 0; i < UCHAR_MAX; i++)
+	    	for (j = 0; j < UCHAR_MAX; j++) 
+			bigram[i][j] = 0;
 
      	while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) {
 
+		/* chop newline */
+	        if ((cp = memchr(path, '\n', sizeof(buf2))) != NULL) 
+			*cp = NULL;
+
 		/* skip longest common prefix */
-		for ( cp = path; *cp == *oldpath; cp++, oldpath++ )
-			if ( *oldpath == NULL )
-				break;
+		for (cp = path; (*cp == *oldpath) != NULL; cp++, oldpath++);
+
 		/*
 		 * output post-residue bigrams only
 		 */
+
 		while ( *cp != NULL && *(cp + 1) != NULL ) {
-			putchar ( *cp++ );
-			putchar ( *cp++ );
-			putchar ( '\n' );
+		    bigram[*cp][*(cp+1)]++;
+		    cp += 2;
 		}
+
 		if ( path == buf1 )		/* swap pointers */
 			path = buf2, oldpath = buf1;
 		else
 			path = buf1, oldpath = buf2;
    	}
+
+	/* output */
+	for (i = 0; i < UCHAR_MAX; i++)
+	    for (j = 0; j < UCHAR_MAX; j++)
+		if (bigram[i][j] != 0) 
+		    fprintf(stdout, "%4d %c%c\n", bigram[i][j], i, j);
 }
--- 1.1	1995/09/12 16:40:45
+++ locate.code.c	1995/09/12 18:07:57
@@ -93,8 +93,20 @@
 char buf2[MAXPATHLEN];
 char bigrams[BGBUFSIZE + 1] = { 0 };
 
+#if 1
+#define BGINDEX(x) (big[(int)*x][(int)*(x+1)])
+typedef u_char bg_t;
+bg_t big[UCHAR_MAX][UCHAR_MAX];
+
+#else
+#define BGINDEX(x) bgindex(x)
+typedef int bg_t;
+#endif
+
 int	bgindex __P((char *));
 void	usage __P((void));
+extern int optind;
+extern int optopt;
 
 int
 main(argc, argv)
@@ -104,6 +116,7 @@
 	register char *cp, *oldpath, *path;
 	int ch, code, count, diffcount, oldcount;
 	FILE *fp;
+	register int i, j;
 
 	while ((ch = getopt(argc, argv, "")) != EOF)
 		switch(ch) {
@@ -126,14 +139,22 @@
 		err(1, "stdout");
 	(void)fclose(fp);
 
+	/* init lookup table */
+	for (i = 0; i < UCHAR_MAX; i++)
+	    	for (j = 0; j < UCHAR_MAX; j++) 
+			big[i][j] = (bg_t)-1;
+
+	for (cp = bigrams, i = 0; *cp != NULL; i += 2, cp += 2)
+	        big[(int)*cp][(int)*(cp + 1)] = (bg_t)i;
+
 	oldpath = buf1;
 	path = buf2;
 	oldcount = 0;
 	while (fgets(path, sizeof(buf2), stdin) != NULL) {
-		/* Truncate newline. */
-		cp = path + strlen(path) - 1;
-		if (cp > path && *cp == '\n')
-			*cp = '\0';
+
+		/* chop newline */
+	        if ((cp = memchr(path, '\n', sizeof(buf2))) != NULL) 
+			*cp = NULL;
 
 		/* Squelch characters that would botch the decoding. */
 		for (cp = path; *cp != NULL; cp++) {
@@ -144,9 +165,9 @@
 		}
 
 		/* Skip longest common prefix. */
-		for (cp = path; *cp == *oldpath; cp++, oldpath++)
-			if (*oldpath == NULL)
-				break;
+		for (cp = path; (*cp == *oldpath) != NULL; cp++, oldpath++)
+		    ;
+
 		count = cp - path;
 		diffcount = count - oldcount + OFFSET;
 		oldcount = count;
@@ -164,7 +185,7 @@
 					err(1, "stdout");
 				break;
 			}
-			if ((code = bgindex(cp)) < 0) {
+			if ((code = BGINDEX(cp)) == (bg_t)-1) {
 				if (putchar(*cp++) == EOF ||
 				    putchar(*cp++) == EOF)
 					err(1, "stdout");
--- /usr/libexec/locate.updatedb.old	Sun Jan  1 05:36:58 1995
+++ /usr/libexec/locate.updatedb	Tue Sep 12 14:33:32 1995
@@ -58,12 +58,10 @@
 # search locally or everything
 # find ${SRCHPATHS} -print | \
 find ${SRCHPATHS} \! -fstype local -prune -or -print | \
-	tr '/' '\001' | \
-	(sort -T $TMPDIR -f; echo $status > $errs) | tr '\001' '/' > $filelist
+	sort -T $TMPDIR -o $filelist; echo $status > $errs
 
-$LIBDIR/locate.bigram < $filelist | \
-	(sort -T /$TMPDIR; echo $status >> $errs) | \
-	uniq -c | sort -T /$TMPDIR -nr | \
+$LIBDIR/locate.bigram.new < $filelist | \
+	sort -T $TMPDIR -nr | \
 	awk '{ if (NR <= 128) print $2 }' | tr -d '\012' > $bigrams
 
 # code the file list



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199509122124.XAA08914>