Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 23 Sep 2008 14:53:38 GMT
From:      "Pedro F. Giffuni" <giffunip@tutopia.com>
To:        freebsd-gnats-submit@FreeBSD.org
Subject:   ports/127580: New port: textproc/amberfish
Message-ID:  <200809231453.m8NErcGg071285@www.freebsd.org>
Resent-Message-ID: <200809231500.m8NF01nh068007@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         127580
>Category:       ports
>Synopsis:       New port: textproc/amberfish
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-ports-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Tue Sep 23 15:00:01 UTC 2008
>Closed-Date:
>Last-Modified:
>Originator:     Pedro F. Giffuni
>Release:        7.0-Release
>Organization:
>Environment:
FreeBSD kakumen.cable.net.co 7.0-RELEASE-p3 FreeBSD 7.0-RELEASE-p3 #0: Fri Aug  8 16:42:27 COT 2008     root@kakumen.cable.net.co:/usr/src/sys/amd64/compile/GENERIC  amd64
>Description:
A new search engine. This is an improved evolution over the classic Isearch software.
>How-To-Repeat:

>Fix:
# This is a shell archive.  Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file".  Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
#	./amberfish
#	./amberfish/Makefile
#	./amberfish/distinfo
#	./amberfish/pkg-descr
#	./amberfish/pkg-plist
#	./amberfish/files
#	./amberfish/files/porter.cc
#	./amberfish/files/patch-Makefile.in
#	./amberfish/amberfish.shar
#
echo c - ./amberfish
mkdir -p ./amberfish > /dev/null 2>&1
echo x - ./amberfish/Makefile
sed 's/^X//' >./amberfish/Makefile << 'END-of-./amberfish/Makefile'
X# New ports collection makefile for:   amberfish
X# Date created:			21 Aug 2008
X# Whom:				Pedro Giffuni
X#
X# $FreeBSD$
X#
X
XPORTNAME=	amberfish
XPORTVERSION=	1.6.4
XCATEGORIES=	textproc databases
XMASTER_SITES=	SF	\
X		http://etymon.com/software/amberfish/stable/
X
XMAINTAINER=	giffunip@tutopia.com
XCOMMENT=	General purpose text retrieval Software
X
XLIB_DEPENDS=	xerces-c.27:${PORTSDIR}/textproc/xerces-c2
X
XGNU_CONFIGURE=	yes
XUSE_GMAKE=	yes
X
XMAN1=	af.1
X
Xpost-extract:
X	${INSTALL_DATA} ${FILESDIR}/porter.cc ${WRKSRC}/src
X
X.if !defined(NOPORTDOCS)
Xpost-build:
X	@(cd ${WRKSRC} && ${SETENV} ${MAKE_ENV} ${GMAKE} html)
X
Xpost-install:
X	${MKDIR} ${DOCSDIR}
X	${INSTALL_DATA} ${WRKSRC}/amberfish.png ${DOCSDIR}
X	${INSTALL_MAN} ${WRKSRC}/doc/html/*.html ${DOCSDIR}
X.endif
X
X.include <bsd.port.mk>
END-of-./amberfish/Makefile
echo x - ./amberfish/distinfo
sed 's/^X//' >./amberfish/distinfo << 'END-of-./amberfish/distinfo'
XMD5 (amberfish-1.6.4.tar.gz) = 8eb3f1e26da9d0317719822539c3b932
XSHA256 (amberfish-1.6.4.tar.gz) = 155ac6e6b9b76fb7cbd94952548f718ab6add72c3b4fd2482d89abb39d96ce76
XSIZE (amberfish-1.6.4.tar.gz) = 127198
END-of-./amberfish/distinfo
echo x - ./amberfish/pkg-descr
sed 's/^X//' >./amberfish/pkg-descr << 'END-of-./amberfish/pkg-descr'
XAmberfish(R)
X
XAmberfish is general purpose text retrieval software, developed at Etymon 
Xby Nassib Nassar and distributed as open source software under the terms 
Xof version 2 of the GNU General Public License (GPL). Its distinguishing 
Xfeatures are indexing/search of semi-structured text (i.e. both free text 
Xand multiply nested fields), built-in support for XML documents using the 
XXerces library, structured queries allowing generalized field/tag paths, 
Xhierarchical result sets (XML only), automatic searching across multiple 
Xdatabases (allowing modular indexing), TREC format results, efficient 
Xindexing, and relatively low memory requirements during indexing (and the 
Xability to index documents larger than available memory). Z39.50 support
Xis available. Other features include Boolean queries, right truncation,  
Xphrase searching, relevance ranking, support for multiple documents per 
Xfile, incremental indexing, and easy integration with other UNIX tools. 
XThe architecture is also designed to permit proximity queries; however, 
Xthey are not fully implemented at present.
X
XWWW: http://www.etymon.com/tr.html
X
XThis port also includes the Porter stemming algorithm for suffix 
Xstripping, available at:
X     http://www.tartarus.org/~martin/PorterStemmer
END-of-./amberfish/pkg-descr
echo x - ./amberfish/pkg-plist
sed 's/^X//' >./amberfish/pkg-plist << 'END-of-./amberfish/pkg-plist'
Xbin/af
X%%PORTDOCS%%%%DOCSDIR%%/Fields-and-XML.html
X%%PORTDOCS%%%%DOCSDIR%%/Fields.html
X%%PORTDOCS%%%%DOCSDIR%%/Indexing.html
X%%PORTDOCS%%%%DOCSDIR%%/Introduction.html
X%%PORTDOCS%%%%DOCSDIR%%/Listing-database-information.html
X%%PORTDOCS%%%%DOCSDIR%%/More-about-XML.html
X%%PORTDOCS%%%%DOCSDIR%%/Multiple-databases.html
X%%PORTDOCS%%%%DOCSDIR%%/Multiple-documents-in-a-file.html
X%%PORTDOCS%%%%DOCSDIR%%/Phrases.html
X%%PORTDOCS%%%%DOCSDIR%%/Relevance-ranking.html
X%%PORTDOCS%%%%DOCSDIR%%/Right-truncation.html
X%%PORTDOCS%%%%DOCSDIR%%/Searching.html
X%%PORTDOCS%%%%DOCSDIR%%/Tutorial.html
X%%PORTDOCS%%%%DOCSDIR%%/amberfish.png
X%%PORTDOCS%%%%DOCSDIR%%/index.html
X%%PORTDOCS%%@dirrm %%DOCSDIR%%
END-of-./amberfish/pkg-plist
echo c - ./amberfish/files
mkdir -p ./amberfish/files > /dev/null 2>&1
echo x - ./amberfish/files/porter.cc
sed 's/^X//' >./amberfish/files/porter.cc << 'END-of-./amberfish/files/porter.cc'
X
X/* This is the Porter stemming algorithm, coded up in ANSI C by the
X   author. It may be be regarded as cononical, in that it follows the
X   algorithm presented in
X
X   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
X   no. 3, pp 130-137,
X
X   only differing from it at the points maked --DEPARTURE-- below.
X
X   See also http://www.tartarus.org/~martin/PorterStemmer
X
XThe algorithm as described in the paper could be exactly replicated
Xby adjusting the points of DEPARTURE, but this is barely necessary,
Xbecause (a) the points of DEPARTURE are definitely improvements, and
X(b) no encoding of the Porter stemmer I have seen is anything like
Xas exact as this version, even with the points of DEPARTURE!
X
XYou can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
X'stem' takes a list of inputs and sends the stemmed equivalent to
Xstdout.
X
XThe algorithm as encoded here is particularly fast.
X
XRelease 1
X*/
X
X#include <string.h>                               /* for memmove */
X
X#define TRUE 1
X#define FALSE 0
X
X/* The main part of the stemming algorithm starts here. b is a buffer
X   holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
X   ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
X   downwards as the stemming progresses. Zero termination is not in fact
X   used in the algorithm.
X
X   Note that only lower case sequences are stemmed. Forcing to lower case
X   should be done before stem(...) is called.
X*/
X
Xstatic char * b;                                  /* buffer for word to be stemmed */
Xstatic int k,k0,j;                                /* j is a general offset into the string */
X
X/* cons(i) is TRUE <=> b[i] is a consonant. */
X
Xstatic int cons(int i)
X{
X    switch (b[i])
X    {
X        case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
X        case 'y': return (i==k0) ? TRUE : !cons(i-1);
X        default: return TRUE;
X    }
X}
X
X
X/* m() measures the number of consonant sequences between k0 and j. if c is
X   a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
X   presence,
X
X      <c><v>       gives 0
X      <c>vc<v>     gives 1
X      <c>vcvc<v>   gives 2
X      <c>vcvcvc<v> gives 3
X      ....
X*/
X
Xstatic int m()
X{
X    int n = 0;
X    int i = k0;
X    while(TRUE)
X    {
X        if (i > j) return n;
X        if (! cons(i)) break; i++;
X    }
X    i++;
X    while(TRUE)
X    {
X        while(TRUE)
X        {
X            if (i > j) return n;
X            if (cons(i)) break;
X            i++;
X        }
X        i++;
X        n++;
X        while(TRUE)
X        {
X            if (i > j) return n;
X            if (! cons(i)) break;
X            i++;
X        }
X        i++;
X    }
X}
X
X
X/* vowelinstem() is TRUE <=> k0,...j contains a vowel */
X
Xstatic int vowelinstem()
X{
X    int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE;
X    return FALSE;
X}
X
X
X/* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */
X
Xstatic int doublec(int j)
X{
X    if (j < k0+1) return FALSE;
X    if (b[j] != b[j-1]) return FALSE;
X    return cons(j);
X}
X
X
X/* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
X   and also if the second c is not w,x or y. this is used when trying to
X   restore an e at the end of a short word. e.g.
X
X      cav(e), lov(e), hop(e), crim(e), but
X      snow, box, tray.
X
X*/
X
Xstatic int cvc(int i)
X{
X    if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return FALSE;
X    {
X        int ch = b[i];
X        if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
X    }
X    return TRUE;
X}
X
X
X/* ends(s) is TRUE <=> k0,...k ends with the string s. */
X
Xstatic int ends(char * s)
X{
X    int length = s[0];
X    if (s[length] != b[k]) return FALSE;          /* tiny speed-up */
X    if (length > k-k0+1) return FALSE;
X    if (memcmp(b+k-length+1,s+1,length) != 0) return FALSE;
X    j = k-length;
X    return TRUE;
X}
X
X
X/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
X   k. */
X
Xstatic void setto(char * s)
X{
X    int length = s[0];
X    memmove(b+j+1,s+1,length);
X    k = j+length;
X}
X
X
X/* r(s) is used further down. */
X
Xstatic void r(char * s) { if (m() > 0) setto(s); }
X
X/* step1ab() gets rid of plurals and -ed or -ing. e.g.
X
X	caresses  ->  caress
X	ponies    ->  poni
X	ties      ->  ti
X	caress    ->  caress
X	cats      ->  cat
X
X	feed      ->  feed
X	agreed    ->  agree
X	disabled  ->  disable
X
X	matting   ->  mat
X	mating    ->  mate
X	meeting   ->  meet
X	milling   ->  mill
X	messing   ->  mess
X
X	meetings  ->  meet
X
X*/
X
Xstatic void step1ab()
X{
X    if (b[k] == 's')
X    {
X        if (ends("\04" "sses")) k -= 2; else
X            if (ends("\03" "ies")) setto("\01" "i"); else
X                if (b[k-1] != 's') k--;
X    }
X    if (ends("\03" "eed")) { if (m() > 0) k--; }
X    else
X    if ((ends("\02" "ed") || ends("\03" "ing")) && vowelinstem())
X    {
X        k = j;
X        if (ends("\02" "at")) setto("\03" "ate"); else
X            if (ends("\02" "bl")) setto("\03" "ble"); else
X                if (ends("\02" "iz")) setto("\03" "ize"); else
X                    if (doublec(k))
X                    {
X                        k--;
X                        {
X                            int ch = b[k];
X                            if (ch == 'l' || ch == 's' || ch == 'z') k++;
X                        }
X                    }
X        else if (m() == 1 && cvc(k)) setto("\01" "e");
X    }
X}
X
X
X/* step1c() turns terminal y to i when there is another vowel in the stem. */
X
Xstatic void step1c() { if (ends("\01" "y") && vowelinstem()) b[k] = 'i'; }
X
X/* step2() maps double suffices to single ones. so -ization ( = -ize plus
X   -ation) maps to -ize etc. note that the string before the suffix must give
X   m() > 0. */
X
Xstatic void step2()
X{
X    switch (b[k-1])
X    {
X        case 'a': if (ends("\07" "ational")) { r("\03" "ate"); break; }
X        if (ends("\06" "tional")) { r("\04" "tion"); break; }
X        break;
X        case 'c': if (ends("\04" "enci")) { r("\04" "ence"); break; }
X        if (ends("\04" "anci")) { r("\04" "ance"); break; }
X        break;
X        case 'e': if (ends("\04" "izer")) { r("\03" "ize"); break; }
X        break;
X        case 'l': if (ends("\03" "bli"))          /*-DEPARTURE-*/
X        {
X            r("\03" "ble"); break;
X        }
X
X/* To match the published algorithm, replace this line with
X   case 'l': if (ends("\04" "abli")) { r("\04" "able"); break; } */
X
X        if (ends("\04" "alli")) { r("\02" "al"); break; }
X        if (ends("\05" "entli")) { r("\03" "ent"); break; }
X        if (ends("\03" "eli")) { r("\01" "e"); break; }
X        if (ends("\05" "ousli")) { r("\03" "ous"); break; }
X        break;
X        case 'o': if (ends("\07" "ization")) { r("\03" "ize"); break; }
X        if (ends("\05" "ation")) { r("\03" "ate"); break; }
X        if (ends("\04" "ator")) { r("\03" "ate"); break; }
X        break;
X        case 's': if (ends("\05" "alism")) { r("\02" "al"); break; }
X        if (ends("\07" "iveness")) { r("\03" "ive"); break; }
X        if (ends("\07" "fulness")) { r("\03" "ful"); break; }
X        if (ends("\07" "ousness")) { r("\03" "ous"); break; }
X        break;
X        case 't': if (ends("\05" "aliti")) { r("\02" "al"); break; }
X        if (ends("\05" "iviti")) { r("\03" "ive"); break; }
X        if (ends("\06" "biliti")) { r("\03" "ble"); break; }
X        break;
X        case 'g': if (ends("\04" "logi"))         /*-DEPARTURE-*/
X        {
X            r("\03" "log"); break;
X        }
X
X/* To match the published algorithm, delete this line */
X
X    }
X}
X
X
X/* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */
X
Xstatic void step3()
X{
X    switch (b[k])
X    {
X        case 'e': if (ends("\05" "icate")) { r("\02" "ic"); break; }
X        if (ends("\05" "ative")) { r("\00" ""); break; }
X        if (ends("\05" "alize")) { r("\02" "al"); break; }
X        break;
X        case 'i': if (ends("\05" "iciti")) { r("\02" "ic"); break; }
X        break;
X        case 'l': if (ends("\04" "ical")) { r("\02" "ic"); break; }
X        if (ends("\03" "ful")) { r("\00" ""); break; }
X        break;
X        case 's': if (ends("\04" "ness")) { r("\00" ""); break; }
X        break;
X    }
X}
X
X
X/* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */
X
Xstatic void step4()
X{
X    switch (b[k-1])
X    {
X        case 'a': if (ends("\02" "al")) break; return;
X        case 'c': if (ends("\04" "ance")) break;
X        if (ends("\04" "ence")) break; return;
X        case 'e': if (ends("\02" "er")) break; return;
X        case 'i': if (ends("\02" "ic")) break; return;
X        case 'l': if (ends("\04" "able")) break;
X        if (ends("\04" "ible")) break; return;
X        case 'n': if (ends("\03" "ant")) break;
X        if (ends("\05" "ement")) break;
X        if (ends("\04" "ment")) break;
X        if (ends("\03" "ent")) break; return;
X        case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break;
X        if (ends("\02" "ou")) break; return;
X/* takes care of -ous */
X        case 's': if (ends("\03" "ism")) break; return;
X        case 't': if (ends("\03" "ate")) break;
X        if (ends("\03" "iti")) break; return;
X        case 'u': if (ends("\03" "ous")) break; return;
X        case 'v': if (ends("\03" "ive")) break; return;
X        case 'z': if (ends("\03" "ize")) break; return;
X        default: return;
X    }
X    if (m() > 1) k = j;
X}
X
X
X/* step5() removes a final -e if m() > 1, and changes -ll to -l if
X   m() > 1. */
X
Xstatic void step5()
X{
X    j = k;
X    if (b[k] == 'e')
X    {
X        int a = m();
X        if (a > 1 || a == 1 && !cvc(k-1)) k--;
X    }
X    if (b[k] == 'l' && doublec(k) && m() > 1) k--;
X}
X
X
X/* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from
X   p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
X   character of a string, (p[j+1] == '\0'). The stemmer adjusts the
X   characters p[i] ... p[j] and returns the new end-point of the string, k.
X   Stemming never increases word length, so i <= k <= j. To turn the stemmer
X   into a module, declare 'stem' as extern, and delete the remainder of this
X   file.
X*/
X
Xint stem(char * p, int i, int j)
X{                                                 /* copy the parameters into statics */
X    b = p; k = j; k0 = i;
X    if (k <= k0+1) return k;                      /*-DEPARTURE-*/
X
X/* With this line, strings of length 1 or 2 don't go through the
X   stemming process, although no mention is made of this in the
X   published algorithm. Remove the line to match the published
X   algorithm. */
X
X    step1ab(); step1c(); step2(); step3(); step4(); step5();
X    return k;
X}
X
X
X/*--------------------stemmer definition ends here------------------------*/
X
X#include <stdio.h>
X#include <stdlib.h>                               /* for malloc, free */
X#include <ctype.h>                                /* for isupper, islower, tolower */
X
Xstatic char * s;                                  /* a char * (=string) pointer; passed into b above */
X
X#define INC 50                                    /* size units in which s is increased */
Xstatic int i_max = INC;                           /* maximum offset in s */
X
Xvoid increase_s()
X{
X    i_max += INC;
X    {
X        char * new_s = (char *) malloc(i_max+1);
X        {                                         /* copy across */
X            int i; for (i = 0; i < i_max; i++) new_s[i] = s[i];
X        }
X        free(s); s = new_s;
X    }
X}
X
X
X#define LETTER(ch) (isupper(ch) || islower(ch))
X
Xstatic void stemfile(FILE * f)
X{
X    while(TRUE)
X    {
X        int ch = getc(f);
X        if (ch == EOF) return;
X        if (LETTER(ch))
X        {
X            int i = 0;
X            while(TRUE)
X            {
X                if (i == i_max) increase_s();
X
X                ch = tolower(ch);                 /* forces lower case */
X
X                s[i] = ch; i++;
X                ch = getc(f);
X                if (!LETTER(ch)) { ungetc(ch,f); break; }
X            }
X            s[stem(s,0,i-1)+1] = 0;
X/* the previous line calls the stemmer and uses its result to
X   zero-terminate the string in s */
X            printf("%s",s);
X        }
X        else putchar(ch);
X    }
X}
X
X/*
X * Commented out as required by amberfish's INSTALL file
X *
X	int main(int argc, char * argv[])
X	{
X	    int i;
X	    s = (char *) malloc(i_max+1);
X	    for (i = 1; i < argc; i++)
X	    {
X	        FILE * f = fopen(argv[i],"r");
X	        if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
X	        stemfile(f);
X	    }
X	    free(s);
X	    return 0;
X	}
X*/
END-of-./amberfish/files/porter.cc
echo x - ./amberfish/files/patch-Makefile.in
sed 's/^X//' >./amberfish/files/patch-Makefile.in << 'END-of-./amberfish/files/patch-Makefile.in'
X--- src/Makefile.in.orig	2008-08-12 11:45:08.000000000 -0500
X+++ src/Makefile.in	2008-08-12 11:46:07.000000000 -0500
X@@ -66,7 +66,7 @@
X 	strip af
X 
X install: all
X-	make strip
X+	${MAKE} strip
X 	mkdir -p ${PREFIXBIN}
X 	cp ${BIN} ${PREFIXBIN}/.
X 
END-of-./amberfish/files/patch-Makefile.in
echo x - ./amberfish/amberfish.shar
sed 's/^X//' >./amberfish/amberfish.shar << 'END-of-./amberfish/amberfish.shar'
END-of-./amberfish/amberfish.shar
exit



>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200809231453.m8NErcGg071285>