[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/porter - output.txt:1.1.2.1 porterstem.c:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 13 Feb 2002 11:26:25 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/porter
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/porter

Added Files:
      Tag: ajung-textindexng-branch
	output.txt porterstem.c stem.h stem.sbl stemmer.html voc.txt 
Log Message:
added PyStemmer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/porter/output.txt === (30328/30428 lines abridged)
a
aaron
aback
abaissiez
abandon
abandon
abas
abash
abash
abat
abat
abat
abat
abat
abatt
abb
abbess
abbei
abbei
abbomin
abbot
abbot
abbrevi
abdic
abduct
ab
abel
aberga
abergavenni
abet
abet
abhomin
abhor
abhorr
abhor
abhorr
abhor
abhor
abhorson
abid
abid
abil
abil
abingdon
abject
abjectli
abject
abjur
abjur
abl

[-=- -=- -=- 30328 lines omitted -=- -=- -=-]

yorick
york
yorkist
york
yorkshir
you
young
younger
youngest
youngl
youngl
youngli
youngster
youngster
younker
younker
your
your
yourself
yourselv
youth
youth
youth
youtli
zani
zani
zatusfi
zeal
zealou
zeal
zed
zee
zenana
zenelophon
zenith
zephyr
zermon
zero
zest
zettlement
zir
zo
zodiac
zodiac
zoggi
zone
zound
zuleikah
zwagger
zwieback


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/porter/porterstem.c === (415/515 lines abridged)

#include "header.h"

extern int porter_stem(struct SN_env * z);
static int r_Step_5b(struct SN_env * z);
static int r_Step_5a(struct SN_env * z);
static int r_Step_4(struct SN_env * z);
static int r_Step_3(struct SN_env * z);
static int r_Step_2(struct SN_env * z);
static int r_Step_1c(struct SN_env * z);
static int r_Step_1b(struct SN_env * z);
static int r_Step_1a(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_shortv(struct SN_env * z);

static struct among a_0[4] =
{
/*  0 */ { 1, (byte *)"s", -1, 3, 0},
/*  1 */ { 3, (byte *)"ies", 0, 2, 0},
/*  2 */ { 4, (byte *)"sses", 0, 1, 0},
/*  3 */ { 2, (byte *)"ss", 0, -1, 0}
};

static struct among a_1[13] =
{
/*  0 */ { 0, (byte *)"", -1, 3, 0},
/*  1 */ { 2, (byte *)"bb", 0, 2, 0},
/*  2 */ { 2, (byte *)"dd", 0, 2, 0},
/*  3 */ { 2, (byte *)"ff", 0, 2, 0},
/*  4 */ { 2, (byte *)"gg", 0, 2, 0},
/*  5 */ { 2, (byte *)"bl", 0, 1, 0},
/*  6 */ { 2, (byte *)"mm", 0, 2, 0},
/*  7 */ { 2, (byte *)"nn", 0, 2, 0},
/*  8 */ { 2, (byte *)"pp", 0, 2, 0},
/*  9 */ { 2, (byte *)"rr", 0, 2, 0},
/* 10 */ { 2, (byte *)"at", 0, 1, 0},
/* 11 */ { 2, (byte *)"tt", 0, 2, 0},
/* 12 */ { 2, (byte *)"iz", 0, 1, 0}
};

static struct among a_2[3] =
{
/*  0 */ { 2, (byte *)"ed", -1, 2, 0},
/*  1 */ { 3, (byte *)"eed", 0, 1, 0},
/*  2 */ { 3, (byte *)"ing", -1, 2, 0}
};

static struct among a_3[20] =
{

[-=- -=- -=- 415 lines omitted -=- -=- -=-]

    lab13:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 132 */
        if (!r_Step_4(z)) goto lab14; /* call Step_4, line 132 */
    lab14:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 133 */
        if (!r_Step_5a(z)) goto lab15; /* call Step_5a, line 133 */
    lab15:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 134 */
        if (!r_Step_5b(z)) goto lab16; /* call Step_5b, line 134 */
    lab16:
        z->c = z->l - m;
    }
    z->c = z->lb;    {   int c = z->c; /* do, line 137 */
        if (!(z->B[0])) goto lab17; /* Boolean test Y_found, line 137 */
        while(1) { /* repeat, line 137 */
            int c = z->c;
            while(1) { /* goto, line 137 */
                int c = z->c;
                z->bra = z->c; /* [, line 137 */
                if (!(eq_s(z, 1, "Y"))) goto lab19;
                z->ket = z->c; /* ], line 137 */
                z->c = c;
                break;
            lab19:
                z->c = c;
                if (z->c >= z->l) goto lab18;
                z->c++;
            }
            slice_from_s(z, 1, "y"); /* <-, line 137 */
            continue;
        lab18:
            z->c = c;
            break;
        }
    lab17:
        z->c = c;
    }
    return 1;
}

extern struct SN_env * porter_create_env(void) { return SN_create_env(0, 2, 1); }

extern void porter_close_env(struct SN_env * z) { SN_close_env(z); }



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/porter/stem.h ===

extern struct SN_env * porter_create_env(void);
extern void porter_close_env(struct SN_env * z);

extern int porter_stem(struct SN_env * z);



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/porter/stem.sbl ===
integers ( p1 p2 )
booleans ( Y_found )

routines (
   shortv
   R1 R2
   Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
)

externals ( stem )

groupings ( v v_WXY )

define v        'aeiouy'
define v_WXY    v + 'wxY'

backwardmode (

    define shortv as ( non-v_WXY v non-v )

    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define Step_1a as (
        [substring] among (
            'sses' (<-'ss')
            'ies'  (<-'i')
            'ss'   ()
            's'    (delete)
        )
    )

    define Step_1b as (
        [substring] among (
            'eed'  (R1 <-'ee')
            'ed'
            'ing' (
                test gopast v  delete
                test substring among(
                    'at' 'bl' 'iz'
                         (<+ 'e')
                    'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
                    // ignoring double c, h, j, k, q, v, w, and x
                         ([next]  delete)
                    ''   (atmark p1  test shortv  <+ 'e')
                )
            )
        )
    )

    define Step_1c as (
        ['y' or 'Y']
        gopast v
        <-'i'
    )

    define Step_2 as (
        [substring] R1 among (
            'tional'  (<-'tion')
            'enci'    (<-'ence')
            'anci'    (<-'ance')
            'abli'    (<-'able')
            'entli'   (<-'ent')
            'eli'     (<-'e')
            'izer' 'ization'
                      (<-'ize')
            'ational' 'ation' 'ator'
                      (<-'ate')
            'alli'    (<-'al')
            'alism' 'aliti'
                      (<-'al')
            'fulness' (<-'ful')
            'ousli' 'ousness'
                      (<-'ous')
            'iveness' 'iviti'
                      (<-'ive')
            'biliti'  (<-'ble')
        )
    )

    define Step_3 as (
        [substring] R1 among (
            'alize'   (<-'al')
            'icate' 'iciti' 'ical'
                      (<-'ic')
            'ative' 'ful' 'ness'
                      (delete)
        )
    )

    define Step_4 as (
        [substring] R2 among (
            'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
            'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
                      (delete)
            'ion'     ('s' or 't' delete)
        )
    )

    define Step_5a as (
        ['e']
        R2 or (R1 not shortv)
        delete
    )

    define Step_5b as (
        ['l']
        R2 'l'
        delete
    )
)

define stem as (

    unset Y_found
    do ( ['y'] <-'Y' set Y_found)
    do repeat(goto (v ['y']) <-'Y' set Y_found)

    $p1 = limit
    $p2 = limit
    do(
        gopast v  gopast non-v  setmark p1
        gopast v  gopast non-v  setmark p2
    )

    backwards (
        do Step_1a
        do Step_1b
        do Step_1c
        do Step_2
        do Step_3
        do Step_4
        do Step_5a
        do Step_5b
    )

    do(Y_found  repeat(goto (['Y']) <-'y'))

)


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/porter/stemmer.html === (541/641 lines abridged)

<HTML>
<HEAD>

<TITLE>The Porter stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>The Porter stemming algorithm</H1>

<TR><TD>
<BR>&nbsp;<H2>Links to resources</H2>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl">    The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c">      The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h">      - and its header</A>
<TR><TD><A HREF="voc.txt">     Sample English vocabulary</A>
<TR><TD><A HREF="output.txt">  Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt">   Vocabulary + stemmed equivalent</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
<BR><BR>
<TR><TD><A HREF="http://www.tartarus.org/~martin/PorterStemmer/index.html">
The &#8216;official&#8217; home page of the Porter stemming algorithm</A>
</TABLE></DL>

<BR><BR>

<B>Here is a case study on how to code up a stemming algorithm in Snowball. First,
the definition of the Porter stemmer, as it appeared in <B><I>Program</I></B>, Vol 14 no. 3 pp
130-137, July 1980.</B>

<BR><BR>

<TR><TD BGCOLOR="silver">

<BR>&nbsp;<H2>THE ALGORITHM</H2>

A <I>consonant</I> in a word is a letter other than A, E, I, O or U, and other
than Y preceded by a consonant. (The fact that the term &#8216;consonant&#8217; is
defined to some extent in terms of itself does not make it ambiguous.) So in
TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a
letter is not a consonant it is a <I>vowel</I>.
<BR><BR>
A consonant will be denoted by c, a vowel by v. A list ccc... of length
greater than 0 will be denoted by C, and a list vvv... of length greater
than 0 will be denoted by V. Any word, or part of a word, therefore has one
of the four forms:
<DL>
    <DT>CVCV ... C
    <DT>CVCV ... V

[-=- -=- -=- 541 lines omitted -=- -=- -=-]

        )
    )

    define Step_5a as (
        ['e']
        R2 or (R1 not shortv)
        delete
    )

    define Step_5b as (
        ['l']
        R2 'l'
        delete
    )
)

define stem as (

    unset Y_found
    do ( ['y'] <-'Y' set Y_found)
    do repeat(goto (v ['y']) <-'Y' set Y_found)

    $p1 = limit
    $p2 = limit
    do(
        gopast v  gopast non-v  setmark p1
        gopast v  gopast non-v  setmark p2
    )

    backwards (
        do Step_1a
        do Step_1b
        do Step_1c
        do Step_2
        do Step_3
        do Step_4
        do Step_5a
        do Step_5b
    )

    do(Y_found  repeat(goto (['Y']) <-'y'))

)
</DL>
</PRE></FONT>
</TR>

</TABLE>
</BODY>
</HTML>


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/porter/voc.txt === (30328/30428 lines abridged)
a
aaron
aback
abaissiez
abandon
abandoned
abase
abash
abashed
abate
abated
abatement
abatements
abates
abattement
abbe
abbess
abbey
abbeys
abbominable
abbot
abbots
abbreviated
abdication
abduction
abed
abel
aberga
abergavenny
abet
abetting
abhominable
abhor
abhorr
abhorred
abhorrence
abhorring
abhors
abhorson
abide
abides
abilities
ability
abingdon
abject
abjectly
abjects
abjur
abjure
able

[-=- -=- -=- 30328 lines omitted -=- -=- -=-]

yorick
york
yorkists
yorks
yorkshire
you
young
younger
youngest
youngling
younglings
youngly
youngster
youngsters
younker
younkers
your
yours
yourself
yourselves
youth
youthful
youths
youtli
zanies
zany
zatusfy
zeal
zealous
zeals
zed
zee
zenanas
zenelophon
zenith
zephyrs
zermons
zero
zest
zettlement
zir
zo
zodiac
zodiacs
zoggy
zone
zounds
zuleikah
zwagger
zwieback