[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/spanish - output.txt:1.1.2.1 spanishstem.c:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 13 Feb 2002 11:26:29 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/spanish
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/spanish

Added Files:
      Tag: ajung-textindexng-branch
	output.txt spanishstem.c stem.h stem.sbl stemmer.html voc.txt 
Log Message:
added PyStemmer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/spanish/output.txt === (28290/28390 lines abridged)
a
aaron
abac
abaj
abander
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abarat
abarc
abarc
abarc
abarc
abarc
abarc
abarc
abarc
abaro
abarroter
abarrot
abastec
abastecedor
abastec
abastec
abastec
abast
abatar
abat
abat
abat
abat
abat
abat
abat
abat
abba
abbud
abc
abdic
abdic
abeb

[-=- -=- -=- 28290 lines omitted -=- -=- -=-]

zanj
zapalinam
zapat
zapater
zapatill
zapatit
zapat
zapotec
zar
zaragoz
zarap
zarap
zarat
zarazu
zarc
zarp
ze
zedill
zeland
zeltun
zenith
zenon
zep
zeped
zertuch
zig
zimmerm
zins
zins
zon
zon
zoolog
zoolog
zorkin
zorrill
zorr
zotoluc
zotoluc
zotoluc
zuazu
zubillag
zubizarret
zuli
zu¤ig
zurc
zurd
zurd
zurd
zurit
zutan


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/spanish/spanishstem.c === (590/690 lines abridged)

#include "header.h"

extern int spanish_stem(struct SN_env * z);
static int r_residual_suffix(struct SN_env * z);
static int r_verb_suffix(struct SN_env * z);
static int r_y_verb_suffix(struct SN_env * z);
static int r_standard_suffix(struct SN_env * z);
static int r_attached_pronoun(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_RV(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);

static struct among a_0[6] =
{
/*  0 */ { 0, (byte *)"", -1, 6, 0},
/*  1 */ { 1, (byte *)"\x82" "", 0, 2, 0},
/*  2 */ { 1, (byte *)"\xA0" "", 0, 1, 0},
/*  3 */ { 1, (byte *)"\xA1" "", 0, 3, 0},
/*  4 */ { 1, (byte *)"\xA2" "", 0, 4, 0},
/*  5 */ { 1, (byte *)"\xA3" "", 0, 5, 0}
};

static struct among a_1[13] =
{
/*  0 */ { 2, (byte *)"la", -1, -1, 0},
/*  1 */ { 4, (byte *)"sela", 0, -1, 0},
/*  2 */ { 2, (byte *)"le", -1, -1, 0},
/*  3 */ { 2, (byte *)"me", -1, -1, 0},
/*  4 */ { 2, (byte *)"se", -1, -1, 0},
/*  5 */ { 2, (byte *)"lo", -1, -1, 0},
/*  6 */ { 4, (byte *)"selo", 5, -1, 0},
/*  7 */ { 3, (byte *)"las", -1, -1, 0},
/*  8 */ { 5, (byte *)"selas", 7, -1, 0},
/*  9 */ { 3, (byte *)"les", -1, -1, 0},
/* 10 */ { 3, (byte *)"los", -1, -1, 0},
/* 11 */ { 5, (byte *)"selos", 10, -1, 0},
/* 12 */ { 3, (byte *)"nos", -1, -1, 0}
};

static struct among a_2[11] =
{
/*  0 */ { 4, (byte *)"ando", -1, 6, 0},
/*  1 */ { 5, (byte *)"iendo", -1, 6, 0},
/*  2 */ { 5, (byte *)"yendo", -1, 7, 0},
/*  3 */ { 5, (byte *)"i\x82" "ndo", -1, 1, 0},
/*  4 */ { 4, (byte *)"\xA0" "ndo", -1, 2, 0},
/*  5 */ { 2, (byte *)"ar", -1, 6, 0},

[-=- -=- -=- 590 lines omitted -=- -=- -=-]

    }
    return 1;
}

extern int spanish_stem(struct SN_env * z) {
    {   int c = z->c; /* do, line 214 */
        if (!r_mark_regions(z)) goto lab0; /* call mark_regions, line 214 */
    lab0:
        z->c = c;
    }
    z->lb = z->c; z->c = z->l; /* backwards, line 215 */

    {   int m = z->l - z->c; /* do, line 216 */
        if (!r_attached_pronoun(z)) goto lab1; /* call attached_pronoun, line 216 */
    lab1:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 217 */
        {   int m = z->l - z->c; /* or, line 217 */
            if (!r_standard_suffix(z)) goto lab4; /* call standard_suffix, line 217 */
            goto lab3;
        lab4:
            z->c = z->l - m;
            if (!r_y_verb_suffix(z)) goto lab5; /* call y_verb_suffix, line 218 */
            goto lab3;
        lab5:
            z->c = z->l - m;
            if (!r_verb_suffix(z)) goto lab2; /* call verb_suffix, line 219 */
        }
    lab3:
    lab2:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 221 */
        if (!r_residual_suffix(z)) goto lab6; /* call residual_suffix, line 221 */
    lab6:
        z->c = z->l - m;
    }
    z->c = z->lb;    {   int c = z->c; /* do, line 223 */
        if (!r_postlude(z)) goto lab7; /* call postlude, line 223 */
    lab7:
        z->c = c;
    }
    return 1;
}

extern struct SN_env * spanish_create_env(void) { return SN_create_env(0, 3, 0); }

extern void spanish_close_env(struct SN_env * z) { SN_close_env(z); }



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/spanish/stem.h ===

extern struct SN_env * spanish_create_env(void);
extern void spanish_close_env(struct SN_env * z);

extern int spanish_stem(struct SN_env * z);



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/spanish/stem.sbl ===
routines (
           postlude mark_regions
           RV R1 R2
           attached_pronoun
           standard_suffix
           y_verb_suffix
           verb_suffix
           residual_suffix
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v )

stringescapes {}

/* special characters (in ISO Latin) */

stringdef a'   hex 'A0'  // a-acute
stringdef e'   hex '82'  // e-acute
stringdef i'   hex 'A1'  // i-acute
stringdef o'   hex 'A2'  // o-acute
stringdef u'   hex 'A3'  // u-acute
stringdef u"   hex '81'  // u-diaeresis
stringdef n~   hex 'A4'  // n-tilde

define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'

define mark_regions as (

    $pV = limit
    $p1 = limit
    $p2 = limit  // defaults

    do (
        ( v (non-v gopast v) or (v gopast non-v) )
        or
        ( non-v (non-v gopast v) or (v next) )
        setmark pV
    )
    do (
        gopast v gopast non-v setmark p1
        gopast v gopast non-v setmark p2
    )
)

define postlude as repeat (
    [substring] among(
        '{a'}' (<- 'a')
        '{e'}' (<- 'e')
        '{i'}' (<- 'i')
        '{o'}' (<- 'o')
        '{u'}' (<- 'u')
        // and possibly {u"}->u here, or in prelude
    ) or next
)

backwardmode (

    define RV as $pV <= cursor
    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define attached_pronoun as (
        [substring] among(
            'me' 'se'  'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
            'las' 'les' 'los' 'nos'
        )
        substring RV among(
            'i{e'}ndo' (] <- 'iendo')
            '{a'}ndo'  (] <- 'ando')
            '{a'}r'    (] <- 'ar')
            '{e'}r'    (] <- 'er')
            '{i'}r'    (] <- 'ir')
            'ando'
            'iendo'
            'ar' 'er' 'ir'
                       (delete)
            'yendo'    ('u' delete)
        )
    )

    define standard_suffix as (
        [substring] among(

            'anza' 'anzas'
            'ico' 'ica' 'icos' 'icas'
            'ismo' 'ismos'
            'able' 'ables'
            'ible' 'ibles'
            'ista' 'istas'
            'oso' 'osa' 'osos' 'osas'
            'amiento' 'amientos'
            'imiento' 'imientos'
            (
                R2 delete
            )
            'adora' 'ador' 'aci{o'}n'
            'adoras' 'adores' 'aciones'
            (
                R2 delete
                try ( ['ic'] R2 delete )
            )
            'log{i'}a'
            'log{i'}as'
            (
                R2 <- 'log'
            )
            'uci{o'}n' 'uciones'
            (
                R2 <- 'u'
            )
            'encia' 'encias'
            (
                R2 <- 'ente'
            )
            'amente'
            (
                R1 delete
                try (
                    [substring] R2 delete among(
                        'iv' (['at'] R2 delete)
                        'os'
                        'ic'
                        'ad'
                    )
                )
            )
            'mente'
            (
                R2 delete
                try (
                    [substring] among(
                        'able'
                        'ible' (R2 delete)
                    )
                )
            )
            'idad'
            'idades'
            (
                R2 delete
                try (
                    [substring] among(
                        'abil'
                        'ic'
                        'iv'   (R2 delete)
                    )
                )
            )
            'iva' 'ivo'
            'ivas' 'ivos'
            (
                R2 delete
                try (
                    ['at'] R2 delete // but not a further   ['ic'] R2 delete
                )
            )
        )
    )

    define y_verb_suffix as (
        setlimit tomark pV for ([substring]) among(
            'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
            'yas' 'yes' 'yais' 'yamos'
                ('u' delete)
        )
    )

    define verb_suffix as (
        setlimit tomark pV for ([substring]) among(

            'en' 'es' '{e'}is' 'emos'
                (try ('u' test 'g') ] delete)

            'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
            'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
            'ar{e'}'
            'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
            'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
            'er{e'}'
            'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
            'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
            'ir{e'}'

            'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
            'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
            'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
            'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
            'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
            'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
            'ierais'  'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
            'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
            '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
                (delete)
        )
    )

    define residual_suffix as (
        [substring] among(
            'os'
            'a' 'o' '{a'}' '{i'}' '{o'}'
                ( RV delete )
            'e' '{e'}'
                ( RV delete try( ['u'] test 'g' RV delete ) )
        )
    )
)

define stem as (
    do mark_regions
    backwards (
        do attached_pronoun
        do ( standard_suffix or
             y_verb_suffix or
             verb_suffix
           )
        do residual_suffix
    )
    do postlude
)


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/spanish/stemmer.html === (514/614 lines abridged)

<HTML>
<HEAD>
<TITLE>Spanish stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>Spanish stemming algorithm</H1>

<TR><TD>
<BR>&nbsp;<H2>Links to resources</H2>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl">    The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c">      The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h">      - and its header</A>
<TR><TD><A HREF="voc.txt">     Sample Spanish vocabulary (ISO Latin codings)</A>
<TR><TD><A HREF="output.txt">  Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt">   Vocabulary + stemmed equivalent in pure ASCII</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="../texts/romance.html">
                  Romance language stemmers</A>
</TABLE></DL>

</TR>

<TR><TD BGCOLOR="lightpink">

<BR><BR>

Here is a sample of Spanish vocabulary, with the stemmed forms that will
be generated with this algorithm.

<BR><BR>



<DL><DD><TABLE CELLPADDING=0>
<TR><TD>  <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
 <TD></TD><TD>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
 <TD></TD><TD> <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
</TR>

<TR><TD>

[-=- -=- -=- 514 lines omitted -=- -=- -=-]

            'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
            'ar{e'}'
            'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
            'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
            'er{e'}'
            'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
            'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
            'ir{e'}'

            'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
            'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
            'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
            'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
            'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
            'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
            'ierais'  'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
            'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
            '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
                (delete)
        )
    )

    define residual_suffix as (
        [substring] among(
            'os'
            'a' 'o' '{a'}' '{i'}' '{o'}'
                ( RV delete )
            'e' '{e'}'
                ( RV delete try( ['u'] test 'g' RV delete ) )
        )
    )
)

define stem as (
    do mark_regions
    backwards (
        do attached_pronoun
        do ( standard_suffix or
             y_verb_suffix or
             verb_suffix
           )
        do residual_suffix
    )
    do postlude
)
</DL>
</PRE></FONT>
</TABLE>
</BODY>
</HTML>


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/spanish/voc.txt === (28290/28390 lines abridged)
a
aar˘n
abaco
abajo
abandera
abandona
abandonada
abandonadas
abandonado
abandonados
abandonamos
abandonan
abandonar
abandonarlo
abandonaron
abandono
abandon˘
abaratar
abarca
abarcamos
abarcan
abarcar
abarcar 
abarcar n
abarcarˇa
abarc˘
abaroa
abarroteros
abarrot˘
abastece
abastecedora
abastecer
abastecimiento
abastecimientos
abasto
abatares
abatida
abatido
abatidos
abatimiento
abati˘
abatir
abatirnos
abatirse
abba
abbud
abc
abdicaci˘n
abdicar
abeba

[-=- -=- -=- 28290 lines omitted -=- -=- -=-]

zanjas
zapalinam‚
zapata
zapatera
zapatilla
zapatitos
zapatos
zapoteco
zar
zaragoza
zarape
zarapeados
z rate
zarazŁa
zarco
zarp˘
ze
zedillo
zelanda
zeltŁn
zenith
zen˘n
zepa
zepeda
zertuche
zig
zimmerman
zinser
zˇnser
zona
zonas
zool˘gico
zool˘gicos
zorkin
zorrilla
zorros
zotoluca
zotoluco
zotolucos
zuazua
zubillaga
zubizarreta
zulia
zŁ¤iga
zurcos
zurda
zurdo
zurdos
zurita
zutano