[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/italian - italianstem.c:1.1.2.1 output.txt:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 13 Feb 2002 11:26:24 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/italian
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/italian

Added Files:
      Tag: ajung-textindexng-branch
	italianstem.c output.txt stem.h stem.sbl stemmer.html voc.txt 
Log Message:
added PyStemmer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/italian/italianstem.c === (580/680 lines abridged)

#include "header.h"

extern int italian_stem(struct SN_env * z);
static int r_vowel_suffix(struct SN_env * z);
static int r_verb_suffix(struct SN_env * z);
static int r_standard_suffix(struct SN_env * z);
static int r_attached_pronoun(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_RV(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);
static int r_prelude(struct SN_env * z);

static struct among a_0[7] =
{
/*  0 */ { 0, (byte *)"", -1, 7, 0},
/*  1 */ { 2, (byte *)"qu", 0, 6, 0},
/*  2 */ { 1, (byte *)"\x82" "", 0, 2, 0},
/*  3 */ { 1, (byte *)"\xA0" "", 0, 1, 0},
/*  4 */ { 1, (byte *)"\xA1" "", 0, 3, 0},
/*  5 */ { 1, (byte *)"\xA2" "", 0, 4, 0},
/*  6 */ { 1, (byte *)"\xA3" "", 0, 5, 0}
};

static struct among a_1[3] =
{
/*  0 */ { 0, (byte *)"", -1, 3, 0},
/*  1 */ { 1, (byte *)"I", 0, 1, 0},
/*  2 */ { 1, (byte *)"U", 0, 2, 0}
};

static struct among a_2[37] =
{
/*  0 */ { 2, (byte *)"la", -1, -1, 0},
/*  1 */ { 4, (byte *)"cela", 0, -1, 0},
/*  2 */ { 6, (byte *)"gliela", 0, -1, 0},
/*  3 */ { 4, (byte *)"mela", 0, -1, 0},
/*  4 */ { 4, (byte *)"tela", 0, -1, 0},
/*  5 */ { 4, (byte *)"vela", 0, -1, 0},
/*  6 */ { 2, (byte *)"le", -1, -1, 0},
/*  7 */ { 4, (byte *)"cele", 6, -1, 0},
/*  8 */ { 6, (byte *)"gliele", 6, -1, 0},
/*  9 */ { 4, (byte *)"mele", 6, -1, 0},
/* 10 */ { 4, (byte *)"tele", 6, -1, 0},
/* 11 */ { 4, (byte *)"vele", 6, -1, 0},
/* 12 */ { 2, (byte *)"ne", -1, -1, 0},
/* 13 */ { 4, (byte *)"cene", 12, -1, 0},
/* 14 */ { 6, (byte *)"gliene", 12, -1, 0},

[-=- -=- -=- 580 lines omitted -=- -=- -=-]

    return 1;
}

extern int italian_stem(struct SN_env * z) {
    {   int c = z->c; /* do, line 181 */
        if (!r_prelude(z)) goto lab0; /* call prelude, line 181 */
    lab0:
        z->c = c;
    }
    {   int c = z->c; /* do, line 182 */
        if (!r_mark_regions(z)) goto lab1; /* call mark_regions, line 182 */
    lab1:
        z->c = c;
    }
    z->lb = z->c; z->c = z->l; /* backwards, line 183 */

    {   int m = z->l - z->c; /* do, line 184 */
        if (!r_attached_pronoun(z)) goto lab2; /* call attached_pronoun, line 184 */
    lab2:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 185 */
        {   int m = z->l - z->c; /* or, line 185 */
            if (!r_standard_suffix(z)) goto lab5; /* call standard_suffix, line 185 */
            goto lab4;
        lab5:
            z->c = z->l - m;
            if (!r_verb_suffix(z)) goto lab3; /* call verb_suffix, line 185 */
        }
    lab4:
    lab3:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 186 */
        if (!r_vowel_suffix(z)) goto lab6; /* call vowel_suffix, line 186 */
    lab6:
        z->c = z->l - m;
    }
    z->c = z->lb;    {   int c = z->c; /* do, line 188 */
        if (!r_postlude(z)) goto lab7; /* call postlude, line 188 */
    lab7:
        z->c = c;
    }
    return 1;
}

extern struct SN_env * italian_create_env(void) { return SN_create_env(0, 3, 0); }

extern void italian_close_env(struct SN_env * z) { SN_close_env(z); }



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/italian/output.txt === (35394/35494 lines abridged)
a
…
aa
aalst
ab
abakoumov
aban
abat
abat
abbacin
abbacin
abbad
abbad
abbagliaron
abbagl
abbai
abbai
abbai
abbain
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abbandon
abband
abbandon
abbaruff
abbass
abbass
abbass
abbass
abbass
abbass
abbass
abbass
abbass
abbass

[-=- -=- -=- 35394 lines omitted -=- -=- -=-]

zimarr
zimarr
zimbell
zimmermann
zin
zincon
zinell
zingar
zing
zio
zippor
zironell
zitt
zitt
zitt
zitt
ziuganov
zo
zobr
zocal
zocc
zoccol
zohr
zol
zolfanell
zoll
zomb
zon
zon
zonz
zoppas
zopp
zoran
zoratt
zorr
zorz
zotic
zou
zucc
zucc
zuccher
zuccherin
zuccher
zucc
zuccon
zuff
zurig
zwe
zwerver
zwickel


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/italian/stem.h ===

extern struct SN_env * italian_create_env(void);
extern void italian_close_env(struct SN_env * z);

extern int italian_stem(struct SN_env * z);



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/italian/stem.sbl ===

routines (
           prelude postlude mark_regions
           RV R1 R2
           attached_pronoun
           standard_suffix
           verb_suffix
           vowel_suffix
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v AEIO CG )

stringescapes {}

/* special characters (in ISO Latin) */

stringdef a'   hex 'A0'
stringdef a`   hex '85'
stringdef e'   hex '82'
stringdef e`   hex '8A'
stringdef i'   hex 'A1'
stringdef i`   hex '8D'
stringdef o'   hex 'A2'
stringdef o`   hex '95'
stringdef u'   hex 'A3'
stringdef u`   hex '97'

define v 'aeiou{a`}{e`}{i`}{o`}{u`}'

define prelude as (
    test repeat (
        [substring] among(
            '{a'}' (<- '{a`}')
            '{e'}' (<- '{e`}')
            '{i'}' (<- '{i`}')
            '{o'}' (<- '{o`}')
            '{u'}' (<- '{u`}')
            'qu'   (<- 'qU')
        ) or next
    )
    repeat goto (
        v [ ('u' ] v <- 'U') or
            ('i' ] v <- 'I')
    )
)

define mark_regions as (

    $pV = limit
    $p1 = limit
    $p2 = limit // defaults

    do (
        ( v (non-v gopast v) or (v gopast non-v) )
        or
        ( non-v (non-v gopast v) or (v next) )
        setmark pV
    )
    do (
        gopast v gopast non-v setmark p1
        gopast v gopast non-v setmark p2
    )
)

define postlude as repeat (

    [substring] among(
        'I'  (<- 'i')
        'U'  (<- 'u')
    ) or next

)

backwardmode (

    define RV as $pV <= cursor
    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define attached_pronoun as (
        [substring] among(
            'ci' 'gli' 'la' 'le' 'li' 'lo'
            'mi' 'ne' 'si'  'ti' 'vi'
            // the compound forms are:
            'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
            'mela' 'mele' 'meli' 'melo' 'mene'
            'tela' 'tele' 'teli' 'telo' 'tene'
            'cela' 'cele' 'celi' 'celo' 'cene'
            'vela' 'vele' 'veli' 'velo' 'vene'
        )
        among( (RV)
            'ando' 'endo'   (delete)
            'ar' 'er' 'ir'  (<- 'e')
        )
    )

    define standard_suffix as (
        [substring] among(

            'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
            'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
            'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
            'atrice' 'atrici'
               ( R2 delete )
            'azione' 'azioni' 'atore' 'atori'
               ( R2 delete
                 try ( ['ic'] R2 delete )
               )
            'logia' 'logie'
               ( R2 <- 'log' )
            'uzione' 'uzioni' 'usione' 'usioni'
               ( R2 <- 'u' )
            'enza' 'enze'
               ( R2 <- 'ente' )
            'amento' 'amenti' 'imento' 'imenti'
               ( RV delete )
            'amente' (
                R1 delete
                try (
                    [substring] R2 delete among(
                        'iv' ( ['at'] R2 delete )
                        'os' 'ic' 'abil'
                    )
                )
            )
            'it{a`}' (
                R2 delete
                try (
                    [substring] among(
                        'abil' 'ic' 'iv' (R2 delete)
                    )
                )
            )
            'ivo' 'ivi' 'iva' 'ive' (
                R2 delete
                try ( ['at'] R2 delete ['ic'] R2 delete )
            )
        )
    )

    define verb_suffix as setlimit tomark pV for (
        [substring] among(
            'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
            'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
            'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
            'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
            'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
            'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
            'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
            'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
            'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
            'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
            'ono' 'uta' 'ute' 'uti' 'uto'

            'ar' 'ir' // but 'er' is problematical
                (delete)
        )
    )

    define AEIO 'aeio{a`}{e`}{i`}{o`}'
    define CG 'cg'

    define vowel_suffix as (
        try (
            [AEIO] RV delete
            ['i'] RV delete
        )
        try (
            ['h'] CG RV delete
        )
    )
)

define stem as (
    do prelude
    do mark_regions
    backwards (
        do attached_pronoun
        do (standard_suffix or verb_suffix)
        do vowel_suffix
    )
    do postlude
)


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/italian/stemmer.html === (447/547 lines abridged)

<HTML>
<HEAD>
<TITLE>Italian stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>Italian stemming algorithm</H1>

<TR><TD>
<BR>&nbsp;<H2>Links to resources</H2>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl">    The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c">      The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h">      - and its header</A>
<TR><TD><A HREF="voc.txt">     Sample Italian vocabulary (ISO Latin codings)</A>
<TR><TD><A HREF="output.txt">  Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt">   Vocabulary + stemmed equivalent in pure ASCII</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="../texts/romance.html">
                  Romance language stemmers</A>
</TABLE></DL>

</TR>

<TR><TD BGCOLOR="lightpink">

<BR><BR>

Here is a sample of Italian vocabulary, with the stemmed forms that will
be generated with this algorithm.

<BR><BR>



<DL><DD><TABLE CELLPADDING=0>
<TR><TD>  <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
 <TD></TD><TD>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
 <TD></TD><TD> <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
</TR>

<TR><TD>

[-=- -=- -=- 447 lines omitted -=- -=- -=-]

    )

    define verb_suffix as setlimit tomark pV for (
        [substring] among(
            'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
            'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
            'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
            'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
            'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
            'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
            'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
            'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
            'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
            'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
            'ono' 'uta' 'ute' 'uti' 'uto'

            'ar' 'ir' // but 'er' is problematical
                (delete)
        )
    )

    define AEIO 'aeio{a`}{e`}{i`}{o`}'
    define CG 'cg'

    define vowel_suffix as (
        try (
            [AEIO] RV delete
            ['i'] RV delete
        )
        try (
            ['h'] CG RV delete
        )
    )
)

define stem as (
    do prelude
    do mark_regions
    backwards (
        do attached_pronoun
        do (standard_suffix or verb_suffix)
        do vowel_suffix
    )
    do postlude
)
</DL>
</PRE></FONT>
</TABLE>
</BODY>
</HTML>


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/italian/voc.txt === (35394/35494 lines abridged)
a
…
aa
aalst
ab
abakoumova
abano
abate
abati
abbacinare
abbacinati
abbadia
abbado
abbagliaron
abbagliato
abbaia
abbaiano
abbaiar
abbaini
abbandona
abbandonando
abbandonano
abbandonar
abbandonare
abbandonarla
abbandonarlo
abbandonarsi
abbandonarvi
abbandonasse
abbandonata
abbandonate
abbandonati
abbandonato
abbandonava
abbandoner…
abbandoneranno
abbandoner•
abbandono
abbandon•
abbaruffato
abbassamento
abbassando
abbassandola
abbassandole
abbassar
abbassare
abbassarono
abbassarsi
abbassassero
abbassato

[-=- -=- -=- 35394 lines omitted -=- -=- -=-]

zimarra
zimarre
zimbello
zimmermann
zina
zincone
zinelli
zingari
zingerle
zio
zippora
zironelli
zitta
zitte
zitti
zitto
ziuganov
zo
zobra
zocalo
zocchi
zoccolo
zohra
zola
zolfanelli
zolla
zombi
zona
zone
zonzo
zoppas
zoppo
zoran
zoratto
zorro
zorzi
zotici
zou
zucca
zucche
zuccheri
zuccherino
zucchero
zucchi
zucconi
zuffi
zurigo
zweite
zwerver
zwickel