[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese - output.txt:1.1.2.1 portuguesestem.c:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1
Andreas Jung
andreas@digicool.com
Wed, 13 Feb 2002 11:26:26 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/portuguese
Added Files:
Tag: ajung-textindexng-branch
output.txt portuguesestem.c stem.h stem.sbl stemmer.html
voc.txt
Log Message:
added PyStemmer
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/output.txt === (31916/32016 lines abridged)
a
Æ
aach
aacut
abacax
abad
abaet
abaf
abaf
abaix
abaix
abaix
abaix
abaix
abal
abal
abal
abal
abal
abalro
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abarrot
abarrot
abarrot
abast
abast
abast
abastec
abastec
abastec
abastec
abat
abat
abatedour
abat
[-=- -=- -=- 31916 lines omitted -=- -=- -=-]
zapping
zar
zaragoz
zarin
zaz
z
zebr
zebr
zebu
zec
zed
zeferin
zehnd
zelador
zelnd
zel
zen
zenild
zenild
zentel
zepellin
zequinh
zer
zerinh
zer
zer
zez
zhiling
zic
zilberman
zimb bu
zinc
zinh
zĄp
zirald
zit
zoar
zodĄac
zol
zoli
zon
zon
zoneament
zonz
zoobor
zooląg
zoomp
zul
zumb
zumb
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/portuguesestem.c === (553/653 lines abridged)
#include "header.h"
extern int portuguese_stem(struct SN_env * z);
static int r_residual_form(struct SN_env * z);
static int r_residual_suffix(struct SN_env * z);
static int r_verb_suffix(struct SN_env * z);
static int r_standard_suffix(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_RV(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);
static int r_prelude(struct SN_env * z);
static struct among a_0[3] =
{
/* 0 */ { 0, (byte *)"", -1, 3, 0},
/* 1 */ { 1, (byte *)"\xC6" "", 0, 1, 0},
/* 2 */ { 1, (byte *)"\xE4" "", 0, 2, 0}
};
static struct among a_1[3] =
{
/* 0 */ { 0, (byte *)"", -1, 3, 0},
/* 1 */ { 2, (byte *)"a~", 0, 1, 0},
/* 2 */ { 2, (byte *)"o~", 0, 2, 0}
};
static struct among a_2[4] =
{
/* 0 */ { 2, (byte *)"ic", -1, -1, 0},
/* 1 */ { 2, (byte *)"ad", -1, -1, 0},
/* 2 */ { 2, (byte *)"os", -1, -1, 0},
/* 3 */ { 2, (byte *)"iv", -1, 1, 0}
};
static struct among a_3[2] =
{
/* 0 */ { 4, (byte *)"avel", -1, 1, 0},
/* 1 */ { 4, (byte *)"\xA1" "vel", -1, 1, 0}
};
static struct among a_4[3] =
{
/* 0 */ { 2, (byte *)"ic", -1, 1, 0},
/* 1 */ { 4, (byte *)"abil", -1, 1, 0},
/* 2 */ { 2, (byte *)"iv", -1, 1, 0}
};
[-=- -=- -=- 553 lines omitted -=- -=- -=-]
{ int m = z->l - z->c; /* do, line 203 */
{ int m = z->l - z->c; /* or, line 207 */
{ int m = z->l - z->c; /* or, line 204 */
if (!r_standard_suffix(z)) goto lab6; /* call standard_suffix, line 204 */
goto lab5;
lab6:
z->c = z->l - m;
if (!r_verb_suffix(z)) goto lab4; /* call verb_suffix, line 204 */
}
lab5:
{ int m = z->l - z->c; /* do, line 205 */
z->ket = z->c; /* [, line 205 */
if (!(eq_s_b(z, 1, "i"))) goto lab7;
z->bra = z->c; /* ], line 205 */
{ int m_test = z->l - z->c; /* test, line 205 */
if (!(eq_s_b(z, 1, "c"))) goto lab7;
z->c = z->l - m_test;
}
if (!r_RV(z)) goto lab7; /* call RV, line 205 */
slice_del(z); /* delete, line 205 */
lab7:
z->c = z->l - m;
}
goto lab3;
lab4:
z->c = z->l - m;
if (!r_residual_suffix(z)) goto lab2; /* call residual_suffix, line 207 */
}
lab3:
lab2:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 209 */
if (!r_residual_form(z)) goto lab8; /* call residual_form, line 209 */
lab8:
z->c = z->l - m;
}
z->c = z->lb; { int c = z->c; /* do, line 211 */
if (!r_postlude(z)) goto lab9; /* call postlude, line 211 */
lab9:
z->c = c;
}
return 1;
}
extern struct SN_env * portuguese_create_env(void) { return SN_create_env(0, 3, 0); }
extern void portuguese_close_env(struct SN_env * z) { SN_close_env(z); }
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/stem.h ===
extern struct SN_env * portuguese_create_env(void);
extern void portuguese_close_env(struct SN_env * z);
extern int portuguese_stem(struct SN_env * z);
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/stem.sbl ===
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
verb_suffix
residual_suffix
residual_form
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* special characters (in ISO Latin) */
stringdef a' hex 'A0' // a-acute
stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico
stringdef e' hex '82' // e-acute
stringdef e^ hex '88' // e-circumflex
stringdef i' hex 'A1' // i-acute
stringdef o^ hex '93' // o-circumflex
stringdef o' hex 'A2' // o-acute
stringdef u' hex 'A3' // u-acute
stringdef c, hex '87' // c-cedilla
stringdef a~ hex 'C6' // a-tilde
stringdef o~ hex 'E4' // o-tilde
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
define prelude as repeat (
[substring] among(
'{a~}' (<- 'a~')
'{o~}' (<- 'o~')
) or next
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'a~' (<- '{a~}')
'o~' (<- '{o~}')
) or next
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
[substring] among(
'eza' 'ezas'
'ico' 'ica' 'icos' 'icas'
'ismo' 'ismos'
'{a'}vel'
'{i'}vel'
'ista' 'istas'
'oso' 'osa' 'osos' 'osas'
'amento' 'amentos'
'imento' 'imentos'
'adora' 'ador' 'a{c,}a~o'
'adoras' 'adores' 'a{c,}o~es' // no -ic test
(
R2 delete
)
'log{i'}a'
'log{i'}as'
(
R2 <- 'log'
)
'uci{o'}n' 'uciones'
(
R2 <- 'u'
)
'{e^}ncia' '{e^}ncias'
(
R2 <- 'ente'
)
'amente'
(
R1 delete
try (
[substring] R2 delete among(
'iv' (['at'] R2 delete)
'os'
'ic'
'ad'
)
)
)
'mente'
(
R2 delete
try (
[substring] among(
'avel'
'{i'}vel' (R2 delete)
)
)
)
'idade'
'idades'
(
R2 delete
try (
[substring] among(
'abil'
'ic'
'iv' (R2 delete)
)
)
)
'iva' 'ivo'
'ivas' 'ivos'
(
R2 delete
try (
['at'] R2 delete // but not a further ['ic'] R2 delete
)
)
'ira' 'iras'
(
RV 'e' // -eira -eiras usually non-verbal
<- 'ir'
)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
'{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
'{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
'{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
'{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
'ira' 'iras'
(delete)
)
)
define residual_suffix as (
[substring] among(
'os'
'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
)
)
define residual_form as (
[substring] among(
'e' '{e'}' '{e^}'
( RV delete [('u'] test 'g') or
('i'] test 'c') RV delete )
'{c,}' (<-'c')
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
( standard_suffix or verb_suffix
do ( ['i'] test 'c' RV delete )
)
or residual_suffix
)
do residual_form
)
do postlude
)
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/stemmer.html === (478/578 lines abridged)
<HTML>
<HEAD>
<TITLE>Portuguese stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>Portuguese stemming algorithm</H1>
<TR><TD>
<BR> <H2>Links to resources</H2>
<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl"> The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c"> The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h"> - and its header</A>
<TR><TD><A HREF="voc.txt"> Sample Portuguese vocabulary (ISO Latin codings)</A>
<TR><TD><A HREF="output.txt"> Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt"> Vocabulary + stemmed equivalent in pure ASCII</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>
<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="../texts/romance.html">
Romance language stemmers</A>
</TABLE></DL>
</TR>
<TR><TD BGCOLOR="lightpink">
<BR><BR>
Here is a sample of Portuguese vocabulary, with the stemmed forms that will
be generated with this algorithm.
<BR><BR>
<DL><DD><TABLE CELLPADDING=0>
<TR><TD> <B>word</B> </TD>
<TD></TD><TD> </TD>
<TD></TD><TD> <B>stem</B> </TD>
<TD></TD><TD> </TD>
<TD></TD><TD> <B>word</B> </TD>
<TD></TD><TD> </TD>
<TD></TD><TD> <B>stem</B> </TD>
</TR>
<TR><TD>
[-=- -=- -=- 478 lines omitted -=- -=- -=-]
'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
'{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
'{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
'{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
'{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
'ira' 'iras'
(delete)
)
)
define residual_suffix as (
[substring] among(
'os'
'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
)
)
define residual_form as (
[substring] among(
'e' '{e'}' '{e^}'
( RV delete [('u'] test 'g') or
('i'] test 'c') RV delete )
'{c,}' (<-'c')
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
( standard_suffix or verb_suffix
do ( ['i'] test 'c' RV delete )
)
or residual_suffix
)
do residual_form
)
do postlude
)
</DL>
</PRE></FONT>
</TABLE>
</BODY>
</HTML>
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/voc.txt === (31916/32016 lines abridged)
a
Æ
aacho
aacute
abacaxi
abade
abaet
abafar
abafaram
abaixa
abaixe
abaixei
abaixo
abaixou
abalada
abalado
abalaram
abalos
abalou
abalroado
abandona
abandon
abandonada
abandonadas
abandonado
abandonados
abandonam
abandonando
abandonar
abandonara
abandonaram
abandonasse
abandono
abandonou
abarrotado
abarrotados
abarrotou
abastada
abastado
abastados
abastecem
abastecer
abastecida
abastecimento
abata
abate
abatedouro
abatem
[-=- -=- -=- 31916 lines omitted -=- -=- -=-]
zapping
zara
zaragoza
zarin
zaz
z
zebra
zebras
zebu
zeca
zedias
zeferina
zehnder
zelador
zelndia
zelar
zen
zenilda
zenildo
zentel
zepellin
zequinha
zerados
zerinho
zero
zerou
zez
zhiling
zico
zilberman
zimb bue
zinco
zinhos
zĄper
ziraldo
zita
zoar
zodĄaco
zola
zolio
zona
zonas
zoneamento
zonzo
zooboros
zoolągico
zoomp
zul
zumbi
zumbido