[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french - frenchstem.c:1.1.2.1 output.txt:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1
Andreas Jung
andreas@digicool.com
Wed, 13 Feb 2002 11:26:20 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/french
Added Files:
Tag: ajung-textindexng-branch
frenchstem.c output.txt stem.h stem.sbl stemmer.html voc.txt
Log Message:
added PyStemmer
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/frenchstem.c === (688/788 lines abridged)
#include "header.h"
extern int french_stem(struct SN_env * z);
static int r_un_accent(struct SN_env * z);
static int r_un_double(struct SN_env * z);
static int r_residual_suffix(struct SN_env * z);
static int r_verb_suffix(struct SN_env * z);
static int r_i_verb_suffix(struct SN_env * z);
static int r_standard_suffix(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_RV(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);
static int r_prelude(struct SN_env * z);
static struct among a_0[4] =
{
/* 0 */ { 0, (byte *)"", -1, 4, 0},
/* 1 */ { 1, (byte *)"I", 0, 1, 0},
/* 2 */ { 1, (byte *)"U", 0, 2, 0},
/* 3 */ { 1, (byte *)"Y", 0, 3, 0}
};
static struct among a_1[4] =
{
/* 0 */ { 3, (byte *)"iqU", -1, 3, 0},
/* 1 */ { 3, (byte *)"abl", -1, 3, 0},
/* 2 */ { 3, (byte *)"eus", -1, 2, 0},
/* 3 */ { 2, (byte *)"iv", -1, 1, 0}
};
static struct among a_2[3] =
{
/* 0 */ { 2, (byte *)"ic", -1, 2, 0},
/* 1 */ { 4, (byte *)"abil", -1, 1, 0},
/* 2 */ { 2, (byte *)"iv", -1, 3, 0}
};
static struct among a_3[43] =
{
/* 0 */ { 4, (byte *)"iqUe", -1, 1, 0},
/* 1 */ { 6, (byte *)"atrice", -1, 2, 0},
/* 2 */ { 4, (byte *)"ance", -1, 1, 0},
/* 3 */ { 4, (byte *)"ence", -1, 5, 0},
/* 4 */ { 5, (byte *)"logie", -1, 3, 0},
/* 5 */ { 4, (byte *)"able", -1, 1, 0},
/* 6 */ { 4, (byte *)"isme", -1, 1, 0},
/* 7 */ { 4, (byte *)"euse", -1, 11, 0},
[-=- -=- -=- 688 lines omitted -=- -=- -=-]
lab5:
z->c = z->l - m;
{ int m = z->l - z->c; /* try, line 223 */
z->ket = z->c; /* [, line 223 */
{ int m = z->l - z->c; /* or, line 223 */
if (!(eq_s_b(z, 1, "Y"))) goto lab10;
z->bra = z->c; /* ], line 223 */
slice_from_s(z, 1, "i"); /* <-, line 223 */
goto lab9;
lab10:
z->c = z->l - m;
if (!(eq_s_b(z, 1, "\x87" ""))) { z->c = z->l - m; goto lab8; }
z->bra = z->c; /* ], line 224 */
slice_from_s(z, 1, "c"); /* <-, line 224 */
}
lab9:
lab8:
}
}
goto lab3;
lab4:
z->c = z->l - m;
if (!r_residual_suffix(z)) goto lab2; /* call residual_suffix, line 227 */
}
lab3:
lab2:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 232 */
if (!r_un_double(z)) goto lab11; /* call un_double, line 232 */
lab11:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 233 */
if (!r_un_accent(z)) goto lab12; /* call un_accent, line 233 */
lab12:
z->c = z->l - m;
}
z->c = z->lb; { int c = z->c; /* do, line 235 */
if (!r_postlude(z)) goto lab13; /* call postlude, line 235 */
lab13:
z->c = c;
}
return 1;
}
extern struct SN_env * french_create_env(void) { return SN_create_env(0, 3, 0); }
extern void french_close_env(struct SN_env * z) { SN_close_env(z); }
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/output.txt === (20307/20407 lines abridged)
a
…
abailard
abaiss
abaiss
abaiss
abaiss
abaissement
abaissent
abaiss
abaiss
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abasourd
abat
abatt
abatt
abatt
abattr
abbay
abb‚
abb‚
abbess
abeil
abhorr
abhorr
abhorr
abŒm
abŒm
abŒm
abŒm
abject
abjur
ablut
abneg
aboi
aboi
abol
abomin
abomin
[-=- -=- -=- 20307 lines omitted -=- -=- -=-]
xii
xii
xiv
xix
xv
xvi
xvii
xvii
xx
xxi
xxii
xxii
xxiv
xxix
xxv
xxvi
xxvii
xxvii
xxx
xxxi
xxxii
xxxii
xxxiv
xxxv
xxxvi
xxxvii
y
yacht
yacht
yakounin
yanke
yeddo
yert
yet
yeux
yokoham
york
young
zambajon
zeb
zebr
z‚bus
zel
zel
z‚nith
zigzag
zingarel
zonder
zoroastr
zurl
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/stem.h ===
extern struct SN_env * french_create_env(void);
extern void french_close_env(struct SN_env * z);
extern int french_stem(struct SN_env * z);
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/stem.sbl ===
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
i_verb_suffix
verb_suffix
residual_suffix
un_double
un_accent
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v keep_with_s )
stringescapes {}
/* special characters (in ISO Latin) */
stringdef a^ hex '83' // a-circumflex
stringdef a` hex '85' // a-grave
stringdef c, hex '87' // c-cedilla
stringdef e" hex '89' // e-diaeresis (rare)
stringdef e' hex '82' // e-acute
stringdef e^ hex '88' // e-circumflex
stringdef e` hex '8A' // e-grave
stringdef i" hex '8B' // i-diaeresis
stringdef i^ hex '8C' // i-circumflex
stringdef o^ hex '93' // o-circumflex
stringdef u^ hex '96' // u-circumflex
stringdef u` hex '97' // u-grave
define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
define prelude as repeat goto (
( v [ ('u' ] v <- 'U') or
('i' ] v <- 'I') or
('y' ] <- 'Y')
)
or
( ['y'] v <- 'Y' )
or
( 'q' ['u'] <- 'U' )
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v v next ) or ( next gopast v )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'Y' (<- 'y')
) or next
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
[substring] among(
'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
'ances' 'iqUes' 'ismes' 'ables' 'istes'
( R2 delete )
'atrice' 'ateur' 'ation'
'atrices' 'ateurs' 'ations'
( R2 delete
try ( ['ic'] (R2 delete) or <-'iqU' )
)
'logie'
'logies'
( R2 <- 'log' )
'usion' 'ution'
'usions' 'utions'
( R2 <- 'u' )
'ence'
'ences'
( R2 <- 'ent' )
'ement'
'ements'
(
RV delete
try (
[substring] among(
'iv' (R2 delete ['at'] R2 delete)
'eus' ((R2 delete) or (R1<-'eux'))
'abl' 'iqU' (R2 delete)
)
)
)
'it{e'}'
'it{e'}s'
(
R2 delete
try (
[substring] among(
'abil' ((R2 delete) or <-'abl')
'ic' ((R2 delete) or <-'iqU')
'iv' (R2 delete)
)
)
)
'if' 'ive'
'ifs' 'ives'
(
R2 delete
try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
)
'eaux' (<- 'eau')
'aux' (R1 <- 'al')
'euse'
'euses'((R2 delete) or (R1<-'eux'))
'issement'
'issements'(R1 non-v delete) // verbal
// fail(...) below forces entry to verb_suffix. -ment typically
// follows the p.p., e.g 'confus{e'}ment'.
'amment' (RV fail(<- 'ant'))
'emment' (RV fail(<- 'ent'))
'ment'
'ments' (test(v RV) fail(delete))
// v is e,i,u,{e'},I or U
)
)
define i_verb_suffix as setlimit tomark pV for (
[substring] among (
'{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
'issez' 'issiez' 'issions' 'issons' 'it'
(non-v delete)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among (
'ions'
(R2 delete)
'{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
'erons' 'eront' 'ez' 'iez'
// 'ons' //-best omitted
(delete)
'{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
'assions'
(delete
try(['e'] delete)
)
)
)
define keep_with_s 'aiou{e`}s'
define residual_suffix as (
try(['s'] test non-keep_with_s delete)
setlimit tomark pV for (
[substring] among(
'ion' (R2 's' or 't' delete)
'ier' 'i{e`}re'
'Ier' 'I{e`}re' (<-'i')
'e' (delete)
'{e"}' ('gu' delete)
)
)
)
define un_double as (
test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
)
define un_accent as (
atleast 1 non-v
[ '{e'}' or '{e`}' ] <-'e'
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
(
( standard_suffix or
i_verb_suffix or
verb_suffix
)
and
try( [ ('Y' ] <- 'i' ) or
('{c,}'] <- 'c' )
)
) or
residual_suffix
)
// try(['ent'] RV delete) // is best omitted
do un_double
do un_accent
)
do postlude
)
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/stemmer.html === (590/690 lines abridged)
<HTML>
<HEAD>
<TITLE>French stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>French stemming algorithm</H1>
<TR><TD>
<BR> <H2>Links to resources</H2>
<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl"> The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c"> The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h"> - and its header</A>
<TR><TD><A HREF="voc.txt"> Sample French vocabulary (ISO Latin codings)</A>
<TR><TD><A HREF="output.txt"> Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt"> Vocabulary + stemmed equivalent in pure ASCII</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>
<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="../texts/romance.html">
Romance language stemmers</A>
</TABLE></DL>
</TR>
<TR><TD BGCOLOR="lightpink">
<BR><BR>
Here is a sample of French vocabulary, with the stemmed forms that will
be generated with this algorithm.
<BR><BR>
<DL><DD><TABLE CELLPADDING=0>
<TR><TD> <B>word</B> </TD>
<TD></TD><TD> </TD>
<TD></TD><TD> <B>stem</B> </TD>
<TD></TD><TD> </TD>
<TD></TD><TD> <B>word</B> </TD>
<TD></TD><TD> </TD>
<TD></TD><TD> <B>stem</B> </TD>
</TR>
<TR><TD>
[-=- -=- -=- 590 lines omitted -=- -=- -=-]
'ier' 'i{e`}re'
'Ier' 'I{e`}re' (<-'i')
'e' (delete)
'{e"}' ('gu' delete)
)
)
)
define un_double as (
test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
)
define un_accent as (
atleast 1 non-v
[ '{e'}' or '{e`}' ] <-'e'
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
(
( standard_suffix or
i_verb_suffix or
verb_suffix
)
and
try( [ ('Y' ] <- 'i' ) or
('{c,}'] <- 'c' )
)
) or
residual_suffix
)
// try(['ent'] RV delete) // is best omitted
do un_double
do un_accent
)
do postlude
)
</DL>
</PRE></FONT>
</TABLE>
</BODY>
</HTML>
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/voc.txt === (20307/20407 lines abridged)
a
…
abailard
abaissait
abaissant
abaisse
abaiss‚
abaissement
abaissent
abaisser
abaisserai
abandon
abandonna
abandonnait
abandonnant
abandonne
abandonn‚
abandonn‚e
abandonner
abandonnera
abandonnerait
abandonn‚s
abandonnez
abasourdi
abat
abattant
abattement
abattit
abattre
abbaye
abb‚
abb‚s
abbesse
abeille
abhorrait
abhorre
abhorr‚
abŒmait
abŒme
abŒm‚
abŒm‚e
abject
abjurant
ablutions
abn‚gation
aboiements
aboiera
abolir
abominable
abominablement
[-=- -=- -=- 20307 lines omitted -=- -=- -=-]
xii
xiii
xiv
xix
xv
xvi
xvii
xviii
xx
xxi
xxii
xxiii
xxiv
xxix
xxv
xxvi
xxvii
xxviii
xxx
xxxi
xxxii
xxxiii
xxxiv
xxxv
xxxvi
xxxvii
y
yacht
yachts
yakounines
yankee
yeddo
yert
yet
yeux
yokohama
york
young
zambajon
zeb
z‚br‚s
z‚bus
zŠle
z‚l‚s
z‚nith
zigzags
zingarelli
zonders
zoroastre
zurla