[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/russian - output.txt:1.1.2.1 russianstem.c:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1
Andreas Jung
andreas@digicool.com
Wed, 13 Feb 2002 11:26:29 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/russian
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/russian
Added Files:
Tag: ajung-textindexng-branch
output.txt russianstem.c stem.h stem.sbl stemmer.html voc.txt
Log Message:
added PyStemmer
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/russian/output.txt ===
<Binary-ish file>
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/russian/russianstem.c ===
#include "header.h"
extern int russian_stem(struct SN_env * z);
static int r_tidy_up(struct SN_env * z);
static int r_derivational(struct SN_env * z);
static int r_noun(struct SN_env * z);
static int r_verb(struct SN_env * z);
static int r_reflexive(struct SN_env * z);
static int r_adjectival(struct SN_env * z);
static int r_adjective(struct SN_env * z);
static int r_perfective_gerund(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static struct among a_0[9] =
{
/* 0 */ { 3, (byte *)"\xD7" "\xDB" "\xC9" "", -1, 1, 0},
/* 1 */ { 4, (byte *)"\xC9" "\xD7" "\xDB" "\xC9" "", 0, 2, 0},
/* 2 */ { 4, (byte *)"\xD9" "\xD7" "\xDB" "\xC9" "", 0, 2, 0},
/* 3 */ { 1, (byte *)"\xD7" "", -1, 1, 0},
/* 4 */ { 2, (byte *)"\xC9" "\xD7" "", 3, 2, 0},
/* 5 */ { 2, (byte *)"\xD9" "\xD7" "", 3, 2, 0},
/* 6 */ { 5, (byte *)"\xD7" "\xDB" "\xC9" "\xD3" "\xD8" "", -1, 1, 0},
/* 7 */ { 6, (byte *)"\xC9" "\xD7" "\xDB" "\xC9" "\xD3" "\xD8" "", 6, 2, 0},
/* 8 */ { 6, (byte *)"\xD9" "\xD7" "\xDB" "\xC9" "\xD3" "\xD8" "", 6, 2, 0}
};
static struct among a_1[26] =
{
/* 0 */ { 2, (byte *)"\xC0" "\xC0" "", -1, 1, 0},
/* 1 */ { 2, (byte *)"\xC5" "\xC0" "", -1, 1, 0},
/* 2 */ { 2, (byte *)"\xCF" "\xC0" "", -1, 1, 0},
/* 3 */ { 2, (byte *)"\xD5" "\xC0" "", -1, 1, 0},
/* 4 */ { 2, (byte *)"\xC5" "\xC5" "", -1, 1, 0},
/* 5 */ { 2, (byte *)"\xC9" "\xC5" "", -1, 1, 0},
/* 6 */ { 2, (byte *)"\xCF" "\xC5" "", -1, 1, 0},
/* 7 */ { 2, (byte *)"\xD9" "\xC5" "", -1, 1, 0},
/* 8 */ { 2, (byte *)"\xC9" "\xC8" "", -1, 1, 0},
/* 9 */ { 2, (byte *)"\xD9" "\xC8" "", -1, 1, 0},
/* 10 */ { 3, (byte *)"\xC9" "\xCD" "\xC9" "", -1, 1, 0},
/* 11 */ { 3, (byte *)"\xD9" "\xCD" "\xC9" "", -1, 1, 0},
/* 12 */ { 2, (byte *)"\xC5" "\xCA" "", -1, 1, 0},
/* 13 */ { 2, (byte *)"\xC9" "\xCA" "", -1, 1, 0},
/* 14 */ { 2, (byte *)"\xCF" "\xCA" "", -1, 1, 0},
/* 15 */ { 2, (byte *)"\xD9" "\xCA" "", -1, 1, 0},
/* 16 */ { 2, (byte *)"\xC5" "\xCD" "", -1, 1, 0},
/* 17 */ { 2, (byte *)"\xC9" "\xCD" "", -1, 1, 0},
/* 18 */ { 2, (byte *)"\xCF" "\xCD" "", -1, 1, 0},
/* 19 */ { 2, (byte *)"\xD9" "\xCD" "", -1, 1, 0},
/* 20 */ { 3, (byte *)"\xC5" "\xC7" "\xCF" "", -1, 1, 0},
/* 21 */ { 3, (byte *)"\xCF" "\xC7" "\xCF" "", -1, 1, 0},
/* 22 */ { 2, (byte *)"\xC1" "\xD1" "", -1, 1, 0},
/* 23 */ { 2, (byte *)"\xD1" "\xD1" "", -1, 1, 0},
/* 24 */ { 3, (byte *)"\xC5" "\xCD" "\xD5" "", -1, 1, 0},
/* 25 */ { 3, (byte *)"\xCF" "\xCD" "\xD5" "", -1, 1, 0}
};
static struct among a_2[8] =
{
/* 0 */ { 2, (byte *)"\xC5" "\xCD" "", -1, 1, 0},
/* 1 */ { 2, (byte *)"\xCE" "\xCE" "", -1, 1, 0},
/* 2 */ { 2, (byte *)"\xD7" "\xDB" "", -1, 1, 0},
/* 3 */ { 3, (byte *)"\xC9" "\xD7" "\xDB" "", 2, 2, 0},
/* 4 */ { 3, (byte *)"\xD9" "\xD7" "\xDB" "", 2, 2, 0},
/* 5 */ { 1, (byte *)"\xDD" "", -1, 1, 0},
/* 6 */ { 2, (byte *)"\xC0" "\xDD" "", 5, 1, 0},
/* 7 */ { 3, (byte *)"\xD5" "\xC0" "\xDD" "", 6, 2, 0}
};
static struct among a_3[2] =
{
/* 0 */ { 2, (byte *)"\xD3" "\xD1" "", -1, 1, 0},
/* 1 */ { 2, (byte *)"\xD3" "\xD8" "", -1, 1, 0}
};
static struct among a_4[46] =
{
/* 0 */ { 1, (byte *)"\xC0" "", -1, 2, 0},
/* 1 */ { 2, (byte *)"\xD5" "\xC0" "", 0, 2, 0},
/* 2 */ { 2, (byte *)"\xCC" "\xC1" "", -1, 1, 0},
/* 3 */ { 3, (byte *)"\xC9" "\xCC" "\xC1" "", 2, 2, 0},
/* 4 */ { 3, (byte *)"\xD9" "\xCC" "\xC1" "", 2, 2, 0},
/* 5 */ { 2, (byte *)"\xCE" "\xC1" "", -1, 1, 0},
/* 6 */ { 3, (byte *)"\xC5" "\xCE" "\xC1" "", 5, 2, 0},
/* 7 */ { 3, (byte *)"\xC5" "\xD4" "\xC5" "", -1, 1, 0},
/* 8 */ { 3, (byte *)"\xC9" "\xD4" "\xC5" "", -1, 2, 0},
/* 9 */ { 3, (byte *)"\xCA" "\xD4" "\xC5" "", -1, 1, 0},
/* 10 */ { 4, (byte *)"\xC5" "\xCA" "\xD4" "\xC5" "", 9, 2, 0},
/* 11 */ { 4, (byte *)"\xD5" "\xCA" "\xD4" "\xC5" "", 9, 2, 0},
/* 12 */ { 2, (byte *)"\xCC" "\xC9" "", -1, 1, 0},
/* 13 */ { 3, (byte *)"\xC9" "\xCC" "\xC9" "", 12, 2, 0},
/* 14 */ { 3, (byte *)"\xD9" "\xCC" "\xC9" "", 12, 2, 0},
/* 15 */ { 1, (byte *)"\xCA" "", -1, 1, 0},
/* 16 */ { 2, (byte *)"\xC5" "\xCA" "", 15, 2, 0},
/* 17 */ { 2, (byte *)"\xD5" "\xCA" "", 15, 2, 0},
/* 18 */ { 1, (byte *)"\xCC" "", -1, 1, 0},
/* 19 */ { 2, (byte *)"\xC9" "\xCC" "", 18, 2, 0},
/* 20 */ { 2, (byte *)"\xD9" "\xCC" "", 18, 2, 0},
/* 21 */ { 2, (byte *)"\xC5" "\xCD" "", -1, 1, 0},
/* 22 */ { 2, (byte *)"\xC9" "\xCD" "", -1, 2, 0},
/* 23 */ { 2, (byte *)"\xD9" "\xCD" "", -1, 2, 0},
/* 24 */ { 1, (byte *)"\xCE" "", -1, 1, 0},
/* 25 */ { 2, (byte *)"\xC5" "\xCE" "", 24, 2, 0},
/* 26 */ { 2, (byte *)"\xCC" "\xCF" "", -1, 1, 0},
/* 27 */ { 3, (byte *)"\xC9" "\xCC" "\xCF" "", 26, 2, 0},
/* 28 */ { 3, (byte *)"\xD9" "\xCC" "\xCF" "", 26, 2, 0},
/* 29 */ { 2, (byte *)"\xCE" "\xCF" "", -1, 1, 0},
/* 30 */ { 3, (byte *)"\xC5" "\xCE" "\xCF" "", 29, 2, 0},
/* 31 */ { 3, (byte *)"\xCE" "\xCE" "\xCF" "", 29, 1, 0},
/* 32 */ { 2, (byte *)"\xC0" "\xD4" "", -1, 1, 0},
/* 33 */ { 3, (byte *)"\xD5" "\xC0" "\xD4" "", 32, 2, 0},
/* 34 */ { 2, (byte *)"\xC5" "\xD4" "", -1, 1, 0},
/* 35 */ { 3, (byte *)"\xD5" "\xC5" "\xD4" "", 34, 2, 0},
/* 36 */ { 2, (byte *)"\xC9" "\xD4" "", -1, 2, 0},
/* 37 */ { 2, (byte *)"\xD1" "\xD4" "", -1, 2, 0},
/* 38 */ { 2, (byte *)"\xD9" "\xD4" "", -1, 2, 0},
/* 39 */ { 2, (byte *)"\xD4" "\xD8" "", -1, 1, 0},
/* 40 */ { 3, (byte *)"\xC9" "\xD4" "\xD8" "", 39, 2, 0},
/* 41 */ { 3, (byte *)"\xD9" "\xD4" "\xD8" "", 39, 2, 0},
/* 42 */ { 3, (byte *)"\xC5" "\xDB" "\xD8" "", -1, 1, 0},
/* 43 */ { 3, (byte *)"\xC9" "\xDB" "\xD8" "", -1, 2, 0},
/* 44 */ { 2, (byte *)"\xCE" "\xD9" "", -1, 1, 0},
/* 45 */ { 3, (byte *)"\xC5" "\xCE" "\xD9" "", 44, 2, 0}
};
static struct among a_5[36] =
{
/* 0 */ { 1, (byte *)"\xC0" "", -1, 1, 0},
/* 1 */ { 2, (byte *)"\xC9" "\xC0" "", 0, 1, 0},
/* 2 */ { 2, (byte *)"\xD8" "\xC0" "", 0, 1, 0},
/* 3 */ { 1, (byte *)"\xC1" "", -1, 1, 0},
/* 4 */ { 1, (byte *)"\xC5" "", -1, 1, 0},
/* 5 */ { 2, (byte *)"\xC9" "\xC5" "", 4, 1, 0},
/* 6 */ { 2, (byte *)"\xD8" "\xC5" "", 4, 1, 0},
/* 7 */ { 2, (byte *)"\xC1" "\xC8" "", -1, 1, 0},
/* 8 */ { 2, (byte *)"\xD1" "\xC8" "", -1, 1, 0},
/* 9 */ { 3, (byte *)"\xC9" "\xD1" "\xC8" "", 8, 1, 0},
/* 10 */ { 1, (byte *)"\xC9" "", -1, 1, 0},
/* 11 */ { 2, (byte *)"\xC5" "\xC9" "", 10, 1, 0},
/* 12 */ { 2, (byte *)"\xC9" "\xC9" "", 10, 1, 0},
/* 13 */ { 3, (byte *)"\xC1" "\xCD" "\xC9" "", 10, 1, 0},
/* 14 */ { 3, (byte *)"\xD1" "\xCD" "\xC9" "", 10, 1, 0},
/* 15 */ { 4, (byte *)"\xC9" "\xD1" "\xCD" "\xC9" "", 14, 1, 0},
/* 16 */ { 1, (byte *)"\xCA" "", -1, 1, 0},
/* 17 */ { 2, (byte *)"\xC5" "\xCA" "", 16, 1, 0},
/* 18 */ { 3, (byte *)"\xC9" "\xC5" "\xCA" "", 17, 1, 0},
/* 19 */ { 2, (byte *)"\xC9" "\xCA" "", 16, 1, 0},
/* 20 */ { 2, (byte *)"\xCF" "\xCA" "", 16, 1, 0},
/* 21 */ { 2, (byte *)"\xC1" "\xCD" "", -1, 1, 0},
/* 22 */ { 2, (byte *)"\xC5" "\xCD" "", -1, 1, 0},
/* 23 */ { 3, (byte *)"\xC9" "\xC5" "\xCD" "", 22, 1, 0},
/* 24 */ { 2, (byte *)"\xCF" "\xCD" "", -1, 1, 0},
/* 25 */ { 2, (byte *)"\xD1" "\xCD" "", -1, 1, 0},
/* 26 */ { 3, (byte *)"\xC9" "\xD1" "\xCD" "", 25, 1, 0},
/* 27 */ { 1, (byte *)"\xCF" "", -1, 1, 0},
/* 28 */ { 1, (byte *)"\xD1" "", -1, 1, 0},
/* 29 */ { 2, (byte *)"\xC9" "\xD1" "", 28, 1, 0},
/* 30 */ { 2, (byte *)"\xD8" "\xD1" "", 28, 1, 0},
/* 31 */ { 1, (byte *)"\xD5" "", -1, 1, 0},
/* 32 */ { 2, (byte *)"\xC5" "\xD7" "", -1, 1, 0},
/* 33 */ { 2, (byte *)"\xCF" "\xD7" "", -1, 1, 0},
/* 34 */ { 1, (byte *)"\xD8" "", -1, 1, 0},
/* 35 */ { 1, (byte *)"\xD9" "", -1, 1, 0}
};
static struct among a_6[2] =
{
/* 0 */ { 3, (byte *)"\xCF" "\xD3" "\xD4" "", -1, 1, 0},
/* 1 */ { 4, (byte *)"\xCF" "\xD3" "\xD4" "\xD8" "", -1, 1, 0}
};
static struct among a_7[4] =
{
/* 0 */ { 4, (byte *)"\xC5" "\xCA" "\xDB" "\xC5" "", -1, 1, 0},
/* 1 */ { 1, (byte *)"\xCE" "", -1, 2, 0},
/* 2 */ { 1, (byte *)"\xD8" "", -1, 3, 0},
/* 3 */ { 3, (byte *)"\xC5" "\xCA" "\xDB" "", -1, 1, 0}
};
static byte g_v[] = { 35, 130, 34, 18 };
static int r_mark_regions(struct SN_env * z) {
z->I[0] = z->l;
z->I[1] = z->l;
{ int c = z->c; /* do, line 61 */
while(1) { /* gopast, line 62 */
if (!(in_grouping(z, g_v, 192, 220))) goto lab1;
break;
lab1:
if (z->c >= z->l) goto lab0;
z->c++;
}
z->I[0] = z->c; /* setmark pV, line 62 */
while(1) { /* gopast, line 62 */
if (!(out_grouping(z, g_v, 192, 220))) goto lab2;
break;
lab2:
if (z->c >= z->l) goto lab0;
z->c++;
}
while(1) { /* gopast, line 63 */
if (!(in_grouping(z, g_v, 192, 220))) goto lab3;
break;
lab3:
if (z->c >= z->l) goto lab0;
z->c++;
}
while(1) { /* gopast, line 63 */
if (!(out_grouping(z, g_v, 192, 220))) goto lab4;
break;
lab4:
if (z->c >= z->l) goto lab0;
z->c++;
}
z->I[1] = z->c; /* setmark p2, line 63 */
lab0:
z->c = c;
}
return 1;
}
static int r_R2(struct SN_env * z) {
if (!(z->I[1] <= z->c)) return 0;
return 1;
}
static int r_perfective_gerund(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 72 */
among_var = find_among_b(z, a_0, 9); /* substring, line 72 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 72 */
switch(among_var) {
case 0: return 0;
case 1:
{ int m = z->l - z->c; /* or, line 76 */
if (!(eq_s_b(z, 1, "\xC1" ""))) goto lab1;
goto lab0;
lab1:
z->c = z->l - m;
if (!(eq_s_b(z, 1, "\xD1" ""))) return 0;
}
lab0:
slice_del(z); /* delete, line 76 */
break;
case 2:
slice_del(z); /* delete, line 83 */
break;
}
return 1;
}
static int r_adjective(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 88 */
among_var = find_among_b(z, a_1, 26); /* substring, line 88 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 88 */
switch(among_var) {
case 0: return 0;
case 1:
slice_del(z); /* delete, line 97 */
break;
}
return 1;
}
static int r_adjectival(struct SN_env * z) {
int among_var;
if (!r_adjective(z)) return 0; /* call adjective, line 102 */
{ int m = z->l - z->c; /* try, line 109 */
z->ket = z->c; /* [, line 110 */
among_var = find_among_b(z, a_2, 8); /* substring, line 110 */
if (!(among_var)) { z->c = z->l - m; goto lab0; }
z->bra = z->c; /* ], line 110 */
switch(among_var) {
case 0: { z->c = z->l - m; goto lab0; }
case 1:
{ int m = z->l - z->c; /* or, line 115 */
if (!(eq_s_b(z, 1, "\xC1" ""))) goto lab2;
goto lab1;
lab2:
z->c = z->l - m;
if (!(eq_s_b(z, 1, "\xD1" ""))) { z->c = z->l - m; goto lab0; }
}
lab1:
slice_del(z); /* delete, line 115 */
break;
case 2:
slice_del(z); /* delete, line 122 */
break;
}
lab0:
}
return 1;
}
static int r_reflexive(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 129 */
among_var = find_among_b(z, a_3, 2); /* substring, line 129 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 129 */
switch(among_var) {
case 0: return 0;
case 1:
slice_del(z); /* delete, line 132 */
break;
}
return 1;
}
static int r_verb(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 137 */
among_var = find_among_b(z, a_4, 46); /* substring, line 137 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 137 */
switch(among_var) {
case 0: return 0;
case 1:
{ int m = z->l - z->c; /* or, line 143 */
if (!(eq_s_b(z, 1, "\xC1" ""))) goto lab1;
goto lab0;
lab1:
z->c = z->l - m;
if (!(eq_s_b(z, 1, "\xD1" ""))) return 0;
}
lab0:
slice_del(z); /* delete, line 143 */
break;
case 2:
slice_del(z); /* delete, line 151 */
break;
}
return 1;
}
static int r_noun(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 160 */
among_var = find_among_b(z, a_5, 36); /* substring, line 160 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 160 */
switch(among_var) {
case 0: return 0;
case 1:
slice_del(z); /* delete, line 167 */
break;
}
return 1;
}
static int r_derivational(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 176 */
among_var = find_among_b(z, a_6, 2); /* substring, line 176 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 176 */
if (!r_R2(z)) return 0; /* call R2, line 176 */
switch(among_var) {
case 0: return 0;
case 1:
slice_del(z); /* delete, line 179 */
break;
}
return 1;
}
static int r_tidy_up(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 184 */
among_var = find_among_b(z, a_7, 4); /* substring, line 184 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 184 */
switch(among_var) {
case 0: return 0;
case 1:
slice_del(z); /* delete, line 188 */
z->ket = z->c; /* [, line 189 */
if (!(eq_s_b(z, 1, "\xCE" ""))) return 0;
z->bra = z->c; /* ], line 189 */
if (!(eq_s_b(z, 1, "\xCE" ""))) return 0;
slice_del(z); /* delete, line 189 */
break;
case 2:
if (!(eq_s_b(z, 1, "\xCE" ""))) return 0;
slice_del(z); /* delete, line 192 */
break;
case 3:
slice_del(z); /* delete, line 194 */
break;
}
return 1;
}
extern int russian_stem(struct SN_env * z) {
{ int c = z->c; /* do, line 201 */
if (!r_mark_regions(z)) goto lab0; /* call mark_regions, line 201 */
lab0:
z->c = c;
}
z->lb = z->c; z->c = z->l; /* backwards, line 202 */
{ int m = z->l - z->c; /* setlimit, line 202 */
int m3;
if (z->c < z->I[0]) return 0;
z->c = z->I[0]; /* tomark, line 202 */
m3 = z->lb; z->lb = z->c;
z->c = z->l - m;
{ int m = z->l - z->c; /* do, line 203 */
{ int m = z->l - z->c; /* or, line 204 */
if (!r_perfective_gerund(z)) goto lab3; /* call perfective_gerund, line 204 */
goto lab2;
lab3:
z->c = z->l - m;
{ int m = z->l - z->c; /* try, line 205 */
if (!r_reflexive(z)) { z->c = z->l - m; goto lab4; } /* call reflexive, line 205 */
lab4:
}
{ int m = z->l - z->c; /* or, line 206 */
if (!r_adjectival(z)) goto lab6; /* call adjectival, line 206 */
goto lab5;
lab6:
z->c = z->l - m;
if (!r_verb(z)) goto lab7; /* call verb, line 206 */
goto lab5;
lab7:
z->c = z->l - m;
if (!r_noun(z)) goto lab1; /* call noun, line 206 */
}
lab5:
}
lab2:
lab1:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* try, line 209 */
z->ket = z->c; /* [, line 209 */
if (!(eq_s_b(z, 1, "\xC9" ""))) { z->c = z->l - m; goto lab8; }
z->bra = z->c; /* ], line 209 */
slice_del(z); /* delete, line 209 */
lab8:
}
{ int m = z->l - z->c; /* do, line 212 */
if (!r_derivational(z)) goto lab9; /* call derivational, line 212 */
lab9:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 213 */
if (!r_tidy_up(z)) goto lab10; /* call tidy_up, line 213 */
lab10:
z->c = z->l - m;
}
z->lb = m3;
}
z->c = z->lb; return 1;
}
extern struct SN_env * russian_create_env(void) { return SN_create_env(0, 2, 0); }
extern void russian_close_env(struct SN_env * z) { SN_close_env(z); }
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/russian/stem.h ===
extern struct SN_env * russian_create_env(void);
extern void russian_close_env(struct SN_env * z);
extern int russian_stem(struct SN_env * z);
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/russian/stem.sbl ===
stringescapes {}
// the 32 Cyrillic letters:
stringdef a hex 'C1'
stringdef b hex 'C2'
stringdef v hex 'D7'
stringdef g hex 'C7'
stringdef d hex 'C4'
stringdef e hex 'C5'
stringdef zh hex 'D6'
stringdef z hex 'DA'
stringdef i hex 'C9'
stringdef i` hex 'CA'
stringdef k hex 'CB'
stringdef l hex 'CC'
stringdef m hex 'CD'
stringdef n hex 'CE'
stringdef o hex 'CF'
stringdef p hex 'D0'
stringdef r hex 'D2'
stringdef s hex 'D3'
stringdef t hex 'D4'
stringdef u hex 'D5'
stringdef f hex 'C6'
stringdef kh hex 'C8'
stringdef ts hex 'C3'
stringdef ch hex 'DE'
stringdef sh hex 'DB'
stringdef shch hex 'DD'
stringdef " hex 'DF'
stringdef y hex 'D9'
stringdef ' hex 'D8'
stringdef e` hex 'DC'
stringdef iu hex 'C0'
stringdef ia hex 'D1'
routines ( mark_regions R2
perfective_gerund
adjective
adjectival
reflexive
verb
noun
derivational
tidy_up
)
externals ( stem )
integers ( pV p2 )
groupings ( v )
define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
define mark_regions as (
$pV = limit
$p2 = limit
do (
gopast v setmark pV gopast non-v
gopast v gopast non-v setmark p2
)
)
backwardmode (
define R2 as $p2 <= cursor
define perfective_gerund as (
[substring] among (
'{v}'
'{v}{sh}{i}'
'{v}{sh}{i}{s}{'}'
('{a}' or '{ia}' delete)
'{i}{v}'
'{i}{v}{sh}{i}'
'{i}{v}{sh}{i}{s}{'}'
'{y}{v}'
'{y}{v}{sh}{i}'
'{y}{v}{sh}{i}{s}{'}'
(delete)
)
)
define adjective as (
[substring] among (
'{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
'{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
'{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
'{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
'{ia}{ia}'
// and -
'{o}{iu}' // - which is somewhat archaic
'{e}{iu}' // - soft form of {o}{iu}
(delete)
)
)
define adjectival as (
adjective
/* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
errors. Removing im, uem, enn creates too many errors.
*/
try (
[substring] among (
'{e}{m}' // present passive participle
'{n}{n}' // adjective from past passive participle
'{v}{sh}' // past active participle
'{iu}{shch}' '{shch}' // present active participle
('{a}' or '{ia}' delete)
//but not '{i}{m}' '{u}{e}{m}' // present passive participle
//or '{e}{n}{n}' // adjective from past passive participle
'{i}{v}{sh}' '{y}{v}{sh}'// past active participle
'{u}{iu}{shch}' // present active participle
(delete)
)
)
)
define reflexive as (
[substring] among (
'{s}{ia}'
'{s}{'}'
(delete)
)
)
define verb as (
[substring] among (
'{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
'{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
'{n}{y}' '{t}{'}' '{e}{sh}{'}'
'{n}{n}{o}'
('{a}' or '{ia}' delete)
'{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
'{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
'{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
'{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
'{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
'{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
(delete)
/* note the short passive participle tests:
'{n}{a}' '{n}' '{n}{o}' '{n}{y}'
'{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
*/
)
)
define noun as (
[substring] among (
'{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
'{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
'{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
'{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
'{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
'{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
(delete)
/* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
'{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
omitted - they only occur on 12 words.
*/
)
)
define derivational as (
[substring] R2 among (
'{o}{s}{t}'
'{o}{s}{t}{'}'
(delete)
)
)
define tidy_up as (
[substring] among (
'{e}{i`}{sh}'
'{e}{i`}{sh}{e}' // superlative forms
(delete
['{n}'] '{n}' delete
)
'{n}'
('{n}' delete) // e.g. -nno endings
'{'}'
(delete) // with some slight false conflations
)
)
)
define stem as (
do mark_regions
backwards setlimit tomark pV for (
do (
perfective_gerund or
( try reflexive
adjectival or verb or noun
)
)
try([ '{i}' ] delete)
// because noun ending -i{iu} is being treated as verb ending -{iu}
do derivational
do tidy_up
)
)
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/russian/stemmer.html === (645/745 lines abridged)
<HTML>
<HEAD><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Russian stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>Russian stemming algorithm</H1>
<TR><TD>
<BR> <H2>Links to resources</H2>
<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl"> The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c"> The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h"> - and its header</A>
<TR><TD><A HREF="voc.txt"> Sample Russian vocabulary (codings as in the Snowball stemmer)</A>
<TR><TD><A HREF="output.txt"> Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt"> Vocabulary + stemmed equivalent in pure ASCII</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>
<BR><BR>
This page should display in Unicode. If you have problems, switch to a different
browser, update the browser you have, or, failing all else, download this page, edit out
the HTML tag
<FONT SIZE=-1><PRE>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
</PRE></FONT>
and view the modified page.<BR><BR>
The sample Russian vocabulary and its stemmed equivalent should be viewable in your
browser if you select the Cyrillic KO18-R character set. You can get to this in Microsoft's
Internet Explorer via <B>View/Encoding/More</B>, and in Netscape via <B>View/Character Set</B>.
</TR>
<TR><TD BGCOLOR="lightpink">
<BR><BR>
Here is a sample of Russian vocabulary, with the stemmed forms that will
be generated with this algorithm.
<BR><BR>
<DL><DD><TABLE CELLPADDING=0>
[-=- -=- -=- 645 lines omitted -=- -=- -=-]
)
)
define derivational as (
[substring] R2 among (
'{o}{s}{t}'
'{o}{s}{t}{'}'
(delete)
)
)
define tidy_up as (
[substring] among (
'{e}{i`}{sh}'
'{e}{i`}{sh}{e}' // superlative forms
(delete
['{n}'] '{n}' delete
)
'{n}'
('{n}' delete) // e.g. -nno endings
'{'}'
(delete) // with some slight false conflations
)
)
)
define stem as (
do mark_regions
backwards setlimit tomark pV for (
do (
perfective_gerund or
( try reflexive
adjectival or verb or noun
)
)
try([ '{i}' ] delete)
// because noun ending -i{iu} is being treated as verb ending -{iu}
do derivational
do tidy_up
)
)
</DL>
</PRE></FONT>
</TR>
</TABLE>
</BODY>
</HTML>
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/russian/voc.txt ===
<Binary-ish file>