[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q - api.c:1.1.2.1 api.h:1.1.2.1 debug.c:1.1.2.1 driver.c:1.1.2.1 german.c:1.1.2.1 german.h:1.1.2.1 header.h:1.1.2.1 make:1.1.2.1 stem.c:1.1.2.1 stem.h:1.1.2.1 test.c:1.1.2.1 use.html:1.1.2.1 utilities.c:1.1.2.1
Andreas Jung
andreas@digicool.com
Wed, 13 Feb 2002 11:41:34 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q
In directory cvs.zope.org:/tmp/cvs-serv2695/q
Added Files:
Tag: ajung-textindexng-branch
api.c api.h debug.c driver.c german.c german.h header.h make
stem.c stem.h test.c use.html utilities.c
Log Message:
added
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/api.c ===
#include "header.h"
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
{ struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
z->p = create_s();
if (S_size)
{ z->S = (byte * *) calloc(S_size, sizeof(byte *));
{ int i;
for (i = 0; i < S_size; i++) z->S[i] = create_s();
}
z->S_size = S_size;
}
if (I_size)
{ z->I = (int *) calloc(I_size, sizeof(int));
z->I_size = I_size;
}
if (B_size)
{ z->B = (byte *) calloc(B_size, sizeof(byte));
z->B_size = B_size;
}
return z;
}
extern void SN_close_env(struct SN_env * z)
{
if (z->S_size)
{
{ int i;
for (i = 0; i < z->S_size; i++) lose_s(z->S[i]);
}
free(z->S);
}
if (z->I_size) free(z->I);
if (z->B_size) free(z->B);
free(z);
}
extern void SN_set_current(struct SN_env * z, int size, const char * s)
{
replace_s(z, 0, z->l, size, (byte *) s);
z->c = 0;
}
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/api.h ===
struct SN_env {
unsigned char * p;
int c; int a; int l; int lb; int bra; int ket;
int S_size; int I_size; int B_size;
unsigned char * * S;
int * I;
unsigned char * B;
};
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
extern void SN_close_env(struct SN_env * z);
extern void SN_set_current(struct SN_env * z, int size, const char * s);
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/debug.c ===
static void debug(struct env * z, int n)
{ int i;
printf("%d <", n);
for (i = z->chead + HL; i < LOF(z->p, z->chead); i++) printf("%c",z->p[i]);
printf(">\n");
}
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/driver.c ===
#include <stdio.h>
#include <ctype.h> /* for isupper, tolower */
#include "api.h"
#include "stem.h"
static repetitions = 1;
static stem_count;
void stemfile(struct SN_env * z, FILE * f_in, FILE * f_out)
{
#define INC 10
int lim = INC;
char * b = (char *) malloc(lim);
while(1)
{ int ch = getc(f_in);
if (ch == EOF) { free(b); return; }
{ int i = 0;
while(1)
{
if (ch == '\n' || ch == EOF) break;
if (i == lim)
{ char * q = (char *) malloc(lim + INC);
memmove(q, b, lim);
free(b); b = q;
lim = lim + INC;
}
/* force lower case: */
if isupper(ch) ch = tolower(ch);
b[i] = ch; i++;
ch = getc(f_in);
}
{ int j; for (j = 1; j <= repetitions; j++)
{
SN_set_current(z, i, b);
stem(z); stem_count++;
}
}
{
z->p[z->l] = 0;
fprintf(f_out, "%s%c", z->p, '\n');
}
}
}
}
static int intof(char * s)
{ int n = 0;
int i; for (i = 0; i < strlen(s); i++)
{ int d = s[i] - '0';
if (d < 0 || d > 9) { fprintf(stderr, "%s not a number\n", s); exit(1); }
n = 10*n + d;
}
return n;
}
static int eq(char * s1, char * s2)
{ int s1_len = strlen(s1);
int s2_len = strlen(s2);
return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0;
}
int main(int argc, char * argv[])
{ char * in;
char * out = 0;
if (argc == 1)
{ printf("options are: file [-o[utput] file] [-r[epetitions] number]\n");
exit(1);
}
if (argc % 2 == 1)
{ printf("number of options must be odd\n");
exit(1);
}
{ char * s;
int i = 1;
while(1)
{ if (i >= argc) break;
s = argv[i++];
if (s[0] == '-')
{ if (eq(s, "-output") || eq(s, "-o")) out = argv[i++]; else
if (eq(s, "-repetitions") || eq(s, "-r")) repetitions = intof(argv[i++]); else
{ fprintf(stderr, "%s unknown\n", s); exit(1);
}
}
else in = s;
}
}
/* initialise the stemming process: */
{ struct SN_env * z = create_env();
FILE * f_in;
FILE * f_out;
f_in = fopen(in, "r");
if (f_in == 0) { fprintf(stderr, "file %s not found\n", in); exit(1); }
f_out = out == 0 ? stdout : fopen(out, "w");
if (f_out == 0) { fprintf(stderr, "file %s cannot be opened\n", out); exit(1); }
stemfile(z, f_in, f_out);
close_env(z);
}
printf("%d calls to stem\n", stem_count);
return 0;
}
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/german.c ===
#include "header.h"
extern int stem(struct SN_env * z);
static int r_standard_suffix(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);
static int r_prelude(struct SN_env * z);
static struct among a_0[5] =
{
/* 0 */ { 1, (byte *)"U", -1, 2},
/* 1 */ { 1, (byte *)"Y", -1, 1},
/* 2 */ { 1, (byte *)"\x81" "", -1, 5},
/* 3 */ { 1, (byte *)"\x84" "", -1, 3},
/* 4 */ { 1, (byte *)"\x94" "", -1, 4}
};
static struct among a_1[7] =
{
/* 0 */ { 1, (byte *)"e", -1, 1},
/* 1 */ { 2, (byte *)"em", -1, 1},
/* 2 */ { 2, (byte *)"en", -1, 1},
/* 3 */ { 3, (byte *)"ern", -1, 1},
/* 4 */ { 2, (byte *)"er", -1, 1},
/* 5 */ { 1, (byte *)"s", -1, 2},
/* 6 */ { 2, (byte *)"es", 5, 1}
};
static struct among a_2[4] =
{
/* 0 */ { 2, (byte *)"en", -1, 1},
/* 1 */ { 2, (byte *)"er", -1, 1},
/* 2 */ { 2, (byte *)"st", -1, 2},
/* 3 */ { 3, (byte *)"est", 2, 1}
};
static struct among a_3[2] =
{
/* 0 */ { 2, (byte *)"ig", -1, 1},
/* 1 */ { 4, (byte *)"lich", -1, 1}
};
static struct among a_4[8] =
{
/* 0 */ { 3, (byte *)"end", -1, 1},
/* 1 */ { 2, (byte *)"ig", -1, 2},
/* 2 */ { 3, (byte *)"ung", -1, 1},
/* 3 */ { 4, (byte *)"lich", -1, 3},
/* 4 */ { 4, (byte *)"isch", -1, 2},
/* 5 */ { 2, (byte *)"ik", -1, 2},
/* 6 */ { 4, (byte *)"heit", -1, 3},
/* 7 */ { 4, (byte *)"keit", -1, 4}
};
static byte g_v[] = { 17, 65, 16, 1, 9, 0, 8 };
static byte g_s_ending[] = { 117, 30, 5 };
static byte g_st_ending[] = { 117, 30, 4 };
static int r_prelude(struct SN_env * z) {
{ int c_test = z->c; /* test, line 30 */
while(1) { /* repeat, line 30 */
int c = z->c;
{ int c = z->c; /* or, line 33 */
z->bra = z->c; /* [, line 32 */
if (!(eq_s(z, 1, "\xE1" ""))) goto lab2;
z->ket = z->c; /* ], line 32 */
slice_from_s(z, 2, "ss"); /* <-, line 32 */
goto lab1;
lab2:
z->c = c;
if (z->c >= z->l) goto lab0;
z->c++; /* next, line 33 */
}
lab1:
continue;
lab0:
z->c = c;
break;
}
z->c = c_test;
}
while(1) { /* repeat, line 36 */
int c = z->c;
while(1) { /* goto, line 36 */
int c = z->c;
if (!(in_grouping(z, g_v, 97, 148))) goto lab4;
z->bra = z->c; /* [, line 37 */
{ int c = z->c; /* or, line 37 */
if (!(eq_s(z, 1, "u"))) goto lab6;
z->ket = z->c; /* ], line 37 */
if (!(in_grouping(z, g_v, 97, 148))) goto lab6;
slice_from_s(z, 1, "U"); /* <-, line 37 */
goto lab5;
lab6:
z->c = c;
if (!(eq_s(z, 1, "y"))) goto lab4;
z->ket = z->c; /* ], line 38 */
if (!(in_grouping(z, g_v, 97, 148))) goto lab4;
slice_from_s(z, 1, "Y"); /* <-, line 38 */
}
lab5:
z->c = c;
break;
lab4:
z->c = c;
if (z->c >= z->l) goto lab3;
z->c++;
}
continue;
lab3:
z->c = c;
break;
}
return 1;
}
static int r_mark_regions(struct SN_env * z) {
z->I[0] = z->l;
z->I[1] = z->l;
while(1) { /* gopast, line 47 */
if (!(in_grouping(z, g_v, 97, 148))) goto lab0;
break;
lab0:
if (z->c >= z->l) return 0;
z->c++;
}
while(1) { /* gopast, line 47 */
if (!(out_grouping(z, g_v, 97, 148))) goto lab1;
break;
lab1:
if (z->c >= z->l) return 0;
z->c++;
}
z->I[0] = z->c; /* setmark p1, line 47 */
/* try, line 48 */
if (!(z->I[0] < 3)) goto lab2;
z->I[0] = 3;
lab2:
while(1) { /* gopast, line 49 */
if (!(in_grouping(z, g_v, 97, 148))) goto lab3;
break;
lab3:
if (z->c >= z->l) return 0;
z->c++;
}
while(1) { /* gopast, line 49 */
if (!(out_grouping(z, g_v, 97, 148))) goto lab4;
break;
lab4:
if (z->c >= z->l) return 0;
z->c++;
}
z->I[1] = z->c; /* setmark p2, line 49 */
return 1;
}
static int r_postlude(struct SN_env * z) {
while(1) { /* repeat, line 53 */
int c = z->c;
z->bra = z->c; /* [, line 55 */
z->a = find_among(z, a_0, 5); /* substring, line 55 */
z->ket = z->c; /* ], line 55 */
{ int c = z->c; /* or, line 61 */
switch(z->a) {
case 0: goto lab2;
case 1:
slice_from_s(z, 1, "y"); /* <-, line 56 */
break;
case 2:
slice_from_s(z, 1, "u"); /* <-, line 57 */
break;
case 3:
slice_from_s(z, 1, "a"); /* <-, line 58 */
break;
case 4:
slice_from_s(z, 1, "o"); /* <-, line 59 */
break;
case 5:
slice_from_s(z, 1, "u"); /* <-, line 60 */
break;
}
goto lab1;
lab2:
z->c = c;
if (z->c >= z->l) goto lab0;
z->c++; /* next, line 61 */
}
lab1:
continue;
lab0:
z->c = c;
break;
}
return 1;
}
static int r_R1(struct SN_env * z) {
if (!(z->I[0] <= z->c)) return 0;
return 1;
}
static int r_R2(struct SN_env * z) {
if (!(z->I[1] <= z->c)) return 0;
return 1;
}
static int r_standard_suffix(struct SN_env * z) {
{ int m = z->l - z->c; /* do, line 71 */
z->ket = z->c; /* [, line 72 */
z->a = find_among_b(z, a_1, 7); /* substring, line 72 */
z->bra = z->c; /* ], line 72 */
if (!r_R1(z)) goto lab0; /* call R1, line 72 */
switch(z->a) {
case 0: goto lab0;
case 1:
slice_del(z); /* delete, line 74 */
break;
case 2:
if (!(in_grouping_b(z, g_s_ending, 98, 116))) goto lab0;
slice_del(z); /* delete, line 77 */
break;
}
lab0:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 81 */
z->ket = z->c; /* [, line 82 */
z->a = find_among_b(z, a_2, 4); /* substring, line 82 */
z->bra = z->c; /* ], line 82 */
if (!r_R1(z)) goto lab1; /* call R1, line 82 */
switch(z->a) {
case 0: goto lab1;
case 1:
slice_del(z); /* delete, line 84 */
break;
case 2:
if (!(in_grouping_b(z, g_st_ending, 98, 116))) goto lab1;
{ int c = z->c - 3;
if (z->lb > c || c > z->l) goto lab1;
z->c = c; /* hop, line 87 */
}
slice_del(z); /* delete, line 87 */
break;
}
lab1:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 91 */
z->ket = z->c; /* [, line 92 */
z->a = find_among_b(z, a_4, 8); /* substring, line 92 */
z->bra = z->c; /* ], line 92 */
if (!r_R2(z)) goto lab2; /* call R2, line 92 */
switch(z->a) {
case 0: goto lab2;
case 1:
slice_del(z); /* delete, line 94 */
{ int m = z->l - z->c; /* try, line 95 */
z->ket = z->c; /* [, line 95 */
if (!(eq_s_b(z, 2, "ig"))) { z->c = z->l - m; goto lab3; }
z->bra = z->c; /* ], line 95 */
{ int m = z->l - z->c; /* not, line 95 */
if (!(eq_s_b(z, 1, "e"))) goto lab4;
{ z->c = z->l - m; goto lab3; }
lab4:
z->c = z->l - m;
}
if (!r_R2(z)) { z->c = z->l - m; goto lab3; } /* call R2, line 95 */
slice_del(z); /* delete, line 95 */
lab3:
}
break;
case 2:
{ int m = z->l - z->c; /* not, line 98 */
if (!(eq_s_b(z, 1, "e"))) goto lab5;
goto lab2;
lab5:
z->c = z->l - m;
}
slice_del(z); /* delete, line 98 */
break;
case 3:
slice_del(z); /* delete, line 101 */
{ int m = z->l - z->c; /* try, line 102 */
z->ket = z->c; /* [, line 103 */
{ int m = z->l - z->c; /* or, line 103 */
if (!(eq_s_b(z, 2, "er"))) goto lab8;
goto lab7;
lab8:
z->c = z->l - m;
if (!(eq_s_b(z, 2, "en"))) { z->c = z->l - m; goto lab6; }
}
lab7:
z->bra = z->c; /* ], line 103 */
if (!r_R1(z)) { z->c = z->l - m; goto lab6; } /* call R1, line 103 */
slice_del(z); /* delete, line 103 */
lab6:
}
break;
case 4:
slice_del(z); /* delete, line 107 */
{ int m = z->l - z->c; /* try, line 108 */
z->ket = z->c; /* [, line 109 */
z->a = find_among_b(z, a_3, 2); /* substring, line 109 */
z->bra = z->c; /* ], line 109 */
if (!r_R2(z)) { z->c = z->l - m; goto lab9; } /* call R2, line 109 */
switch(z->a) {
case 0: { z->c = z->l - m; goto lab9; }
case 1:
slice_del(z); /* delete, line 111 */
break;
}
lab9:
}
break;
}
lab2:
z->c = z->l - m;
}
return 1;
}
extern int stem(struct SN_env * z) {
{ int c = z->c; /* do, line 122 */
if (!r_prelude(z)) goto lab0; /* call prelude, line 122 */
lab0:
z->c = c;
}
{ int c = z->c; /* do, line 123 */
if (!r_mark_regions(z)) goto lab1; /* call mark_regions, line 123 */
lab1:
z->c = c;
}
z->lb = z->c; z->c = z->l; /* backwards, line 124 */
{ int m = z->l - z->c; /* do, line 125 */
if (!r_standard_suffix(z)) goto lab2; /* call standard_suffix, line 125 */
lab2:
z->c = z->l - m;
}
z->c = z->lb; { int c = z->c; /* do, line 126 */
if (!r_postlude(z)) goto lab3; /* call postlude, line 126 */
lab3:
z->c = c;
}
return 1;
}
extern struct SN_env * create_env(void) { return SN_create_env(0, 2, 0); }
extern void close_env(struct SN_env * z) { SN_close_env(z); }
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/german.h ===
extern struct SN_env * create_env(void);
extern void close_env(struct SN_env * z);
extern int stem(struct SN_env * z);
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/header.h ===
#include <limits.h>
#include "api.h"
#define MAXINT INT_MAX
#define MININT INT_MIN
#define HEAD 2*sizeof(int)
#define SIZE(p) ((int *)(p))[-1]
#define SET_SIZE(p, n) ((int *)(p))[-1] = n
#define CAPACITY(p) ((int *)(p))[-2]
typedef unsigned char byte;
struct among
{ int s_size; /* number of chars in string */
byte * s; /* search string */
int substring_i;/* index to longest matching substring */
int result; /* result of the lookup */
int (* function)(struct SN_env *);
};
extern byte * create_s(void);
extern void lose_s(byte * p);
extern int in_grouping(struct SN_env * z, byte * s, int min, int max);
extern int in_grouping_b(struct SN_env * z, char * s, int min, int max);
extern int out_grouping(struct SN_env * z, byte * s, int min, int max);
extern int out_grouping_b(struct SN_env * z, char * s, int min, int max);
extern int in_range(struct SN_env * z, int min, int max);
extern int in_range_b(struct SN_env * z, int min, int max);
extern int out_range(struct SN_env * z, int min, int max);
extern int out_range_b(struct SN_env * z, int min, int max);
extern int eq_s(struct SN_env * z, int s_size, char * s);
extern int eq_s_b(struct SN_env * z, int s_size, char * s);
extern int eq_v(struct SN_env * z, byte * p);
extern int eq_v_b(struct SN_env * z, byte * p);
extern int find_among(struct SN_env * z, struct among * v, int v_size);
extern int find_among_b(struct SN_env * z, struct among * v, int v_size);
extern byte * increase_size(byte * p, int n);
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const byte * s);
extern void slice_from_s(struct SN_env * z, int s_size, char * s);
extern void slice_from_v(struct SN_env * z, byte * p);
extern void slice_del(struct SN_env * z);
extern void insert_s(struct SN_env * z, int bra, int ket, int s_size, char * s);
extern void insert_v(struct SN_env * z, int bra, int ket, byte * p);
extern byte * slice_to(struct SN_env * z, byte * p);
extern byte * assign_to(struct SN_env * z, byte * p);
extern void debug(struct SN_env * z, int number, int line_count);
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/make ===
gcc -O4 -c -o q/utilities.o q/utilities.c
gcc -O4 -c -o q/api.o q/api.c
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/stem.c === (404/504 lines abridged)
#include "header.h"
extern int stem(struct SN_env * z);
static int r_Step_5b(struct SN_env * z);
static int r_Step_5a(struct SN_env * z);
static int r_Step_4(struct SN_env * z);
static int r_Step_3(struct SN_env * z);
static int r_Step_2(struct SN_env * z);
static int r_Step_1c(struct SN_env * z);
static int r_Step_1b(struct SN_env * z);
static int r_Step_1a(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_shortv(struct SN_env * z);
static struct among a_0[4] =
{
/* 0 */ { 1, (byte *)"s", -1, 3},
/* 1 */ { 3, (byte *)"ies", 0, 2},
/* 2 */ { 4, (byte *)"sses", 0, 1},
/* 3 */ { 2, (byte *)"ss", 0, -1}
};
static struct among a_1[13] =
{
/* 0 */ { 0, (byte *)"", -1, 3},
/* 1 */ { 2, (byte *)"bb", 0, 2},
/* 2 */ { 2, (byte *)"dd", 0, 2},
/* 3 */ { 2, (byte *)"ff", 0, 2},
/* 4 */ { 2, (byte *)"gg", 0, 2},
/* 5 */ { 2, (byte *)"bl", 0, 1},
/* 6 */ { 2, (byte *)"mm", 0, 2},
/* 7 */ { 2, (byte *)"nn", 0, 2},
/* 8 */ { 2, (byte *)"pp", 0, 2},
/* 9 */ { 2, (byte *)"rr", 0, 2},
/* 10 */ { 2, (byte *)"at", 0, 1},
/* 11 */ { 2, (byte *)"tt", 0, 2},
/* 12 */ { 2, (byte *)"iz", 0, 1}
};
static struct among a_2[3] =
{
/* 0 */ { 2, (byte *)"ed", -1, 2},
/* 1 */ { 3, (byte *)"eed", 0, 1},
/* 2 */ { 3, (byte *)"ing", -1, 2}
};
static struct among a_3[20] =
{
[-=- -=- -=- 404 lines omitted -=- -=- -=-]
lab13:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 132 */
if (!r_Step_4(z)) goto lab14; /* call Step_4, line 132 */
lab14:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 133 */
if (!r_Step_5a(z)) goto lab15; /* call Step_5a, line 133 */
lab15:
z->c = z->l - m;
}
{ int m = z->l - z->c; /* do, line 134 */
if (!r_Step_5b(z)) goto lab16; /* call Step_5b, line 134 */
lab16:
z->c = z->l - m;
}
z->c = z->lb; { int c = z->c; /* do, line 137 */
if (!(z->B[0])) goto lab17; /* Boolean test Y_found, line 137 */
while(1) { /* repeat, line 137 */
int c = z->c;
while(1) { /* goto, line 137 */
int c = z->c;
z->bra = z->c; /* [, line 137 */
if (!(eq_s(z, 1, "Y"))) goto lab19;
z->ket = z->c; /* ], line 137 */
z->c = c;
break;
lab19:
z->c = c;
if (z->c >= z->l) goto lab18;
z->c++;
}
slice_from_s(z, 1, "y"); /* <-, line 137 */
continue;
lab18:
z->c = c;
break;
}
lab17:
z->c = c;
}
return 1;
}
extern struct SN_env * create_env(void) { return SN_create_env(0, 2, 1); }
extern void close_env(struct SN_env * z) { SN_close_env(z); }
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/stem.h ===
extern struct SN_env * create_env(void);
extern void close_env(struct SN_env * z);
extern int stem(struct SN_env * z);
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/test.c ===
#include "api.h"
void main(int argc,char **argv) {
}
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/use.html ===
<HTML>
<HEAD>
<TITLE>Using Snowball</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>Using Snowball</H1>
<TR><TD BGCOLOR="wheat">
<BR> <H2>Links to resources</H2>
<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="http://snowball.sourceforge.net"> Snowball main page</A>
<TR><TD><A HREF="../p/snowman.html"> Snowball manual</A>
</TABLE></DL>
</TR>
<TR><TD>
<BR><BR>
<BR> <H2>Compiling and running Snowball</H2>
When you download Snowball, it already has in place a make file that you can
call to build it. But in any case, Snowball has a very simple structure,
comprising the traditional tokeniser, syntax analyser and code generator modules,
with two extra modules for space management and an internal merge sort,
and a small driver module, all
sharing a common header file. If you put these sources into a directory
<TT>p/</TT>, you can compile Snowball at once (Linux or Unix) with
<BR><PRE>
gcc -O -o Snowball p/*.c
</PRE>
Snowball can then be called up with the following syntax,
<BR><PRE>
F1 [-o[utput] F2] [-s[yntax]] [-ep[refix] S1] [-vp[refix] S2]
</PRE>
For example,
<BR><PRE>
./Snowball danish/stem -o q/danish
./Snowball danish/stem -syntax
./Snowball danish/stem -output q/danish -ep danish_
</PRE>
The first argument, <TT>F1</TT>, is the name of the Snowball file to be compiled. It produces
two outputs, an ANSI C module in <TT>F2.c</TT> and a corresponding header file in <TT>F2.h</TT>.
In the absence of the <TT>-eprefix</TT> and <TT>-vprefix</TT> options, the list of declared externals in
the Snowball program, for example,
<BR><PRE>
externals ( stem_1 stem_2 moderate )
</PRE>
gives rise to a header file containing,
<BR><PRE>
extern struct SN_env * create_env(void);
extern void close_env(struct SN_env * z);
extern int moderate(struct SN_env * z);
extern int stem_2(struct SN_env * z);
extern int stem_1(struct SN_env * z);
</PRE>
If <TT>-eprefix</TT> is used, its string, <TT>S1</TT>, is prefixed to each external name, for
example
<BR><PRE>
-eprefix Khotanese_
</PRE>
would give rise to the header file,
<BR><PRE>
extern struct SN_env * Khotanese_create_env(void);
extern void Khotanese_close_env(struct SN_env * z);
extern int Khotanese_moderate(struct SN_env * z);
extern int Khotanese_stem_2(struct SN_env * z);
extern int Khotanese_stem_1(struct SN_env * z);
</PRE>
If <TT>-vprefix</TT> is used, all Snowball strings, integers and booleans give rise to a
<TT>#define</TT> line in the header file. For example
<BR><PRE>
-eprefix Khotanese_ -vprefix Khotanese_variable
</PRE>
would give rise the header file,
<BR><PRE>
extern struct SN_env * Khotanese_create_env(void);
extern void Khotanese_close_env(struct SN_env * z);
#define Khotanese_variable_ch (S[0])
#define Khotanese_variable_Y_found (B[0])
#define Khotanese_variable_p2 (I[1])
#define Khotanese_variable_p1 (I[0])
extern int Khotanese_stem(struct SN_env * z);
</PRE>
If <TT>-syntax</TT> is used the other options are ignored, and the syntax tree of the Snowball
program is directed to <TT>stdout</TT>. This can be a handy way of checking that you have got
the bracketing right in the program you have written.
<BR> <H2>The Snowball API</H2>
To access Snowball from C, include the header <TT>api.h</TT>, and any headers generated from the
Snowball scripts you wish to use. <TT>api.h</TT> declares
<BR><PRE>
struct SN_env { ... };
extern void SN_set_current(struct SN_env * z, int size, char * s);
</PRE>
Continuing the previous example, you set up an environment to call the resources of the
Khotanese module with
<BR><PRE>
struct SN_env * z;
z = Khotanese_create_env();
</PRE>
Snowball has the concept of a ‘current string’.
This can be set up by,
<BR><PRE>
SN_set_current(z, i, b);
</PRE>
This defines the current string as the <TT>i</TT> bytes of data starting at address <TT>b</TT>.
The externals can then be called,
<BR><PRE>
Khotanese_moderate(z);
...
Khotanese_stem_1(z);
</PRE>
They give a 1 or 0 result, corresponding to the <B><I>t</I></B> or <B><I>f</I></B> result of the Snowball
routine.
<BR><BR>
And later,
<BR><PRE>
Khotanese_close_env(z);
</PRE>
To release the space raised by z back to the system. You can do this for a number of
Snowball modules at the same time: you will need a separate
<TT>struct SN_env * z;</TT> for each module.
The current string is given by the <TT>z->l</TT> bytes of data starting at <TT>z->p</TT>.
The string is not zero-terminated, but you can zero terminate it yourself with
<BR><PRE>
z->p[z->l] = 0;
</PRE>
(There is always room for this last zero byte.) For example,
<BR><PRE>
SN_set_current(z, strlen(s), s);
Khotanese_stem_1(z);
z->p[z->l] = 0;
printf("Khotanese-1 stems '%s' to '%s'\n", s, z->p);
</PRE>
The values of the other variables can be accessed via the <TT>#define</TT> settings that result
from the <TT>-vprefix</TT> option, although this should not usually be necessary:
<BR><PRE>
printf("p1 is %d\n", z->Khotanese_variable_p1);
</PRE>
The stemming scripts on this Web site use Snowball very simply. <TT>-vprefix</TT> is left unset, and
<TT>-eprefix</TT>
is set to the name of the script (usually the language the script is for). All the programs are tested through a common
<A HREF="driver.c">driver</A>
program.
<BR> <H2>Debugging</H2>
In the rare event that your Snowball script does not run perfectly the first time:
<BR><BR>
Remember that the option <TT>-syntax</TT> prints out the syntax tree. A question mark can be
included in Snowball as a command, and it will cause the current string to sent to
<TT>stdout</TT>, with square brackets marking the slice and vertical bar the position of <B><I>c</I></B>.
Curly brackets mark the end-limits of the string, which may be less than the whole
string because of the action of <TT>setlimit</TT>.
<BR><BR>
At present there is no way of reporting the value of an integer or boolean.
<BR><BR>
If desperate, you can put debugging lines into the generated C program. This is not so
hard, since running comments show the correspondence with the Snowball source.
<BR><BR>
<BR> <H2>Compiler bugs</H2>
There must be a few compiler bugs in such a young language. If you hit one, try to
capture it in a small script before notifying us.
<BR> <H2>Known problems in Snowball</H2>
The main one is that it is possible to ‘pull the rug from under your own feet’ in
constructions like this:
<BR><PRE>
[ do something ]
do something else
( C1 delete C2 ) or ( C3 )
</PRE>
Suppose <TT>C1</TT> gives <B><I>t</I></B>, the delete removes the slice established on the first
line, and <TT>C2</TT> gives <B><I>f</I></B>, so C3 is done with <B><I>c</I></B> set back to the value it had
before <TT>C1</TT> was obeyed - but this old value does not take account of the byte shift
caused by the delete. This problem was forseen from the beginning when designing
Snowball, and recognised as a minor issue because it is an unnatural thing to want to
do. (<TT>C3</TT> should not be an alternative to something which has deletion as an
occasional side-effect.) It may be addressed in the future.
<BR><BR>
</TR>
</TABLE>
</BODY>
</HTML>
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/q/utilities.c ===
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "header.h"
#define unless(C) if(!(C))
#define CREATE_SIZE 1
extern byte * create_s(void)
{ byte * p = HEAD + (byte *) malloc(HEAD + CREATE_SIZE + 1);
CAPACITY(p) = CREATE_SIZE;
SET_SIZE(p, CREATE_SIZE);
return p;
}
extern void lose_s(byte * p) { free(p - HEAD); }
extern int in_grouping(struct SN_env * z, byte * s, int min, int max)
{ if (z->c >= z->l) return 0;
{ int ch = z->p[z->c];
if
(ch > max || (ch -= min) < 0 ||
(s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
}
z->c++; return 1;
}
extern int in_grouping_b(struct SN_env * z, char * s, int min, int max)
{ if (z->c <= z->lb) return 0;
{ int ch = z->p[z->c - 1];
if
(ch > max || (ch -= min) < 0 ||
(s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
}
z->c--; return 1;
}
extern int out_grouping(struct SN_env * z, byte * s, int min, int max)
{ if (z->c >= z->l) return 0;
{ int ch = z->p[z->c];
unless
(ch > max || (ch -= min) < 0 ||
(s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
}
z->c++; return 1;
}
extern int out_grouping_b(struct SN_env * z, char * s, int min, int max)
{ if (z->c <= z->lb) return 0;
{ int ch = z->p[z->c - 1];
unless
(ch > max || (ch -= min) < 0 ||
(s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
}
z->c--; return 1;
}
extern int in_range(struct SN_env * z, int min, int max)
{ if (z->c >= z->l) return 0;
{ int ch = z->p[z->c];
if
(ch > max || ch < min) return 0;
}
z->c++; return 1;
}
extern int in_range_b(struct SN_env * z, int min, int max)
{ if (z->c <= z->lb) return 0;
{ int ch = z->p[z->c - 1];
if
(ch > max || ch < min) return 0;
}
z->c--; return 1;
}
extern int out_range(struct SN_env * z, int min, int max)
{ if (z->c >= z->l) return 0;
{ int ch = z->p[z->c];
unless
(ch > max || ch < min) return 0;
}
z->c++; return 1;
}
extern int out_range_b(struct SN_env * z, int min, int max)
{ if (z->c <= z->lb) return 0;
{ int ch = z->p[z->c - 1];
unless
(ch > max || ch < min) return 0;
}
z->c--; return 1;
}
extern int eq_s(struct SN_env * z, int s_size, char * s)
{ if (z->l - z->c < s_size ||
memcmp(z->p + z->c, s, s_size) != 0) return 0;
z->c += s_size; return 1;
}
extern int eq_s_b(struct SN_env * z, int s_size, char * s)
{ if (z->c - z->lb < s_size ||
memcmp(z->p + z->c - s_size, s, s_size) != 0) return 0;
z->c -= s_size; return 1;
}
extern int eq_v(struct SN_env * z, byte * p)
{ return eq_s(z, SIZE(p), (char *)p);
}
extern int eq_v_b(struct SN_env * z, byte * p)
{ return eq_s_b(z, SIZE(p), (char *)p);
}
extern int find_among(struct SN_env * z, struct among * v, int v_size)
{
int i = 0;
int j = v_size;
int c = z->c; int l = z->l;
byte * q = z->p + c;
struct among * w;
int common_i = 0;
int common_j = 0;
int first_key_inspected = 0;
while(1)
{ int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; /* smaller */
w = v + k;
{ int i; for (i = common; i < w->s_size; i++)
{ if (c + common == l) { diff = -1; break; }
diff = q[common] - w->s[i];
if (diff != 0) break;
common++;
}
}
if (diff < 0) { j = k; common_j = common; }
else { i = k; common_i = common; }
if (j - i <= 1)
{ if (i > 0) break; /* v->s has been inspected */
if (j == i) break; /* only one item in v */
/* - but now we need to go round once more to get
v->s inspected. This looks messy, but is actually
the optimal approach. */
if (first_key_inspected) break;
first_key_inspected = 1;
}
}
while(1)
{ w = v + i;
if (common_i >= w->s_size)
{ z->c = c + w->s_size;
if (w->function == 0) return w->result;
{ int res = w->function(z);
z->c = c + w->s_size;
if (res) return w->result;
}
}
i = w->substring_i;
if (i < 0) return 0;
}
}
/* find_among_b is for backwards processing. Same comments apply */
extern int find_among_b(struct SN_env * z, struct among * v, int v_size)
{
int i = 0;
int j = v_size;
int c = z->c; int lb = z->lb;
byte * q = z->p + c - 1;
struct among * w;
int common_i = 0;
int common_j = 0;
int first_key_inspected = 0;
while(1)
{ int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
w = v + k;
{ int i; for (i = w->s_size - 1 - common; i >= 0; i--)
{ if (c - common == lb) { diff = -1; break; }
diff = q[- common] - w->s[i];
if (diff != 0) break;
common++;
}
}
if (diff < 0) { j = k; common_j = common; }
else { i = k; common_i = common; }
if (j - i <= 1)
{ if (i > 0) break;
if (j == i) break;
if (first_key_inspected) break;
first_key_inspected = 1;
}
}
while(1)
{ w = v + i;
if (common_i >= w->s_size)
{ z->c = c - w->s_size;
if (w->function == 0) return w->result;
{ int res = w->function(z);
z->c = c - w->s_size;
if (res) return w->result;
}
}
i = w->substring_i;
if (i < 0) return 0;
}
}
extern byte * increase_size(byte * p, int n)
{ int new_size = /**-CAPACITY(p) +-**/ n + 20;
byte * q = HEAD + (byte *) malloc(HEAD + new_size + 1);
CAPACITY(q) = new_size;
memmove(q, p, CAPACITY(p)); lose_s(p); return q;
}
/* to replace chars between c_bra and c_ket in z->p by the
s_size chars at s
*/
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const byte * s)
{ int adjustment = s_size - (c_ket - c_bra);
int len = SIZE(z->p);
if (adjustment != 0)
{ if (adjustment + len > CAPACITY(z->p)) z->p = increase_size(z->p, adjustment + len);
memmove(z->p + c_ket + adjustment, z->p + c_ket, len - c_ket);
SET_SIZE(z->p, adjustment + len);
z->l += adjustment;
if (z->c >= c_ket) z->c += adjustment; else
if (z->c > c_bra) z->c = c_bra;
}
unless (s_size == 0) memmove(z->p + c_bra, s, s_size);
return adjustment;
}
static void slice_check(struct SN_env * z)
{
if (!(0 <= z->bra &&
z->bra <= z->ket &&
z->ket <= z->l &&
z->l <= SIZE(z->p))) /* this line could be removed */
{
fprintf(stderr, "faulty slice operation:\n");
debug(z, -1, 0);
exit(1);
}
}
extern void slice_from_s(struct SN_env * z, int s_size, char * s)
{ slice_check(z);
replace_s(z, z->bra, z->ket, s_size, (byte *) s);
}
extern void slice_from_v(struct SN_env * z, byte * p)
{ slice_from_s(z, SIZE(p), (char *)p);
}
extern void slice_del(struct SN_env * z)
{ slice_from_s(z, 0, 0);
}
extern void insert_s(struct SN_env * z, int bra, int ket, int s_size, char * s)
{ int adjustment = replace_s(z, bra, ket, s_size, (byte *) s);
if (bra <= z->bra) z->bra += adjustment;
if (bra <= z->ket) z->ket += adjustment;
}
extern void insert_v(struct SN_env * z, int bra, int ket, byte * p)
{ int adjustment = replace_s(z, bra, ket, SIZE(p), p);
if (bra <= z->bra) z->bra += adjustment;
if (bra <= z->ket) z->ket += adjustment;
}
extern byte * slice_to(struct SN_env * z, byte * p)
{ slice_check(z);
{ int len = z->ket - z->bra;
if (CAPACITY(p) < len) p = increase_size(p, len);
memmove(p, z->p + z->bra, len);
SET_SIZE(p, len);
}
return p;
}
extern byte * assign_to(struct SN_env * z, byte * p)
{ int len = z->l;
if (CAPACITY(p) < len) p = increase_size(p, len);
memmove(p, z->p, len);
SET_SIZE(p, len);
return p;
}
extern void debug(struct SN_env * z, int number, int line_count)
{ int i;
int limit = SIZE(z->p);
/*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
for (i = 0; i <= limit; i++)
{ if (z->lb == i) printf("{");
if (z->bra == i) printf("[");
if (z->c == i) printf("|");
if (z->ket == i) printf("]");
if (z->l == i) printf("}");
if (i < limit)
{ int ch = z->p[i];
if (ch == 0) ch = '#';
printf("%c", ch);
}
}
printf("'\n");
}