[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src - ISO_8859_1_Splitter.c:1.5.10.2
Andreas Jung
andreas@digicool.com
Wed, 9 Jan 2002 09:16:25 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src
In directory cvs.zope.org:/tmp/cvs-serv25029/src
Modified Files:
Tag: ajung-textindexng-branch
ISO_8859_1_Splitter.c
Log Message:
added 'maxlen','indexnumbers','singlechar' parameters to constructor
=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c 1.5.10.1 => 1.5.10.2 ===
char *here, *end;
int index;
+ int allow_single_chars;
+ int index_numbers;
+ int max_len;
}
Splitter;
@@ -181,7 +184,7 @@
len = PyString_Size(word);
- if(len < 2) /* Single-letter words are stop words! */
+ if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{
Py_INCREF(Py_None);
return Py_None;
@@ -193,7 +196,7 @@
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
;
- if (len < 0) {
+ if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None);
return Py_None;
}
@@ -223,12 +226,11 @@
return value; /* Which must be None! */
}
-#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject *
next_word(Splitter *self, char **startpos, char **endpos)
{
- char wbuf[MAX_WORD];
+ char wbuf[256];
char *end, *here, *b;
int i = 0, c;
PyObject *pyword, *res;
@@ -258,13 +260,13 @@
if(startpos && i==0)
*startpos=here;
- if(i++ < MAX_WORD)
+ if(i++ < self->max_len)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
- if(i >= MAX_WORD)
- i=MAX_WORD; /* "stem" the long word */
+ if(i >= self->max_len)
+ i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here;
@@ -308,8 +310,8 @@
/* We've reached the end of the string */
- if(i >= MAX_WORD)
- i=MAX_WORD; /* "stem" the long word */
+ if(i >= self->max_len)
+ i=self->max_len; /* "stem" the long word */
if (i == 0) {
/* No words */
@@ -488,7 +490,7 @@
SplitterType__doc__ /* Documentation string */
};
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
@@ -496,8 +498,29 @@
Splitter *self;
PyObject *doc, *synstop = NULL;
char * encoding="latin1";
+ int single_char = 0;
+ int index_numbers = 0;
+ int max_len=64;
+
+ UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
+
+
+ if (index_numbers<0 || index_numbers>1) {
+ PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+ return NULL;
+ }
+
+ if (single_char<0 || single_char>1) {
+ PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+ return NULL;
+ }
+
+ if (max_len<1 || max_len>128) {
+ PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+ return NULL;
+ }
+
- UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding)) return NULL;
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
@@ -513,6 +536,9 @@
UNLESS(self->here=PyString_AsString(self->text)) goto err;
self->end = self->here + PyString_Size(self->text);
+ self->allow_single_chars = single_char;
+ self->index_numbers = index_numbers;
+ self->max_len = max_len;
self->index = -1;
@@ -527,7 +553,7 @@
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
- "ISO_8859_1_Splitter(doc[,synstop]) -- Return a word splitter"
+ "ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
},
{ NULL, NULL }