[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.5.10.2
Andreas Jung
andreas@zope.com
Tue, 8 Jan 2002 14:09:34 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv16661/src
Modified Files:
Tag: ajung-textindexng-branch
ZopeSplitter.c
Log Message:
introducing new constructor parameters:
'maxwords' --
=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.5.10.1 => 1.5.10.2 ===
****************************************************************************/
+
+
#include "Python.h"
#include <ctype.h>
@@ -23,6 +25,9 @@
PyObject *text, *synstop;
char *here, *end;
int index;
+ int allow_single_chars;
+ int index_numbers;
+ int max_words;
}
Splitter;
@@ -98,7 +103,7 @@
cword = PyString_AsString(word);
len = PyString_Size(word);
- if(len < 2) /* Single-letter words are stop words! */
+ if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{
Py_INCREF(Py_None);
return Py_None;
@@ -110,7 +115,7 @@
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
;
- if (len < 0) {
+ if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None);
return Py_None;
}
@@ -140,12 +145,11 @@
return value; /* Which must be None! */
}
-#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject *
next_word(Splitter *self, char **startpos, char **endpos)
{
- char wbuf[MAX_WORD];
+ char wbuf[256];
char *end, *here, *b;
int i = 0, c;
PyObject *pyword, *res;
@@ -175,13 +179,13 @@
if(startpos && i==0)
*startpos=here;
- if(i++ < MAX_WORD)
+ if(i++ < self->max_words)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
- if(i >= MAX_WORD)
- i=MAX_WORD; /* "stem" the long word */
+ if(i >= self->max_words)
+ i=self->max_words; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here;
@@ -225,8 +229,8 @@
/* We've reached the end of the string */
- if(i >= MAX_WORD)
- i=MAX_WORD; /* "stem" the long word */
+ if(i >= self->max_words)
+ i=self->max_words; /* "stem" the long word */
if (i == 0) {
/* No words */
@@ -307,14 +311,14 @@
}
static PySequenceMethods Splitter_as_sequence = {
- (inquiry)Splitter_length, /*sq_length*/
- (binaryfunc)Splitter_concat, /*sq_concat*/
- (intargfunc)Splitter_repeat, /*sq_repeat*/
- (intargfunc)Splitter_item, /*sq_item*/
- (intintargfunc)Splitter_slice, /*sq_slice*/
- (intobjargproc)0, /*sq_ass_item*/
- (intintobjargproc)0, /*sq_ass_slice*/
- };
+ (inquiry)Splitter_length, /*sq_length*/
+ (binaryfunc)Splitter_concat, /*sq_concat*/
+ (intargfunc)Splitter_repeat, /*sq_repeat*/
+ (intargfunc)Splitter_item, /*sq_item*/
+ (intintargfunc)Splitter_slice, /*sq_slice*/
+ (intobjargproc)0, /*sq_ass_item*/
+ (intintobjargproc)0, /*sq_ass_slice*/
+};
static PyObject *
Splitter_pos(Splitter *self, PyObject *args)
@@ -407,31 +411,31 @@
static char SplitterType__doc__[] = "";
static PyTypeObject SplitterType = {
- PyObject_HEAD_INIT(NULL)
- 0, /*ob_size*/
- "Splitter", /*tp_name*/
- sizeof(Splitter), /*tp_basicsize*/
- 0, /*tp_itemsize*/
- /* methods */
- (destructor)Splitter_dealloc, /*tp_dealloc*/
- (printfunc)0, /*tp_print*/
- (getattrfunc)Splitter_getattr, /*tp_getattr*/
- (setattrfunc)0, /*tp_setattr*/
- (cmpfunc)0, /*tp_compare*/
- (reprfunc)0, /*tp_repr*/
- 0, /*tp_as_number*/
- &Splitter_as_sequence, /*tp_as_sequence*/
- 0, /*tp_as_mapping*/
- (hashfunc)0, /*tp_hash*/
- (ternaryfunc)0, /*tp_call*/
- (reprfunc)0, /*tp_str*/
-
- /* Space for future expansion */
- 0L,0L,0L,0L,
- SplitterType__doc__ /* Documentation string */
- };
+ PyObject_HEAD_INIT(NULL)
+ 0, /*ob_size*/
+ "Splitter", /*tp_name*/
+ sizeof(Splitter), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ /* methods */
+ (destructor)Splitter_dealloc, /*tp_dealloc*/
+ (printfunc)0, /*tp_print*/
+ (getattrfunc)Splitter_getattr, /*tp_getattr*/
+ (setattrfunc)0, /*tp_setattr*/
+ (cmpfunc)0, /*tp_compare*/
+ (reprfunc)0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ &Splitter_as_sequence, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ (hashfunc)0, /*tp_hash*/
+ (ternaryfunc)0, /*tp_call*/
+ (reprfunc)0, /*tp_str*/
+
+ /* Space for future expansion */
+ 0L,0L,0L,0L,
+ SplitterType__doc__ /* Documentation string */
+};
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxwords",NULL};
static PyObject *
@@ -440,8 +444,12 @@
Splitter *self;
PyObject *doc, *synstop = NULL;
char *encoding = "latin1";
+ int single_char = 0;
+ int index_numbers = 0;
+ int max_words= 64;
- UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL;
+ UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \
+ &doc,&synstop,&encoding,&single_char,&index_numbers,&max_words)) return NULL;
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
@@ -459,6 +467,9 @@
self->end = self->here + PyString_Size(self->text);
self->index = -1;
+ self->allow_single_chars = single_char;
+ self->index_numbers = index_numbers;
+ self->max_words = max_words;
return (PyObject*)self;
@@ -471,7 +482,7 @@
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
- "ZopeSplitter(doc[,synstop]) -- Return a word splitter"
+ "ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers]) -- Return a word splitter"
},
{ NULL, NULL }