[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.6
Andreas Jung
andreas@digicool.com
Wed, 9 Jan 2002 10:17:35 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv7963/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
Modified Files:
ZopeSplitter.c
Log Message:
added 3 new parameters for all zope splitters
=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.5 => 1.6 ===
-
+
Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
@@ -10,6 +10,8 @@
FOR A PARTICULAR PURPOSE
****************************************************************************/
+
+
#include "Python.h"
#include <ctype.h>
@@ -23,6 +25,9 @@
PyObject *text, *synstop;
char *here, *end;
int index;
+ int allow_single_chars;
+ int index_numbers;
+ int max_len;
}
Splitter;
@@ -98,7 +103,7 @@
cword = PyString_AsString(word);
len = PyString_Size(word);
- if(len < 2) /* Single-letter words are stop words! */
+ if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{
Py_INCREF(Py_None);
return Py_None;
@@ -110,7 +115,7 @@
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
;
- if (len < 0) {
+ if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None);
return Py_None;
}
@@ -140,12 +145,11 @@
return value; /* Which must be None! */
}
-#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject *
next_word(Splitter *self, char **startpos, char **endpos)
{
- char wbuf[MAX_WORD];
+ char wbuf[256];
char *end, *here, *b;
int i = 0, c;
PyObject *pyword, *res;
@@ -175,13 +179,13 @@
if(startpos && i==0)
*startpos=here;
- if(i++ < MAX_WORD)
+ if(i++ < self->max_len)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
- if(i >= MAX_WORD)
- i=MAX_WORD; /* "stem" the long word */
+ if(i >= self->max_len)
+ i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here;
@@ -225,8 +229,8 @@
/* We've reached the end of the string */
- if(i >= MAX_WORD)
- i=MAX_WORD; /* "stem" the long word */
+ if(i >= self->max_len)
+ i=self->max_len; /* "stem" the long word */
if (i == 0) {
/* No words */
@@ -274,6 +278,31 @@
return word;
}
+
+static PyObject *
+Splitter_split(Splitter*self)
+{
+ PyObject *list=NULL,*word=NULL;
+
+ UNLESS(list = PyList_New(0)) return NULL;
+
+ Splitter_reset(self);
+
+ while (1) {
+ Py_XDECREF(word);
+
+ UNLESS(word = next_word(self,NULL,NULL)) return NULL;
+
+ if (word == Py_None) {
+ return list;
+ }
+
+ PyList_Append(list,word);
+ }
+
+ return list;
+}
+
static PyObject *
Splitter_slice(Splitter *self, int i, int j)
{
@@ -282,14 +311,14 @@
}
static PySequenceMethods Splitter_as_sequence = {
- (inquiry)Splitter_length, /*sq_length*/
- (binaryfunc)Splitter_concat, /*sq_concat*/
- (intargfunc)Splitter_repeat, /*sq_repeat*/
- (intargfunc)Splitter_item, /*sq_item*/
- (intintargfunc)Splitter_slice, /*sq_slice*/
- (intobjargproc)0, /*sq_ass_item*/
- (intintobjargproc)0, /*sq_ass_slice*/
- };
+ (inquiry)Splitter_length, /*sq_length*/
+ (binaryfunc)Splitter_concat, /*sq_concat*/
+ (intargfunc)Splitter_repeat, /*sq_repeat*/
+ (intargfunc)Splitter_item, /*sq_item*/
+ (intintargfunc)Splitter_slice, /*sq_slice*/
+ (intobjargproc)0, /*sq_ass_item*/
+ (intintobjargproc)0, /*sq_ass_slice*/
+};
static PyObject *
Splitter_pos(Splitter *self, PyObject *args)
@@ -359,8 +388,12 @@
static struct PyMethodDef Splitter_methods[] =
{
+ { "split", (PyCFunction)Splitter_split, 0,
+ "split() -- Split complete string in one run"
+ },
+
{ "pos", (PyCFunction)Splitter_pos, 0,
- "pos(index) -- Return the starting and ending position of a token"
+ "pos(index) -- Return the starting and ending position of a token"
},
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
@@ -378,31 +411,31 @@
static char SplitterType__doc__[] = "";
static PyTypeObject SplitterType = {
- PyObject_HEAD_INIT(NULL)
- 0, /*ob_size*/
- "Splitter", /*tp_name*/
- sizeof(Splitter), /*tp_basicsize*/
- 0, /*tp_itemsize*/
- /* methods */
- (destructor)Splitter_dealloc, /*tp_dealloc*/
- (printfunc)0, /*tp_print*/
- (getattrfunc)Splitter_getattr, /*tp_getattr*/
- (setattrfunc)0, /*tp_setattr*/
- (cmpfunc)0, /*tp_compare*/
- (reprfunc)0, /*tp_repr*/
- 0, /*tp_as_number*/
- &Splitter_as_sequence, /*tp_as_sequence*/
- 0, /*tp_as_mapping*/
- (hashfunc)0, /*tp_hash*/
- (ternaryfunc)0, /*tp_call*/
- (reprfunc)0, /*tp_str*/
-
- /* Space for future expansion */
- 0L,0L,0L,0L,
- SplitterType__doc__ /* Documentation string */
- };
+ PyObject_HEAD_INIT(NULL)
+ 0, /*ob_size*/
+ "Splitter", /*tp_name*/
+ sizeof(Splitter), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ /* methods */
+ (destructor)Splitter_dealloc, /*tp_dealloc*/
+ (printfunc)0, /*tp_print*/
+ (getattrfunc)Splitter_getattr, /*tp_getattr*/
+ (setattrfunc)0, /*tp_setattr*/
+ (cmpfunc)0, /*tp_compare*/
+ (reprfunc)0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ &Splitter_as_sequence, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ (hashfunc)0, /*tp_hash*/
+ (ternaryfunc)0, /*tp_call*/
+ (reprfunc)0, /*tp_str*/
+
+ /* Space for future expansion */
+ 0L,0L,0L,0L,
+ SplitterType__doc__ /* Documentation string */
+};
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static PyObject *
@@ -411,8 +444,28 @@
Splitter *self;
PyObject *doc, *synstop = NULL;
char *encoding = "latin1";
+ int single_char = 0;
+ int index_numbers = 0;
+ int max_len= 64;
+
+ UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \
+ &doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
+
- UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL;
+ if (index_numbers<0 || index_numbers>1) {
+ PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+ return NULL;
+ }
+
+ if (single_char<0 || single_char>1) {
+ PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+ return NULL;
+ }
+
+ if (max_len<1 || max_len>128) {
+ PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+ return NULL;
+ }
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
@@ -430,6 +483,9 @@
self->end = self->here + PyString_Size(self->text);
self->index = -1;
+ self->allow_single_chars = single_char;
+ self->index_numbers = index_numbers;
+ self->max_len = max_len;
return (PyObject*)self;
@@ -442,7 +498,7 @@
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
- "ZopeSplitter(doc[,synstop]) -- Return a word splitter"
+ "ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
},
{ NULL, NULL }