[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src - UnicodeSplitter.c:1.13
Andreas Jung
andreas@digicool.com
Wed, 9 Jan 2002 10:17:35 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv7963/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src
Modified Files:
UnicodeSplitter.c
Log Message:
added 3 new parameters for all zope splitters
=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c 1.12 => 1.13 ===
#include "Python.h"
-#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
-
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif
@@ -24,8 +22,12 @@
PyObject_HEAD
PyObject *list;
PyObject *synstop;
+ int max_len;
+ int allow_single_chars;
+ int index_numbers;
}
Splitter;
+
static
PyUnicodeObject *prepareString(PyUnicodeObject *o);
@@ -34,6 +36,9 @@
/* Always returns a borrowed reference */
PyObject *value;
+ if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars)
+ return Py_None;
+
if (self->synstop) {
value = PyDict_GetItem(self->synstop,word);
if (value != NULL) {
@@ -82,6 +87,14 @@
return item;
}
+static PyObject *
+Splitter_split(Splitter *self) {
+
+ Py_INCREF(self->list);
+
+ return self->list;
+}
+
static PyObject *
Splitter_indexes(Splitter *self, PyObject *args)
@@ -133,6 +146,8 @@
static struct PyMethodDef Splitter_methods[] =
{
+ { "split", (PyCFunction) Splitter_split, 0,
+ "split() -- Split string in one run" },
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in the sequence",
},
@@ -198,14 +213,19 @@
register Py_UNICODE ch;
ch = *s;
-#ifdef DEBUG
- printf("%d %c %d\n",i,ch,ch);
- fflush(stdout);
-#endif
+
if (!inside_word) {
- if (Py_UNICODE_ISALPHA(ch)) {
- inside_word=1;
- start = i;
+ if (self->index_numbers) {
+ if (Py_UNICODE_ISALNUM(ch)) {
+ inside_word=1;
+ start = i;
+ }
+
+ } else {
+ if (Py_UNICODE_ISALPHA(ch)) {
+ inside_word=1;
+ start = i;
+ }
}
} else {
@@ -213,7 +233,7 @@
inside_word = 0;
word = PySequence_GetSlice((PyObject *)doc1,start,
- min(i, start + MAX_WORD));
+ min(i, start + self->max_len));
if (word==NULL)
goto err;
@@ -234,7 +254,7 @@
if (inside_word) {
word = PySequence_GetSlice((PyObject *)doc1,start,
- min(len, start + MAX_WORD));
+ min(len, start + self->max_len));
if (word==NULL)
goto err;
@@ -288,7 +308,7 @@
return u;
}
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
static PyObject *
@@ -297,9 +317,11 @@
Splitter *self=NULL;
PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
char *encoding = "latin1";
+ int index_numbers = 0;
+ int max_len=64;
+ int single_char = 0;
- if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
- if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding))) return NULL;
+ if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
#ifdef DEBUG
puts("got text");
@@ -307,6 +329,21 @@
fflush(stdout);
#endif
+ if (index_numbers<0 || index_numbers>1) {
+ PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+ return NULL;
+ }
+
+ if (single_char<0 || single_char>1) {
+ PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+ return NULL;
+ }
+
+ if (max_len<1 || max_len>128) {
+ PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+ return NULL;
+ }
+
if (PyString_Check(doc)) {
unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
@@ -324,11 +361,17 @@
return NULL;
}
+ if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
+
if (synstop) {
self->synstop = synstop;
Py_INCREF(synstop);
} else self->synstop=NULL;
+ self->index_numbers = index_numbers;
+ self->max_len = max_len;
+ self->allow_single_chars = single_char;
+
if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
goto err;
@@ -344,11 +387,6 @@
static struct PyMethodDef Splitter_module_methods[] =
{
- { "pos", (PyCFunction) Splitter_pos, 0,
- "pos(index) -- Return the starting and ending position of a token" },
- { "indexes", (PyCFunction) Splitter_indexes, METH_VARARGS,
- "indexes(word) -- Return a list of the indexes of word in sequence" },
-
{ "UnicodeSplitter", (PyCFunction)newSplitter,
METH_VARARGS|METH_KEYWORDS,
"UnicodeSplitter(doc[,synstop][,encoding='latin1']) "