[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src - UnicodeSplitter.c:1.12.10.2
Andreas Jung
andreas@digicool.com
Wed, 9 Jan 2002 10:04:08 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv4880/src
Modified Files:
Tag: ajung-textindexng-branch
UnicodeSplitter.c
Log Message:
added 'maxlen','indexnumbers','singlechar' parameters to the constructor
=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c 1.12.10.1 => 1.12.10.2 ===
#include "Python.h"
-#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
-
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif
@@ -24,8 +22,12 @@
PyObject_HEAD
PyObject *list;
PyObject *synstop;
+ int max_len;
+ int allow_single_chars;
+ int index_numbers;
}
Splitter;
+
static
PyUnicodeObject *prepareString(PyUnicodeObject *o);
@@ -34,6 +36,9 @@
/* Always returns a borrowed reference */
PyObject *value;
+ if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars)
+ return Py_None;
+
if (self->synstop) {
value = PyDict_GetItem(self->synstop,word);
if (value != NULL) {
@@ -208,14 +213,19 @@
register Py_UNICODE ch;
ch = *s;
-#ifdef DEBUG
- printf("%d %c %d\n",i,ch,ch);
- fflush(stdout);
-#endif
+
if (!inside_word) {
- if (Py_UNICODE_ISALPHA(ch)) {
- inside_word=1;
- start = i;
+ if (self->index_numbers) {
+ if (Py_UNICODE_ISALNUM(ch)) {
+ inside_word=1;
+ start = i;
+ }
+
+ } else {
+ if (Py_UNICODE_ISALPHA(ch)) {
+ inside_word=1;
+ start = i;
+ }
}
} else {
@@ -223,7 +233,7 @@
inside_word = 0;
word = PySequence_GetSlice((PyObject *)doc1,start,
- min(i, start + MAX_WORD));
+ min(i, start + self->max_len));
if (word==NULL)
goto err;
@@ -244,7 +254,7 @@
if (inside_word) {
word = PySequence_GetSlice((PyObject *)doc1,start,
- min(len, start + MAX_WORD));
+ min(len, start + self->max_len));
if (word==NULL)
goto err;
@@ -298,7 +308,7 @@
return u;
}
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
static PyObject *
@@ -307,9 +317,11 @@
Splitter *self=NULL;
PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
char *encoding = "latin1";
+ int index_numbers = 0;
+ int max_len=64;
+ int single_char = 0;
- if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
- if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding))) return NULL;
+ if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
#ifdef DEBUG
puts("got text");
@@ -317,6 +329,21 @@
fflush(stdout);
#endif
+ if (index_numbers<0 || index_numbers>1) {
+ PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+ return NULL;
+ }
+
+ if (single_char<0 || single_char>1) {
+ PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+ return NULL;
+ }
+
+ if (max_len<1 || max_len>128) {
+ PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+ return NULL;
+ }
+
if (PyString_Check(doc)) {
unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
@@ -334,10 +361,16 @@
return NULL;
}
+ if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
+
if (synstop) {
self->synstop = synstop;
Py_INCREF(synstop);
} else self->synstop=NULL;
+
+ self->index_numbers = index_numbers;
+ self->max_len = max_len;
+ self->allow_single_chars = single_char;
if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
goto err;