[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src - UnicodeSplitter.c:1.13

Andreas Jung andreas@digicool.com
Wed, 9 Jan 2002 10:17:35 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv7963/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src

Modified Files:
	UnicodeSplitter.c 
Log Message:
added 3 new parameters for all zope splitters


=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c 1.12 => 1.13 ===
 #include "Python.h"
 
-#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
-
 #ifndef min
 #define min(a,b) ((a)<(b)?(a):(b))
 #endif
@@ -24,8 +22,12 @@
     PyObject_HEAD
     PyObject *list;
     PyObject *synstop;
+    int max_len;
+    int allow_single_chars;
+    int index_numbers;
 }
 Splitter;
+
 static
 PyUnicodeObject *prepareString(PyUnicodeObject *o);
 
@@ -34,6 +36,9 @@
     /* Always returns a borrowed reference */
     PyObject *value;
 
+    if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars)
+        return Py_None;
+
     if (self->synstop) {
         value = PyDict_GetItem(self->synstop,word);
         if (value != NULL) {
@@ -82,6 +87,14 @@
   return item;
 }
 
+static PyObject * 
+Splitter_split(Splitter *self) {
+
+    Py_INCREF(self->list);
+
+    return self->list;
+}
+
 
 static PyObject *
 Splitter_indexes(Splitter *self, PyObject *args)
@@ -133,6 +146,8 @@
 
 static struct PyMethodDef Splitter_methods[] =
     {
+        { "split", (PyCFunction) Splitter_split, 0,
+          "split() -- Split string in one run" },
         { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
           "indexes(word) -- Return a list of the indexes of word in the sequence",
         },
@@ -198,14 +213,19 @@
         register Py_UNICODE ch;
 
         ch = *s;
-#ifdef DEBUG
-        printf("%d %c %d\n",i,ch,ch);
-        fflush(stdout);
-#endif
+
         if (!inside_word) {
-            if (Py_UNICODE_ISALPHA(ch)) {
-                inside_word=1;
-                start = i;
+            if (self->index_numbers) {
+                if (Py_UNICODE_ISALNUM(ch)) {
+                    inside_word=1;
+                    start = i;
+                }
+
+            } else {
+                if (Py_UNICODE_ISALPHA(ch)) {
+                    inside_word=1;
+                    start = i;
+                }
             }
         } else {
 
@@ -213,7 +233,7 @@
                 inside_word = 0;
 
                 word = PySequence_GetSlice((PyObject *)doc1,start,
-                                           min(i, start + MAX_WORD));
+                                           min(i, start + self->max_len));
                 if (word==NULL)
                   goto err;
 
@@ -234,7 +254,7 @@
 
     if (inside_word) {
         word = PySequence_GetSlice((PyObject *)doc1,start,
-                                   min(len, start + MAX_WORD));
+                                   min(len, start + self->max_len));
         if (word==NULL)
           goto err;
 
@@ -288,7 +308,7 @@
     return  u;
 }
 
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
 
 
 static PyObject *
@@ -297,9 +317,11 @@
     Splitter *self=NULL;
     PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
     char *encoding = "latin1";
+    int index_numbers = 0;
+    int max_len=64;
+    int single_char = 0;
 
-    if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
-    if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding))) return NULL;
+    if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
 
 #ifdef DEBUG
     puts("got text");
@@ -307,6 +329,21 @@
     fflush(stdout);
 #endif
 
+    if (index_numbers<0 || index_numbers>1) {
+        PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+        return NULL;
+    }
+
+    if (single_char<0 || single_char>1) {
+        PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+        return NULL;
+    }
+
+    if (max_len<1 || max_len>128) {
+        PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+        return NULL;
+    }
+
     if (PyString_Check(doc)) {
 
         unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
@@ -324,11 +361,17 @@
         return NULL;
     }
 
+    if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
+
     if (synstop) {
         self->synstop = synstop;
         Py_INCREF(synstop);
     } else  self->synstop=NULL;
 
+    self->index_numbers      = index_numbers;
+    self->max_len            = max_len;
+    self->allow_single_chars = single_char;
+
     if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
       goto err;
 
@@ -344,11 +387,6 @@
 
 static struct PyMethodDef Splitter_module_methods[] =
     {
-        { "pos", (PyCFunction) Splitter_pos, 0,
-          "pos(index) -- Return the starting and ending position of a token" },
-        { "indexes", (PyCFunction) Splitter_indexes, METH_VARARGS,
-          "indexes(word) -- Return a list of the indexes of word in sequence" },
-   
         { "UnicodeSplitter", (PyCFunction)newSplitter,
           METH_VARARGS|METH_KEYWORDS,
           "UnicodeSplitter(doc[,synstop][,encoding='latin1']) "