[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src - UnicodeSplitter.c:1.12.10.2

Wed, 9 Jan 2002 10:04:08 -0500

Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv4880/src

Modified Files:
      Tag: ajung-textindexng-branch
	UnicodeSplitter.c 
Log Message:
added 'maxlen','indexnumbers','singlechar' parameters to the constructor


=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c 1.12.10.1 => 1.12.10.2 ===
 #include "Python.h"
 
-#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
-
 #ifndef min
 #define min(a,b) ((a)<(b)?(a):(b))
 #endif
@@ -24,8 +22,12 @@
     PyObject_HEAD
     PyObject *list;
     PyObject *synstop;
+    int max_len;
+    int allow_single_chars;
+    int index_numbers;
 }
 Splitter;
+
 static
 PyUnicodeObject *prepareString(PyUnicodeObject *o);
 
@@ -34,6 +36,9 @@
     /* Always returns a borrowed reference */
     PyObject *value;
 
+    if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars)
+        return Py_None;
+
     if (self->synstop) {
         value = PyDict_GetItem(self->synstop,word);
         if (value != NULL) {
@@ -208,14 +213,19 @@
         register Py_UNICODE ch;
 
         ch = *s;
-#ifdef DEBUG
-        printf("%d %c %d\n",i,ch,ch);
-        fflush(stdout);
-#endif
+
         if (!inside_word) {
-            if (Py_UNICODE_ISALPHA(ch)) {
-                inside_word=1;
-                start = i;
+            if (self->index_numbers) {
+                if (Py_UNICODE_ISALNUM(ch)) {
+                    inside_word=1;
+                    start = i;
+                }
+
+            } else {
+                if (Py_UNICODE_ISALPHA(ch)) {
+                    inside_word=1;
+                    start = i;
+                }
             }
         } else {
 
@@ -223,7 +233,7 @@
                 inside_word = 0;
 
                 word = PySequence_GetSlice((PyObject *)doc1,start,
-                                           min(i, start + MAX_WORD));
+                                           min(i, start + self->max_len));
                 if (word==NULL)
                   goto err;
 
@@ -244,7 +254,7 @@
 
     if (inside_word) {
         word = PySequence_GetSlice((PyObject *)doc1,start,
-                                   min(len, start + MAX_WORD));
+                                   min(len, start + self->max_len));
         if (word==NULL)
           goto err;
 
@@ -298,7 +308,7 @@
     return  u;
 }
 
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
 
 
 static PyObject *
@@ -307,9 +317,11 @@
     Splitter *self=NULL;
     PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
     char *encoding = "latin1";
+    int index_numbers = 0;
+    int max_len=64;
+    int single_char = 0;
 
-    if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
-    if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding))) return NULL;
+    if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
 
 #ifdef DEBUG
     puts("got text");
@@ -317,6 +329,21 @@
     fflush(stdout);
 #endif
 
+    if (index_numbers<0 || index_numbers>1) {
+        PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+        return NULL;
+    }
+
+    if (single_char<0 || single_char>1) {
+        PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+        return NULL;
+    }
+
+    if (max_len<1 || max_len>128) {
+        PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+        return NULL;
+    }
+
     if (PyString_Check(doc)) {
 
         unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
@@ -334,10 +361,16 @@
         return NULL;
     }
 
+    if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
+
     if (synstop) {
         self->synstop = synstop;
         Py_INCREF(synstop);
     } else  self->synstop=NULL;
+
+    self->index_numbers      = index_numbers;
+    self->max_len            = max_len;
+    self->allow_single_chars = single_char;
 
     if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
       goto err;