[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.2.10.4

Andreas Jung andreas@zope.com
Tue, 9 Oct 2001 16:08:39 -0400


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv3621/src

Modified Files:
      Tag: ajung-unicode
	ZopeSplitter.c 
Log Message:
final version of fully unicode-aware splitter


=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.2.10.3 => 1.2.10.4 ===
-#include <ctype.h>
+
+#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
 
 #define ASSIGN(V,E) {PyObject *__e; __e=(E); Py_XDECREF(V); (V)=__e;}
 #define UNLESS(E) if(!(E))
@@ -11,7 +12,6 @@
         PyObject *text ;
         PyObject *list;
         PyObject *synstop;
-        int index;
 }
 Splitter;
 
@@ -19,7 +19,7 @@
 Splitter_dealloc(Splitter *self)
 {
         Py_XDECREF(self->list);
-
+        Py_XDECREF(self->text);
         PyMem_DEL(self);
 }
 
@@ -43,18 +43,21 @@
         return NULL;
 }
 
-#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
 
 
 static PyObject *
 Splitter_item(Splitter *self, int i)
 {
         PyObject *item=NULL;
-        if (i >= PyList_Size(self->list)) return NULL;
+        if (i >= PyList_Size(self->list)) {
+            PyErr_SetString(PyExc_IndexError,"Splitter index out of range");
+            return NULL;
+        }
 
         ASSIGN(item,PyList_GetItem(self->list , i));
         Py_XINCREF(item);
 
+
 #ifdef DEBUG
         printf("\n\tItem %d",i);
         PyObject_Print(item,stdout,0);
@@ -160,22 +163,71 @@
         SplitterType__doc__ /* Documentation string */
 };
 
-void consolidateList(Splitter *self)
+
+void splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
 {
-        int i;
-        PyObject * word;
-        Py_UNICODE * buf;
-
-        for (i=PyList_Size(self->list)-1;i>=0; i--) {
-                word = PyList_GetItem(self->list,i);
-
-                buf  = PyUnicode_AsUnicode(word);
-                if (Py_UNICODE_ISNUMERIC(buf[0])) {
-                        PyList_SetSlice(self->list,i,i+1,NULL);
+        PyObject *word;
+        Py_UNICODE *s = doc->str;
+        int len = doc->length;
+        int inside_word=0;
+        int i=0;
+        int start=0;
+
+        self->list = PyList_New(0);
+
+        do {
+                register Py_UNICODE ch;
+
+                ch = *s;
+
+#ifdef DEBUG
+                printf("%d %c %d\n",i,ch,ch);
+                fflush(stdout);
+#endif
+                if (!inside_word) {
+                        if (Py_UNICODE_ISALPHA(ch)) {
+                                inside_word=1;
+                                start = i;
+                        }
+                } else {
+
+                        if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) 
+                        {
+                                inside_word = 0;
+
+                                word = PySequence_GetSlice((PyObject *)doc,start,i);
+                                Py_INCREF(word);
+                                // Stem word
+                                if (PyUnicode_GET_SIZE(word)>MAX_WORD)
+                                    word = PySequence_GetSlice(word,0,MAX_WORD);
+
+                                PyList_Append(self->list,word);
+                                start =  0;
+#ifdef DEBUG
+                                PyObject_Print(word,stdout,0);
+                                fflush(stdout);
+#endif
+                        }
                 }
 
+                s++;
+        } while(++i < len);
+
+        if (inside_word) {
+            word = PySequence_GetSlice((PyObject *)doc,start,i);
+
+            // Stem word
+            if (PyUnicode_GET_SIZE(word)>MAX_WORD)
+                word = PySequence_GetSlice(word,0,MAX_WORD);
+
+            Py_INCREF(word);
+            PyList_Append(self->list,word);
         }
 
+#ifdef DEBUG
+        PyObject_Print(self->list,stdout,0);
+        fflush(stdout);
+#endif
 }
 
 static PyObject *
@@ -193,18 +245,24 @@
         fflush(stdout);
 #endif
 
-        UNLESS(self->list = PyUnicode_Split(doc,NULL,-1)) goto err;
+        if (PyString_Check(doc)) {
+                doc = PyUnicode_FromObject(doc);
+
+        } else if( PyUnicode_Check(doc)) {}
+        else {
+                PyErr_SetString(PyExc_TypeError, "first argument is neither string nor unicode.");
+                return NULL;
+        }
+
+
         UNLESS(self->text = doc) goto err;
+        splitUnicodeString(self,(PyUnicodeObject *)doc);
 
         if (synstop) {
                 self->synstop = synstop;
                 Py_INCREF(synstop);
         }
 
-        consolidateList(self);
-
-        self->index = -1;
-
         return (PyObject*)self;
 
 err:
@@ -221,7 +279,7 @@
         };
 
 static char Splitter_module_documentation[] =
-        "Parse source strings into sequences of words\n"
+        "Parse source (unicode) string into sequences of words\n"
         "\n"
         "for use in an inverted index\n"
         "\n"