[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.2.10.4
Andreas Jung
andreas@zope.com
Tue, 9 Oct 2001 16:08:39 -0400
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv3621/src
Modified Files:
Tag: ajung-unicode
ZopeSplitter.c
Log Message:
final version of fully unicode-aware splitter
=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.2.10.3 => 1.2.10.4 ===
-#include <ctype.h>
+
+#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
#define ASSIGN(V,E) {PyObject *__e; __e=(E); Py_XDECREF(V); (V)=__e;}
#define UNLESS(E) if(!(E))
@@ -11,7 +12,6 @@
PyObject *text ;
PyObject *list;
PyObject *synstop;
- int index;
}
Splitter;
@@ -19,7 +19,7 @@
Splitter_dealloc(Splitter *self)
{
Py_XDECREF(self->list);
-
+ Py_XDECREF(self->text);
PyMem_DEL(self);
}
@@ -43,18 +43,21 @@
return NULL;
}
-#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject *
Splitter_item(Splitter *self, int i)
{
PyObject *item=NULL;
- if (i >= PyList_Size(self->list)) return NULL;
+ if (i >= PyList_Size(self->list)) {
+ PyErr_SetString(PyExc_IndexError,"Splitter index out of range");
+ return NULL;
+ }
ASSIGN(item,PyList_GetItem(self->list , i));
Py_XINCREF(item);
+
#ifdef DEBUG
printf("\n\tItem %d",i);
PyObject_Print(item,stdout,0);
@@ -160,22 +163,71 @@
SplitterType__doc__ /* Documentation string */
};
-void consolidateList(Splitter *self)
+
+void splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
{
- int i;
- PyObject * word;
- Py_UNICODE * buf;
-
- for (i=PyList_Size(self->list)-1;i>=0; i--) {
- word = PyList_GetItem(self->list,i);
-
- buf = PyUnicode_AsUnicode(word);
- if (Py_UNICODE_ISNUMERIC(buf[0])) {
- PyList_SetSlice(self->list,i,i+1,NULL);
+ PyObject *word;
+ Py_UNICODE *s = doc->str;
+ int len = doc->length;
+ int inside_word=0;
+ int i=0;
+ int start=0;
+
+ self->list = PyList_New(0);
+
+ do {
+ register Py_UNICODE ch;
+
+ ch = *s;
+
+#ifdef DEBUG
+ printf("%d %c %d\n",i,ch,ch);
+ fflush(stdout);
+#endif
+ if (!inside_word) {
+ if (Py_UNICODE_ISALPHA(ch)) {
+ inside_word=1;
+ start = i;
+ }
+ } else {
+
+ if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-'))
+ {
+ inside_word = 0;
+
+ word = PySequence_GetSlice((PyObject *)doc,start,i);
+ Py_INCREF(word);
+ // Stem word
+ if (PyUnicode_GET_SIZE(word)>MAX_WORD)
+ word = PySequence_GetSlice(word,0,MAX_WORD);
+
+ PyList_Append(self->list,word);
+ start = 0;
+#ifdef DEBUG
+ PyObject_Print(word,stdout,0);
+ fflush(stdout);
+#endif
+ }
}
+ s++;
+ } while(++i < len);
+
+ if (inside_word) {
+ word = PySequence_GetSlice((PyObject *)doc,start,i);
+
+ // Stem word
+ if (PyUnicode_GET_SIZE(word)>MAX_WORD)
+ word = PySequence_GetSlice(word,0,MAX_WORD);
+
+ Py_INCREF(word);
+ PyList_Append(self->list,word);
}
+#ifdef DEBUG
+ PyObject_Print(self->list,stdout,0);
+ fflush(stdout);
+#endif
}
static PyObject *
@@ -193,18 +245,24 @@
fflush(stdout);
#endif
- UNLESS(self->list = PyUnicode_Split(doc,NULL,-1)) goto err;
+ if (PyString_Check(doc)) {
+ doc = PyUnicode_FromObject(doc);
+
+ } else if( PyUnicode_Check(doc)) {}
+ else {
+ PyErr_SetString(PyExc_TypeError, "first argument is neither string nor unicode.");
+ return NULL;
+ }
+
+
UNLESS(self->text = doc) goto err;
+ splitUnicodeString(self,(PyUnicodeObject *)doc);
if (synstop) {
self->synstop = synstop;
Py_INCREF(synstop);
}
- consolidateList(self);
-
- self->index = -1;
-
return (PyObject*)self;
err:
@@ -221,7 +279,7 @@
};
static char Splitter_module_documentation[] =
- "Parse source strings into sequences of words\n"
+ "Parse source (unicode) string into sequences of words\n"
"\n"
"for use in an inverted index\n"
"\n"