[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.2.10.5
Andreas Jung
andreas@zope.com
Wed, 10 Oct 2001 15:38:27 -0400
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv12808/src
Modified Files:
Tag: ajung-unicode
ZopeSplitter.c
Log Message:
This version is known to fail. Checkin for debugging purposes.
=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.2.10.4 => 1.2.10.5 ===
#define UNLESS_ASSIGN(V,E) ASSIGN(V,E) UNLESS(V)
+#define PO(x) { PyObject_Print(x,stdout,0); fflush(stdout); }
+
typedef struct
{
PyObject_HEAD
- PyObject *text ;
PyObject *list;
PyObject *synstop;
}
Splitter;
+static PyObject * checkSynword(Splitter *self,PyObject *word)
+{
+
+ PyObject *value;
+
+#if DEBUG
+ PO(word);
+ PO(self->synstop);
+#endif
+
+ if (PyList_Check(self->list)) {
+
+ value = PyObject_GetItem(self->synstop,word);
+ if (value) {
+ Py_INCREF(value);
+
+ return value;
+ } else return word;
+ } else return word;
+}
+
static void
Splitter_dealloc(Splitter *self)
{
Py_XDECREF(self->list);
- Py_XDECREF(self->text);
+ Py_XDECREF(self->synstop);
PyMem_DEL(self);
}
@@ -50,8 +72,8 @@
{
PyObject *item=NULL;
if (i >= PyList_Size(self->list)) {
- PyErr_SetString(PyExc_IndexError,"Splitter index out of range");
- return NULL;
+ PyErr_SetString(PyExc_IndexError,"Splitter index out of range");
+ return NULL;
}
ASSIGN(item,PyList_GetItem(self->list , i));
@@ -166,19 +188,32 @@
void splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
{
- PyObject *word;
+
+ PyObject *word,*synword;
Py_UNICODE *s = doc->str;
int len = doc->length;
int inside_word=0;
int i=0;
int start=0;
+#ifdef DEBUG
+ puts("before List_New");
+ fflush(stdout);
+#endif
+
self->list = PyList_New(0);
+#ifdef DEBUG
+ puts("after List_New");
+ fflush(stdout);
+#endif
+
+
do {
register Py_UNICODE ch;
ch = *s;
+ *s = Py_UNICODE_TOLOWER(ch);
#ifdef DEBUG
printf("%d %c %d\n",i,ch,ch);
@@ -191,17 +226,21 @@
}
} else {
- if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-'))
- {
+ if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) {
inside_word = 0;
word = PySequence_GetSlice((PyObject *)doc,start,i);
- Py_INCREF(word);
// Stem word
if (PyUnicode_GET_SIZE(word)>MAX_WORD)
- word = PySequence_GetSlice(word,0,MAX_WORD);
+ word = PySequence_GetSlice(word,0,MAX_WORD);
+
+ synword = checkSynword(self,word);
+ if (synword != Py_None) {
+ PyList_Append(self->list,synword);
+ } else Py_DECREF(synword);
+
+ Py_DECREF(word);
- PyList_Append(self->list,word);
start = 0;
#ifdef DEBUG
PyObject_Print(word,stdout,0);
@@ -211,30 +250,36 @@
}
s++;
+
} while(++i < len);
if (inside_word) {
- word = PySequence_GetSlice((PyObject *)doc,start,i);
+ word = PySequence_GetSlice((PyObject *)doc,start,i);
- // Stem word
- if (PyUnicode_GET_SIZE(word)>MAX_WORD)
- word = PySequence_GetSlice(word,0,MAX_WORD);
+ // Stem word
+ if (PyUnicode_GET_SIZE(word)>MAX_WORD)
+ word = PySequence_GetSlice(word,0,MAX_WORD);
+
+ synword = checkSynword(self,word);
+ if (synword != Py_None) {
+ PyList_Append(self->list,synword);
+ } else Py_DECREF(synword);
- Py_INCREF(word);
- PyList_Append(self->list,word);
+ Py_DECREF(word);
}
#ifdef DEBUG
PyObject_Print(self->list,stdout,0);
fflush(stdout);
#endif
+
}
static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args)
{
Splitter *self;
- PyObject *doc, *synstop=NULL;
+ PyObject *doc, *unicodedoc,*synstop=NULL;
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
UNLESS(PyArg_ParseTuple(args,"O|O",&doc,&synstop)) return NULL;
@@ -246,22 +291,42 @@
#endif
if (PyString_Check(doc)) {
- doc = PyUnicode_FromObject(doc);
- } else if( PyUnicode_Check(doc)) {}
- else {
+#ifdef DEBUG
+ puts("got a string object");
+ fflush(stdout);
+#endif
+ unicodedoc = PyUnicode_FromObject(doc);
+
+ } else if( PyUnicode_Check(doc)) {
+#ifdef DEBUG
+ puts("got a unicode object");
+ fflush(stdout);
+
+#endif
+ unicodedoc = doc;
+ } else {
PyErr_SetString(PyExc_TypeError, "first argument is neither string nor unicode.");
return NULL;
}
-
- UNLESS(self->text = doc) goto err;
- splitUnicodeString(self,(PyUnicodeObject *)doc);
-
if (synstop) {
self->synstop = synstop;
Py_INCREF(synstop);
+ } else {
+ self->synstop=NULL;
}
+
+#ifdef DEBUG
+ puts("before splitUnicodeString");
+ PyObject_Print(unicodedoc,stdout,0);
+#endif
+
+ splitUnicodeString(self,(PyUnicodeObject *)unicodedoc);
+
+#ifdef DEBUG
+ puts("after splitUnicodeString");
+#endif
return (PyObject*)self;