[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.2.10.1
Andreas Jung
andreas@zope.com
Thu, 27 Sep 2001 11:37:12 -0400
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv32342
Modified Files:
Tag: ajung-unicode
ZopeSplitter.c
Log Message:
very rough prototype of unicode-aware splitter
=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.2 => 1.2.10.1 ===
PyObject_HEAD
PyObject *text, *synstop;
+ PyObject *list;
char *here, *end;
int index;
} Splitter;
@@ -117,6 +118,10 @@
static int
Splitter_length(Splitter *self)
{
+
+ puts("inside Splitter_length()");
+
+ return PyList_Size(self->list);
PyObject *res=0;
Splitter_reset(self);
@@ -211,113 +216,29 @@
static PyObject *
next_word(Splitter *self, char **startpos, char **endpos)
{
- char wbuf[MAX_WORD];
- char *end, *here, *b;
- int i = 0, c;
- PyObject *pyword, *res;
-
- here=self->here;
- end=self->end;
- b=wbuf;
- while (here < end)
- {
- /* skip hyphens */
- if ((i > 0) && (*here == '-'))
- {
- here++;
- while (isspace((unsigned char) *here) && (here < end)) here++;
- continue;
- }
+ PyObject *list=NULL, *word=NULL;
- c=tolower((unsigned char) *here);
-
- /* Check to see if this character is part of a word */
- if(isalnum((unsigned char)c) || c=='/' || c=='_')
- { /* Found a word character */
- if(startpos && i==0) *startpos=here;
- if(i++ < MAX_WORD) *b++ = c;
- }
- else if (i != 0)
- { /* We've found the end of a word */
- if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
-
- UNLESS(pyword = PyString_FromStringAndSize(wbuf, i))
- {
- self->here=here;
- return NULL;
- }
-
- UNLESS(res = check_synstop(self, pyword))
- {
- self->here=here;
- Py_DECREF(pyword);
- return NULL;
- }
-
- if (res != Py_None)
- {
- if(endpos) *endpos=here;
- self->here=here;
- Py_DECREF(pyword);
- self->index++;
- return res;
- }
-
- /* The word is a stopword, so ignore it */
-
- Py_DECREF(res);
- Py_DECREF(pyword);
- i = 0;
- b=wbuf;
- }
-
- here++;
+ if (self->text == NULL) {
+ Py_INCREF(Py_None);
+ return Py_None;
}
- self->here=here;
-
- /* We've reached the end of the string */
+ list = PyUnicode_Split(self->text,NULL,1);
+ word = PyList_GetItem(list,0);
- if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
- if (i == 0)
- {
- /* No words */
- self->here=here;
- Py_INCREF(Py_None);
- return Py_None;
- }
-
- UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL;
-
- if(endpos) *endpos=here;
- res = check_synstop(self, pyword);
- Py_DECREF(pyword);
- if(PyString_Check(res)) self->index++;
- return res;
+ if (PyList_Size(list)>1) self->text= PyList_GetItem(list,1);
+ else self->text=NULL;
+
+ PyObject_Print(word,stdout,0);
+ fflush(stdout);
+
+ return word;
}
static PyObject *
Splitter_item(Splitter *self, int i)
{
- PyObject *word = NULL;
-
- if (i <= self->index) Splitter_reset(self);
-
- while(self->index < i)
- {
- Py_XDECREF(word);
-
- UNLESS(word = next_word(self,NULL,NULL)) return NULL;
- if (word == Py_None)
- {
- Py_DECREF(word);
- PyErr_SetString(PyExc_IndexError,
- "Splitter index out of range");
- return NULL;
- }
- }
-
- return word;
+ return PyList_GetItem(self->list , i);
}
static PyObject *
@@ -444,11 +365,12 @@
get_Splitter(PyObject *modinfo, PyObject *args)
{
Splitter *self;
- PyObject *doc, *synstop = NULL;
+ PyObject *synstop = NULL;
+ PyObject *doc=NULL;
+ UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
UNLESS(PyArg_ParseTuple(args,"O|O",&doc,&synstop)) return NULL;
- UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
if(synstop)
{
@@ -457,11 +379,12 @@
}
else self->synstop=NULL;
- UNLESS(self->text = PyObject_Str(doc)) goto err;
- UNLESS(self->here=PyString_AsString(self->text)) goto err;
- self->end = self->here + PyString_Size(self->text);
+ UNLESS(self->list = PyUnicode_Split(doc,NULL,-1)) goto err;
+ UNLESS(self->text = doc) goto err;
+
self->index = -1;
return (PyObject*)self;
+
err:
Py_DECREF(self);
return NULL;