[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src - normalizer.c:1.1.2.3

Andreas Jung andreas@digicool.com
Sat, 9 Feb 2002 10:28:36 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src
In directory cvs.zope.org:/tmp/cvs-serv1404/src

Modified Files:
      Tag: ajung-textindexng-branch
	normalizer.c 
Log Message:
YET another rewrite: 
- added optional 'encoding' parameter to override the default encoding
- all strings are now converted to unicode according to the encoding parameter
  (default=latin1). 


=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/normalizer.c 1.1.2.2 => 1.1.2.3 ===
 
-
 typedef struct
 {
     PyObject_HEAD
     PyObject * table;
+    char *encoding;
 }
 normalizer;
 
@@ -16,36 +16,53 @@
     PyMem_DEL(self);
 }
 
-static PyUnicodeObject *NormalizeWord(normalizer *self,PyUnicodeObject *word) {
+static PyObject *NormalizeWord(normalizer *self,PyObject *word)
+{
 
-    PyUnicodeObject *temp;
     int i;
+    PyObject *temp;
 
-    temp = (PyUnicodeObject *) PyUnicode_FromUnicode(word->str,word->length);
+    if (PyString_Check(word)) {
+        if (! (temp = PyUnicode_FromEncodedObject(word,self->encoding,"strict"))) {
+            PyErr_SetString(PyExc_UnicodeError,"unicode conversion failed");
+            return NULL;
+        }
+    }  else  {
+        temp = PyUnicode_FromObject(word);
+    }
 
     for (i=0; i<PyList_GET_SIZE(self->table); i++) {
-        PyUnicodeObject *s;
-        PyObject *item, *key, *value;
+        PyObject *s, *item, *key, *value;
 
         item = PyList_GetItem(self->table, i);
 
         key   = PyTuple_GetItem(item,0);
         value = PyTuple_GetItem(item,1);
 
-        s = (PyUnicodeObject *) PyUnicode_Replace((PyObject *) temp, key, value, -1);
+        if (PyString_Check(key))
+            key = PyUnicode_FromEncodedObject(key, self->encoding,"strict");
+
+        if (PyString_Check(value))
+            value = PyUnicode_FromEncodedObject(value, self->encoding,"strict");
+
+        if (! (s = PyUnicode_Replace( temp, key, value, -1)))
+            return NULL;
+
         Py_DECREF(temp);
+
         temp = s;
     }
 
     return temp;
 }
 
-static PyObject *normalize(normalizer *self, PyObject*args)
+static PyObject *normalize(normalizer *self, PyObject *args)
 {
     int j;
     PyObject * data=NULL ;
 
-    if (! (PyArg_ParseTuple(args,"O", &data))) return NULL;
+    if (! (PyArg_ParseTuple(args,"O", &data)))
+        return NULL;
 
     if (PyList_Check(data)) {
 
@@ -54,47 +71,25 @@
         list = PyList_New(0);
 
         for (j=0; j<PyList_Size(data); j++) {
-            PyUnicodeObject *word=NULL,*item=NULL;
-
-            item = (PyUnicodeObject *) PyList_GetItem(data,j);
-
-            if (PyUnicode_Check(item)) {
-                word = NormalizeWord(self,item);
-            } else if (PyString_Check(item)) {
-                PyUnicodeObject *unicodeword;
-
-                unicodeword = (PyUnicodeObject *) PyUnicode_FromObject((PyObject *) item);
+            PyObject *word=NULL,*item=NULL;
 
-                word = NormalizeWord(self,unicodeword);
+            item = PyList_GetItem(data,j);
 
-                Py_DECREF(unicodeword);
-            }
+            word = NormalizeWord(self, item);
 
             PyList_Append(list, (PyObject *) word);
-            Py_DECREF(word);
         }
 
         Py_DECREF(data);
 
         return list;
 
-    } else if (PyUnicode_Check(data)) {
+    } else if (PyUnicode_Check(data) || PyString_Check(data) ) {
 
-        PyUnicodeObject *word;
+        PyObject *word;
 
-        word = NormalizeWord(self, (PyUnicodeObject *) data);
-
-        return (PyObject *) word;
-
-    } else if (PyString_Check(data)) {
-
-        PyUnicodeObject *word,*unicodeword;
-
-        unicodeword = (PyUnicodeObject *) PyUnicode_FromObject(data);
-
-        word = NormalizeWord(self,unicodeword);
-
-        Py_DECREF(unicodeword);
+        if (! (word = NormalizeWord(self,data)))
+            return NULL;
 
         return (PyObject *) word;
 
@@ -125,7 +120,6 @@
         }
 
         if (PyTuple_Size(item) != 2) {
-
             PyErr_SetString(PyExc_TypeError,"nested arguments must be 2-tuples of strings/unicode strings");
             goto err;
         }
@@ -171,40 +165,43 @@
 static char normalizerType__doc__[] = "normalizer object";
 
 static PyTypeObject normalizerType = {
-    PyObject_HEAD_INIT(NULL)
-    0,                            /*ob_size*/
-    "normalizer",                 /*tp_name*/
-    sizeof(normalizer),           /*tp_basicsize*/
-    0,                            /*tp_itemsize*/
-    /* methods */
-    (destructor)normalizer_dealloc,  /*tp_dealloc*/
-    (printfunc)0,                 /*tp_print*/
-    (getattrfunc)normalizer_getattr, /*tp_getattr*/
-    (setattrfunc)0,               /*tp_setattr*/
-    (cmpfunc)0,                   /*tp_compare*/
-    (reprfunc)0,                  /*tp_repr*/
-    0,                            /*tp_as_number*/
-    0,                            /*tp_as_sequence*/
-    0,                            /*tp_as_mapping*/
-    (hashfunc)0,                  /*tp_hash*/
-    (ternaryfunc)0,               /*tp_call*/
-    (reprfunc)0,                  /*tp_str*/
-
-    /* Space for future expansion */
-    0L,0L,0L,0L,
-    normalizerType__doc__            /* Documentation string */
-};
+                                         PyObject_HEAD_INIT(NULL)
+                                         0,                            /*ob_size*/
+                                         "normalizer",                 /*tp_name*/
+                                         sizeof(normalizer),           /*tp_basicsize*/
+                                         0,                            /*tp_itemsize*/
+                                         /* methods */
+                                         (destructor)normalizer_dealloc,  /*tp_dealloc*/
+                                         (printfunc)0,                 /*tp_print*/
+                                         (getattrfunc)normalizer_getattr, /*tp_getattr*/
+                                         (setattrfunc)0,               /*tp_setattr*/
+                                         (cmpfunc)0,                   /*tp_compare*/
+                                         (reprfunc)0,                  /*tp_repr*/
+                                         0,                            /*tp_as_number*/
+                                         0,                            /*tp_as_sequence*/
+                                         0,                            /*tp_as_mapping*/
+                                         (hashfunc)0,                  /*tp_hash*/
+                                         (ternaryfunc)0,               /*tp_call*/
+                                         (reprfunc)0,                  /*tp_str*/
+
+                                         /* Space for future expansion */
+                                         0L,0L,0L,0L,
+                                         normalizerType__doc__            /* Documentation string */
+                                     };
+
 
+static char *normalizer_args[]={"translation","encoding",NULL};
 
 static PyObject *
 newnormalizer(PyObject *modinfo, PyObject *args, PyObject *keywds)
 {
     normalizer *self=NULL;
     PyObject *table;
+    static char * encoding = "latin1";
 
-
-    if (! (PyArg_ParseTuple(args,"O", &table)))
+    if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|s",normalizer_args,&table,&encoding)))
         return NULL;
+
     if (! checkList(table))
         return NULL;
 
@@ -212,6 +209,8 @@
         return NULL;
 
     self->table = table;
+    self->encoding = encoding;
+
     Py_INCREF(self->table);
 
     return (PyObject*)self;
@@ -219,7 +218,7 @@
 
 static struct PyMethodDef normalizer_module_methods[] =
     {
-        { "Normalizer", (PyCFunction)newnormalizer, METH_VARARGS,
+        { "Normalizer", (PyCFunction)newnormalizer, METH_VARARGS|METH_KEYWORDS,
             "Normalizer(list) " "-- Normalizer module - takes a list of 2-tuples of strings/unicode strings"
         },
         { NULL, NULL }