[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - BaseNormalizer.py:1.1.2.1 Setup:1.1.2.1 Normalizer.py:1.1.2.2
Andreas Jung
andreas@digicool.com
Sun, 3 Feb 2002 14:09:18 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv626
Modified Files:
Tag: ajung-textindexng-branch
Normalizer.py
Added Files:
Tag: ajung-textindexng-branch
BaseNormalizer.py Setup
Log Message:
added Normalizer
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/BaseNormalizer.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
__doc__ ="""
Word normalizer. A normalizer takes a word and translates its characters
according to a translation table. The functionality is similiar to
string.translate() but allows to translate multiple characters.
A normalizer is typically used to translate accented characters to ASCII.
"""
from BTrees.OOBTree import OOBTree
from types import ListType, StringType
import normalizer
import re, os
_basedir = os.path.dirname(__file__)
class NormalizerException(Exception): pass
class BaseNormalizer:
""" word normalizer """
def __init__(self, arg):
if isinstance(arg,ListType):
self._normalizer = normalizer.Normalizer(arg)
elif isinstance(arg,StringType):
self._normalizer = normalizer.Normalizer(
self.readTable(arg)
)
else:
raise NormalizerException, \
'Unknown type for normalizer constructor'
def normalize(self, arg):
""" normalize the string/sequence of strings """
return self._normalizer.normalize( arg )
__call__ = normalize
def readTable(self, fname):
""" read a translation table """
d = []
try:
f = os.path.join(_basedir,'normalizers',fname)
lines = open(f).readlines()
except:
try: lines = open(fname).readlines()
except: raise
for l in lines:
if l.startswith('#'): continue
fields = l.strip().split()
d.append( (fields[0], fields[1]) )
return d
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/Setup ===
*shared*
normalizer src/normalizer.c
=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/Normalizer.py 1.1.2.1 => 1.1.2.2 ===
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
-#
+#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
-#
+#
##############################################################################
-__doc__ ="""
-Word normalizer. A normalizer takes a word and translates its characters
-according to a translation table. The functionality is similiar to
-string.translate() but allows to translate multiple characters.
-A normalizer is typically used to translate accented characters to ASCII.
-"""
+__doc__=""" same as BaseStopWords but to be used in Zope """
-from BTrees.OOBTree import OOBTree
-from types import DictType, StringType
-import re, os
+from Persistence import Persistent
+from BaseNormalizer import BaseNormalizer
-_basedir = os.path.dirname(__file__)
-
-
-class Normalizer:
- """ word normalizer """
-
- def __init__(self, arg):
- self.clear()
-
- if isinstance(arg,DictType):
- self._trans.update(arg)
-
- elif isinstance(arg,StringType):
- self._trans.update (self.readTranslationTable(arg) )
-
- else:
- raise ValueError
-
- self.keys = self._trans.keys
- self.values = self._trans.values
- self.items = self._trans.items
- self.has_key = self._trans.has_key
- self.get = self._trans.get
-
- def __len__(self): return len(self._trans)
-
-
- def clear(self):
- self._trans = OOBTree()
- self._order = []
-
-
- def normalize(self, word):
- """ normalize the word using the given translation table. This
- functionality *MUST* go into a C extension for performance
- reasons !!!
- """
-
- for token in self._order:
- word = word.replace(token, self._trans[token])
-
- return word
-
- __call__ = normalize
-
-
- def readTranslationTable(self, fname):
- """ read a translation table """
-
- def __ordersort(a,b): return cmp(len(a),len(b))
-
- d = {}
-
- try:
- f = os.path.join(_basedir,'normalizers',fname)
- print f
- lines = open(f).readlines()
- except:
- try: lines = open(fname).readlines()
- except: raise
-
- for l in lines:
- l = l.strip()
- fields = l.split()
-
- d[fields[0]] = fields[1]
-
-
- self._order = d.keys()
- self._order.sort(__ordersort)
- self._order.reverse()
-
- return d
+class Normalizer(Persistent, BaseNormalizer):
+ pass