[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - Normalizer.py:1.1.2.1
Andreas Jung
andreas@digicool.com
Sun, 13 Jan 2002 13:21:32 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv12409
Added Files:
Tag: ajung-textindexng-branch
Normalizer.py
Log Message:
added
=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/Normalizer.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
__doc__ ="""
Word normalizer. A normalizer takes a word and translates its characters
according to a translation table. The functionality is similiar to
string.translate() but allows to translate multiple characters.
A normalizer is typically used to translate accented characters to ASCII.
"""
from BTrees.OOBTree import OOBTree
from types import DictType, StringType
import re, os
_basedir = os.path.dirname(__file__)
class Normalizer:
""" word normalizer """
def __init__(self, arg):
self.clear()
if isinstance(arg,DictType):
self._trans.update(arg)
elif isinstance(arg,StringType):
self._trans.update (self.readTranslationTable(arg) )
else:
raise ValueError
self.keys = self._trans.keys
self.values = self._trans.values
self.items = self._trans.items
self.has_key = self._trans.has_key
self.get = self._trans.get
def __len__(self): return len(self._trans)
def clear(self):
self._trans = OOBTree()
self._order = []
def normalize(self, word):
""" normalize the word using the given translation table. This
functionality *MUST* go into a C extension for performance
reasons !!!
"""
for token in self._order:
word = word.replace(token, self._trans[token])
return word
__call__ = normalize
def readTranslationTable(self, fname):
""" read a translation table """
def __ordersort(a,b): return cmp(len(a),len(b))
d = {}
try:
f = os.path.join(_basedir,'normalizers',fname)
print f
lines = open(f).readlines()
except:
try: lines = open(fname).readlines()
except: raise
for l in lines:
l = l.strip()
fields = l.split()
d[fields[0]] = fields[1]
self._order = d.keys()
self._order.sort(__ordersort)
self._order.reverse()
return d