[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.8
Andreas Jung
andreas@zope.com
Tue, 8 Jan 2002 11:56:40 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv15264
Modified Files:
Tag: ajung-textindexng-branch
TextIndexNG.py
Log Message:
added support for proximity indexing
=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.7 => 1.2.2.8 ===
from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
from Products.PluginIndexes.TextIndex.GlobbingLexicon import GlobbingLexicon
-
from Products.PluginIndexes.TextIndex import Splitter
+from ProximityLexicon import ProximityLexicon
from types import IntType, StringType, UnicodeType, InstanceType
@@ -43,6 +43,7 @@
from TextIndexCommon import *
import Stemmer
+import Proximity
import Thesaurus, StopWords
@@ -71,6 +72,8 @@
self.id = id
+ debug(extra)
+
# reference to catalog (can we get rid of that ?)
self.catalog = caller
@@ -78,7 +81,7 @@
self.useSplitter = getattr(extra,'useSplitter', 'ZopeSplitter')
# name of stemmer or None
- self.useStemmer = getattr(extra,'useStemmer', None)
+ self.useStemmer = getattr(extra,'useStemmer', None) or None
# default operator to combine queries
self.useOperator = getattr(extra,'useOperator', 'and')
@@ -87,7 +90,7 @@
self.useGlobbing = getattr(extra,'useGlobbing', 1)
# lexicon to be used (name, reference or None(internal))
- self.lexicon = getattr(extra,'lexicon', None)
+ self.lexicon = getattr(extra,'lexicon', None) or None
# support near search: 1/0 (requires more storage)
self.useNearSearch = getattr(extra,'useNearSearch', 1)
@@ -96,19 +99,17 @@
self.nearDistance = getattr(extra,'nearDistance', 5)
# use proximity algorithm
- self.useProximity = getattr(extra,'useProximity', None)
+ self.useProximity = getattr(extra,'useProximity', None) or None
# storage of positions for near search ('internal','documentLookup')
self.nearStorage = getattr(extra,'nearStorage', 'internal')
# Stopwords: either filename or StopWord object
- self.stopWords = getattr(extra,'stopWords', None)
+ self.stopWords = getattr(extra,'stopWords', None) or None
# Thesaurus: either filename or StopWord object
- self.thesaurus = getattr(extra,'thesaurus', None)
+ self.thesaurus = getattr(extra,'thesaurus', None) or None
- if self.lexicon == 'None': self.lexicon = None
- if self.useStemmer == 'None': self.useStemmer = None
if not self.nearStorage in ('internal','documentLookup'):
raise ValueError,'nearStorage must be either "internal"'\
@@ -122,7 +123,7 @@
self._IDX = IOBTree()
self._invIDX = IOBTree()
self._thesaurus = None
- self._stopwords = StopWords.StopWords()
+ self._stopwords = None
# Thesaurus
@@ -131,31 +132,54 @@
elif isinstance(self.thesaurus, InstanceType):
self._thesaurus = self.thesaurus
+
# StopWords
- if isinstance(self.stopWords, StringType):
- self._stopwords = StopWords.StopWords(self.stopWords)
- elif isinstance(self.stopWords, InstanceType):
- self._stopwords = self.stopWords
+ if self.stopWords:
+ if isinstance(self.stopWords, StringType):
+ self._stopwords = StopWords.StopWords(self.stopWords)
+ elif isinstance(self.stopWords, InstanceType):
+ self._stopwords = self.stopWords
+ else:
+ self._stopwords = {}
+
+
+ # Proximity
+
+ if self.useProximity:
+ self._PROX_LEX = ProximityLexicon(algorithm=self.useProximity)
+ self._PROX_IDX = IOBTree()
+ self._invPROX_IDX = IOBTree()
+
+ # the selection of the proximity function must be more general
+ # in the future. This requires some more work on the Python
+ # Proximity extension
+
+ if self.useProximity=='soundex':
+ self._v_proximityfunc = Proximity.soundex
+ else:
+ raise RuntimeError,'unsupported soundex'
+
# near Search
if self.nearStorage == 'internal':
- self.positions = self.positionsFromInternalStorage
- self.insertForwardEntry = self.insertForwardEntryInternal
+ self._v_positions = self.positionsFromInternalStorage
+ self._v_insertForwardEntry = self.insertForwardEntryInternal
else:
- self.positions = self.positionsFromDocumentLookup
- self.insertForwardEntry = self.insertForwardEntryDocumentLookup
+ self._v_positions = self.positionsFromDocumentLookup
+ self._v_insertForwardEntry = self.insertForwardEntryDocumentLookup
-
# get splitter function
- self._splitterfunc = self._stemmerfunc = None
+ self._v_splitterfunc = self._v_stemmerfunc = None
if self.useSplitter:
- self._splitterfunc = Splitter.getSplitter(self.useSplitter)
+ self._v_splitterfunc = Splitter.getSplitter(self.useSplitter)
+
# stemmer function
+
if self.useStemmer:
- self._stemmerfunc = Stemmer.Stemmer(self.useStemmer).stem
+ self._v_stemmerfunc = Stemmer.Stemmer(self.useStemmer).stem
if self.lexicon:
@@ -169,15 +193,16 @@
self._LEXICON = GlobbingLexicon()
debug('created new globbing lexicon')
- if self._stemmerfunc:
+ if self._v_stemmerfunc:
debug('stemming disabled because globbing enabled')
- self._stemmerfunc = None
+ self._v_stemmerfunc = None
else:
self._LEXICON = Lexicon()
debug('created new lexicon')
+ self._v_getWordIdList = self._LEXICON.getWordIdList
self._v_getWordId = self._LEXICON.getWordId
self._v_getWordById = self._LEXICON.getWord
self._v_getIdByWord = self._LEXICON.get
@@ -206,41 +231,83 @@
idx[documentId].update(widLst)
- def insertForwardEntryInternal(self,wordId,pos,documentId):
+ def insertForwardEntryInternal(self,wordIds,pos,documentId):
""" insert entries for forward index. This function stores
the word positions internally.
+
+ wordId is either an integer or a list of integers
"""
# self._IDX is a mapping:
# wordId -> documentId -> [positions]
-
+
idx = self._IDX
- if idx.has_key(wordId) == 0:
- idx[wordId] = IOBTree()
+ _single = 0
+ if isinstance(wordIds,IntType):
+ wordIds = [wordIds]
+ _single = 1
- tree = idx[wordId]
- if tree.has_key(documentId) == 0:
- tree[documentId] = IISet()
+ for i in range(len(wordIds)):
+ wordId = wordIds[i]
+
+ if idx.has_key(wordId) == 0:
+ idx[wordId] = IOBTree()
- tree[documentId].insert(pos)
+ tree = idx[wordId]
+ if tree.has_key(documentId) == 0:
+ tree[documentId] = IISet()
+ if _single:
+ tree[documentId].insert(pos)
+ else:
+ tree[documentId].insert(i)
- def insertForwardEntryDocumentLookup(self,wordId,pos,documentId):
+
+ def insertForwardEntryDocumentLookup(self,wordIds,pos,documentId):
""" insert entries for forward index. This function does not store
word positions. Word positions are calculated when document is in the
hitlist.
+
+ wordId is either an integer or a list of integers
"""
# self._IDX is a mapping:
- # wordId -> documentId -> [positions]
+ # wordId -> documentId
idx = self._IDX
- if idx.has_key(wordId) == 0:
- idx[wordId] = IISet()
+ if isinstance(wordIds,IntType): wordIds = [wordIds]
+
+ for wordId in wordIds:
+
+ if idx.has_key(wordId) == 0:
+ idx[wordId] = IISet()
- idx[wordId].insert(documentId)
+ idx[wordId].insert(documentId)
+
+
+ def insertProximityEntries(self,wordIds,documentId):
+ """ insert forward *and* backword entries for proximity indexes """
+
+ idx = self._PROX_IDX
+ invidx = self._invPROX_IDX
+
+ if isinstance(wordIds,IntType): wordIds = [wordIds]
+
+ for wordId in wordIds:
+
+ if idx.has_key(wordId) == 0:
+ idx[wordId] = IISet()
+
+ idx[wordId].insert(documentId)
+
+
+ if invidx.has_key(documentId)==0:
+ invidx[documentId] = IISet(wordIds)
+ else:
+ invidx[documentId].update(wordIds)
+
def _printIndex(self):
@@ -279,34 +346,43 @@
# For performance reasons it might be better when the
# splitter returns the list of splitted words.
- words = self._splitterfunc(source,encoding=encoding)
+ words = self._v_splitterfunc(source,encoding=encoding).split()
- # we collect all wordIds for performance reasons in a list
- # and update the backward index once instead of inserting
- # every single wordId
+ # apply stopwords list
+ # Maybe this should go into a C extension for performance reasons
- widLst = []
isStopWord = self._stopwords.has_key
+ words = filter(lambda x,f=isStopWord: f(x)==0, words)
- pos = 0
- for word in words:
- if isStopWord(word): continue
+ # Check if we want proximity searches. If yes, we need to create
+ # a list containing the proximity representations of the of
+ # the words
- # stem the single word
- if self._stemmerfunc:
- word = self._stemmerfunc(word)
+ if self.useProximity:
+ proximity_words = self._v_proximityfunc(words)
+ proximity_widList = self._PROX_LEX.getWordIdList(proximity_words)
+ assert len(proximity_words)==len(proximity_widList)
- # get (new) wordId for word
- wid = self._v_getWordId(word)
- widLst.append(wid)
+ self.insertProximityEntries(proximity_widList,documentId)
+
+ # Stem all words in one run
- # and insert the wordId, its position and the documentId
- # in the index
- self.insertForwardEntry(wid,pos,documentId)
+ if self._v_stemmerfunc:
+ words = self._v_stemmerfunc(words)
- pos+=1
+ # we collect all wordIds for performance reasons in a list
+ # and update the backward index once instead of inserting
+ # every single wordId
+
+
+ widLst = self._v_getWordIdList(words)
+ assert len(widLst)==len(words)
+ # insert forward entries
+ self._v_insertForwardEntry(widLst,None,documentId)
+
+ # insert backward entries
self.insertBackwardEntries(widLst,documentId)
return len(widLst)
@@ -353,8 +429,8 @@
# We need to stem
- if self._stemmerfunc:
- word = self._stemmerfunc(word)
+ if self._v_stemmerfunc:
+ word = self._v_stemmerfunc(word)
wids = self._v_getIdByWord(word)
@@ -371,7 +447,6 @@
for docId in res:
r[docId] = IISet()
-
else:
r={}
@@ -445,7 +520,7 @@
# Split retrieved document and obtain list of word positions
- SP = self._splitterfunc(data)
+ SP = self._v_splitterfunc(data)
for word in words:
posLst = SP.indexes(word)