[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.27
Andreas Jung
andreas@digicool.com
Tue, 12 Feb 2002 14:29:30 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv27494
Modified Files:
Tag: ajung-textindexng-branch
TextIndexNG.py
Log Message:
- additional checks for parameters
- minor API cleanup
- prelim. support for NEAR search
=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.26 => 1.2.2.27 ===
-class QueryException(Exception): pass
+class TextIndexNGException(Exception): pass
class TextIndexNG(PluggableIndex.PluggableIndex, Persistent,
@@ -91,25 +91,36 @@
'help': ('TextIndex','TextIndex_Settings.stx')},
)
+ _all_options = ('useSplitter','splitterMaxLen','splitterIndexNumbers',
+ 'splitterSingleChars','splitterCasefolding','useStemmer','useOperator',
+ 'useGlobbing','lexicon','nearDistance','useProximity','nearSearch',
+ 'stopWords','thesaurus','characterMapping'
+ )
+
query_options = ("query","operator")
_valid_default_operators = ('and','or')
def __init__(self
, id
- , extra= None
- , caller = None
- , verbose = 0
- , timed_statistics = 0
+ , extra= None # record with all parameters
+ , caller = None # ZCatalog instance (old crap)
+ , verbose = 0 # verbose mode
+ , timed_statistics = 0 # timed statistics
):
- self.id = id
-
debug(extra)
+ # check parameters
+ for k in dir(extra):
+ if not k in self._all_options:
+ raise TextIndexNGException,'unknown parameter "%s"' % k
# reference to catalog (can we get rid of that ?)
self.catalog = caller
+
+ # our own Id
+ self.id = id
# name of splitter
self.useSplitter = getattr(extra,'useSplitter', 'ZopeSplitter')
@@ -138,9 +149,6 @@
# lexicon to be used (name, reference or None(internal))
self.lexicon = getattr(extra,'lexicon', None) or None
- # default maximum distance for words with near search
- self.nearDistance = getattr(extra,'nearDistance', 5)
-
# use proximity algorithm
self.useProximity = getattr(extra,'useProximity', None) or None
@@ -148,34 +156,40 @@
self.nearSearch = getattr(extra,'nearSearch', None)
if self.nearSearch=='None': self.nearSearch = None
+ # default maximum distance for words with near search
+ self.nearDistance = getattr(extra,'nearDistance', 5)
+
# Stopwords: either filename or StopWord object
self.stopWords = getattr(extra,'stopWords', None) or None
# Thesaurus: either filename or StopWord object
self.thesaurus = getattr(extra,'thesaurus', None) or None
- # Thesaurus: either filename or StopWord object
- self.thesaurus = getattr(extra,'thesaurus', None) or None
-
# Normalizer: characterMapping
self.charMapping = getattr(extra,'characterMapping', None) or None
-
if not self.nearSearch in (None,'internal','documentLookup'):
raise ValueError,'nearSearch must be either None, "internal"'\
' or "documentLookup"'
-
- if verbose: TextIndexCommon.debug_on()
- else: TextIndexCommon.debug_off()
+ if verbose: self.debugOn()
+ else: self.debugOff()
self.timed_statistics = timed_statistics
self.clear()
-
+
+
+ def debugOn(self): TextIndexCommon.debug_on()
+ def debugOff(self): TextIndexCommon.debug_off()
+
+ def timedStatsOn(self): self.timed_statistics = 1
+ def timedStatsOff(self): self.timed_statistics = 0
def clear(self):
+ self.__OBJECTS = IOBTree() # mapping RID->object reference
+
self._IDX = IOBTree()
self._invIDX = IOBTree()
self._thesaurus = None
@@ -387,6 +401,11 @@
def index_object(self, documentId, obj, threshold=None):
+ # HACK !
+ # We store references to the object for testing purposes
+ # only. A later implementation must be more clever
+
+ self.__OBJECTS[documentId] = obj
T = Timer(self.timed_statistics)
@@ -529,7 +548,14 @@
res = self.query( q )
- return res
+ # this should be fixed
+ bucket = IIBucket()
+
+ for docId in res[0]:
+ bucket[ docId ] = 1
+
+ print bucket
+ return res[0], (self.id, )
def query(self, q):
@@ -552,6 +578,7 @@
debug('Result:',res)
# Bah....this sucks
+
return res.docIds(), (self.id,)
@@ -601,7 +628,7 @@
debug("ProximityLexionLookup: ",word)
if not self.useProximity:
- raise QueryException, 'proximity search is not enabled'
+ raise TextIndexNGException, 'proximity search is not enabled'
# Lookup list of wordIds (usually should contain only *one*)
@@ -668,80 +695,91 @@
def txNear(self, *sets):
- """ perform near intersection of two ResultSets """
+ """ perform near search on results sets """
+
+ # One resultset consists of an IISet() or documentIds and
+ # tuple whose first element is the word (from LexiconLookup())
+ #
- # first we perform an intersection
+ # First we perform an intersection to get the documentIds of
+ # those documents that contain all the words
r = self.txIntersection(*sets)
+ docIds = r.docIds()
- # XXX: Near search ...
+ # Now we determine for every document the positions of all
+ # the words inside the document. then we compare all the positions
+ # to determine neighbourship
- return
+ words = []
+ for set in sets: words.append(set.words()[0])
-
+ res_docIds = IISet()
- def positionsFromDocumentLookup(self,docId, words):
- """ search all positions for a list of words for
- a given document given by its documentId.
- positions() returns a mapping word to
- list of positions of the word inside the document.
- """
+ for docId in docIds:
- debug('searching positions docid: %s, words: %s' % (docId,words))
+ posMap = self.positionsFromDocumentLookup(docId, words)
- res = OOBTree()
+ debug("Position Map for NEAR:")
+ for k,v in posMap.items(): debug("\t'%s'" % k,":","'%s'" % v)
- # obtain object from ZCatalog
- # this implementation must be changed for ZCatalog usage
- # (for testing purposes we read from the filesystem)
+ if self.checkPositionMap( posMap):
+ res_docIds.insert(docId)
- uid = self.catalog.paths[docId]
- data = open(uid).read() # CHANGE THAT !!!
+ r = ResultSet( docIds, words)
+ debug("\treturn: ",r)
- # Split retrieved document and obtain list of word positions
+ return r
+
+ def checkPositionMap(self, posMap):
+ """ check if a PositionMap represents a valid match for
+ a near search
+ """
- SP = self._v_splitterfunc(data)
+ # a posMap is a mapping for word to an IISet() to posititions
+ # of that word inside one document
- for word in words:
- posLst = SP.indexes(word)
+ # to be written
- res[word] = IISet(posLst)
-
- for k,v in res.items():
- debug(k,v)
+ return 1
- return res
+ # THIS IS A BAD BAD HACK !
+ def getDataFromObject(self, docId):
+ obj = self.__OBJECTS[docId]
+ data = getattr(obj,self.id)
+ return data
- def positionsFromInternalStorage(self,docId, words):
+
+ def positionsFromDocumentLookup(self,docId, words):
""" search all positions for a list of words for
a given document given by its documentId.
positions() returns a mapping word to
list of positions of the word inside the document.
"""
-
- debug('searching positions docid: %s, words: %s' % (docId,words))
- res = OOBTree()
+ debug('searching positions docid: %s, word: %s' % (docId,words))
- for w in words:
+ res = OOBTree()
- if isinstance(w,IntType):
- wid = w
- word = self._v_getWordById(w)
- else:
- wid = self._v_getIdByWord(w)[0]
- word = w
+ # obtain object from ZCatalog
+ # THis is a bad hack !
+ data = self.getDataFromObject( docId )
- if self._IDX[wid].has_key(docId):
- res[w] = self._IDX[wid][docId]
+ # Split retrieved document and obtain list of word positions
+ SP = self._v_splitterfunc(data)
- for k,v in res.items():
- debug(k,v)
+ for word in words:
+
+ posLst = SP.indexes(word)
+ res[word] = IISet(posLst)
+
+ for k,v in res.items(): debug(k,':',v)
return res
+
def numObjects(self):