The Zope 2.1.6 ZCatalog was not bad but had a set of glitches. The Zope 2.2.1b1 ZCatalog (CVS from 2000-8-19) is much better but still has a set of glitches: * UnTextIndex(349) breaks, if the lexicon lookup returns the empty tuple --> "index out of range". * UnTextIndex(349) only looks at the first wordindex returned by the lexicon lookup -- surely not what one expects, if the lexicon is globbing * GlobbingLexicon(178) breaks, if the second pattern character is a wildcard. * GlobbingLexicon.GlobbingLexicon.translate anchors the resulting pattern at the end but not the start. This implies that a word matches the pattern (as defined by lexicon), if it has a suffix that matches (as defined by "re.match"). This is quite a strange semantics. Attached is a patch, fixing these glitches. Dieter --- lib/python/SearchIndex/:GlobbingLexicon.py Sat Mar 25 18:42:32 2000 +++ lib/python/SearchIndex/GlobbingLexicon.py Sun Aug 20 12:46:42 2000 @@ -164,28 +164,42 @@ def get(self, pattern): """ Query the lexicon for words matching a pattern. + DM: Note: the complete word must mach *pattern*. + Formerly, a word was matched when it ended + in a match of *pattern*. This, probably, was + a bug. """ + # import pdb; pdb.set_trace() + wc_set = [self.multi_wc, self.single_wc] digrams = [] + no_wc= 1 # DM: optimization for i in range(len(pattern)): if pattern[i] in wc_set: + no_wc= 0 continue if i == 0: digrams.insert(i, (self.eow + pattern[i]) ) - digrams.append((pattern[i] + pattern[i+1])) - else: - try: - if pattern[i+1] not in wc_set: - digrams.append( pattern[i] + pattern[i+1] ) - - except IndexError: - digrams.append( (pattern[i] + self.eow) ) + try: + if pattern[i+1] not in wc_set: + digrams.append( pattern[i] + pattern[i+1] ) + else: no_wc= 0 + + except IndexError: + digrams.append( (pattern[i] + self.eow) ) + # DM: optimization + if no_wc and self.anchored: + # we have no wildcard; therefore we may look up the + # word directly + if self._lexicon.has_key(pattern): return (self._lexicon[pattern],) + else: return () + ## now get all of the intsets that contain the result digrams result = None @@ -248,6 +262,10 @@ return Splitter(astring) + # DM: optimization + # anchored: true means, the pattern must match the complete word + anchored= 1 + def translate(self, pat): """Translate a PATTERN to a regular expression. @@ -255,7 +273,7 @@ """ i, n = 0, len(pat) - res = '' + res = self.anchored and '^' or '' while i < n: c = pat[i] i = i+1 @@ -265,7 +283,7 @@ res = res + '.' else: res = res + re.escape(c) - return res + "$" + return self.anchored and res + "$" or res --- lib/python/SearchIndex/:UnTextIndex.py Sat Aug 19 19:27:01 2000 +++ lib/python/SearchIndex/UnTextIndex.py Sun Aug 20 12:43:22 2000 @@ -346,10 +346,15 @@ if len(src) == 1: src=src[0] if src[:1]=='"' and src[-1:]=='"': return self[src] - r = self._index.get(self.getLexicon(self._lexicon).get(src)[0], - None) - if r is None: r = {} - return ResultList(r, (src,), self) + r= None + for wordindex in self.getLexicon(self._lexicon).get(src): + rr = self._index.get(wordindex, None) + if rr is not None: + rr= ResultList(rr, (src,), self) + if r is None: r= rr + else: r= r.__or__(rr) + if r is None: r = ResultList({}, (src,), self) + return r r = None for word in src: