[Zope-dev] ZCatalog glitches
Dieter Maurer
dieter@handshake.de
Sun, 20 Aug 2000 13:06:46 +0200
The Zope 2.1.6 ZCatalog was not bad but had a set of glitches.
The Zope 2.2.1b1 ZCatalog (CVS from 2000-8-19) is much
better but still has a set of glitches:
* UnTextIndex(349) breaks, if the lexicon lookup returns the
empty tuple --> "index out of range".
* UnTextIndex(349) only looks at the first wordindex returned
by the lexicon lookup -- surely not what one expects,
if the lexicon is globbing
* GlobbingLexicon(178) breaks, if the second pattern character
is a wildcard.
* GlobbingLexicon.GlobbingLexicon.translate
anchors the resulting pattern at the end but not
the start.
This implies that a word matches the pattern (as defined by lexicon), if
it has a suffix that matches (as defined by "re.match").
This is quite a strange semantics.
Attached is a patch, fixing these glitches.
Dieter
--- lib/python/SearchIndex/:GlobbingLexicon.py Sat Mar 25 18:42:32 2000
+++ lib/python/SearchIndex/GlobbingLexicon.py Sun Aug 20 12:46:42 2000
@@ -164,28 +164,42 @@
def get(self, pattern):
""" Query the lexicon for words matching a pattern.
+ DM: Note: the complete word must mach *pattern*.
+ Formerly, a word was matched when it ended
+ in a match of *pattern*. This, probably, was
+ a bug.
"""
+ # import pdb; pdb.set_trace()
+
wc_set = [self.multi_wc, self.single_wc]
digrams = []
+ no_wc= 1 # DM: optimization
for i in range(len(pattern)):
if pattern[i] in wc_set:
+ no_wc= 0
continue
if i == 0:
digrams.insert(i, (self.eow + pattern[i]) )
- digrams.append((pattern[i] + pattern[i+1]))
- else:
- try:
- if pattern[i+1] not in wc_set:
- digrams.append( pattern[i] + pattern[i+1] )
-
- except IndexError:
- digrams.append( (pattern[i] + self.eow) )
+ try:
+ if pattern[i+1] not in wc_set:
+ digrams.append( pattern[i] + pattern[i+1] )
+ else: no_wc= 0
+
+ except IndexError:
+ digrams.append( (pattern[i] + self.eow) )
+ # DM: optimization
+ if no_wc and self.anchored:
+ # we have no wildcard; therefore we may look up the
+ # word directly
+ if self._lexicon.has_key(pattern): return (self._lexicon[pattern],)
+ else: return ()
+
## now get all of the intsets that contain the result digrams
result = None
@@ -248,6 +262,10 @@
return Splitter(astring)
+ # DM: optimization
+ # anchored: true means, the pattern must match the complete word
+ anchored= 1
+
def translate(self, pat):
"""Translate a PATTERN to a regular expression.
@@ -255,7 +273,7 @@
"""
i, n = 0, len(pat)
- res = ''
+ res = self.anchored and '^' or ''
while i < n:
c = pat[i]
i = i+1
@@ -265,7 +283,7 @@
res = res + '.'
else:
res = res + re.escape(c)
- return res + "$"
+ return self.anchored and res + "$" or res
--- lib/python/SearchIndex/:UnTextIndex.py Sat Aug 19 19:27:01 2000
+++ lib/python/SearchIndex/UnTextIndex.py Sun Aug 20 12:43:22 2000
@@ -346,10 +346,15 @@
if len(src) == 1:
src=src[0]
if src[:1]=='"' and src[-1:]=='"': return self[src]
- r = self._index.get(self.getLexicon(self._lexicon).get(src)[0],
- None)
- if r is None: r = {}
- return ResultList(r, (src,), self)
+ r= None
+ for wordindex in self.getLexicon(self._lexicon).get(src):
+ rr = self._index.get(wordindex, None)
+ if rr is not None:
+ rr= ResultList(rr, (src,), self)
+ if r is None: r= rr
+ else: r= r.__or__(rr)
+ if r is None: r = ResultList({}, (src,), self)
+ return r
r = None
for word in src: