[Zope-CVS] CVS: Products/ZCTextIndex - Lexicon.py:1.14

Guido van Rossum guido@python.org
Wed, 22 May 2002 15:34:45 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv614

Modified Files:
	Lexicon.py 
Log Message:
Add full globbing.  This implements * and ? like in the shell,
but the pattern may not begin with a glob character (else
someone specifying "*" as the pattern can tie up the CPU for
a long time).


=== Products/ZCTextIndex/Lexicon.py 1.13 => 1.14 ===
 from BTrees.IOBTree import IOBTree
 from BTrees.OIBTree import OIBTree
+
 from Products.ZCTextIndex.ILexicon import ILexicon
 from Products.ZCTextIndex.StopDict import get_stopdict
-from PipelineFactory import splitter_factory, element_factory
+from Products.ZCTextIndex.ParseTree import QueryError
+from Products.ZCTextIndex.PipelineFactory import \
+     splitter_factory, element_factory
 
 class Lexicon:
 
@@ -78,7 +81,7 @@
         return last
 
     def isGlob(self, word):
-        return "*" in word
+        return "*" in word or "?" in word
 
     def get_word(self, wid):
         return self._words[wid]
@@ -87,17 +90,41 @@
         return self._wids.get(word, 0)
 
     def globToWordIds(self, pattern):
-        # This currently only knows about trailing *;
-        # whatever splitter you use should match this
-        assert pattern.endswith("*")
-        prefix = pattern[:-1]
-        assert prefix and not prefix.endswith("*")
+        # Implement * and ? just as in the shell, except the pattern
+        # must not start with either of these
+        prefix = ""
+        while pattern and pattern[0] not in "*?":
+            prefix += pattern[0]
+            pattern = pattern[1:]
+        if not pattern:
+            # There were no globbing characters in the pattern
+            wid = self._wids.get(prefix, 0)
+            if wid:
+                return [wid]
+            else:
+                return []
+        if not prefix:
+            # The pattern starts with a globbing character.
+            # This is too efficient, so we raise an exception.
+            raise QueryError(
+                "pattern %r shouldn't start with glob character" % pattern)
+        pat = prefix
+        for c in pattern:
+            if c == "*":
+                pat += ".*"
+            elif c == "?":
+                pat += "."
+            else:
+                pat += re.escape(c)
+        pat += "$"
+        prog = re.compile(pat)
         keys = self._wids.keys(prefix) # Keys starting at prefix
         wids = []
         for key in keys:
             if not key.startswith(prefix):
                 break
-            wids.append(self._wids[key])
+            if prog.match(key):
+                wids.append(self._wids[key])
         return wids
 
     def _getWordIdCreate(self, word):
@@ -128,7 +155,7 @@
 
     import re
     rx = re.compile(r"\w+")
-    rxGlob = re.compile(r"\w+\*?") # See globToWordIds() above
+    rxGlob = re.compile(r"\w+[\w*?]*") # See globToWordIds() above
 
     def process(self, lst):
         result = []