[Zope-CVS] CVS: Products/ZCTextIndex - IIndex.py:1.1.2.7 ILexicon.py:1.1.2.6 IQueryParser.py:1.1.2.6 Index.py:1.1.2.40 Lexicon.py:1.1.2.10 ParseTree.py:1.1.2.4 QueryParser.py:1.1.2.19

Guido van Rossum guido@python.org
Mon, 13 May 2002 18:17:12 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv7094

Modified Files:
      Tag: TextIndexDS9-branch
	IIndex.py ILexicon.py IQueryParser.py Index.py Lexicon.py 
	ParseTree.py QueryParser.py 
Log Message:
Added globbing or pattern searches.  Currently only a single trailing
star is supported (e.g. ``foo*'').  There are some assumptions on case
insensitivity hidden in the glob code.  There are no unit tests for
the globbing search execution currently.  (There are none for phrase
searching either.  This is all my fault.  But it does work.)


=== Products/ZCTextIndex/IIndex.py 1.1.2.6 => 1.1.2.7 ===
         """
 
+    def search_glob(pattern):
+        """Execute a pattern search.
+
+        The pattern represents a set of words by using * and ?.  For
+        example, "foo*" represents the set of all words in the lexicon
+        starting with "foo".
+
+        NOTE: Currently only a single trailing * is supported.
+
+        Return an IIBucket.
+        """
+
     def query_weight(terms):
         """Return the weight for a set of query terms.
 


=== Products/ZCTextIndex/ILexicon.py 1.1.2.5 => 1.1.2.6 ===
         """
 
+    def globToWordIds(pattern):
+        """Return a sequence of ids of words matching the pattern.
+
+        The argument should be a single word using globbing syntax,
+        e.g. 'foo*' meaning anything starting with 'foo'.
+
+        NOTE: Currently only a single trailing * is supported.
+
+        Returns the wids for all words in the lexicon that match the
+        pattern.
+        """
+
     def length():
         """Return the number of unique term in the lexicon."""


=== Products/ZCTextIndex/IQueryParser.py 1.1.2.5 => 1.1.2.6 ===
         """Return the node type.
 
-        This is one of 'AND', 'OR', 'NOT', 'ATOM' or 'PHRASE'.
+        This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
         """
 
     def getValue():
@@ -45,6 +45,7 @@
         'NOT'             a parse tree
         'ATOM'            a string (representing a single search term)
         'PHRASE'          a string (representing a search phrase)
+        'GLOB'            a string (representing a pattern, e.g. "foo*")
         """
 
     def terms():


=== Products/ZCTextIndex/Index.py 1.1.2.39 => 1.1.2.40 ===
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)
-        return self.search_wids(wids)
+        return self._union(self._search_wids(wids))
 
-    def search_wids(self, wids):
+    def search_glob(self, pattern):
+        wids = self._lexicon.globToWordIds(pattern)
+        return self._union(self._search_wids(wids))
+
+    def search_phrase(self, phrase):
+        wids = self._lexicon.termToWordIds(phrase)
+        hits = self._intersection(self._search_wids(wids))
+        if not hits:
+            return hits
+        code = WidCode.encode(wids)
+        result = IIBTree()
+        for docid, weight in hits.items():
+            docwords = self._docwords[docid]
+            if docwords.find(code) >= 0:
+                result[docid] = weight
+        return result
+
+    def _search_wids(self, wids):
         if not wids:
-            return IIBucket()
+            return []
         N = float(len(self._docweight))
         L = []
         DictType = type({})
@@ -122,23 +139,20 @@
                 d2w = IIBucket(d2w)
             L.append((d2w, scaled_int(idf)))
         L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+        return L
+
+    def _intersection(self, L):
         d2w, weight = L[0]
         dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
         for d2w, weight in L[1:]:
             dummy, result = weightedIntersection(result, d2w, 1, weight)
         return result
 
-    def search_phrase(self, phrase):
-        wids = self._lexicon.termToWordIds(phrase)
-        hits = self.search_wids(wids)
-        if not hits:
-            return hits
-        code = WidCode.encode(wids)
+    def _union(self, L):
+        # XXX This can be optimized, see ParseTree.OrNode.executeQuery()
         result = IIBTree()
-        for docid, weight in hits.items():
-            docwords = self._docwords[docid]
-            if docwords.find(code) >= 0:
-                result[docid] = weight
+        for d2w, weight in L:
+            dummy, result = weightedUnion(result, d2w, 1, weight)
         return result
 
     def query_weight(self, terms):


=== Products/ZCTextIndex/Lexicon.py 1.1.2.9 => 1.1.2.10 ===
 ##############################################################################
 
+import re
+
 from BTrees.IOBTree import IOBTree
 from BTrees.OIBTree import OIBTree
 from Products.ZCTextIndex.ILexicon import ILexicon
@@ -59,6 +61,23 @@
             wid = self.__wids.get(word)
             if wid is not None:
                 wids.append(wid)
+        return wids
+
+    def globToWordIds(self, pattern):
+        if not re.match("^\w+\*$", pattern):
+            return []
+        pattern = pattern.lower()
+        assert pattern.endswith("*")
+        prefix = pattern[:-1]
+        assert prefix and not prefix.endswith("*")
+        keys = self.__wids.keys(prefix) # Keys starting at prefix
+        wids = []
+        words = []
+        for key in keys:
+            if not key.startswith(prefix):
+                break
+            wids.append(self.__wids[key])
+            words.append(key)
         return wids
 
     def _getWordIdCreate(self, word):


=== Products/ZCTextIndex/ParseTree.py 1.1.2.3 => 1.1.2.4 ===
     def executeQuery(self, index):
         return index.search_phrase(self.getValue())
+
+class GlobNode(AtomNode):
+
+    _nodeType = "GLOB"
+
+    def executeQuery(self, index):
+        return index.search_glob(self.getValue())


=== Products/ZCTextIndex/QueryParser.py 1.1.2.18 => 1.1.2.19 ===
   can contain whitespace, parentheses and key words.
 
-In addtion, an ATOM may optionally be preceded by a hyphen, meaning that it
-must not be present.
+In addition, an ATOM may optionally be preceded by a hyphen, meaning
+that it must not be present.
+
+An unquoted ATOM may also end in a star.  This is a primitive
+"globbing" function, meaning to search for any word with a given
+prefix.
 
 When multiple consecutive ATOMs are found at the leaf level, they are
 connected by an implied AND operator, and an unquoted leading hyphen
@@ -46,6 +50,7 @@
 - words connected by punctuation implies phrase search, e.g. ``foo-bar''
 - a leading hyphen implies NOT, e.g. ``foo -bar''
 - these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
+- a trailing * means globbing (i.e. prefix search), e.g. ``foo*''
 """
 
 import re
@@ -167,13 +172,15 @@
             nodes = []
             nots = []
             for a in atoms:
-                words = re.findall(r"\w+", a)
+                words = re.findall(r"\w+\*?", a)
                 if not words:
                     continue
-                if len(words) == 1:
-                    n = ParseTree.AtomNode(words[0])
-                else:
+                if len(words) > 1:
                     n = ParseTree.PhraseNode(" ".join(words))
+                elif words[0].endswith("*"):
+                    n = ParseTree.GlobNode(words[0])
+                else:
+                    n = ParseTree.AtomNode(words[0])
                 if a[0] == "-":
                     n = ParseTree.NotNode(n)
                     nots.append(n)