[Zope-Checkins] CVS: Zope/lib/python/SearchIndex - GlobbingLexicon.py:1.9.36.1 UnTextIndex.py:1.49.8.1

Thu, 9 Aug 2001 13:34:12 -0400

Update of /cvs-repository/Zope/lib/python/SearchIndex
In directory cvs.zope.org:/tmp/cvs-serv29115/lib/python/SearchIndex

Modified Files:
      Tag: NR-branch
	GlobbingLexicon.py UnTextIndex.py 
Log Message:
Sync NR-branch with trunk.  Sorry about so many checkin messages...


=== Zope/lib/python/SearchIndex/GlobbingLexicon.py 1.9 => 1.9.36.1 ===
     def createDigrams(self, word):
         """Returns a list with the set of digrams in the word."""
-        digrams = []
-
-        digrams.append(self.eow + word[0])    # Mark the beginning
-
-        for i in range(1,len(word)):
-            digrams.append(word[i-1:i+1])
+        digrams = list(word)
+        digrams.append(self.eow)
+        last = self.eow
 
-        digrams[-1] = digrams[-1] + self.eow  # Mark the end
+        for i in range(len(digrams)):
+            last, digrams[i] = digrams[i], last + digrams[i]
 
         return digrams
 
@@ -267,21 +265,28 @@
 
     def query_hook(self, q):
         """expand wildcards"""
-        words = []
-        for w in q:
-            if ( (self.multi_wc in w) or
-                 (self.single_wc in w) ):
-                wids = self.get(w)
+        ListType = type([])
+        i = len(q) - 1
+        while i >= 0:
+            e = q[i]
+            if isinstance(e, ListType):
+                self.query_hook(e)
+            elif ( (self.multi_wc in e) or
+                   (self.single_wc in e) ):
+                wids = self.get(e)
+                words = []
                 for wid in wids:
                     if words:
                         words.append(Or)
                     words.append(wid)
-            else:
-                words.append(w)
+                if not words:
+                    # if words is empty, return something that will make
+                    # textindex's __getitem__ return an empty result list
+                    words.append('')
+                q[i] = words
+            i = i - 1
 
-        # if words is empty, return something that will make textindex's
-        # __getitem__ return an empty result list
-        return words or ['']
+        return q
 
     def Splitter(self, astring, words=None):
         """ wrap the splitter """
@@ -298,19 +303,16 @@
         There is no way to quote meta-characters.
         """
 
+        # Remove characters that are meaningful in a regex
         transTable = string.maketrans("", "")
+        result = string.translate(pat, transTable,
+                                  r'()&|!@#$%^{}\<>.')
         
-        # First, deal with mutli-character globbing
-        result = string.replace(pat, '*', '.*')
+        # First, deal with multi-character globbing
+        result = string.replace(result, '*', '.*')
 
         # Next, we need to deal with single-character globbing
-        result = string.replace(result, '?', '.?')
-
-        # Now, we need to remove all of the characters that
-        # are forbidden.
-        result = string.translate(result, transTable,
-                                  r'()&|!@#$%^{}\<>')
+        result = string.replace(result, '?', '.')
 
         return "%s$" % result 
-
 


=== Zope/lib/python/SearchIndex/UnTextIndex.py 1.49 => 1.49.8.1 ===
         Integers are assumed to be resolved word ids. """
         
-        if type(word) is IntType:
+        if isinstance(word, IntType):
             # We have a word ID
             result = self._index.get(word, {})
             return ResultList(result, (word,), self)
@@ -440,7 +440,7 @@
         
             if len(splitSource) == 1:
                 splitSource = splitSource[0]
-                if splitSource[:1] == '"' and splitSource[-1:] == '"':
+                if splitSource[:1] == splitSource[-1:] == '"':
                     return self[splitSource]
 
                 wids=self.getLexicon(self._lexicon).get(splitSource)
@@ -551,28 +551,37 @@
 
 
 
-    def query(self, s, default_operator=Or, ws=(string.whitespace,)):
-        """ This is called by TextIndexes.  A 'query term' which is a
-        string 's' is passed in, along with an index object.  s is
-        parsed, then the wildcards are parsed, then something is
-        parsed again, then the whole thing is 'evaluated'. """
+    def query(self, s, default_operator=Or):
+        """ Evaluate a query string.
+        
+        Convert the query string into a data structure of nested lists
+        and strings, based on the grouping of whitespace-separated
+        strings by parentheses and quotes.  The 'Near' operator is
+        inserted between the strings of a quoted group.
+
+        The Lexicon is given the opportunity to transform the
+        data structure.  Stemming, wildcards, and translation are
+        possible Lexicon services.
+
+        Finally, the query list is normalized so that it and every
+        sub-list consist of non-operator strings or lists separated
+        by operators. This list is evaluated.
+        """
 
         # First replace any occurences of " and not " with " andnot "
-        s = re.sub(
-            '[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
-            ' andnot ', s)
+        s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)
 
-        # do some parsing
+        # Parse parentheses and quotes
         q = parse(s)
 
-        ## here, we give lexicons a chance to transform the query.
-        ## For example, substitute wildcards, or translate words into
-        ## various languages.
+        # Allow the Lexicon to process the query
         q = self.getLexicon(self._lexicon).query_hook(q)
-        # do some more parsing
+
+        # Insert the default operator between any two search terms not
+        # already joined by an operator.
         q = parse2(q, default_operator)
 
-        ## evalute the final 'expression'
+        # evalute the final 'expression'
         return self.evaluate(q)
 
 
@@ -605,19 +614,17 @@
 
     def evaluate(self, query):
         """Evaluate a parsed query"""
-        # There are two options if the query passed in is only one
-        # item. It means either it's an embedded query, in which case
-        # we'll recursively evaluate, other wise it's nothing for us
-        # to evaluate, and we just get the results and return them.
-        if (len(query) == 1):
-            if (type(query[0]) is ListType):
-                return self.evaluate(query[0])
+        # Strip off meaningless layers
+        while isinstance(query, ListType) and len(query) == 1:
+            query = query[0]
+
+        # If it's not a list, assume a string or number
+        if not isinstance(query, ListType):
+            return self[query]
 
-            return self[query[0]]       # __getitem__
-
-        # Now we need to loop through the query and expand out
+        # Now we need to loop through the query and reduce
         # operators.  They are currently evaluated in the following
-        # order: AndNote -> And -> Or -> Near
+        # order: AndNot -> And -> Or -> Near
         i = 0
         while (i < len(query)):
             if query[i] is AndNot:
@@ -660,98 +667,91 @@
     l = []
     tmp = string.lower(s)
 
-    while (1):
-        p = parens(tmp)
+    p = parens(tmp)
+    while p is not None:
+        # Look for quotes in the section of the string before
+        # the parentheses, then parse the string inside the parens
+        l = l + quotes(p[0])
+        l.append(parse(p[1]))
 
-        if (p is None):
-            # No parentheses found.  Look for quotes then exit.
-            l = l + quotes(tmp)
-            break
-        else:
-            # Look for quotes in the section of the string before
-            # the parentheses, then parse the string inside the parens
-            l = l + quotes(tmp[:(p[0] - 1)])
-            l.append(parse(tmp[p[0] : p[1]]))
-
-            # continue looking through the rest of the string
-            tmp = tmp[(p[1] + 1):]
+        # continue looking through the rest of the string
+        tmp = p[2]
+        p = parens(tmp)
 
-    return l
+    return l + quotes(tmp)
 
 def parse2(q, default_operator,
            operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
     """Find operators and operands"""
-    i = 0
     isop = operator_dict.has_key
-    while (i < len(q)):
-        if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
-
-        # every other item, starting with the first, should be an operand
-        if ((i % 2) != 0):
-            # This word should be an operator; if it is not, splice in
-            # the default operator.
-            
-            if type(q[i]) is not ListType and isop(q[i]):
-                q[i] = operator_dict[q[i]]
-            else: q[i : i] = [ default_operator ]
-
-        i = i + 1
+    i = len(q) - 1
+    while i >= 0:
+        e = q[i]
+        if isinstance(e, ListType):
+            q[i] = parse2(e, default_operator)
+            if i % 2:
+                q.insert(i, default_operator)
+        elif i % 2:
+            # This element should be an operator
+            if isop(e):
+                # Ensure that it is identical, not merely equal.
+                q[i] = operator_dict[e]
+            else:
+                # Insert the default operator.
+                q.insert(i, default_operator)
+        i = i - 1
 
     return q
 
 
-def parens(s, parens_re=re.compile('[\(\)]').search):
-
-    index = open_index = paren_count = 0
-
-    while 1:
-
-        mo = parens_re(s, index)
-        if mo is None : break
-
+def parens(s, parens_re=re.compile('[()]').search):
+    mo = parens_re(s)
+    if mo is None:
+        return
+    
+    open_index = mo.start(0) + 1
+    paren_count = 0
+    while mo is not None:
         index = mo.start(0)
     
         if s[index] == '(':
             paren_count = paren_count + 1
-            if open_index == 0 : open_index = index + 1
         else:
             paren_count = paren_count - 1
+            if paren_count == 0:
+                return (s[:open_index - 1], s[open_index:index],
+                        s[index + 1:])
+            if paren_count < 0:
+                break
+        mo = parens_re(s, index + 1)
+
+    raise QueryError, "Mismatched parentheses"      
 
-        if paren_count == 0:
-            return open_index, index
-        else:
-            index = index + 1
 
-    if paren_count == 0: # No parentheses Found
-        return None
-    else:
-        raise QueryError, "Mismatched parentheses"      
-
-
-def quotes(s, ws=(string.whitespace,)):
-     # split up quoted regions
-     splitted = re.split( '[%s]*\"[%s]*' % (ws * 2),s)
-     split=string.split
-
-     if (len(splitted) > 1):
-         if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
+def quotes(s):
+    split=string.split
+    if '"' not in s:
+        return split(s)
     
-         for i in range(1,len(splitted),2):
-             # split the quoted region into words
-             splitted[i] = filter(None, split(splitted[i]))
-
-             # put the Proxmity operator in between quoted words
-             for j in range(1, len(splitted[i])):
-                 splitted[i][j : j] = [ Near ]
-
-         for i in range(len(splitted)-1,-1,-2):
-             # split the non-quoted region into words
-             splitted[i:i+1] = filter(None, split(splitted[i]))
-
-         splitted = filter(None, splitted)
-     else:
-         # No quotes, so just split the string into words
-         splitted = filter(None, split(s))
+    # split up quoted regions
+    splitted = re.split('\s*\"\s*', s)
 
-     return splitted
+    if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
+    
+    for i in range(1,len(splitted),2):
+        # split the quoted region into words
+        words = splitted[i] = split(splitted[i])
+        
+        # put the Proxmity operator in between quoted words
+        j = len(words) - 1
+        while j > 0:
+            words.insert(j, Near)
+            j = j - 1
+
+    i = len(splitted) - 1
+    while i >= 0:
+        # split the non-quoted region into words
+        splitted[i:i+1] = split(splitted[i])
+        i = i - 2
 
+    return filter(None, splitted)