[Zope-Checkins] CVS: Zope/lib/python/SearchIndex - GlobbingLexicon.py:1.9.36.1 UnTextIndex.py:1.49.8.1
Shane Hathaway
shane@digicool.com
Thu, 9 Aug 2001 13:34:12 -0400
Update of /cvs-repository/Zope/lib/python/SearchIndex
In directory cvs.zope.org:/tmp/cvs-serv29115/lib/python/SearchIndex
Modified Files:
Tag: NR-branch
GlobbingLexicon.py UnTextIndex.py
Log Message:
Sync NR-branch with trunk. Sorry about so many checkin messages...
=== Zope/lib/python/SearchIndex/GlobbingLexicon.py 1.9 => 1.9.36.1 ===
def createDigrams(self, word):
"""Returns a list with the set of digrams in the word."""
- digrams = []
-
- digrams.append(self.eow + word[0]) # Mark the beginning
-
- for i in range(1,len(word)):
- digrams.append(word[i-1:i+1])
+ digrams = list(word)
+ digrams.append(self.eow)
+ last = self.eow
- digrams[-1] = digrams[-1] + self.eow # Mark the end
+ for i in range(len(digrams)):
+ last, digrams[i] = digrams[i], last + digrams[i]
return digrams
@@ -267,21 +265,28 @@
def query_hook(self, q):
"""expand wildcards"""
- words = []
- for w in q:
- if ( (self.multi_wc in w) or
- (self.single_wc in w) ):
- wids = self.get(w)
+ ListType = type([])
+ i = len(q) - 1
+ while i >= 0:
+ e = q[i]
+ if isinstance(e, ListType):
+ self.query_hook(e)
+ elif ( (self.multi_wc in e) or
+ (self.single_wc in e) ):
+ wids = self.get(e)
+ words = []
for wid in wids:
if words:
words.append(Or)
words.append(wid)
- else:
- words.append(w)
+ if not words:
+ # if words is empty, return something that will make
+ # textindex's __getitem__ return an empty result list
+ words.append('')
+ q[i] = words
+ i = i - 1
- # if words is empty, return something that will make textindex's
- # __getitem__ return an empty result list
- return words or ['']
+ return q
def Splitter(self, astring, words=None):
""" wrap the splitter """
@@ -298,19 +303,16 @@
There is no way to quote meta-characters.
"""
+ # Remove characters that are meaningful in a regex
transTable = string.maketrans("", "")
+ result = string.translate(pat, transTable,
+ r'()&|!@#$%^{}\<>.')
- # First, deal with mutli-character globbing
- result = string.replace(pat, '*', '.*')
+ # First, deal with multi-character globbing
+ result = string.replace(result, '*', '.*')
# Next, we need to deal with single-character globbing
- result = string.replace(result, '?', '.?')
-
- # Now, we need to remove all of the characters that
- # are forbidden.
- result = string.translate(result, transTable,
- r'()&|!@#$%^{}\<>')
+ result = string.replace(result, '?', '.')
return "%s$" % result
-
=== Zope/lib/python/SearchIndex/UnTextIndex.py 1.49 => 1.49.8.1 ===
Integers are assumed to be resolved word ids. """
- if type(word) is IntType:
+ if isinstance(word, IntType):
# We have a word ID
result = self._index.get(word, {})
return ResultList(result, (word,), self)
@@ -440,7 +440,7 @@
if len(splitSource) == 1:
splitSource = splitSource[0]
- if splitSource[:1] == '"' and splitSource[-1:] == '"':
+ if splitSource[:1] == splitSource[-1:] == '"':
return self[splitSource]
wids=self.getLexicon(self._lexicon).get(splitSource)
@@ -551,28 +551,37 @@
- def query(self, s, default_operator=Or, ws=(string.whitespace,)):
- """ This is called by TextIndexes. A 'query term' which is a
- string 's' is passed in, along with an index object. s is
- parsed, then the wildcards are parsed, then something is
- parsed again, then the whole thing is 'evaluated'. """
+ def query(self, s, default_operator=Or):
+ """ Evaluate a query string.
+
+ Convert the query string into a data structure of nested lists
+ and strings, based on the grouping of whitespace-separated
+ strings by parentheses and quotes. The 'Near' operator is
+ inserted between the strings of a quoted group.
+
+ The Lexicon is given the opportunity to transform the
+ data structure. Stemming, wildcards, and translation are
+ possible Lexicon services.
+
+ Finally, the query list is normalized so that it and every
+ sub-list consist of non-operator strings or lists separated
+ by operators. This list is evaluated.
+ """
# First replace any occurences of " and not " with " andnot "
- s = re.sub(
- '[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
- ' andnot ', s)
+ s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)
- # do some parsing
+ # Parse parentheses and quotes
q = parse(s)
- ## here, we give lexicons a chance to transform the query.
- ## For example, substitute wildcards, or translate words into
- ## various languages.
+ # Allow the Lexicon to process the query
q = self.getLexicon(self._lexicon).query_hook(q)
- # do some more parsing
+
+ # Insert the default operator between any two search terms not
+ # already joined by an operator.
q = parse2(q, default_operator)
- ## evalute the final 'expression'
+ # evalute the final 'expression'
return self.evaluate(q)
@@ -605,19 +614,17 @@
def evaluate(self, query):
"""Evaluate a parsed query"""
- # There are two options if the query passed in is only one
- # item. It means either it's an embedded query, in which case
- # we'll recursively evaluate, other wise it's nothing for us
- # to evaluate, and we just get the results and return them.
- if (len(query) == 1):
- if (type(query[0]) is ListType):
- return self.evaluate(query[0])
+ # Strip off meaningless layers
+ while isinstance(query, ListType) and len(query) == 1:
+ query = query[0]
+
+ # If it's not a list, assume a string or number
+ if not isinstance(query, ListType):
+ return self[query]
- return self[query[0]] # __getitem__
-
- # Now we need to loop through the query and expand out
+ # Now we need to loop through the query and reduce
# operators. They are currently evaluated in the following
- # order: AndNote -> And -> Or -> Near
+ # order: AndNot -> And -> Or -> Near
i = 0
while (i < len(query)):
if query[i] is AndNot:
@@ -660,98 +667,91 @@
l = []
tmp = string.lower(s)
- while (1):
- p = parens(tmp)
+ p = parens(tmp)
+ while p is not None:
+ # Look for quotes in the section of the string before
+ # the parentheses, then parse the string inside the parens
+ l = l + quotes(p[0])
+ l.append(parse(p[1]))
- if (p is None):
- # No parentheses found. Look for quotes then exit.
- l = l + quotes(tmp)
- break
- else:
- # Look for quotes in the section of the string before
- # the parentheses, then parse the string inside the parens
- l = l + quotes(tmp[:(p[0] - 1)])
- l.append(parse(tmp[p[0] : p[1]]))
-
- # continue looking through the rest of the string
- tmp = tmp[(p[1] + 1):]
+ # continue looking through the rest of the string
+ tmp = p[2]
+ p = parens(tmp)
- return l
+ return l + quotes(tmp)
def parse2(q, default_operator,
operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
"""Find operators and operands"""
- i = 0
isop = operator_dict.has_key
- while (i < len(q)):
- if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
-
- # every other item, starting with the first, should be an operand
- if ((i % 2) != 0):
- # This word should be an operator; if it is not, splice in
- # the default operator.
-
- if type(q[i]) is not ListType and isop(q[i]):
- q[i] = operator_dict[q[i]]
- else: q[i : i] = [ default_operator ]
-
- i = i + 1
+ i = len(q) - 1
+ while i >= 0:
+ e = q[i]
+ if isinstance(e, ListType):
+ q[i] = parse2(e, default_operator)
+ if i % 2:
+ q.insert(i, default_operator)
+ elif i % 2:
+ # This element should be an operator
+ if isop(e):
+ # Ensure that it is identical, not merely equal.
+ q[i] = operator_dict[e]
+ else:
+ # Insert the default operator.
+ q.insert(i, default_operator)
+ i = i - 1
return q
-def parens(s, parens_re=re.compile('[\(\)]').search):
-
- index = open_index = paren_count = 0
-
- while 1:
-
- mo = parens_re(s, index)
- if mo is None : break
-
+def parens(s, parens_re=re.compile('[()]').search):
+ mo = parens_re(s)
+ if mo is None:
+ return
+
+ open_index = mo.start(0) + 1
+ paren_count = 0
+ while mo is not None:
index = mo.start(0)
if s[index] == '(':
paren_count = paren_count + 1
- if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
+ if paren_count == 0:
+ return (s[:open_index - 1], s[open_index:index],
+ s[index + 1:])
+ if paren_count < 0:
+ break
+ mo = parens_re(s, index + 1)
+
+ raise QueryError, "Mismatched parentheses"
- if paren_count == 0:
- return open_index, index
- else:
- index = index + 1
- if paren_count == 0: # No parentheses Found
- return None
- else:
- raise QueryError, "Mismatched parentheses"
-
-
-def quotes(s, ws=(string.whitespace,)):
- # split up quoted regions
- splitted = re.split( '[%s]*\"[%s]*' % (ws * 2),s)
- split=string.split
-
- if (len(splitted) > 1):
- if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
+def quotes(s):
+ split=string.split
+ if '"' not in s:
+ return split(s)
- for i in range(1,len(splitted),2):
- # split the quoted region into words
- splitted[i] = filter(None, split(splitted[i]))
-
- # put the Proxmity operator in between quoted words
- for j in range(1, len(splitted[i])):
- splitted[i][j : j] = [ Near ]
-
- for i in range(len(splitted)-1,-1,-2):
- # split the non-quoted region into words
- splitted[i:i+1] = filter(None, split(splitted[i]))
-
- splitted = filter(None, splitted)
- else:
- # No quotes, so just split the string into words
- splitted = filter(None, split(s))
+ # split up quoted regions
+ splitted = re.split('\s*\"\s*', s)
- return splitted
+ if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
+
+ for i in range(1,len(splitted),2):
+ # split the quoted region into words
+ words = splitted[i] = split(splitted[i])
+
+ # put the Proxmity operator in between quoted words
+ j = len(words) - 1
+ while j > 0:
+ words.insert(j, Near)
+ j = j - 1
+
+ i = len(splitted) - 1
+ while i >= 0:
+ # split the non-quoted region into words
+ splitted[i:i+1] = split(splitted[i])
+ i = i - 2
+ return filter(None, splitted)