[Zope-CVS] CVS: Products/ZCTextIndex - ILexicon.py:1.3 IPipelineElement.py:1.3 IQueryParser.py:1.3 Lexicon.py:1.10 ParseTree.py:1.5 QueryParser.py:1.4
Guido van Rossum
guido@python.org
Mon, 20 May 2002 09:55:39 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv7771
Modified Files:
ILexicon.py IPipelineElement.py IQueryParser.py Lexicon.py
ParseTree.py QueryParser.py
Log Message:
Refactor the query parser to rely on the lexicon for parsing terms.
ILexicon.py:
- Added parseTerms() and isGlob().
- Added get_word(), get_wid() (get_word() is old; get_wid() for symmetry).
- Reflowed some text.
IQueryParser.py:
- Expanded docs for parseQuery().
- Added getIgnored() and parseQueryEx().
IPipelineElement.py:
- Added processGlob().
Lexicon.py:
- Added parseTerms() and isGlob().
- Added get_wid().
- Some pipeline elements now support processGlob().
ParseTree.py:
- Clarified the error message for calling executeQuery() on a
NotNode.
QueryParser.py (lots of changes):
- Change private names __tokens etc. into protected _tokens etc.
- Add getIgnored() and parseQueryEx() methods.
- The atom parser now uses the lexicon's parseTerms() and isGlob()
methods.
- Query parts that consist only of stopwords (as determined by the
lexicon), or of stopwords and negated terms, yield None instead of
a parse tree node; the ignored term is added to self._ignored.
None is ignored when combining terms for AND/OR/NOT operators, and
when an operator has no non-None operands, the operator itself
returns None. When this None percolates all the way to the top,
the parser raises a ParseError exception.
tests/testQueryParser.py:
- Changed test expressions of the form "a AND b AND c" to "aa AND bb
AND cc" so that the terms won't be considered stopwords.
- The test for "and/" can only work for the base class.
tests/testZCTextIndex.py:
- Added copyright notice.
- Refactor testStopWords() to have two helpers, one for success, one
for failures.
- Change testStopWords() to require parser failure for those queries
that have only stopwords or stopwords plus negated terms.
- Improve compareSet() to sort the sets of keys, and use a more
direct way of extracting the keys. This wasn't strictly needed
(nothing fails without this), but the old approach of copying the
keys into a dict in a loop depends on the dict hashing to always
return keys in the same order.
=== Products/ZCTextIndex/ILexicon.py 1.2 => 1.3 ===
The input text may be either a string or a list of strings.
- Parses the text as if they are search terms, and skips words that
- aren't in the lexicon.
+ Parse the text as if they are search terms, and skips words
+ that aren't in the lexicon.
"""
def sourceToWordIds(text):
@@ -31,8 +31,9 @@
The input text may be either a string or a list of strings.
- Parses the text as if they come from a source document, and creates
- new word ids for words that aren't (yet) in the lexicon.
+ Parse the text as if they come from a source document, and
+ creates new word ids for words that aren't (yet) in the
+ lexicon.
"""
def globToWordIds(pattern):
@@ -43,9 +44,34 @@
NOTE: Currently only a single trailing * is supported.
- Returns the wids for all words in the lexicon that match the
+ Return the wids for all words in the lexicon that match the
pattern.
"""
def length():
"""Return the number of unique term in the lexicon."""
+
+ def get_word(wid):
+ """Return the word for the given word id.
+
+ Raise KeyError if the word id is not in the lexicon.
+ """
+
+ def get_wid(word):
+ """Return the wird id for the given word.
+
+ Return 0 of the word is not in the lexicon.
+ """
+
+ def parseTerms(text):
+ """Pass the text through the pipeline.
+
+ Return a list of words, normalized by the pipeline
+ (e.g. stopwords removed, case normalized etc.).
+ """
+
+ def isGlob(word):
+ """Return true if the word is a globbing pattern.
+
+ The word should be one of the words returned by parseTerm().
+ """
=== Products/ZCTextIndex/IPipelineElement.py 1.2 => 1.3 ===
Process a source sequence of words into a result sequence.
"""
+
+ def processGlob(source):
+ """Process, passing through globbing metacharaters.
+
+ This is an optional method; if it is not used, process() is used.
+ """
=== Products/ZCTextIndex/IQueryParser.py 1.2 => 1.3 ===
Return a parse tree (which implements IQueryParseTree).
+ Some of the query terms may be ignored because they are
+ stopwords; use getIgnored() to find out which terms were
+ ignored. But if the entire query consists only of stop words,
+ or of stopwords and one or more negated terms, an exception is
+ raised.
+
+ May raise ParseTree.ParseError.
+ """
+
+ def getIgnored():
+ """Return the list of ignored terms.
+
+ Return the list of terms that were ignored by the most recent
+ call to parseQuery() because they were stopwords.
+
+ If parseQuery() was never called this returns None.
+ """
+
+ def parseQueryEx(query):
+ """Parse a query string.
+
+ Return a tuple (tree, ignored) where 'tree' is the parse tree
+ as returned by parseQuery(), and 'ignored' is a list of
+ ignored terms as returned by getIgnored().
+
May raise ParseTree.ParseError.
"""
=== Products/ZCTextIndex/Lexicon.py 1.9 => 1.10 ===
return wids
+ def parseTerms(self, text):
+ last = _text2list(text)
+ for element in self._pipeline:
+ process = getattr(element, "processGlob", element.process)
+ last = process(last)
+ return last
+
+ def isGlob(self, word):
+ return "*" in word
+
def get_word(self, wid):
- """Return the word for the given word id"""
return self._words[wid]
+ def get_wid(self, word):
+ return self._wids.get(word, 0)
+
def globToWordIds(self, pattern):
if not re.match("^\w+\*$", pattern):
return []
@@ -116,11 +128,18 @@
import re
rx = re.compile(r"\w+")
+ rxGlob = re.compile(r"\w+\*?")
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
+ return result
+
+ def processGlob(self, lst):
+ result = []
+ for s in lst:
+ result += self.rxGlob.findall(s)
return result
class CaseNormalizer:
=== Products/ZCTextIndex/ParseTree.py 1.4 => 1.5 ===
def executeQuery(self, index):
- raise QueryError, "NOT operator must occur right after AND"
+ raise QueryError, "NOT parse tree node cannot be executed directly"
class AndNode(ParseTreeNode):
=== Products/ZCTextIndex/QueryParser.py 1.3 => 1.4 ===
# followed by
(?:
- # a string
+ # a string inside double quotes (and not containing these)
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+
@@ -92,46 +92,64 @@
class QueryParser:
+ # This class is not thread-safe;
+ # each thread should have its own instance
+
def __init__(self, lexicon):
self._lexicon = lexicon
+ self._ignored = None
+
+ # Public API methods
def parseQuery(self, query):
# Lexical analysis.
tokens = _tokenizer_regex.findall(query)
- self.__tokens = tokens
+ self._tokens = tokens
# classify tokens
- self.__tokentypes = [_keywords.get(token.upper(), _ATOM)
- for token in tokens]
+ self._tokentypes = [_keywords.get(token.upper(), _ATOM)
+ for token in tokens]
# add _EOF
- self.__tokens.append(_EOF)
- self.__tokentypes.append(_EOF)
- self.__index = 0
+ self._tokens.append(_EOF)
+ self._tokentypes.append(_EOF)
+ self._index = 0
# Syntactical analysis.
+ self._ignored = [] # Ignored words in the query, for parseQueryEx
tree = self._parseOrExpr()
self._require(_EOF)
+ if tree is None:
+ raise ParseTree.ParseError(
+ "Query contains only common words: %s" % repr(query))
return tree
+ def getIgnored(self):
+ return self._ignored
+
+ def parseQueryEx(self, query):
+ tree = self.parseQueryEx(query)
+ ignored = self.getIgnored()
+ return tree, ignored
+
# Recursive descent parser
def _require(self, tokentype):
if not self._check(tokentype):
- t = self.__tokens[self.__index]
+ t = self._tokens[self._index]
msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg
def _check(self, tokentype):
- if self.__tokentypes[self.__index] is tokentype:
- self.__index += 1
+ if self._tokentypes[self._index] is tokentype:
+ self._index += 1
return 1
else:
return 0
def _peek(self, tokentype):
- return self.__tokentypes[self.__index] is tokentype
+ return self._tokentypes[self._index] is tokentype
def _get(self, tokentype):
- t = self.__tokens[self.__index]
+ t = self._tokens[self._index]
self._require(tokentype)
return t
@@ -140,16 +158,31 @@
L.append(self._parseAndExpr())
while self._check(_OR):
L.append(self._parseAndExpr())
- if len(L) == 1:
+ L = filter(None, L)
+ if not L:
+ return None # Only stopwords
+ elif len(L) == 1:
return L[0]
else:
return ParseTree.OrNode(L)
def _parseAndExpr(self):
L = []
- L.append(self._parseTerm())
+ t = self._parseTerm()
+ if t is not None:
+ L.append(t)
+ Nots = []
while self._check(_AND):
- L.append(self._parseNotExpr())
+ t = self._parseNotExpr()
+ if t is None:
+ continue
+ if isinstance(t, ParseTree.NotNode):
+ Nots.append(t)
+ else:
+ L.append(t)
+ if not L:
+ return None # Only stopwords
+ L.extend(Nots)
if len(L) == 1:
return L[0]
else:
@@ -157,7 +190,10 @@
def _parseNotExpr(self):
if self._check(_NOT):
- return ParseTree.NotNode(self._parseTerm())
+ t = self._parseTerm()
+ if t is None:
+ return None # Only stopwords
+ return ParseTree.NotNode(t)
else:
return self._parseTerm()
@@ -172,12 +208,13 @@
nodes = []
nots = []
for a in atoms:
- words = re.findall(r"\w+\*?", a)
+ words = self._lexicon.parseTerms(a)
if not words:
- continue
+ self._ignored.append(a)
+ continue # Only stopwords
if len(words) > 1:
n = ParseTree.PhraseNode(" ".join(words))
- elif words[0].endswith("*"):
+ elif self._lexicon.isGlob(words[0]):
n = ParseTree.GlobNode(words[0])
else:
n = ParseTree.AtomNode(words[0])
@@ -187,9 +224,7 @@
else:
nodes.append(n)
if not nodes:
- text = " ".join(atoms)
- msg = "At least one positive term required: %r" % text
- raise ParseTree.ParseError, msg
+ return None # Only stowords
nodes.extend(nots)
if len(nodes) == 1:
tree = nodes[0]