[Zope-CVS] CVS: Products/ZCTextIndex - WidCode.py:1.1.2.1 IQueryParser.py:1.1.2.5 Index.py:1.1.2.38 ParseTree.py:1.1.2.3 QueryParser.py:1.1.2.14
Guido van Rossum
guido@python.org
Fri, 10 May 2002 21:00:36 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv16638
Modified Files:
Tag: TextIndexDS9-branch
IQueryParser.py Index.py ParseTree.py QueryParser.py
Added Files:
Tag: TextIndexDS9-branch
WidCode.py
Log Message:
Add primitive phrase search.
This uses a "compression" scheme for lists of ints that I dubbed
"WidCode", and which uses an encoding somewhat similar to UTF-8. It
only saves about 20 percent in pickle size over the (binary) pickle of
the list, but has the special property that you can use the string's
find() method to verify if a phrase occurs in a document. Because
docwords now records *all* wids, in document order, rather than only a
list of unique wids, the WidCode-encoded list is probably *longer*
than what was stored in docwords before, but i hope it's not that much
longer. The performance of WidCode could be improved by doing a
pre-scan over part of the corpus and assigning wids by frequency of
occurrence (the most frequent word gets wid 1, and so on).
Also still to do: change the query parser to recognize "words in
quotes" for phrase search. It currently takes any sequence of words
without operators as a phrase search, i.e. the default operator is now
phrase search; but that's probably not what you want! I did add a new
parse tree node, PhraseNode, which encodes a phrase search; there's
also a new Index method search_phrase().
=== Added File Products/ZCTextIndex/WidCode.py ===
# An efficient(?) encoding for lists of positive ints with many small ones
import re
def encode(wids):
# Encode a list of wids as a string
get = _encoding.get
return "".join([get(w) or _encode(w) for w in wids])
_encoding = {} # Filled later
def _encode(w):
assert 0x1000 <= w < 0x1000000
b, c = divmod(w, 0x80)
a, b = divmod(b, 0x80)
s = chr(b) + chr(c)
if a < 0x10:
return chr(a + 0xE0) + s
a, b = divmod(a, 0x80)
assert a < 0x4, (w, a, b, s)
return (chr(a + 0xF0) + chr(b)) + s
_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
def decode(code):
# Decode a string into a list of wids
get = _decoding.get
_prog.findall(code)
return [get(p) or _decode(p) for p in _prog.findall(code)]
_decoding = {} # Filled later
def _decode(s):
if s == '\x80':
return 0
if len(s) == 3:
a = ord(s[0])
b = ord(s[1])
c = ord(s[2])
assert a&0xF0 == 0xE0 and not b&0x80 and not c&0x80
return ((a&0xF) << 14) | (b<<7) | c
assert len(s) == 4, `s`
a = ord(s[0])
b = ord(s[1])
c = ord(s[2])
d = ord(s[3])
assert a&0xF8 == 0xF0 and not b&0x80 and not c&0x80 and not d&0x80
return ((a&0x7) << 21) | (b<<14) | (c<<7) | d
def _fill():
for i in range(0x40):
s = chr(i + 0x80)
_encoding[i] = s
_decoding[s] = i
for i in range(0x40, 0x1000):
hi, lo = divmod(i, 0x80)
s = chr(hi + 0xC0) + chr(lo)
_encoding[i] = s
_decoding[s] = i
_fill()
def test():
for i in range(2**20):
if i % 1000 == 0: print i
wids = [i]
code = encode(wids)
assert decode(code) == wids, (wids, code, decode(code))
if __name__ == "__main__":
test()
=== Products/ZCTextIndex/IQueryParser.py 1.1.2.4 => 1.1.2.5 ===
"""Return the node type.
- This is one of 'AND', 'OR', 'NOT', or 'ATOM'.
+ This is one of 'AND', 'OR', 'NOT', 'ATOM' or 'PHRASE'.
"""
def getValue():
@@ -44,6 +44,7 @@
'OR' a list of parse trees
'NOT' a parse tree
'ATOM' a string (representing a single search term)
+ 'PHRASE' a string (representing a search phrase)
"""
def terms():
=== Products/ZCTextIndex/Index.py 1.1.2.37 => 1.1.2.38 ===
from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet, weightedUnion
+from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import weightedIntersection, weightedUnion
from Products.ZCTextIndex.IIndex import IIndex
+from Products.ZCTextIndex import WidCode
# Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024
@@ -94,7 +96,7 @@
for i in range(len(uniqwids)):
self._add_wordinfo(uniqwids[i], freqs[i], docid)
self._docweight[docid] = docweight
- self._add_undoinfo(docid, uniqwids)
+ self._add_undoinfo(docid, wids)
def unindex_doc(self, docid):
for wid in self._get_undoinfo(docid):
@@ -104,6 +106,9 @@
def search(self, term):
wids = self._lexicon.termToWordIds(term)
+ return self.search_wids(wids)
+
+ def search_wids(self, wids):
if not wids:
return IIBucket()
N = float(len(self._docweight))
@@ -117,9 +122,23 @@
d2w = IIBucket(d2w)
L.append((d2w, scaled_int(idf)))
L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+ d2w, weight = L[0]
+ dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
+ for d2w, weight in L[1:]:
+ dummy, result = weightedIntersection(result, d2w, 1, weight)
+ return result
+
+ def search_phrase(self, phrase):
+ wids = self._lexicon.termToWordIds(phrase)
+ hits = self.search_wids(wids)
+ if not hits:
+ return hits
+ code = WidCode.encode(wids)
result = IIBTree()
- for d2w, weight in L:
- dummy, result = weightedUnion(result, d2w, 1, weight)
+ for docid, weight in hits.items():
+ docwords = self._docwords[docid]
+ if docwords.find(code) >= 0:
+ result[docid] = weight
return result
def query_weight(self, terms):
@@ -196,8 +215,11 @@
self._wordinfo[wid] = map # Not redundant, because of Persistency!
def _del_wordinfo(self, wid, docid):
- map = self._wordinfo[wid]
- del map[docid]
+ try:
+ map = self._wordinfo[wid]
+ del map[docid]
+ except KeyError:
+ return
if len(map) == 0:
del self._wordinfo[wid]
return
@@ -209,10 +231,10 @@
self._wordinfo[wid] = map # Not redundant, because of Persistency!
def _add_undoinfo(self, docid, wids):
- self._docwords[docid] = wids
+ self._docwords[docid] = WidCode.encode(wids)
def _get_undoinfo(self, docid):
- return self._docwords[docid]
+ return WidCode.decode(self._docwords[docid])
# The rest are helper methods to support unit tests
=== Products/ZCTextIndex/ParseTree.py 1.1.2.2 => 1.1.2.3 ===
def executeQuery(self, index):
- return index.search(self.getValue())
+ return index.search(self.getValue())
+
+class PhraseNode(AtomNode):
+
+ _nodeType = "PHRASE"
+
+ def executeQuery(self, index):
+ return index.search_phrase(self.getValue())
=== Products/ZCTextIndex/QueryParser.py 1.1.2.13 => 1.1.2.14 ===
tree = ParseTree.AtomNode(atoms[0])
else:
- tree = ParseTree.OrNode([ParseTree.AtomNode(t) for t in atoms])
+ tree = ParseTree.PhraseNode(" ".join(atoms))
return tree