[Zope3-checkins] CVS: Zope3/lib/python/Zope/TextIndex - BaseIndex.py:1.1 CosineIndex.py:1.1 HTMLSplitter.py:1.1 IIndex.py:1.1 ILexicon.py:1.1 INBest.py:1.1 IPipelineElement.py:1.1 IPipelineElementFactory.py:1.1 IQueryParseTree.py:1.1 IQueryParser.py:1.1 ISplitter.py:1.1 Lexicon.py:1.1 NBest.py:1.1 OkapiIndex.py:1.1 ParseTree.py:1.1 PipelineFactory.py:1.1 QueryParser.py:1.1 RiceCode.py:1.1 SetOps.py:1.1 StopDict.py:1.1 WidCode.py:1.1 __init__.py:1.1
Guido van Rossum
guido@python.org
Tue, 3 Dec 2002 09:28:18 -0500
Update of /cvs-repository/Zope3/lib/python/Zope/TextIndex
In directory cvs.zope.org:/tmp/cvs-serv25939
Added Files:
BaseIndex.py CosineIndex.py HTMLSplitter.py IIndex.py
ILexicon.py INBest.py IPipelineElement.py
IPipelineElementFactory.py IQueryParseTree.py IQueryParser.py
ISplitter.py Lexicon.py NBest.py OkapiIndex.py ParseTree.py
PipelineFactory.py QueryParser.py RiceCode.py SetOps.py
StopDict.py WidCode.py __init__.py
Log Message:
Baseline for TextIndex, copied from Zope2's ZCTextIndex with changes
necessitated by different locations of BTrees in Zope3, and C
extensions (temporarily) omitted. The test suite passes!
[Rotterdam sprint]
=== Added File Zope3/lib/python/Zope/TextIndex/BaseIndex.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Abstract base class for full text index with relevance ranking."""
import math
from Persistence.BTrees.IOBTree import IOBTree
from Persistence.BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
from Persistence.BTrees.IIBTree import intersection, difference
from Persistence.BTrees import Length
from Zope.TextIndex.IIndex import IIndex
from Zope.TextIndex import WidCode
from Zope.TextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion
import ZODB
from Persistence import Persistent
# Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024
# is large enough to get about 3 decimal digits of fractional info, and
# small enough so that scaled values should almost always fit in a signed
# 16-bit int (we're generally storing logs, so a few bits before the radix
# point goes a long way; on the flip side, for reasonably small numbers x
# most of the info in log(x) is in the fractional bits, so we do want to
# save a lot of those).
SCALE_FACTOR = 1024.0
def scaled_int(f, scale=SCALE_FACTOR):
# We expect only positive inputs, so "add a half and chop" is the
# same as round(). Surprising, calling round() is significantly more
# expensive.
return int(f * scale + 0.5)
def unique(L):
"""Return a list of the unique elements in L."""
return IITreeSet(L).keys()
class BaseIndex(Persistent):
__implements__ = IIndex
def __init__(self, lexicon):
self._lexicon = lexicon
# wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we
# expect each indexer to use ._wordinfo to map wids to its notion
# of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
# for a word we don't currently know about. For example, if we
# unindex the last doc containing a particular word, that wid
# remains in the lexicon, but is no longer in our _wordinfo map;
# lexicons can also be shared across indices, and some other index
# may introduce a lexicon word we've never seen.
# A word is in-vocabulary for this index if and only if
# _wordinfo.has_key(wid). Note that wid 0 must not be a key.
self._wordinfo = IOBTree()
# docid -> weight
# Different indexers have different notions of doc weight, but we
# expect each indexer to use ._docweight to map docids to its
# notion of what a doc weight is.
self._docweight = IIBTree()
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self._docwords = IOBTree()
# Use a BTree length for efficient length computation w/o conflicts
self.length = Length.Length()
def length(self):
"""Return the number of words in the index."""
# This is overridden per instance
return len(self._wordinfo)
def get_words(self, docid):
"""Return a list of the wordids for a given docid."""
# Note this is overridden in the instance
return WidCode.decode(self._docwords[docid])
# A subclass may wish to extend or override this.
def index_doc(self, docid, text):
if self._docwords.has_key(docid):
return self._reindex_doc(docid, text)
wids = self._lexicon.sourceToWordIds(text)
wid2weight, docweight = self._get_frequencies(wids)
self._mass_add_wordinfo(wid2weight, docid)
self._docweight[docid] = docweight
self._docwords[docid] = WidCode.encode(wids)
return len(wids)
# A subclass may wish to extend or override this. This is for adjusting
# to a new version of a doc that already exists. The goal is to be
# faster than simply unindexing the old version in its entirety and then
# adding the new version in its entirety.
def _reindex_doc(self, docid, text):
# Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
old_wids = self.get_words(docid)
old_wid2w, old_docw = self._get_frequencies(old_wids)
new_wids = self._lexicon.sourceToWordIds(text)
new_wid2w, new_docw = self._get_frequencies(new_wids)
old_widset = IITreeSet(old_wid2w.keys())
new_widset = IITreeSet(new_wid2w.keys())
in_both_widset = intersection(old_widset, new_widset)
only_old_widset = difference(old_widset, in_both_widset)
only_new_widset = difference(new_widset, in_both_widset)
del old_widset, new_widset
for wid in only_old_widset.keys():
self._del_wordinfo(wid, docid)
for wid in only_new_widset.keys():
self._add_wordinfo(wid, new_wid2w[wid], docid)
for wid in in_both_widset.keys():
# For the Okapi indexer, the "if" will trigger only for words
# whose counts have changed. For the cosine indexer, the "if"
# may trigger for every wid, since W(d) probably changed and
# W(d) is divided into every score.
newscore = new_wid2w[wid]
if old_wid2w[wid] != newscore:
self._add_wordinfo(wid, newscore, docid)
self._docweight[docid] = new_docw
self._docwords[docid] = WidCode.encode(new_wids)
return len(new_wids)
# Subclass must override.
def _get_frequencies(self, wids):
# Compute term frequencies and a doc weight, whatever those mean
# to an indexer.
# Return pair:
# {wid0: w(d, wid0), wid1: w(d, wid1), ...],
# docweight
# The wid->weight mappings are fed into _add_wordinfo, and docweight
# becomes the value of _docweight[docid].
raise NotImplementedError
def has_doc(self, docid):
return self._docwords.has_key(docid)
# A subclass may wish to extend or override this.
def unindex_doc(self, docid):
for wid in unique(self.get_words(docid)):
self._del_wordinfo(wid, docid)
del self._docwords[docid]
del self._docweight[docid]
def search(self, term):
wids = self._lexicon.termToWordIds(term)
if not wids:
return None # All docs match
wids = self._remove_oov_wids(wids)
return mass_weightedUnion(self._search_wids(wids))
def search_glob(self, pattern):
wids = self._lexicon.globToWordIds(pattern)
wids = self._remove_oov_wids(wids)
return mass_weightedUnion(self._search_wids(wids))
def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
cleaned_wids = self._remove_oov_wids(wids)
if len(wids) != len(cleaned_wids):
# At least one wid was OOV: can't possibly find it.
return IIBTree()
scores = self._search_wids(wids)
hits = mass_weightedIntersection(scores)
if not hits:
return hits
code = WidCode.encode(wids)
result = IIBTree()
for docid, weight in hits.items():
docwords = self._docwords[docid]
if docwords.find(code) >= 0:
result[docid] = weight
return result
def _remove_oov_wids(self, wids):
return filter(self._wordinfo.has_key, wids)
# Subclass must override.
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t. wids must not
# contain any OOV words.
def _search_wids(self, wids):
raise NotImplementedError
# Subclass must override.
# It's not clear what it should do. It must return an upper bound on
# document scores for the query. It would be nice if a document score
# divided by the query's query_weight gave the proabability that a
# document was relevant, but nobody knows how to do that. For
# CosineIndex, the ratio is the cosine of the angle between the document
# and query vectors. For OkapiIndex, the ratio is a (probably
# unachievable) upper bound with no "intuitive meaning" beyond that.
def query_weight(self, terms):
raise NotImplementedError
DICT_CUTOFF = 10
def _add_wordinfo(self, wid, f, docid):
# Store a wordinfo in a dict as long as there are less than
# DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
# The pickle of a dict is smaller than the pickle of an
# IIBTree, substantially so for small mappings. Thus, we use
# a dictionary until the mapping reaches DICT_CUTOFF elements.
# The cutoff is chosen based on the implementation
# characteristics of Python dictionaries. The dict hashtable
# always has 2**N slots and is resized whenever it is 2/3s
# full. A pickled dict with 10 elts is half the size of an
# IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
# choose 10 as the cutoff for now.
# The IIBTree has a smaller in-memory representation than a
# dictionary, so pickle size isn't the only consideration when
# choosing the threshold. The pickle of a 500-elt dict is 92%
# of the size of the same IIBTree, but the dict uses more
# space when it is live in memory. An IIBTree stores two C
# arrays of ints, one for the keys and one for the values. It
# holds up to 120 key-value pairs in a single bucket.
doc2score = self._wordinfo.get(wid)
if doc2score is None:
doc2score = {}
self.length.change(1)
else:
# _add_wordinfo() is called for each update. If the map
# size exceeds the DICT_CUTOFF, convert to an IIBTree.
# Obscure: First check the type. If it's not a dict, it
# can't need conversion, and then we can avoid an expensive
# len(IIBTree).
if (isinstance(doc2score, type({})) and
len(doc2score) == self.DICT_CUTOFF):
doc2score = IIBTree(doc2score)
doc2score[docid] = f
self._wordinfo[wid] = doc2score # not redundant: Persistency!
# self._mass_add_wordinfo(wid2weight, docid)
#
# is the same as
#
# for wid, weight in wid2weight.items():
# self._add_wordinfo(wid, weight, docid)
#
# except that _mass_add_wordinfo doesn't require so many function calls.
def _mass_add_wordinfo(self, wid2weight, docid):
dicttype = type({})
get_doc2score = self._wordinfo.get
new_word_count = 0
for wid, weight in wid2weight.items():
doc2score = get_doc2score(wid)
if doc2score is None:
doc2score = {}
new_word_count += 1
elif (isinstance(doc2score, dicttype) and
len(doc2score) == self.DICT_CUTOFF):
doc2score = IIBTree(doc2score)
doc2score[docid] = weight
self._wordinfo[wid] = doc2score # not redundant: Persistency!
self.length.change(new_word_count)
def _del_wordinfo(self, wid, docid):
doc2score = self._wordinfo[wid]
del doc2score[docid]
numdocs = len(doc2score)
if numdocs == 0:
del self._wordinfo[wid]
self.length.change(-1)
return
if numdocs == self.DICT_CUTOFF:
new = {}
for k, v in doc2score.items():
new[k] = v
doc2score = new
self._wordinfo[wid] = doc2score # not redundant: Persistency!
def inverse_doc_frequency(term_count, num_items):
"""Return the inverse doc frequency for a term,
that appears in term_count items in a collection with num_items
total items.
"""
# implements IDF(q, t) = log(1 + N/f(t))
return math.log(1.0 + float(num_items) / term_count)
=== Added File Zope3/lib/python/Zope/TextIndex/CosineIndex.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using a cosine measure."""
import math
from Persistence.BTrees.IIBTree import IIBucket
from Zope.TextIndex.IIndex import IIndex
from Zope.TextIndex.BaseIndex import BaseIndex, \
inverse_doc_frequency, \
scaled_int, SCALE_FACTOR
class CosineIndex(BaseIndex):
__implements__ = IIndex
def __init__(self, lexicon):
BaseIndex.__init__(self, lexicon)
# ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d)
# ._docweight for cosine is
# docid -> W(docid)
# Most of the computation for computing a relevance score for the
# document occurs in the _search_wids() method. The code currently
# implements the cosine similarity function described in Managing
# Gigabytes, eq. 4.3, p. 187. The index_object() method
# precomputes some values that are independent of the particular
# query.
# The equation is
#
# sum(for t in I(d,q): w(d,t) * w(q,t))
# cosine(d, q) = -------------------------------------
# W(d) * W(q)
#
# where
# I(d, q) = the intersection of the terms in d and q.
#
# w(d, t) = 1 + log f(d, t)
# computed by doc_term_weight(); for a given word t,
# self._wordinfo[t] is a map from d to w(d, t).
#
# w(q, t) = log(1 + N/f(t))
# computed by inverse_doc_frequency()
#
# W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
# computed by _get_frequencies(), and remembered in
# self._docweight[d]
#
# W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
# computed by self.query_weight()
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._docweight))
L = []
DictType = type({})
for wid in wids:
assert self._wordinfo.has_key(wid) # caller responsible for OOV
d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
idf = inverse_doc_frequency(len(d2w), N) # an unscaled float
#print "idf = %.3f" % idf
if isinstance(d2w, DictType):
d2w = IIBucket(d2w)
L.append((d2w, scaled_int(idf)))
return L
def query_weight(self, terms):
wids = []
for term in terms:
wids += self._lexicon.termToWordIds(term)
N = float(len(self._docweight))
sum = 0.0
for wid in self._remove_oov_wids(wids):
wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
sum += wt ** 2.0
return scaled_int(math.sqrt(sum))
def _get_frequencies(self, wids):
d = {}
dget = d.get
for wid in wids:
d[wid] = dget(wid, 0) + 1
Wsquares = 0.0
for wid, count in d.items():
w = doc_term_weight(count)
Wsquares += w * w
d[wid] = w
W = math.sqrt(Wsquares)
#print "W = %.3f" % W
for wid, weight in d.items():
#print i, ":", "%.3f" % weight,
d[wid] = scaled_int(weight / W)
#print "->", d[wid]
return d, scaled_int(W)
# The rest are helper methods to support unit tests
def _get_wdt(self, d, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
def _get_Wd(self, d):
return self._docweight[d]
def _get_ft(self, t):
wid, = self._lexicon.termToWordIds(t)
return len(self._wordinfo[wid])
def _get_wt(self, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
def doc_term_weight(count):
"""Return the doc-term weight for a term that appears count times."""
# implements w(d, t) = 1 + log f(d, t)
return 1.0 + math.log(count)
=== Added File Zope3/lib/python/Zope/TextIndex/HTMLSplitter.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Zope.TextIndex.ISplitter import ISplitter
from Zope.TextIndex.PipelineFactory import element_factory
import re
class HTMLWordSplitter:
__implements__ = ISplitter
def process(self, text, wordpat=r"(?L)\w+"):
splat = []
for t in text:
splat += self._split(t, wordpat)
return splat
def processGlob(self, text):
# see Lexicon.globToWordIds()
return self.process(text, r"(?L)\w+[\w*?]*")
def _split(self, text, wordpat):
text = text.lower()
remove = [r"<[^<>]*>",
r"&[A-Za-z]+;"]
for pat in remove:
text = re.sub(pat, " ", text)
return re.findall(wordpat, text)
element_factory.registerFactory('Word Splitter',
'HTML aware splitter',
HTMLWordSplitter)
if __name__ == "__main__":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
=== Added File Zope3/lib/python/Zope/TextIndex/IIndex.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Index Interface."""
from Interface import Interface
class IIndex(Interface):
"""Interface for an Index."""
def length():
"""Return the number of documents in the index."""
def get_words(docid):
"""Return a list of wordids for the given docid."""
def search(term):
"""Execute a search on a single term given as a string.
Return an IIBTree mapping docid to score, or None if all docs
match due to the lexicon returning no wids for the term (e.g.,
if the term is entirely composed of stopwords).
"""
def search_phrase(phrase):
"""Execute a search on a phrase given as a string.
Return an IIBtree mapping docid to score.
"""
def search_glob(pattern):
"""Execute a pattern search.
The pattern represents a set of words by using * and ?. For
example, "foo*" represents the set of all words in the lexicon
starting with "foo".
Return an IIBTree mapping docid to score.
"""
def query_weight(terms):
"""Return the weight for a set of query terms.
'terms' is a sequence of all terms included in the query,
although not terms with a not. If a term appears more than
once in a query, it should appear more than once in terms.
Nothing is defined about what "weight" means, beyond that the
result is an upper bound on document scores returned for the
query.
"""
def index_doc(docid, text):
"XXX"
def unindex_doc(docid):
"XXX"
def has_doc(docid):
"""Returns true if docid is an id of a document in the index"""
=== Added File Zope3/lib/python/Zope/TextIndex/ILexicon.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Interface
class ILexicon(Interface):
"""Object responsible for converting text to word identifiers."""
def termToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parse the text as if they are search terms, and skips words
that aren't in the lexicon.
"""
def sourceToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parse the text as if they come from a source document, and
creates new word ids for words that aren't (yet) in the
lexicon.
"""
def globToWordIds(pattern):
"""Return a sequence of ids of words matching the pattern.
The argument should be a single word using globbing syntax,
e.g. 'foo*' meaning anything starting with 'foo'.
Return the wids for all words in the lexicon that match the
pattern.
"""
def length():
"""Return the number of unique term in the lexicon."""
def get_word(wid):
"""Return the word for the given word id.
Raise KeyError if the word id is not in the lexicon.
"""
def get_wid(word):
"""Return the wird id for the given word.
Return 0 of the word is not in the lexicon.
"""
def parseTerms(text):
"""Pass the text through the pipeline.
Return a list of words, normalized by the pipeline
(e.g. stopwords removed, case normalized etc.).
"""
def isGlob(word):
"""Return true if the word is a globbing pattern.
The word should be one of the words returned by parseTerm().
"""
=== Added File Zope3/lib/python/Zope/TextIndex/INBest.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""NBest Interface.
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
from Interface import Interface
class INBest(Interface):
"""Interface for an N-Best chooser."""
def add(item, score):
"""Record that item 'item' has score 'score'. No return value.
The N best-scoring items are remembered, where N was passed to
the constructor. 'item' can by anything. 'score' should be
a number, and larger numbers are considered better.
"""
def addmany(sequence):
"""Like "for item, score in sequence: self.add(item, score)".
This is simply faster than calling add() len(seq) times.
"""
def getbest():
"""Return the (at most) N best-scoring items as a sequence.
The return value is a sequence of 2-tuples, (item, score), with
the largest score first. If .add() has been called fewer than
N times, this sequence will contain fewer than N pairs.
"""
def pop_smallest():
"""Return and remove the (item, score) pair with lowest score.
If len(self) is 0, raise IndexError.
To be cleaer, this is the lowest score among the N best-scoring
seen so far. This is most useful if the capacity of the NBest
object is never exceeded, in which case pop_smallest() allows
using the object as an ordinary smallest-in-first-out priority
queue.
"""
def __len__():
"""Return the number of (item, score) pairs currently known.
This is N (the value passed to the constructor), unless .add()
has been called fewer than N times.
"""
def capacity():
"""Return the maximum number of (item, score) pairs.
This is N (the value passed to the constructor).
"""
=== Added File Zope3/lib/python/Zope/TextIndex/IPipelineElement.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Interface
class IPipelineElement(Interface):
def process(source):
"""Provide a text processing step.
Process a source sequence of words into a result sequence.
"""
def processGlob(source):
"""Process, passing through globbing metacharaters.
This is an optional method; if it is not used, process() is used.
"""
=== Added File Zope3/lib/python/Zope/TextIndex/IPipelineElementFactory.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Interface
class IPipelineElementFactory(Interface):
"""Class for creating pipeline elements by name"""
def registerFactory(group, name, factory):
"""Registers a pipeline factory by name and element group.
Each name can be registered only once for a given group. Duplicate
registrations will raise a ValueError
"""
def getFactoryGroups():
"""Returns a sorted list of element group names
"""
def getFactoryNames(group):
"""Returns a sorted list of registered pipeline factory names
in the specified element group
"""
def instantiate(group, name):
"""Instantiates a pipeline element by group and name. If name is not
registered raise a KeyError.
"""
=== Added File Zope3/lib/python/Zope/TextIndex/IQueryParseTree.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser Tree Interface."""
from Interface import Interface
class IQueryParseTree(Interface):
"""Interface for parse trees returned by parseQuery()."""
def nodeType():
"""Return the node type.
This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
"""
def getValue():
"""Return a node-type specific value.
For node type: Return:
'AND' a list of parse trees
'OR' a list of parse trees
'NOT' a parse tree
'ATOM' a string (representing a single search term)
'PHRASE' a string (representing a search phrase)
'GLOB' a string (representing a pattern, e.g. "foo*")
"""
def terms():
"""Return a list of all terms in this node, excluding NOT subtrees."""
def executeQuery(index):
"""Execute the query represented by this node against the index.
The index argument must implement the IIndex interface.
Return an IIBucket or IIBTree mapping document ids to scores
(higher scores mean better results).
May raise ParseTree.QueryError.
"""
=== Added File Zope3/lib/python/Zope/TextIndex/IQueryParser.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser Interface."""
from Interface import Interface
class IQueryParser(Interface):
"""Interface for Query Parsers."""
def parseQuery(query):
"""Parse a query string.
Return a parse tree (which implements IQueryParseTree).
Some of the query terms may be ignored because they are
stopwords; use getIgnored() to find out which terms were
ignored. But if the entire query consists only of stop words,
or of stopwords and one or more negated terms, an exception is
raised.
May raise ParseTree.ParseError.
"""
def getIgnored():
"""Return the list of ignored terms.
Return the list of terms that were ignored by the most recent
call to parseQuery() because they were stopwords.
If parseQuery() was never called this returns None.
"""
def parseQueryEx(query):
"""Parse a query string.
Return a tuple (tree, ignored) where 'tree' is the parse tree
as returned by parseQuery(), and 'ignored' is a list of
ignored terms as returned by getIgnored().
May raise ParseTree.ParseError.
"""
=== Added File Zope3/lib/python/Zope/TextIndex/ISplitter.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Interface
class ISplitter(Interface):
"""A splitter."""
def process(text):
"""Run the splitter over the input text, returning a list of terms."""
=== Added File Zope3/lib/python/Zope/TextIndex/Lexicon.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import re
from Persistence.BTrees.IOBTree import IOBTree
from Persistence.BTrees.OIBTree import OIBTree
import ZODB
from Persistence import Persistent
from Zope.TextIndex.ILexicon import ILexicon
from Zope.TextIndex.StopDict import get_stopdict
from Zope.TextIndex.ParseTree import QueryError
from Zope.TextIndex.PipelineFactory import element_factory
class Lexicon(Persistent):
__implements__ = ILexicon
def __init__(self, *pipeline):
self._wids = OIBTree() # word -> wid
self._words = IOBTree() # wid -> word
# wid 0 is reserved for words that aren't in the lexicon (OOV -- out
# of vocabulary). This can happen, e.g., if a query contains a word
# we never saw before, and that isn't a known stopword (or otherwise
# filtered out). Returning a special wid value for OOV words is a
# way to let clients know when an OOV word appears.
self._nextwid = 1
self._pipeline = pipeline
# Keep some statistics about indexing
self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
self._nwords = 0 # Number of words indexed (after pipeline)
def length(self):
"""Return the number of unique terms in the lexicon."""
return self._nextwid - 1
def words(self):
return self._wids.keys()
def wids(self):
return self._words.keys()
def items(self):
return self._wids.items()
def sourceToWordIds(self, text):
last = _text2list(text)
for t in last:
self._nbytes += len(t)
for element in self._pipeline:
last = element.process(last)
self._nwords += len(last)
return map(self._getWordIdCreate, last)
def termToWordIds(self, text):
last = _text2list(text)
for element in self._pipeline:
last = element.process(last)
wids = []
for word in last:
wids.append(self._wids.get(word, 0))
return wids
def parseTerms(self, text):
last = _text2list(text)
for element in self._pipeline:
process = getattr(element, "processGlob", element.process)
last = process(last)
return last
def isGlob(self, word):
return "*" in word or "?" in word
def get_word(self, wid):
return self._words[wid]
def get_wid(self, word):
return self._wids.get(word, 0)
def globToWordIds(self, pattern):
# Implement * and ? just as in the shell, except the pattern
# must not start with either of these
prefix = ""
while pattern and pattern[0] not in "*?":
prefix += pattern[0]
pattern = pattern[1:]
if not pattern:
# There were no globbing characters in the pattern
wid = self._wids.get(prefix, 0)
if wid:
return [wid]
else:
return []
if not prefix:
# The pattern starts with a globbing character.
# This is too efficient, so we raise an exception.
raise QueryError(
"pattern %r shouldn't start with glob character" % pattern)
pat = prefix
for c in pattern:
if c == "*":
pat += ".*"
elif c == "?":
pat += "."
else:
pat += re.escape(c)
pat += "$"
prog = re.compile(pat)
keys = self._wids.keys(prefix) # Keys starting at prefix
wids = []
for key in keys:
if not key.startswith(prefix):
break
if prog.match(key):
wids.append(self._wids[key])
return wids
def _getWordIdCreate(self, word):
wid = self._wids.get(word)
if wid is None:
wid = self._new_wid()
self._wids[word] = wid
self._words[wid] = word
return wid
def _new_wid(self):
wid = self._nextwid
self._nextwid += 1
return wid
def _text2list(text):
# Helper: splitter input may be a string or a list of strings
try:
text + ""
except:
return text
else:
return [text]
# Sample pipeline elements
class Splitter:
import re
rx = re.compile(r"(?L)\w+")
rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result
def processGlob(self, lst):
result = []
for s in lst:
result += self.rxGlob.findall(s)
return result
element_factory.registerFactory('Word Splitter',
'Whitespace splitter',
Splitter)
class CaseNormalizer:
def process(self, lst):
return [w.lower() for w in lst]
element_factory.registerFactory('Case Normalizer',
'Case Normalizer',
CaseNormalizer)
element_factory.registerFactory('Stop Words',
' Don\'t remove stop words',
None)
class StopWordRemover:
dict = get_stopdict().copy()
try:
from Zope.TextIndex.stopper import process as _process
except ImportError:
def process(self, lst):
has_key = self.dict.has_key
return [w for w in lst if not has_key(w)]
else:
def process(self, lst):
return self._process(self.dict, lst)
element_factory.registerFactory('Stop Words',
'Remove listed stop words only',
StopWordRemover)
class StopWordAndSingleCharRemover(StopWordRemover):
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
element_factory.registerFactory('Stop Words',
'Remove listed and single char words',
StopWordAndSingleCharRemover)
=== Added File Zope3/lib/python/Zope/TextIndex/NBest.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""NBest
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
from bisect import bisect
from Zope.TextIndex.INBest import INBest
class NBest:
__implements__ = INBest
def __init__(self, N):
"Build an NBest object to remember the N best-scoring objects."
if N < 1:
raise ValueError("NBest() argument must be at least 1")
self._capacity = N
# This does a very simple thing with sorted lists. For large
# N, a min-heap can be unboundedly better in terms of data
# movement time.
self._scores = []
self._items = []
def __len__(self):
return len(self._scores)
def capacity(self):
return self._capacity
def add(self, item, score):
self.addmany([(item, score)])
def addmany(self, sequence):
scores, items, capacity = self._scores, self._items, self._capacity
n = len(scores)
for item, score in sequence:
# When we're in steady-state, the usual case is that we're filled
# to capacity, and that an incoming item is worse than any of
# the best-seen so far.
if n >= capacity and score <= scores[0]:
continue
i = bisect(scores, score)
scores.insert(i, score)
items.insert(i, item)
if n == capacity:
del items[0], scores[0]
else:
n += 1
assert n == len(scores)
def getbest(self):
result = zip(self._items, self._scores)
result.reverse()
return result
def pop_smallest(self):
if self._scores:
return self._items.pop(0), self._scores.pop(0)
raise IndexError("pop_smallest() called on empty NBest object")
=== Added File Zope3/lib/python/Zope/TextIndex/OkapiIndex.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using an Okapi BM25 rank."""
# Lots of comments are at the bottom of this file. Read them to
# understand what's going on.
from Persistence.BTrees.IIBTree import IIBucket
from Zope.TextIndex.IIndex import IIndex
from Zope.TextIndex.BaseIndex import \
BaseIndex, inverse_doc_frequency, scaled_int
##from Zope.TextIndex.okascore import score
class OkapiIndex(BaseIndex):
__implements__ = IIndex
# BM25 free parameters.
K1 = 1.2
B = 0.75
assert K1 >= 0.0
assert 0.0 <= B <= 1.0
def __init__(self, lexicon):
BaseIndex.__init__(self, lexicon)
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# ._docweight for Okapi is
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
# in compressed form, so uncompressing it just to count the list
# length would be ridiculously expensive.
# sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
self._totaldoclen = 0L
def index_doc(self, docid, text):
count = BaseIndex.index_doc(self, docid, text)
self._totaldoclen += count
return count
def _reindex_doc(self, docid, text):
self._totaldoclen -= self._docweight[docid]
return BaseIndex._reindex_doc(self, docid, text)
def unindex_doc(self, docid):
self._totaldoclen -= self._docweight[docid]
BaseIndex.unindex_doc(self, docid)
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t.
# As currently written, the weights are always 1, and the IIBucket maps
# D to TF(D,t)*IDF(t) directly, where the product is computed as a float
# but stored as a scaled_int.
# NOTE: This may be overridden below, by a function that computes the
# same thing but with the inner scoring loop in C.
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
K1 = self.K1
B = self.B
K1_plus1 = K1 + 1.0
B_from1 = 1.0 - B
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()
for docid, f in d2f.items():
lenweight = B_from1 + B * docid2len[docid] / meandoclen
tf = f * K1_plus1 / (f + K1 * lenweight)
result[docid] = scaled_int(tf * idf)
L.append((result, 1))
return L
# Note about the above: the result is tf * idf. tf is small -- it
# can't be larger than k1+1 = 2.2. idf is formally unbounded, but
# is less than 14 for a term that appears in only 1 of a million
# documents. So the product is probably less than 32, or 5 bits
# before the radix point. If we did the scaled-int business on
# both of them, we'd be up to 25 bits. Add 64 of those and we'd
# be in overflow territory. That's pretty unlikely, so we *could*
# just store scaled_int(tf) in result[docid], and use scaled_int(idf)
# as an invariant weight across the whole result. But besides
# skating near the edge, it's not a speed cure, since the computation
# of tf would still be done at Python speed, and it's a lot more
# work than just multiplying by idf.
# The same function as _search_wids above, but with the inner scoring
# loop written in C (module okascore, function score()).
# Cautions: okascore hardcodes the values of K, B1, and the scaled_int
# function.
def _search_wids_NOTYET(self, wids):
if not wids:
return []
N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
#B_from1 = 1.0 - B
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()
score(result, d2f.items(), docid2len, idf, meandoclen)
L.append((result, 1))
return L
def query_weight(self, terms):
# Get the wids.
wids = []
for term in terms:
termwids = self._lexicon.termToWordIds(term)
wids.extend(termwids)
# The max score for term t is the maximum value of
# TF(D, t) * IDF(Q, t)
# We can compute IDF directly, and as noted in the comments below
# TF(D, t) is bounded above by 1+K1.
N = float(len(self._docweight))
tfmax = 1.0 + self.K1
sum = 0
for t in self._remove_oov_wids(wids):
idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
sum += scaled_int(idf * tfmax)
return sum
def _get_frequencies(self, wids):
d = {}
dget = d.get
for wid in wids:
d[wid] = dget(wid, 0) + 1
return d, len(wids)
"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
It's based on probability arguments about how words are distributed in
documents, not on an abstract vector space model. A long paper by its
principal inventors gives an excellent overview of how it was derived:
A probabilistic model of information retrieval: development and status
K. Sparck Jones, S. Walker, S.E. Robertson
http://citeseer.nj.nec.com/jones98probabilistic.html
Spellings that ignore relevance information (which we don't have) are of this
high-level form:
score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
where
D a specific document
Q a specific query
t a term (word, atomic phrase, whatever)
D&Q the terms common to D and Q
TF(D, t) a measure of t's importance in D -- a kind of term frequency
weight
IDF(Q, t) a measure of t's importance in the query and in the set of
documents as a whole -- a kind of inverse document frequency
weight
The IDF(Q, t) here is identical to the one used for our cosine measure.
Since queries are expected to be short, it ignores Q entirely:
IDF(Q, t) = log(1.0 + N / f(t))
where
N the total number of documents
f(t) the number of documents in which t appears
Most Okapi literature seems to use log(N/f(t)) instead. We don't, because
that becomes 0 for a term that's in every document, and, e.g., if someone
is searching for "documentation" on python.org (a term that may well show
up on every page, due to the top navigation bar), we still want to find the
pages that use the word a lot (which is TF's job to find, not IDF's -- we
just want to stop IDF from considering this t to be irrelevant).
The TF(D, t) spellings are more interesting. With lots of variations, the
most basic spelling is of the form
f(D, t)
TF(D, t) = ---------------
f(D, t) + K(D)
where
f(D, t) the number of times t appears in D
K(D) a measure of the length of D, normalized to mean doc length
The functional *form* f/(f+K) is clever. It's a gross approximation to a
mixture of two distinct Poisson distributions, based on the idea that t
probably appears in D for one of two reasons:
1. More or less at random.
2. Because it's important to D's purpose in life ("eliteness" in papers).
Note that f/(f+K) is always between 0 and 1. If f is very large compared to
K, it approaches 1. If K is very large compared to f, it approaches 0. If
t appears in D more or less "for random reasons", f is likely to be small,
and so K will dominate unless it's a very small doc, and the ratio will be
small. OTOH, if t appears a lot in D, f will dominate unless it's a very
large doc, and the ratio will be close to 1.
We use a variation on that simple theme, a simplification of what's called
BM25 in the literature (it was the 25th stab at a Best Match function from
the Okapi group; "a simplification" means we're setting some of BM25's more
esoteric free parameters to 0):
f(D, t) * (k1 + 1)
TF(D, t) = --------------------
f(D, t) + k1 * K(D)
where
k1 a "tuning factor", typically between 1.0 and 2.0. We use 1.2,
the usual default value. This constant adjusts the curve to
look more like a theoretical 2-Poisson curve.
Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
an asymptote of k1+1 from below.
Finally, we use
K(D) = (1-b) + b * len(D)/E(len(D))
where
b is another free parameter, discussed below. We use 0.75.
len(D) the length of D in words
E(len(D)) the expected value of len(D) across the whole document set;
or, IOW, the average document length
b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
of the "Verbosity Hypothesis". Suppose b is 1, and some word t appears
10 times as often in document d2 than in document d1. If document d2 is
also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
f(d2, t) * (k1 + 1)
TF(d2, t) = --------------------------------- =
f(d2, t) + k1 * len(d2)/E(len(D))
10 * f(d1, t) * (k1 + 1)
----------------------------------------------- = TF(d1, t)
10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
because the 10's cancel out. This is appropriate if we believe that a word
appearing 10x more often in a doc 10x as long is simply due to that the
longer doc is more verbose. If we do believe that, the longer doc and the
shorter doc are probably equally relevant. OTOH, it *could* be that the
longer doc is talking about t in greater depth too, in which case it's
probably more relevant than the shorter doc.
At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
completely, and a doc scores higher for having more occurences of a word
regardless of the doc's length.
Reality is between these extremes, and probably varies by document and word
too. Reports in the literature suggest that b=0.75 is a good compromise "in
general", favoring the "verbosity hypothesis" end of the scale.
Putting it all together, the final TF function is
f(D, t) * (k1 + 1)
TF(D, t) = --------------------------------------------
f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
with k1=1.2 and b=0.75.
Query Term Weighting
--------------------
I'm ignoring the query adjustment part of Okapi BM25 because I expect our
queries are very short. Full BM25 takes them into account by adding the
following to every score(D, Q); it depends on the lengths of D and Q, but
not on the specific words in Q, or even on whether they appear in D(!):
E(len(D)) - len(D)
k2 * len(Q) * -------------------
E(len(D)) + len(D)
Here k2 is another "tuning constant", len(Q) is the number of words in Q, and
len(D) & E(len(D)) were defined above. The Okapi group set k2 to 0 in TREC-9,
so it apparently doesn't do much good (or may even hurt).
Full BM25 *also* multiplies the following factor into IDF(Q, t):
f(Q, t) * (k3 + 1)
------------------
f(Q, t) + k3
where k3 is yet another free parameter, and f(Q,t) is the number of times t
appears in Q. Since we're using short "web style" queries, I expect f(Q,t)
to always be 1, and then that quotient is
1 * (k3 + 1)
------------ = 1
1 + k3
regardless of k3's value. So, in a trivial sense, we are incorporating
this measure (and optimizing it by not bothering to multiply by 1 <wink>).
"""
=== Added File Zope3/lib/python/Zope/TextIndex/ParseTree.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Generic parser support: exception and parse tree nodes."""
from Persistence.BTrees.IIBTree import difference
from Zope.TextIndex.IQueryParseTree import IQueryParseTree
from Zope.TextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion
class QueryError(Exception):
pass
class ParseError(Exception):
pass
class ParseTreeNode:
__implements__ = IQueryParseTree
_nodeType = None
def __init__(self, value):
self._value = value
def nodeType(self):
return self._nodeType
def getValue(self):
return self._value
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.getValue())
def terms(self):
t = []
for v in self.getValue():
t.extend(v.terms())
return t
def executeQuery(self, index):
raise NotImplementedError
class NotNode(ParseTreeNode):
_nodeType = "NOT"
def terms(self):
return []
def executeQuery(self, index):
raise QueryError, "NOT parse tree node cannot be executed directly"
class AndNode(ParseTreeNode):
_nodeType = "AND"
def executeQuery(self, index):
L = []
Nots = []
for subnode in self.getValue():
if subnode.nodeType() == "NOT":
r = subnode.getValue().executeQuery(index)
# If None, technically it matches every doc, but we treat
# it as if it matched none (we want
# real_word AND NOT stop_word
# to act like plain real_word).
if r is not None:
Nots.append((r, 1))
else:
r = subnode.executeQuery(index)
# If None, technically it matches every doc, so needn't be
# included.
if r is not None:
L.append((r, 1))
set = mass_weightedIntersection(L)
if Nots:
notset = mass_weightedUnion(Nots)
set = difference(set, notset)
return set
class OrNode(ParseTreeNode):
_nodeType = "OR"
def executeQuery(self, index):
weighted = []
for node in self.getValue():
r = node.executeQuery(index)
# If None, technically it matches every doc, but we treat
# it as if it matched none (we want
# real_word OR stop_word
# to act like plain real_word).
if r is not None:
weighted.append((r, 1))
return mass_weightedUnion(weighted)
class AtomNode(ParseTreeNode):
_nodeType = "ATOM"
def terms(self):
return [self.getValue()]
def executeQuery(self, index):
return index.search(self.getValue())
class PhraseNode(AtomNode):
_nodeType = "PHRASE"
def executeQuery(self, index):
return index.search_phrase(self.getValue())
class GlobNode(AtomNode):
_nodeType = "GLOB"
def executeQuery(self, index):
return index.search_glob(self.getValue())
=== Added File Zope3/lib/python/Zope/TextIndex/PipelineFactory.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Zope.TextIndex.IPipelineElementFactory \
import IPipelineElementFactory
class PipelineElementFactory:
__implements__ = IPipelineElementFactory
def __init__(self):
self._groups = {}
def registerFactory(self, group, name, factory):
if self._groups.has_key(group) and \
self._groups[group].has_key(name):
raise ValueError('ZCTextIndex lexicon element "%s" '
'already registered in group "%s"'
% (name, group))
elements = self._groups.get(group)
if elements is None:
elements = self._groups[group] = {}
elements[name] = factory
def getFactoryGroups(self):
groups = self._groups.keys()
groups.sort()
return groups
def getFactoryNames(self, group):
names = self._groups[group].keys()
names.sort()
return names
def instantiate(self, group, name):
factory = self._groups[group][name]
if factory is not None:
return factory()
element_factory = PipelineElementFactory()
=== Added File Zope3/lib/python/Zope/TextIndex/QueryParser.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser.
This particular parser recognizes the following syntax:
Start = OrExpr
OrExpr = AndExpr ('OR' AndExpr)*
AndExpr = Term ('AND' NotExpr)*
NotExpr = ['NOT'] Term
Term = '(' OrExpr ')' | ATOM+
The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal (ignoring case) to one of the key words
'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the
string can contain whitespace, parentheses and key words, but not
quotes.
+ A hyphen followed by one of the two forms above, meaning that it
must not be present.
An unquoted ATOM may also contain globbing characters. Globbing
syntax is defined by the lexicon; for example "foo*" could mean any
word starting with "foo".
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
is interpreted as a NOT operator.
Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- * and ? are used for globbing (i.e. prefix search), e.g. ``foo*''
"""
import re
from Zope.TextIndex.IQueryParser import IQueryParser
from Zope.TextIndex import ParseTree
# Create unique symbols for token types.
_AND = intern("AND")
_OR = intern("OR")
_NOT = intern("NOT")
_LPAREN = intern("(")
_RPAREN = intern(")")
_ATOM = intern("ATOM")
_EOF = intern("EOF")
# Map keyword string to token type.
_keywords = {
_AND: _AND,
_OR: _OR,
_NOT: _NOT,
_LPAREN: _LPAREN,
_RPAREN: _RPAREN,
}
# Regular expression to tokenize.
_tokenizer_regex = re.compile(r"""
# a paren
[()]
# or an optional hyphen
| -?
# followed by
(?:
# a string inside double quotes (and not containing these)
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+
)
""", re.VERBOSE)
class QueryParser:
__implements__ = IQueryParser
# This class is not thread-safe;
# each thread should have its own instance
def __init__(self, lexicon):
self._lexicon = lexicon
self._ignored = None
# Public API methods
def parseQuery(self, query):
# Lexical analysis.
tokens = _tokenizer_regex.findall(query)
self._tokens = tokens
# classify tokens
self._tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens]
# add _EOF
self._tokens.append(_EOF)
self._tokentypes.append(_EOF)
self._index = 0
# Syntactical analysis.
self._ignored = [] # Ignored words in the query, for parseQueryEx
tree = self._parseOrExpr()
self._require(_EOF)
if tree is None:
raise ParseTree.ParseError(
"Query contains only common words: %s" % repr(query))
return tree
def getIgnored(self):
return self._ignored
def parseQueryEx(self, query):
tree = self.parseQuery(query)
ignored = self.getIgnored()
return tree, ignored
# Recursive descent parser
def _require(self, tokentype):
if not self._check(tokentype):
t = self._tokens[self._index]
msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg
def _check(self, tokentype):
if self._tokentypes[self._index] is tokentype:
self._index += 1
return 1
else:
return 0
def _peek(self, tokentype):
return self._tokentypes[self._index] is tokentype
def _get(self, tokentype):
t = self._tokens[self._index]
self._require(tokentype)
return t
def _parseOrExpr(self):
L = []
L.append(self._parseAndExpr())
while self._check(_OR):
L.append(self._parseAndExpr())
L = filter(None, L)
if not L:
return None # Only stopwords
elif len(L) == 1:
return L[0]
else:
return ParseTree.OrNode(L)
def _parseAndExpr(self):
L = []
t = self._parseTerm()
if t is not None:
L.append(t)
Nots = []
while self._check(_AND):
t = self._parseNotExpr()
if t is None:
continue
if isinstance(t, ParseTree.NotNode):
Nots.append(t)
else:
L.append(t)
if not L:
return None # Only stopwords
L.extend(Nots)
if len(L) == 1:
return L[0]
else:
return ParseTree.AndNode(L)
def _parseNotExpr(self):
if self._check(_NOT):
t = self._parseTerm()
if t is None:
return None # Only stopwords
return ParseTree.NotNode(t)
else:
return self._parseTerm()
def _parseTerm(self):
if self._check(_LPAREN):
tree = self._parseOrExpr()
self._require(_RPAREN)
else:
nodes = []
nodes = [self._parseAtom()]
while self._peek(_ATOM):
nodes.append(self._parseAtom())
nodes = filter(None, nodes)
if not nodes:
return None # Only stopwords
structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
for i in range(len(nodes))]
structure.sort()
nodes = [node for (bit, index, node) in structure]
if isinstance(nodes[0], ParseTree.NotNode):
raise ParseTree.ParseError(
"a term must have at least one positive word")
if len(nodes) == 1:
return nodes[0]
tree = ParseTree.AndNode(nodes)
return tree
def _parseAtom(self):
term = self._get(_ATOM)
words = self._lexicon.parseTerms(term)
if not words:
self._ignored.append(term)
return None
if len(words) > 1:
tree = ParseTree.PhraseNode(words)
elif self._lexicon.isGlob(words[0]):
tree = ParseTree.GlobNode(words[0])
else:
tree = ParseTree.AtomNode(words[0])
if term[0] == "-":
tree = ParseTree.NotNode(tree)
return tree
=== Added File Zope3/lib/python/Zope/TextIndex/RiceCode.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Rice coding (a variation of Golomb coding)
Based on a Java implementation by Glen McCluskey described in a Usenix
;login: article at
http://www.usenix.org/publications/login/2000-4/features/java.html
McCluskey's article explains the approach as follows. The encoding
for a value x is represented as a unary part and a binary part. The
unary part is a sequence of 1 bits followed by a 0 bit. The binary
part encodes some of the lower bits of x-1.
The encoding is parameterized by a value m that describes how many
bits to store in the binary part. If most of the values are smaller
than 2**m then they can be stored in only m+1 bits.
Compute the length of the unary part, q, where
q = math.floor((x-1)/ 2 ** m)
Emit q 1 bits followed by a 0 bit.
Emit the lower m bits of x-1, treating x-1 as a binary value.
"""
import array
class BitArray:
def __init__(self, buf=None):
self.bytes = array.array('B')
self.nbits = 0
self.bitsleft = 0
self.tostring = self.bytes.tostring
def __getitem__(self, i):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if self.bytes[byte] & mask:
return 1
else:
return 0
def __setitem__(self, i, val):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if val:
self.bytes[byte] |= mask
else:
self.bytes[byte] &= ~mask
def __len__(self):
return self.nbits
def append(self, bit):
"""Append a 1 if bit is true or 1 if it is false."""
if self.bitsleft == 0:
self.bytes.append(0)
self.bitsleft = 8
self.__setitem__(self.nbits, bit)
self.nbits += 1
self.bitsleft -= 1
def __getstate__(self):
return self.nbits, self.bitsleft, self.tostring()
def __setstate__(self, (nbits, bitsleft, s)):
self.bytes = array.array('B', s)
self.nbits = nbits
self.bitsleft = bitsleft
class RiceCode:
def __init__(self, m):
"""Constructor a RiceCode for m-bit values."""
if not (0 <= m <= 16):
raise ValueError, "m must be between 0 and 16"
self.init(m)
self.bits = BitArray()
self.len = 0
def init(self, m):
self.m = m
self.lower = (1 << m) - 1
self.mask = 1 << (m - 1)
def append(self, val):
"""Append an item to the list."""
if val < 1:
raise ValueError, "value >= 1 expected, got %s" % `val`
val -= 1
# emit the unary part of the code
q = val >> self.m
for i in range(q):
self.bits.append(1)
self.bits.append(0)
# emit the binary part
r = val & self.lower
mask = self.mask
while mask:
self.bits.append(r & mask)
mask >>= 1
self.len += 1
def __len__(self):
return self.len
def tolist(self):
"""Return the items as a list."""
l = []
i = 0 # bit offset
binary_range = range(self.m)
for j in range(self.len):
unary = 0
while self.bits[i] == 1:
unary += 1
i += 1
assert self.bits[i] == 0
i += 1
binary = 0
for k in binary_range:
binary = (binary << 1) | self.bits[i]
i += 1
l.append((unary << self.m) + (binary + 1))
return l
def tostring(self):
"""Return a binary string containing the encoded data.
The binary string may contain some extra zeros at the end.
"""
return self.bits.tostring()
def __getstate__(self):
return self.m, self.bits
def __setstate__(self, (m, bits)):
self.init(m)
self.bits = bits
def encode(m, l):
c = RiceCode(m)
for elt in l:
c.append(elt)
assert c.tolist() == l
return c
def encode_deltas(l):
if len(l) == 1:
return l[0], []
deltas = RiceCode(6)
deltas.append(l[1] - l[0])
for i in range(2, len(l)):
deltas.append(l[i] - l[i - 1])
return l[0], deltas
def decode_deltas(start, enc_deltas):
deltas = enc_deltas.tolist()
l = [start]
for i in range(1, len(deltas)):
l.append(l[i-1] + deltas[i])
l.append(l[-1] + deltas[-1])
return l
def test():
import random
for size in [10, 20, 50, 100, 200]:
l = [random.randint(1, size) for i in range(50)]
c = encode(random.randint(1, 16), l)
assert c.tolist() == l
for size in [10, 20, 50, 100, 200]:
l = range(random.randint(1, size), size + random.randint(1, size))
t = encode_deltas(l)
l2 = decode_deltas(*t)
assert l == l2
if l != l2:
print l
print l2
def pickle_efficiency():
import pickle
import random
for m in [4, 8, 12]:
for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
l = [random.randint(1, elt_range) for i in range(size)]
raw = pickle.dumps(l, 1)
enc = pickle.dumps(encode(m, l), 1)
print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
print "%5d %5d" % (len(raw), len(enc)),
if len(raw) > len(enc):
print "win"
else:
print "lose"
if __name__ == "__main__":
test()
=== Added File Zope3/lib/python/Zope/TextIndex/SetOps.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""SetOps -- Weighted intersections and unions applied to many inputs."""
from Persistence.BTrees.IIBTree import \
IIBucket, weightedIntersection, weightedUnion
from Zope.TextIndex.NBest import NBest
def mass_weightedIntersection(L):
"A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
L = [(x, wx) for (x, wx) in L if x is not None]
if len(L) < 2:
return _trivial(L)
# Intersect with smallest first. We expect the input maps to be
# IIBuckets, so it doesn't hurt to get their lengths repeatedly
# (len(Bucket) is fast; len(BTree) is slow).
L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
(x, wx), (y, wy) = L[:2]
dummy, result = weightedIntersection(x, y, wx, wy)
for x, wx in L[2:]:
dummy, result = weightedIntersection(result, x, 1, wx)
return result
def mass_weightedUnion(L):
"A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
if len(L) < 2:
return _trivial(L)
# Balance unions as closely as possible, smallest to largest.
merge = NBest(len(L))
for x, weight in L:
merge.add((x, weight), len(x))
while len(merge) > 1:
# Merge the two smallest so far, and add back to the queue.
(x, wx), dummy = merge.pop_smallest()
(y, wy), dummy = merge.pop_smallest()
dummy, z = weightedUnion(x, y, wx, wy)
merge.add((z, 1), len(z))
(result, weight), dummy = merge.pop_smallest()
return result
def _trivial(L):
# L is empty or has only one (mapping, weight) pair. If there is a
# pair, we may still need to multiply the mapping by its weight.
assert len(L) <= 1
if len(L) == 0:
return IIBucket()
[(result, weight)] = L
if weight != 1:
dummy, result = weightedUnion(IIBucket(), result, 0, weight)
return result
=== Added File Zope3/lib/python/Zope/TextIndex/StopDict.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Provide a default list of stop words for the index.
The specific splitter and lexicon are customizable, but the default
ZCTextIndex should do something useful.
"""
def get_stopdict():
"""Return a dictionary of stopwords."""
return _dict
# This list of English stopwords comes from Lucene
_words = [
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
]
_dict = {}
for w in _words:
_dict[w] = None
=== Added File Zope3/lib/python/Zope/TextIndex/WidCode.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
# for smaller ints. This is intended for lists of word ids (wids). The
# ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding. Unlike
# UTF-8, the initial byte does not tell you how many continuation bytes
# follow; and there's no ASCII superset property.
# Details:
#
# + Only the first byte of an encoding has the sign bit set.
#
# + The first byte has 7 bits of data.
#
# + Bytes beyond the first in an encoding have the sign bit clear, followed
# by 7 bits of data.
#
# + The first byte doesn't tell you how many continuation bytes are
# following. You can tell by searching for the next byte with the
# high bit set (or the end of the string).
#
# The int to be encoded can contain no more than 28 bits.
#
# If it contains no more than 7 bits, 0abcdefg, the encoding is
# 1abcdefg
#
# If it contains 8 thru 14 bits,
# 00abcdef ghijkLmn
# the encoding is
# 1abcdefg 0hijkLmn
#
# Static tables _encoding and _decoding capture all encodes and decodes for
# 14 or fewer bits.
#
# If it contains 15 thru 21 bits,
# 000abcde fghijkLm nopqrstu
# the encoding is
# 1abcdefg 0hijkLmn 0opqrstu
#
# If it contains 22 thru 28 bits,
# 0000abcd efghijkL mnopqrst uvwxyzAB
# the encoding is
# 1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
assert 0x80**2 == 0x4000
assert 0x80**4 == 0x10000000
import re
def encode(wids):
# Encode a list of wids as a string.
wid2enc = _encoding
n = len(wid2enc)
return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
_encoding = [None] * 0x4000 # Filled later, and converted to a tuple
def _encode(w):
assert 0x4000 <= w < 0x10000000
b, c = divmod(w, 0x80)
a, b = divmod(b, 0x80)
s = chr(b) + chr(c)
if a < 0x80: # no more than 21 data bits
return chr(a + 0x80) + s
a, b = divmod(a, 0x80)
assert a < 0x80, (w, a, b, s) # else more than 28 data bits
return (chr(a + 0x80) + chr(b)) + s
_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
def decode(code):
# Decode a string into a list of wids.
get = _decoding.get
# Obscure: while _decoding does have the key '\x80', its value is 0,
# so the "or" here calls _decode('\x80') anyway.
return [get(p) or _decode(p) for p in _prog.findall(code)]
_decoding = {} # Filled later
def _decode(s):
if s == '\x80':
# See comment in decode(). This is here to allow a trick to work.
return 0
if len(s) == 3:
a, b, c = map(ord, s)
assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
return ((a & 0x7F) << 14) | (b << 7) | c
assert len(s) == 4, `s`
a, b, c, d = map(ord, s)
assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d
def _fill():
global _encoding
for i in range(0x80):
s = chr(i + 0x80)
_encoding[i] = s
_decoding[s] = i
for i in range(0x80, 0x4000):
hi, lo = divmod(i, 0x80)
s = chr(hi + 0x80) + chr(lo)
_encoding[i] = s
_decoding[s] = i
_encoding = tuple(_encoding)
_fill()
def test():
for i in range(2**20):
if i % 1000 == 0: print i
wids = [i]
code = encode(wids)
assert decode(code) == wids, (wids, code, decode(code))
if __name__ == "__main__":
test()
=== Added File Zope3/lib/python/Zope/TextIndex/__init__.py ===
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""TextIndex for Zope3."""