[Zope-CVS] CVS: Products/ZCTextIndex - HTMLSplitter.py:1.2 IIndex.py:1.2 ILexicon.py:1.2 INBest.py:1.2 IPipelineElement.py:1.2 IQueryParser.py:1.2 ISplitter.py:1.2 Index.py:1.2 Lexicon.py:1.2 NBest.py:1.2 OkapiIndex.py:1.2 ParseTree.py:1.2 QueryParser.py:1.2 RiceCode.py:1.2 Setup:1.2 StopDict.py:1.2 WidCode.py:1.2 ZCTextIndex.py:1.2 __init__.py:1.2 stopper.c:1.2
Guido van Rossum
guido@python.org
Tue, 14 May 2002 11:13:05 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv10099
Added Files:
HTMLSplitter.py IIndex.py ILexicon.py INBest.py
IPipelineElement.py IQueryParser.py ISplitter.py Index.py
Lexicon.py NBest.py OkapiIndex.py ParseTree.py QueryParser.py
RiceCode.py Setup StopDict.py WidCode.py ZCTextIndex.py
__init__.py stopper.c
Log Message:
Merged TextIndexDS9-branch into trunk.
=== Products/ZCTextIndex/HTMLSplitter.py 1.1 => 1.2 ===
+
+import re
+
+class HTMLSplitter:
+
+ __implements__ = ISplitter
+
+ def process(self, text):
+ return re.sub('<[^>]*>', ' ', text).split()
+
+class HTMLWordSplitter:
+
+ __implements__ = ISplitter
+
+ def process(self, text):
+ splat = []
+ for t in text:
+ splat += self.split(t)
+ return splat
+
+ def split(self, text):
+ text = text.lower()
+ remove = ["<[^>]*>",
+ "&[A-Za-z]+;",
+ "\W+"]
+ for pat in remove:
+ text = re.sub(pat, " ", text)
+ rx = re.compile("[A-Za-z]")
+ return [word for word in text.split()
+ if len(word) > 1 and rx.search(word)]
+
+if __name__ == "__main__":
+ import sys
+ splitter = HTMLWordSplitter()
+ for path in sys.argv[1:]:
+ f = open(path, "rb")
+ buf = f.read()
+ f.close()
+ print path
+ print splitter.process([buf])
=== Products/ZCTextIndex/IIndex.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Index Interface."""
+
+import Interface
+
+class IIndex(Interface.Base):
+ """Interface for an Index."""
+
+ def search(term):
+ """Execute a search on a single term given as a string.
+
+ Return an IIBucket.
+ """
+
+ def search_phrase(phrase):
+ """Execute a search on a phrase given as a string.
+
+ Return an IIBucket.
+ """
+
+ def search_glob(pattern):
+ """Execute a pattern search.
+
+ The pattern represents a set of words by using * and ?. For
+ example, "foo*" represents the set of all words in the lexicon
+ starting with "foo".
+
+ NOTE: Currently only a single trailing * is supported.
+
+ Return an IIBucket.
+ """
+
+ def query_weight(terms):
+ """Return the weight for a set of query terms.
+
+ 'terms' is a sequence of all terms included in the query,
+ although not terms with a not. If a term appears more than
+ once in a query, it should appear more than once in terms.
+ """
+
+ def index_doc(docid, text):
+ "XXX"
+
+ def unindex_doc(docid):
+ "XXX"
=== Products/ZCTextIndex/ILexicon.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class ILexicon(Interface):
+ """Object responsible for converting text to word identifiers."""
+
+ def termToWordIds(text):
+ """Return a sequence of ids of the words parsed from the text.
+
+ The input text may be either a string or a list of strings.
+
+ Parses the text as if they are search terms, and skips words that
+ aren't in the lexicon.
+ """
+
+ def sourceToWordIds(text):
+ """Return a sequence of ids of the words parsed from the text.
+
+ The input text may be either a string or a list of strings.
+
+ Parses the text as if they come from a source document, and creates
+ new word ids for words that aren't (yet) in the lexicon.
+ """
+
+ def globToWordIds(pattern):
+ """Return a sequence of ids of words matching the pattern.
+
+ The argument should be a single word using globbing syntax,
+ e.g. 'foo*' meaning anything starting with 'foo'.
+
+ NOTE: Currently only a single trailing * is supported.
+
+ Returns the wids for all words in the lexicon that match the
+ pattern.
+ """
+
+ def length():
+ """Return the number of unique term in the lexicon."""
=== Products/ZCTextIndex/INBest.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""NBest Interface.
+
+An NBest object remembers the N best-scoring items ever passed to its
+.add(item, score) method. If .add() is called M times, the worst-case
+number of comparisons performed overall is M * log2(N).
+"""
+
+
+import Interface
+
+class INBest(Interface.Base):
+ """Interface for an N-Best chooser."""
+
+ def add(item, score):
+ """Record that item 'item' has score 'score'. No return value.
+
+ The N best-scoring items are remembered, where N was passed to
+ the constructor. 'item' can by anything. 'score' should be
+ a number, and larger numbers are considered better.
+ """
+
+ def addmany(sequence):
+ """Like "for item, score in sequence: self.add(item, score)".
+
+ This is simply faster than calling add() len(seq) times.
+ """
+
+ def getbest():
+ """Return the (at most) N best-scoring items as a sequence.
+
+ The return value is a sequence of 2-tuples, (item, score), with
+ the largest score first. If .add() has been called fewer than
+ N times, this sequence will contain fewer than N pairs.
+ """
+
+ def pop_smallest():
+ """Return and remove the (item, score) pair with lowest score.
+
+ If len(self) is 0, raise IndexError.
+
+ To be cleaer, this is the lowest score among the N best-scoring
+ seen so far. This is most useful if the capacity of the NBest
+ object is never exceeded, in which case pop_smallest() allows
+ using the object as an ordinary smallest-in-first-out priority
+ queue.
+ """
+
+ def __len__():
+ """Return the number of (item, score) pairs currently known.
+
+ This is N (the value passed to the constructor), unless .add()
+ has been called fewer than N times.
+ """
+
+ def capacity():
+ """Return the maximum number of (item, score) pairs.
+
+ This is N (the value passed to the constructor).
+ """
=== Products/ZCTextIndex/IPipelineElement.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class IPipelineElement(Interface):
+
+ def process(source):
+ """Provide a text processing step.
+
+ Process a source sequence of words into a result sequence.
+ """
=== Products/ZCTextIndex/IQueryParser.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Query Parser Interface."""
+
+import Interface
+
+class IQueryParser(Interface.Base):
+ """Interface for Query Parsers."""
+
+ def parseQuery(query):
+ """Parse a query string.
+
+ Return a parse tree (which implements IQueryParseTree).
+
+ May raise ParseTree.ParseError.
+ """
+
+class IQueryParseTree(Interface.Base):
+ """Interface for parse trees returned by parseQuery()."""
+
+ def nodeType():
+ """Return the node type.
+
+ This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
+ """
+
+ def getValue():
+ """Return a node-type specific value.
+
+ For node type: Return:
+ 'AND' a list of parse trees
+ 'OR' a list of parse trees
+ 'NOT' a parse tree
+ 'ATOM' a string (representing a single search term)
+ 'PHRASE' a string (representing a search phrase)
+ 'GLOB' a string (representing a pattern, e.g. "foo*")
+ """
+
+ def terms():
+ """Return a list of all terms in this node, excluding NOT subtrees."""
+
+ def executeQuery(index):
+ """Execute the query represented by this node against the index.
+
+ The index argument must implement the IIndex interface.
+
+ Return an IIBucket or IIBTree mapping document ids to scores
+ (higher scores mean better results).
+
+ May raise ParseTree.QueryError.
+ """
=== Products/ZCTextIndex/ISplitter.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class ISplitter(Interface):
+ """A splitter."""
+
+ def process(text):
+ """Run the splitter over the input text, returning a list of terms."""
=== Products/ZCTextIndex/Index.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Full text index with relevance ranking."""
+
+import math
+
+from BTrees.IOBTree import IOBTree
+from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import weightedIntersection, weightedUnion
+
+from Products.ZCTextIndex.IIndex import IIndex
+from Products.ZCTextIndex import WidCode
+
+# Instead of storing floats, we generally store scaled ints. Binary pickles
+# can store those more efficiently. The default SCALE_FACTOR of 1024
+# is large enough to get about 3 decimal digits of fractional info, and
+# small enough so that scaled values should almost always fit in a signed
+# 16-bit int (we're generally storing logs, so a few bits before the radix
+# point goes a long way; on the flip side, for reasonably small numbers x
+# most of the info in log(x) is in the fractional bits, so we do want to
+# save a lot of those).
+SCALE_FACTOR = 1024.0
+
+def scaled_int(f, scale=SCALE_FACTOR):
+ # We expect only positive inputs, so "add a half and chop" is the
+ # same as round(). Surprising, calling round() is significantly more
+ # expensive.
+ return int(f * scale + 0.5)
+
+class Index:
+
+ __implements__ = IIndex
+
+ def __init__(self, lexicon):
+ self._lexicon = lexicon
+
+ # wid -> { docid -> frequency }
+ self._wordinfo = IOBTree()
+
+ # docid -> W(docid)
+ self._docweight = IIBTree()
+
+ # docid -> [ wid ]
+ # used for un-indexing
+ self._docwords = IOBTree()
+
+ def length(self):
+ """Return the number of documents in the index."""
+ return len(self._docwords)
+
+ # Most of the computation for computing a relevance score for the
+ # document occurs in the search() method. The code currently
+ # implements the cosine similarity function described in Managing
+ # Gigabytes, eq. 4.3, p. 187. The index_object() method
+ # precomputes some values that are independent of the particular
+ # query.
+
+ # The equation is
+ #
+ # sum(for t in I(d,q): w(d,t) * w(q,t))
+ # cosine(d, q) = -------------------------------------
+ # W(d) * W(q)
+ #
+ # where
+ # I(d, q) = the intersection of the terms in d and q.
+ #
+ # w(d, t) = 1 + log f(d, t)
+ # computed by doc_term_weight(); for a given word t,
+ # self._wordinfo[t] is a map from d to w(d, t).
+ #
+ # w(q, t) = log(1 + N/f(t))
+ # computed by query_term_weight()
+ #
+ # W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
+ # computed by _get_frequencies(), and remembered in
+ # self._docweight[d]
+ #
+ # W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
+ # computed by self.query_weight()
+
+ def index_doc(self, docid, text):
+ wids = self._lexicon.sourceToWordIds(text)
+ uniqwids, freqs, docweight = self._get_frequencies(wids)
+ for i in range(len(uniqwids)):
+ self._add_wordinfo(uniqwids[i], freqs[i], docid)
+ self._docweight[docid] = docweight
+ self._add_undoinfo(docid, wids)
+
+ def unindex_doc(self, docid):
+ for wid in self._get_undoinfo(docid):
+ self._del_wordinfo(wid, docid)
+ del self._docwords[docid]
+ del self._docweight[docid]
+
+ def search(self, term):
+ wids = self._lexicon.termToWordIds(term)
+ return self._union(self._search_wids(wids))
+
+ def search_glob(self, pattern):
+ wids = self._lexicon.globToWordIds(pattern)
+ return self._union(self._search_wids(wids))
+
+ def search_phrase(self, phrase):
+ wids = self._lexicon.termToWordIds(phrase)
+ hits = self._intersection(self._search_wids(wids))
+ if not hits:
+ return hits
+ code = WidCode.encode(wids)
+ result = IIBTree()
+ for docid, weight in hits.items():
+ docwords = self._docwords[docid]
+ if docwords.find(code) >= 0:
+ result[docid] = weight
+ return result
+
+ def _search_wids(self, wids):
+ if not wids:
+ return []
+ N = float(len(self._docweight))
+ L = []
+ DictType = type({})
+ for wid in wids:
+ d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
+ idf = query_term_weight(len(d2w), N) # this is an unscaled float
+ #print "idf = %.3f" % idf
+ if isinstance(d2w, DictType):
+ d2w = IIBucket(d2w)
+ L.append((d2w, scaled_int(idf)))
+ L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+ return L
+
+ def _intersection(self, L):
+ if not L:
+ return IIBTree()
+ d2w, weight = L[0]
+ dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
+ for d2w, weight in L[1:]:
+ dummy, result = weightedIntersection(result, d2w, 1, weight)
+ return result
+
+ def _union(self, L):
+ # XXX This can be optimized, see OkapiIndex
+ result = IIBTree()
+ for d2w, weight in L:
+ dummy, result = weightedUnion(result, d2w, 1, weight)
+ return result
+
+ def query_weight(self, terms):
+ wids = []
+ for term in terms:
+ wids += self._lexicon.termToWordIds(term)
+ N = float(len(self._docweight))
+ sum = 0.0
+ for wid in wids:
+ wt = math.log(1.0 + N / len(self._wordinfo[wid]))
+ sum += wt ** 2.0
+ return scaled_int(math.sqrt(sum))
+
+ def _get_frequencies(self, wids):
+ """Return individual doc-term weights and docweight."""
+ # Computes w(d, t) for each term, and W(d).
+ # Return triple:
+ # [wid0, wid1, ...],
+ # [w(d, wid0)/W(d), w(d, wid1)/W(d), ...],
+ # W(d)
+ # The second list and W(d) are scaled_ints.
+ d = {}
+ for wid in wids:
+ d[wid] = d.get(wid, 0) + 1
+ Wsquares = 0.0
+ weights = []
+ push = weights.append
+ for count in d.values():
+ w = doc_term_weight(count)
+ Wsquares += w * w
+ push(w)
+ W = math.sqrt(Wsquares)
+ #print "W = %.3f" % W
+ for i in xrange(len(weights)):
+ #print i, ":", "%.3f" % weights[i],
+ weights[i] = scaled_int(weights[i] / W)
+ #print "->", weights[i]
+ return d.keys(), weights, scaled_int(W)
+
+ DICT_CUTOFF = 10
+
+ def _add_wordinfo(self, wid, f, docid):
+ # Store a wordinfo in a dict as long as there are less than
+ # DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
+
+ # The pickle of a dict is smaller than the pickle of an
+ # IIBTree, substantially so for small mappings. Thus, we use
+ # a dictionary until the mapping reaches DICT_CUTOFF elements.
+
+ # The cutoff is chosen based on the implementation
+ # characteristics of Python dictionaries. The dict hashtable
+ # always has 2**N slots and is resized whenever it is 2/3s
+ # full. A pickled dict with 10 elts is half the size of an
+ # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
+ # choose 10 as the cutoff for now.
+
+ # The IIBTree has a smaller in-memory representation than a
+ # dictionary, so pickle size isn't the only consideration when
+ # choosing the threshold. The pickle of a 500-elt dict is 92%
+ # of the size of the same IIBTree, but the dict uses more
+ # space when it is live in memory. An IIBTree stores two C
+ # arrays of ints, one for the keys and one for the values. It
+ # holds upto 120 key-value pairs in a single bucket.
+ try:
+ map = self._wordinfo[wid]
+ except KeyError:
+ map = {}
+ else:
+ # _add_wordinfo() is called for each update. If the map
+ # size exceeds the DICT_CUTOFF, convert to an IIBTree.
+ if len(map) == self.DICT_CUTOFF:
+ map = IIBTree(map)
+ map[docid] = f
+ self._wordinfo[wid] = map # Not redundant, because of Persistency!
+
+ def _del_wordinfo(self, wid, docid):
+ try:
+ map = self._wordinfo[wid]
+ del map[docid]
+ except KeyError:
+ return
+ if len(map) == 0:
+ del self._wordinfo[wid]
+ return
+ if len(map) == self.DICT_CUTOFF:
+ new = {}
+ for k, v in map.items():
+ new[k] = v
+ map = new
+ self._wordinfo[wid] = map # Not redundant, because of Persistency!
+
+ def _add_undoinfo(self, docid, wids):
+ self._docwords[docid] = WidCode.encode(wids)
+
+ def _get_undoinfo(self, docid):
+ return WidCode.decode(self._docwords[docid])
+
+ # The rest are helper methods to support unit tests
+
+ def _get_wdt(self, d, t):
+ wid, = self._lexicon.termToWordIds(t)
+ map = self._wordinfo[wid]
+ return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
+
+ def _get_Wd(self, d):
+ return self._docweight[d]
+
+ def _get_ft(self, t):
+ wid, = self._lexicon.termToWordIds(t)
+ return len(self._wordinfo[wid])
+
+ def _get_wt(self, t):
+ wid, = self._lexicon.termToWordIds(t)
+ map = self._wordinfo[wid]
+ return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
+
+def doc_term_weight(count):
+ """Return the doc-term weight for a term that appears count times."""
+ # implements w(d, t) = 1 + log f(d, t)
+ return 1.0 + math.log(count)
+
+def query_term_weight(term_count, num_items):
+ """Return the query-term weight for a term,
+
+ that appears in term_count items in a collection with num_items
+ total items.
+ """
+ # implements w(q, t) = log(1 + N/f(t))
+ return math.log(1.0 + float(num_items) / term_count)
=== Products/ZCTextIndex/Lexicon.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+import re
+
+from BTrees.IOBTree import IOBTree
+from BTrees.OIBTree import OIBTree
+from Products.ZCTextIndex.ILexicon import ILexicon
+from Products.ZCTextIndex.StopDict import get_stopdict
+
+class Lexicon:
+
+ __implements__ = ILexicon
+
+ def __init__(self, *pipeline):
+ self.__wids = OIBTree()
+ self.__words = IOBTree()
+ # XXX we're reserving wid 0, but that might be yagni
+ self.__nextwid = 1
+ self.__pipeline = pipeline
+
+ def length(self):
+ """Return the number of unique terms in the lexicon."""
+ return self.__nextwid - 1
+
+ def words(self):
+ return self.__wids.keys()
+
+ def wids(self):
+ return self.__words.keys()
+
+ def items(self):
+ return self.__wids.items()
+
+ def sourceToWordIds(self, text):
+ last = _text2list(text)
+ for element in self.__pipeline:
+ last = element.process(last)
+ return map(self._getWordIdCreate, last)
+
+ def termToWordIds(self, text):
+ last = _text2list(text)
+ for element in self.__pipeline:
+ last = element.process(last)
+ wids = []
+ for word in last:
+ wid = self.__wids.get(word)
+ if wid is not None:
+ wids.append(wid)
+ return wids
+
+ def globToWordIds(self, pattern):
+ if not re.match("^\w+\*$", pattern):
+ return []
+ pattern = pattern.lower()
+ assert pattern.endswith("*")
+ prefix = pattern[:-1]
+ assert prefix and not prefix.endswith("*")
+ keys = self.__wids.keys(prefix) # Keys starting at prefix
+ wids = []
+ words = []
+ for key in keys:
+ if not key.startswith(prefix):
+ break
+ wids.append(self.__wids[key])
+ words.append(key)
+ return wids
+
+ def _getWordIdCreate(self, word):
+ wid = self.__wids.get(word)
+ if wid is None:
+ wid = self.__new_wid()
+ self.__wids[word] = wid
+ self.__words[wid] = word
+ return wid
+
+ def __new_wid(self):
+ wid = self.__nextwid
+ self.__nextwid += 1
+ return wid
+
+def _text2list(text):
+ # Helper: splitter input may be a string or a list of strings
+ try:
+ text + ""
+ except:
+ return text
+ else:
+ return [text]
+
+# Sample pipeline elements
+
+class Splitter:
+
+ import re
+ rx = re.compile(r"\w+")
+
+ def process(self, lst):
+ result = []
+ for s in lst:
+ result += self.rx.findall(s)
+ return result
+
+class CaseNormalizer:
+
+ def process(self, lst):
+ return [w.lower() for w in lst]
+
+class StopWordRemover:
+
+ dict = get_stopdict().copy()
+ for c in range(255):
+ dict[chr(c)] = None
+
+ def process(self, lst):
+ has_key = self.dict.has_key
+ return [w for w in lst if not has_key(w)]
+
+try:
+ from Products.ZCTextIndex import stopper as _stopper
+except ImportError:
+ pass
+else:
+ _stopwords = StopWordRemover.dict
+ def StopWordRemover():
+ swr = _stopper.new()
+ swr.dict.update(_stopwords)
+ return swr
=== Products/ZCTextIndex/NBest.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+"""NBest
+
+An NBest object remembers the N best-scoring items ever passed to its
+.add(item, score) method. If .add() is called M times, the worst-case
+number of comparisons performed overall is M * log2(N).
+"""
+
+from bisect import bisect
+
+from Products.ZCTextIndex.INBest import INBest
+
+class NBest:
+ __implements__ = INBest
+
+ def __init__(self, N):
+ "Build an NBest object to remember the N best-scoring objects."
+
+ if N < 1:
+ raise ValueError("NBest() argument must be at least 1")
+ self._capacity = N
+
+ # This does a very simple thing with sorted lists. For large
+ # N, a min-heap can be unboundedly better in terms of data
+ # movement time.
+ self.scores = []
+ self.items = []
+
+ def __len__(self):
+ return len(self.scores)
+
+ def capacity(self):
+ return self._capacity
+
+ def add(self, item, score):
+ self.addmany([(item, score)])
+
+ def addmany(self, sequence):
+ scores, items, capacity = self.scores, self.items, self._capacity
+ n = len(scores)
+ for item, score in sequence:
+ # When we're in steady-state, the usual case is that we're filled
+ # to capacity, and that an incoming item is worse than any of
+ # the best-seen so far.
+ if n >= capacity and score <= scores[0]:
+ continue
+ i = bisect(scores, score)
+ scores.insert(i, score)
+ items.insert(i, item)
+ if n == capacity:
+ del items[0], scores[0]
+ else:
+ n += 1
+ assert n == len(scores)
+
+ def getbest(self):
+ result = zip(self.items, self.scores)
+ result.reverse()
+ return result
+
+ def pop_smallest(self):
+ if self.scores:
+ return self.items.pop(0), self.scores.pop(0)
+ raise IndexError("pop_smallest() called on empty NBest object")
=== Products/ZCTextIndex/OkapiIndex.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Full text index with relevance ranking, using an Okapi BM25 rank."""
+
+# Lots of comments are at the bottom of this file. Read them to
+# understand what's going on.
+
+import math
+
+from BTrees.IOBTree import IOBTree
+from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import weightedIntersection, weightedUnion
+
+from Products.ZCTextIndex.IIndex import IIndex
+from Products.ZCTextIndex import WidCode
+from Products.ZCTextIndex.NBest import NBest
+
+# Instead of storing floats, we generally store scaled ints. Binary pickles
+# can store those more efficiently. The default SCALE_FACTOR of 1024
+# is large enough to get about 3 decimal digits of fractional info, and
+# small enough so that scaled values should almost always fit in a signed
+# 16-bit int (we're generally storing logs, so a few bits before the radix
+# point goes a long way; on the flip side, for reasonably small numbers x
+# most of the info in log(x) is in the fractional bits, so we do want to
+# save a lot of those).
+SCALE_FACTOR = 1024.0
+
+def scaled_int(f, scale=SCALE_FACTOR):
+ # We expect only positive inputs, so "add a half and chop" is the
+ # same as round(). Surprising, calling round() is significantly more
+ # expensive.
+ return int(f * scale + 0.5)
+
+class Index:
+
+ __implements__ = IIndex
+
+ # BM25 free parameters.
+ K1 = 1.2
+ B = 0.75
+ assert K1 >= 0.0
+ assert 0.0 <= B <= 1.0
+
+ def __init__(self, lexicon):
+ self._lexicon = lexicon
+
+ # wid -> { docid -> frequency }; t -> D -> f(D, t)
+ self._wordinfo = IOBTree()
+
+ # docid -> # of words in the doc
+ # XXX this is just len(self._docwords[docid]), but if _docwords
+ # XXX is stored in compressed form then uncompressing just to count
+ # XXX the list length would be ridiculously expensive.
+ self._doclen = IIBTree()
+
+ # docid -> [ wid ]
+ # used for un-indexing
+ self._docwords = IOBTree()
+
+ # sum(self._doclen.values()), the total # of words in all docs
+ self._totaldoclen = 0L
+
+ def length(self):
+ """Return the number of documents in the index."""
+ return len(self._docwords)
+
+ # Most of the computation for computing a relevance score for the
+ # document occurs in the search() method.
+
+ def index_doc(self, docid, text):
+ wids = self._lexicon.sourceToWordIds(text)
+ self._doclen[docid] = len(wids)
+ self._totaldoclen += len(wids)
+ wid2count = self._get_frequencies(wids)
+ for wid, count in wid2count.items():
+ self._add_wordinfo(wid, count, docid)
+ self._add_undoinfo(docid, wids)
+
+ def unindex_doc(self, docid):
+ for wid in self._get_undoinfo(docid):
+ self._del_wordinfo(wid, docid)
+ del self._docwords[docid]
+ count = self._doclen[docid]
+ del self._doclen[docid]
+ self._totaldoclen -= count
+
+ def search(self, term):
+ wids = self._lexicon.termToWordIds(term)
+ return self._union(self._search_wids(wids))
+
+ def search_glob(self, pattern):
+ wids = self._lexicon.globToWordIds(pattern)
+ return self._union(self._search_wids(wids))
+
+ def search_phrase(self, phrase):
+ wids = self._lexicon.termToWordIds(phrase)
+ hits = self._intersection(self._search_wids(wids))
+ if not hits:
+ return hits
+ code = WidCode.encode(wids)
+ result = IIBTree()
+ for docid, weight in hits.items():
+ docwords = self._docwords[docid]
+ if docwords.find(code) >= 0:
+ result[docid] = weight
+ return result
+
+ def _search_wids(self, wids):
+ if not wids:
+ return []
+ N = float(len(self._doclen))
+ L = []
+ K1 = self.K1
+ B = self.B
+ K1_plus1 = K1 + 1.0
+ B_from1 = 1.0 - B
+ meandoclen = self._totaldoclen / N
+
+ # f(D, t) * (k1 + 1)
+ # TF(D, t) = -------------------------------------------
+ # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+ for wid in wids:
+ d2f = self._wordinfo[wid] # map {docid -> f(docid, wid)}
+ idf = inverse_doc_frequency(len(d2f), N) # this is an unscaled float
+ result = IIBucket()
+ for docid, f in d2f.items():
+ lenweight = B_from1 + B * self._doclen[docid] / meandoclen
+ tf = f * K1_plus1 / (f + K1 * lenweight)
+ result[docid] = scaled_int(tf * idf)
+ L.append((result, 1))
+ return L
+
+ # Note about the above: the result is tf * idf. tf is small -- it
+ # can't be larger than k1+1 = 2.2. idf is formally unbounded, but
+ # is less than 14 for a term that appears in only 1 of a million
+ # documents. So the product is probably less than 32, or 5 bits
+ # before the radix point. If we did the scaled-int business on
+ # both of them, we'd be up to 25 bits. Add 64 of those and we'd
+ # be in overflow territory. That's pretty unlikely, so we *could*
+ # just store scaled_int(tf) in result[docid], and use scaled_int(idf)
+ # as an invariant weight across the whole result. But besides
+ # skating near the edge, it's not a speed cure, since the computation
+ # of tf would still done at Python speed, and it's a lot more
+ # work than just multiplying by idf.
+
+ def _intersection(self, L):
+ if not L:
+ return IIBTree()
+ # Intersect with smallest first.
+ L = L[:] # don't mutate the caller's L
+ L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+ d2w, weight = L[0]
+ dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
+ for d2w, weight in L[1:]:
+ dummy, result = weightedIntersection(result, d2w, 1, weight)
+ return result
+
+ def _union(self, L):
+ if not L:
+ return IIBTree()
+ # Balance unions as closely as possible, smallest to largest.
+ merge = NBest(len(L))
+ for x, weight in L:
+ merge.add((x, weight), len(x))
+ while len(merge) > 1:
+ # Merge the two smallest so far, and add back to the queue.
+ x, wx = merge.pop_smallest()
+ y, wy = merge.pop_smallest()
+ dummy, z = weightedUnion(x, y, wx, wy)
+ merge.add((z, 1), len(z))
+ (result, weight), score = merge.pop_smallest()
+ return result
+
+ def query_weight(self, terms):
+ # XXX I have no idea what to put here
+ return 10
+
+ def _get_frequencies(self, wids):
+ """Return individual term frequencies."""
+ # Computes f(d, t) for each term.
+ # Returns a dict mapping wid to the number of times wid appeared
+ # in wids, {t: f(d, t)}
+ d = {}
+ dget = d.get
+ for wid in wids:
+ d[wid] = dget(wid, 0) + 1
+ return d
+
+ DICT_CUTOFF = 10
+
+ def _add_wordinfo(self, wid, f, docid):
+ # Store a wordinfo in a dict as long as there are less than
+ # DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
+
+ # The pickle of a dict is smaller than the pickle of an
+ # IIBTree, substantially so for small mappings. Thus, we use
+ # a dictionary until the mapping reaches DICT_CUTOFF elements.
+
+ # The cutoff is chosen based on the implementation
+ # characteristics of Python dictionaries. The dict hashtable
+ # always has 2**N slots and is resized whenever it is 2/3s
+ # full. A pickled dict with 10 elts is half the size of an
+ # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
+ # choose 10 as the cutoff for now.
+
+ # The IIBTree has a smaller in-memory representation than a
+ # dictionary, so pickle size isn't the only consideration when
+ # choosing the threshold. The pickle of a 500-elt dict is 92%
+ # of the size of the same IIBTree, but the dict uses more
+ # space when it is live in memory. An IIBTree stores two C
+ # arrays of ints, one for the keys and one for the values. It
+ # holds upto 120 key-value pairs in a single bucket.
+ try:
+ map = self._wordinfo[wid]
+ except KeyError:
+ map = {}
+ else:
+ # _add_wordinfo() is called for each update. If the map
+ # size exceeds the DICT_CUTOFF, convert to an IIBTree.
+ if len(map) == self.DICT_CUTOFF:
+ map = IIBTree(map)
+ map[docid] = f
+ self._wordinfo[wid] = map # Not redundant, because of Persistency!
+
+ def _del_wordinfo(self, wid, docid):
+ try:
+ map = self._wordinfo[wid]
+ del map[docid]
+ except KeyError:
+ return
+ if len(map) == 0:
+ del self._wordinfo[wid]
+ return
+ if len(map) == self.DICT_CUTOFF:
+ new = {}
+ for k, v in map.items():
+ new[k] = v
+ map = new
+ self._wordinfo[wid] = map # Not redundant, because of Persistency!
+
+ def _add_undoinfo(self, docid, wids):
+ self._docwords[docid] = WidCode.encode(wids)
+
+ def _get_undoinfo(self, docid):
+ return WidCode.decode(self._docwords[docid])
+
+ # The rest are helper methods to support unit tests
+ # XXX These don't work for Okapi, I assume
+
+ def _get_wdt(self, d, t):
+ wid, = self._lexicon.termToWordIds(t)
+ map = self._wordinfo[wid]
+ return map.get(d, 0) * self._doclen[d] / SCALE_FACTOR
+
+ def _get_Wd(self, d):
+ return self._doclen[d]
+
+ def _get_ft(self, t):
+ wid, = self._lexicon.termToWordIds(t)
+ return len(self._wordinfo[wid])
+
+ def _get_wt(self, t):
+ wid, = self._lexicon.termToWordIds(t)
+ map = self._wordinfo[wid]
+ return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
+
+def inverse_doc_frequency(term_count, num_items):
+ """Return the inverse doc frequency for a term,
+
+ that appears in term_count items in a collection with num_items
+ total items.
+ """
+ # implements IDF(q, t) = log(1 + N/f(t))
+ return math.log(1.0 + float(num_items) / term_count)
+
+"""
+"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
+It's based on probability arguments about how words are distributed in
+documents, not on an abstract vector space model. A long paper by its
+principal inventors gives an excellent overview of how it was derived:
+
+ A probabilistic model of information retrieval: development and status
+ K. Sparck Jones, S. Walker, S.E. Robertson
+ http://citeseer.nj.nec.com/jones98probabilistic.html
+
+Spellings that ignore relevance information (which we don't have) are of this
+high-level form:
+
+ score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
+
+where
+
+ D a specific document
+
+ Q a specific query
+
+ t a term (word, atomic phrase, whatever)
+
+ D&Q the terms common to D and Q
+
+ TF(D, t) a measure of t's importance in D -- a kind of term frequency
+ weight
+
+ IDF(Q, t) a measure of t's importance in the query and in the set of
+ documents as a whole -- a kind of inverse document frequency
+ weight
+
+The IDF(Q, t) here is identical to the one used for our cosine measure.
+Since queries are expected to be short, it ignores Q entirely:
+
+ IDF(Q, t) = log(1.0 + N / f(t))
+
+where
+
+ N the total number of documents
+ f(t) the number of documents in which t appears
+
+Most Okapi literature seems to use log(N/f(t)) instead. We don't, because
+that becomes 0 for a term that's in every document, and, e.g., if someone
+is searching for "documentation" on python.org (a term that may well show
+up on every page, due to the top navigation bar), we still want to find the
+pages that use the word a lot (which is TF's job to find, not IDF's -- we
+just want to stop IDF from considering this t to be irrelevant).
+
+The TF(D, t) spellings are more interesting. With lots of variations, the
+most basic spelling is of the form
+
+ f(D, t)
+ TF(D, t) = ---------------
+ f(D, t) + K(D)
+
+where
+
+ f(D, t) the number of times t appears in D
+ K(D) a measure of the length of D, normalized to mean doc length
+
+The functional *form* f/(f+K) is clever. It's a gross approximation to a
+mixture of two distinct Poisson distributions, based on the idea that t
+probably appears in D for one of two reasons:
+
+1. More or less at random.
+
+2. Because it's important to D's purpose in life ("eliteness" in papers).
+
+Note that f/(f+K) is always between 0 and 1. If f is very large compared to
+K, it approaches 1. If K is very large compared to f, it approaches 0. If
+t appears in D more or less "for random reasons", f is likely to be small,
+and so K will dominate unless it's a very small doc, and the ratio will be
+small. OTOH, if t appears a lot in D, f will dominate unless it's a very
+large doc, and the ratio will be close to 1.
+
+We use a variation on that simple theme, a simplification of what's called
+BM25 in the literature (it was the 25th stab at a Best Match function from
+the Okapi group; "a simplification" means we're setting some of BM25's more
+esoteric free parameters to 0):
+
+ f(D, t) * (k1 + 1)
+ TF(D, t) = --------------------
+ f(D, t) + k1 * K(D)
+
+where
+
+ k1 a "tuning factor", typically between 1.0 and 2.0. We use 1.2,
+ the usual default value. This constant adjusts the curve to
+ look more like a theoretical 2-Poisson curve.
+
+Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
+an asymptote of k1+1 from below.
+
+Finally, we use
+
+ K(D) = (1-b) + b * len(D)/E(len(D))
+
+where
+
+ b is another free parameter, discussed below. We use 0.75.
+
+ len(D) the length of D in words
+
+ E(len(D)) the expected value of len(D) across the whole document set;
+ or, IOW, the average document length
+
+b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
+of the "Verbosity Hypothesis". Suppose b is 1, and some word t appears
+10 times as often in document d2 than in document d1. If document d2 is
+also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
+
+ f(d2, t) * (k1 + 1)
+ TF(d2, t) = --------------------------------- =
+ f(d2, t) + k1 * len(d2)/E(len(D))
+
+ 10 * f(d1, t) * (k1 + 1)
+ ----------------------------------------------- = TF(d1, t)
+ 10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
+
+because the 10's cancel out. This is appropriate if we believe that a word
+appearing 10x more often in a doc 10x as long is simply due to that the
+longer doc is more verbose. If we do believe that, the longer doc and the
+shorter doc are probably equally relevant. OTOH, it *could* be that the
+longer doc is talking about t in greater depth too, in which case it's
+probably more relevant than the shorter doc.
+
+At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
+completely, and a doc scores higher for having more occurences of a word
+regardless of the doc's length.
+
+Reality is between these extremes, and probably varies by document and word
+too. Reports in the literature suggest that b=0.75 is a good compromise "in
+general", favoring the "verbosity hypothesis" end of the scale.
+
+Putting it all together, the final TF function is
+
+ f(D, t) * (k1 + 1)
+ TF(D, t) = --------------------------------------------
+ f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+with k1=1.2 and b=0.75.
+"""
=== Products/ZCTextIndex/ParseTree.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Generic parser support: exception and parse tree nodes."""
+
+from BTrees.IIBTree import difference, weightedIntersection, weightedUnion
+from Products.ZCTextIndex.NBest import NBest
+
+class QueryError(Exception):
+ pass
+
+class ParseError(Exception):
+ pass
+
+class ParseTreeNode:
+
+ _nodeType = None
+
+ def __init__(self, value):
+ self._value = value
+
+ def nodeType(self):
+ return self._nodeType
+
+ def getValue(self):
+ return self._value
+
+ def __repr__(self):
+ return "%s(%r)" % (self.__class__.__name__, self.getValue())
+
+ def terms(self):
+ t = []
+ for v in self.getValue():
+ t.extend(v.terms())
+ return t
+
+ def executeQuery(self, index):
+ raise NotImplementedError
+
+class NotNode(ParseTreeNode):
+
+ _nodeType = "NOT"
+
+ def terms(self):
+ return []
+
+ def executeQuery(self, index):
+ raise QueryError, "NOT operator must occur right after AND"
+
+class AndNode(ParseTreeNode):
+
+ _nodeType = "AND"
+
+ def executeQuery(self, index):
+ L = []
+ Nots = []
+ for subnode in self.getValue():
+ if subnode.nodeType() == "NOT":
+ Nots.append(subnode.getValue().executeQuery(index))
+ else:
+ L.append(subnode.executeQuery(index))
+ assert L
+ L.sort(lambda x, y: cmp(len(x), len(y)))
+ set = L[0]
+ for x in L[1:]:
+ dummy, set = weightedIntersection(set, x)
+ if Nots:
+ Nots.sort(lambda x, y: cmp(len(x), len(y)))
+ notset = Nots[0]
+ for x in Nots[1:]:
+ dummy, notset = weightedUnion(notset, x)
+ set = difference(set, notset)
+ return set
+
+class OrNode(ParseTreeNode):
+
+ _nodeType = "OR"
+
+ def executeQuery(self, index):
+ # Balance unions as closely as possible, smallest to largest.
+ allofem = self.getValue()
+ merge = NBest(len(allofem))
+ for subnode in allofem:
+ result = subnode.executeQuery(index)
+ merge.add(result, len(result))
+ while len(merge) > 1:
+ # Merge the two smallest so far, and add back to the queue.
+ x, dummy = merge.pop_smallest()
+ y, dummy = merge.pop_smallest()
+ dummy, z = weightedUnion(x, y)
+ merge.add(z, len(z))
+ result, dummy = merge.pop_smallest()
+ return result
+
+class AtomNode(ParseTreeNode):
+
+ _nodeType = "ATOM"
+
+ def terms(self):
+ return [self.getValue()]
+
+ def executeQuery(self, index):
+ return index.search(self.getValue())
+
+class PhraseNode(AtomNode):
+
+ _nodeType = "PHRASE"
+
+ def executeQuery(self, index):
+ return index.search_phrase(self.getValue())
+
+class GlobNode(AtomNode):
+
+ _nodeType = "GLOB"
+
+ def executeQuery(self, index):
+ return index.search_glob(self.getValue())
=== Products/ZCTextIndex/QueryParser.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Query Parser.
+
+This particular parser recognizes the following syntax:
+
+Start = OrExpr
+OrExpr = AndExpr ('OR' AndExpr)*
+AndExpr = Term ('AND' NotExpr)*
+NotExpr = ['NOT'] Term
+Term = '(' OrExpr ')' | ATOM+
+
+The key words (AND, OR, NOT) are recognized in any mixture of case.
+
+An ATOM is either:
+
++ A sequence of characters not containing whitespace or parentheses or
+ double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or
+
++ A non-empty string enclosed in double quotes. The interior of the string
+ can contain whitespace, parentheses and key words.
+
+In addition, an ATOM may optionally be preceded by a hyphen, meaning
+that it must not be present.
+
+An unquoted ATOM may also end in a star. This is a primitive
+"globbing" function, meaning to search for any word with a given
+prefix.
+
+When multiple consecutive ATOMs are found at the leaf level, they are
+connected by an implied AND operator, and an unquoted leading hyphen
+is interpreted as a NOT operator.
+
+Summarizing the default operator rules:
+
+- a sequence of words without operators implies AND, e.g. ``foo bar''
+- double-quoted text implies phrase search, e.g. ``"foo bar"''
+- words connected by punctuation implies phrase search, e.g. ``foo-bar''
+- a leading hyphen implies NOT, e.g. ``foo -bar''
+- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
+- a trailing * means globbing (i.e. prefix search), e.g. ``foo*''
+"""
+
+import re
+
+import ParseTree # relative import
+
+# Create unique symbols for token types.
+_AND = intern("AND")
+_OR = intern("OR")
+_NOT = intern("NOT")
+_LPAREN = intern("(")
+_RPAREN = intern(")")
+_ATOM = intern("ATOM")
+_EOF = intern("EOF")
+
+# Map keyword string to token type.
+_keywords = {
+ _AND: _AND,
+ _OR: _OR,
+ _NOT: _NOT,
+ _LPAREN: _LPAREN,
+ _RPAREN: _RPAREN,
+}
+
+# Regular expression to tokenize.
+_tokenizer_regex = re.compile(r"""
+ # a paren
+ [()]
+ # or an optional hyphen
+| -?
+ # followed by
+ (?:
+ # a string
+ " [^"]* "
+ # or a non-empty stretch w/o whitespace, parens or double quotes
+ | [^()\s"]+
+ )
+""", re.VERBOSE)
+
+class QueryParser:
+
+ def __init__(self):
+ pass # This parser has no persistent state
+
+ def parseQuery(self, query):
+ # Lexical analysis.
+ tokens = _tokenizer_regex.findall(query)
+ self.__tokens = tokens
+ # classify tokens
+ self.__tokentypes = [_keywords.get(token.upper(), _ATOM)
+ for token in tokens]
+ # add _EOF
+ self.__tokens.append(_EOF)
+ self.__tokentypes.append(_EOF)
+ self.__index = 0
+
+ # Syntactical analysis.
+ tree = self._parseOrExpr()
+ self._require(_EOF)
+ return tree
+
+ # Recursive descent parser
+
+ def _require(self, tokentype):
+ if not self._check(tokentype):
+ t = self.__tokens[self.__index]
+ msg = "Token %r required, %r found" % (tokentype, t)
+ raise ParseTree.ParseError, msg
+
+ def _check(self, tokentype):
+ if self.__tokentypes[self.__index] is tokentype:
+ self.__index += 1
+ return 1
+ else:
+ return 0
+
+ def _peek(self, tokentype):
+ return self.__tokentypes[self.__index] is tokentype
+
+ def _get(self, tokentype):
+ t = self.__tokens[self.__index]
+ self._require(tokentype)
+ return t
+
+ def _parseOrExpr(self):
+ L = []
+ L.append(self._parseAndExpr())
+ while self._check(_OR):
+ L.append(self._parseAndExpr())
+ if len(L) == 1:
+ return L[0]
+ else:
+ return ParseTree.OrNode(L)
+
+ def _parseAndExpr(self):
+ L = []
+ L.append(self._parseTerm())
+ while self._check(_AND):
+ L.append(self._parseNotExpr())
+ if len(L) == 1:
+ return L[0]
+ else:
+ return ParseTree.AndNode(L)
+
+ def _parseNotExpr(self):
+ if self._check(_NOT):
+ return ParseTree.NotNode(self._parseTerm())
+ else:
+ return self._parseTerm()
+
+ def _parseTerm(self):
+ if self._check(_LPAREN):
+ tree = self._parseOrExpr()
+ self._require(_RPAREN)
+ else:
+ atoms = [self._get(_ATOM)]
+ while self._peek(_ATOM):
+ atoms.append(self._get(_ATOM))
+ nodes = []
+ nots = []
+ for a in atoms:
+ words = re.findall(r"\w+\*?", a)
+ if not words:
+ continue
+ if len(words) > 1:
+ n = ParseTree.PhraseNode(" ".join(words))
+ elif words[0].endswith("*"):
+ n = ParseTree.GlobNode(words[0])
+ else:
+ n = ParseTree.AtomNode(words[0])
+ if a[0] == "-":
+ n = ParseTree.NotNode(n)
+ nots.append(n)
+ else:
+ nodes.append(n)
+ if not nodes:
+ text = " ".join(atoms)
+ msg = "At least one positive term required: %r" % text
+ raise ParseTree.ParseError, msg
+ nodes.extend(nots)
+ if len(nodes) == 1:
+ tree = nodes[0]
+ else:
+ tree = ParseTree.AndNode(nodes)
+ return tree
=== Products/ZCTextIndex/RiceCode.py 1.1 => 1.2 ===
+
+Based on a Java implementation by Glen McCluskey described in a Usenix
+ ;login: article at
+http://www.usenix.org/publications/login/2000-4/features/java.html
+
+McCluskey's article explains the approach as follows. The encoding
+for a value x is represented as a unary part and a binary part. The
+unary part is a sequence of 1 bits followed by a 0 bit. The binary
+part encodes some of the lower bits of x-1.
+
+The encoding is parameterized by a value m that describes how many
+bits to store in the binary part. If most of the values are smaller
+than 2**m then they can be stored in only m+1 bits.
+
+Compute the length of the unary part, q, where
+ q = math.floor((x-1)/ 2 ** m)
+
+ Emit q 1 bits followed by a 0 bit.
+
+Emit the lower m bits of x-1, treating x-1 as a binary value.
+"""
+
+import array
+
+class BitArray:
+
+ def __init__(self, buf=None):
+ self.bytes = array.array('B')
+ self.nbits = 0
+ self.bitsleft = 0
+ self.tostring = self.bytes.tostring
+
+ def __getitem__(self, i):
+ byte, offset = divmod(i, 8)
+ mask = 2 ** offset
+ if self.bytes[byte] & mask:
+ return 1
+ else:
+ return 0
+
+ def __setitem__(self, i, val):
+ byte, offset = divmod(i, 8)
+ mask = 2 ** offset
+ if val:
+ self.bytes[byte] |= mask
+ else:
+ self.bytes[byte] &= ~mask
+
+ def __len__(self):
+ return self.nbits
+
+ def append(self, bit):
+ """Append a 1 if bit is true or 1 if it is false."""
+ if self.bitsleft == 0:
+ self.bytes.append(0)
+ self.bitsleft = 8
+ self.__setitem__(self.nbits, bit)
+ self.nbits += 1
+ self.bitsleft -= 1
+
+ def __getstate__(self):
+ return self.nbits, self.bitsleft, self.tostring()
+
+ def __setstate__(self, (nbits, bitsleft, s)):
+ self.bytes = array.array('B', s)
+ self.nbits = nbits
+ self.bitsleft = bitsleft
+
+class RiceCode:
+ def __init__(self, m):
+ """Constructor a RiceCode for m-bit values."""
+ if not (0 <= m <= 16):
+ raise ValueError, "m must be between 0 and 16"
+ self.init(m)
+ self.bits = BitArray()
+ self.len = 0
+
+ def init(self, m):
+ self.m = m
+ self.lower = (1 << m) - 1
+ self.mask = 1 << (m - 1)
+
+ def append(self, val):
+ """Append an item to the list."""
+ if val < 1:
+ raise ValueError, "value >= 1 expected, got %s" % `val`
+ val -= 1
+ # emit the unary part of the code
+ q = val >> self.m
+ for i in range(q):
+ self.bits.append(1)
+ self.bits.append(0)
+ # emit the binary part
+ r = val & self.lower
+ mask = self.mask
+ while mask:
+ self.bits.append(r & mask)
+ mask >>= 1
+ self.len += 1
+
+ def __len__(self):
+ return self.len
+
+ def tolist(self):
+ """Return the items as a list."""
+ l = []
+ i = 0 # bit offset
+ binary_range = range(self.m)
+ for j in range(self.len):
+ unary = 0
+ while self.bits[i] == 1:
+ unary += 1
+ i += 1
+ assert self.bits[i] == 0
+ i += 1
+ binary = 0
+ for k in binary_range:
+ binary = (binary << 1) | self.bits[i]
+ i += 1
+ l.append((unary << self.m) + (binary + 1))
+ return l
+
+ def tostring(self):
+ """Return a binary string containing the encoded data.
+
+ The binary string may contain some extra zeros at the end.
+ """
+ return self.bits.tostring()
+
+ def __getstate__(self):
+ return self.m, self.bits
+
+ def __setstate__(self, (m, bits)):
+ self.init(m)
+ self.bits = bits
+
+def encode(m, l):
+ c = RiceCode(m)
+ for elt in l:
+ c.append(elt)
+ assert c.tolist() == l
+ return c
+
+def encode_deltas(l):
+ if len(l) == 1:
+ return l[0], []
+ deltas = RiceCode(6)
+ deltas.append(l[1] - l[0])
+ for i in range(2, len(l)):
+ deltas.append(l[i] - l[i - 1])
+ return l[0], deltas
+
+def decode_deltas(start, enc_deltas):
+ deltas = enc_deltas.tolist()
+ l = [start]
+ for i in range(1, len(deltas)):
+ l.append(l[i-1] + deltas[i])
+ l.append(l[-1] + deltas[-1])
+ return l
+
+def test():
+ import random
+ for size in [10, 20, 50, 100, 200]:
+ l = [random.randint(1, size) for i in range(50)]
+ c = encode(random.randint(1, 16), l)
+ assert c.tolist() == l
+ for size in [10, 20, 50, 100, 200]:
+ l = range(random.randint(1, size), size + random.randint(1, size))
+ t = encode_deltas(l)
+ l2 = decode_deltas(*t)
+ assert l == l2
+ if l != l2:
+ print l
+ print l2
+
+def pickle_efficiency():
+ import pickle
+ import random
+ for m in [4, 8, 12]:
+ for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
+ for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
+ l = [random.randint(1, elt_range) for i in range(size)]
+ raw = pickle.dumps(l, 1)
+ enc = pickle.dumps(encode(m, l), 1)
+ print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
+ print "%5d %5d" % (len(raw), len(enc)),
+ if len(raw) > len(enc):
+ print "win"
+ else:
+ print "lose"
+
+if __name__ == "__main__":
+ test()
=== Products/ZCTextIndex/Setup 1.1 => 1.2 ===
+stopper stopper.c
=== Products/ZCTextIndex/StopDict.py 1.1 => 1.2 ===
+
+The specific splitter and lexicon are customizable, but the default
+ZCTextIndex should do something useful.
+"""
+
+def get_stopdict():
+ """Return a dictionary of stopwords."""
+ return _dict
+
+# This list of English stopwords comes from Lucene
+_words = [
+ "a", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+]
+
+_dict = {}
+for w in _words:
+ _dict[w] = None
=== Products/ZCTextIndex/WidCode.py 1.1 => 1.2 ===
+# for smaller ints. This is intended for lists of word ids (wids). The
+# ordinary string .find() method can be used to find the encoded form of a
+# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
+# of an encoding can't appear in the interior of an encoding, so find() can't
+# be fooled into starting a match "in the middle" of an encoding.
+
+# Details:
+#
+# + Only the first byte of an encoding has the sign bit set.
+#
+# + The number of bytes in the encoding is encoded in unary at the start of
+# the first byte (i.e., an encoding with n bytes begins with n 1-bits
+# followed by a 0 bit).
+#
+# + Bytes beyond the first in an encoding have the sign bit clear, followed
+# by 7 bits of data.
+#
+# + The number of data bits in the first byte of an encoding varies.
+#
+# The int to be encoded can contain no more than 24 bits.
+# XXX this could certainly be increased
+#
+# If it contains no more than 6 bits, 00abcdef, the encoding is
+# 10abcdef
+#
+# If it contains 7 thru 12 bits,
+# 0000abcd efghijkL
+# the encoding is
+# 110abcde 0fghijkL
+#
+# Static tables _encoding and _decoding capture all encodes and decodes for
+# 12 or fewer bits.
+#
+# If it contains 13 thru 18 bits,
+# 000000ab cdefghij kLmnopqr
+# the encoding is
+# 1110abcd 0efghijk 0Lmnopqr
+#
+# If it contains 19 thru 24 bits,
+# abcdefgh ijkLmnop qrstuvwx
+# the encoding is
+# 11110abc 0defghij 0kLmnopq 0rstuvwx
+
+
+import re
+
+def encode(wids):
+ # Encode a list of wids as a string.
+ wid2enc = _encoding
+ n = len(wid2enc)
+ return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
+
+_encoding = [None] * 0x1000 # Filled later, and converted to a tuple
+
+def _encode(w):
+ assert 0x1000 <= w < 0x1000000
+ b, c = divmod(w, 0x80)
+ a, b = divmod(b, 0x80)
+ s = chr(b) + chr(c)
+ if a < 0x10: # no more than 18 data bits
+ return chr(a + 0xE0) + s
+ a, b = divmod(a, 0x80)
+ assert a < 0x4, (w, a, b, s) # else more than 24 data bits
+ return (chr(a + 0xF0) + chr(b)) + s
+
+_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
+
+def decode(code):
+ # Decode a string into a list of wids.
+ get = _decoding.get
+ # Obscure: while _decoding does have the key '\x80', its value is 0,
+ # so the "or" here calls _decode('\x80') anyway.
+ return [get(p) or _decode(p) for p in _prog.findall(code)]
+
+_decoding = {} # Filled later
+
+def _decode(s):
+ if s == '\x80':
+ # See comment in decode(). This is here to allow a trick to work.
+ return 0
+ if len(s) == 3:
+ a, b, c = map(ord, s)
+ assert a & 0xF0 == 0xE0 and not b & 0x80 and not c & 0x80
+ return ((a & 0xF) << 14) | (b << 7) | c
+ assert len(s) == 4, `s`
+ a, b, c, d = map(ord, s)
+ assert a & 0xF8 == 0xF0 and not b & 0x80 and not c & 0x80 and not d & 0x80
+ return ((a & 0x7) << 21) | (b << 14) | (c << 7) | d
+
+def _fill():
+ global _encoding
+ for i in range(0x40):
+ s = chr(i + 0x80)
+ _encoding[i] = s
+ _decoding[s] = i
+ for i in range(0x40, 0x1000):
+ hi, lo = divmod(i, 0x80)
+ s = chr(hi + 0xC0) + chr(lo)
+ _encoding[i] = s
+ _decoding[s] = i
+ _encoding = tuple(_encoding)
+
+_fill()
+
+def test():
+ for i in range(2**20):
+ if i % 1000 == 0: print i
+ wids = [i]
+ code = encode(wids)
+ assert decode(code) == wids, (wids, code, decode(code))
+
+if __name__ == "__main__":
+ test()
=== Products/ZCTextIndex/ZCTextIndex.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Plug in text index for ZCatalog with relevance ranking."""
+
+import ZODB
+from Persistence import Persistent
+import Acquisition
+from OFS.SimpleItem import SimpleItem
+
+from Products.PluginIndexes.common.PluggableIndex \
+ import PluggableIndexInterface
+
+from Products.ZCTextIndex.Index import Index
+from Products.ZCTextIndex.ILexicon import ILexicon
+from Products.ZCTextIndex.NBest import NBest
+from Products.ZCTextIndex.QueryParser import QueryParser
+from Globals import DTMLFile
+from Interface import verify_class_implementation
+
+class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
+ __implements__ = PluggableIndexInterface
+
+ meta_type = 'ZCTextIndex'
+
+ manage_options= (
+ {'label': 'Settings', 'action': 'manage_main'},
+ )
+
+ def __init__(self, id, extra, caller):
+ self.id = id
+ self._fieldname = extra.doc_attr
+ lexicon = getattr(caller, extra.lexicon_id, None)
+
+ if lexicon is None:
+ raise LookupError, 'Lexicon "%s" not found' % extra.lexicon_id
+
+ verify_class_implementation(ILexicon, lexicon.__class__)
+
+ self.lexicon = lexicon
+ self.index = Index(self.lexicon)
+ self.parser = QueryParser()
+
+ def index_object(self, docid, obj):
+ self.index.index_doc(docid, self._get_object_text(obj))
+ self._p_changed = 1 # XXX
+
+ def unindex_object(self, docid):
+ self.index.unindex_doc(docid)
+ self._p_changed = 1 # XXX
+
+ def _apply_index(self, req):
+ pass # XXX
+
+ def query(self, query, nbest=10):
+ # returns a mapping from docids to scores
+ tree = self.parser.parseQuery(query)
+ results = tree.executeQuery(self.index)
+ chooser = NBest(nbest)
+ chooser.addmany(results.items())
+ return chooser.getbest()
+
+ def _get_object_text(self, obj):
+ x = getattr(obj, self._fieldname)
+ if callable(x):
+ return x()
+ else:
+ return x
+
+ ## User Interface Methods ##
+
+ manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
+
+def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
+ RESPONSE=None):
+ """Add a text index"""
+ return self.manage_addIndex(id, 'ZCTextIndex', extra,
+ REQUEST, RESPONSE, REQUEST.URL3)
+
+manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
+
+manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
+
+def manage_addLexicon(self, id, title, splitter=None, normalizer=None,
+ stopword=None, REQUEST=None):
+ elements = []
+ if splitter:
+ elements.append(Lexicon.Splitter())
+ if normalizer:
+ elements.append(CaseNormalizer())
+ if stopwords:
+ elements.append(StopWordRemover())
+ lexicon = Lexicon(*elements)
+ self._setObject(id, lexicon)
+ if REQUEST is not None:
+ return self.manage_main(self, REQUEST, update_menu=1)
=== Products/ZCTextIndex/__init__.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+"""ZCatalog Text Index
+
+Experimental plugin text index for ZCatalog.
+"""
+
+def initialize(context):
+ from Products.ZCTextIndex import ZCTextIndex
+
+ context.registerClass(
+ ZCTextIndex.ZCTextIndex,
+ permission='Add Pluggable Index',
+ constructors=(ZCTextIndex.manage_addZCTextIndexForm,
+ ZCTextIndex.manage_addZCTextIndex),
+ visibility=None
+ )
=== Products/ZCTextIndex/stopper.c 1.1 => 1.2 ===
+ *
+ * Fast version of the StopWordRemover object.
+ */
+
+#include "Python.h"
+#include "structmember.h"
+
+typedef struct {
+ PyObject_HEAD
+ PyObject *swr_dict;
+} StopWordRemover;
+
+static PyObject *
+swr_process(StopWordRemover *self, PyObject *args)
+{
+ PyObject *result = NULL;
+ PyObject *seq;
+ int len, i;
+
+ if (!PyArg_ParseTuple(args, "O:process", &seq))
+ return NULL;
+ seq = PySequence_Fast(seq,
+ "process() requires a sequence as the argument");
+ if (seq == NULL)
+ return NULL;
+ result = PyList_New(0);
+ if (result == NULL)
+ goto finally;
+#if PY_VERSION_HEX >= 0x02020000
+ /* Only available in Python 2.2 and newer. */
+ len = PySequence_Fast_GET_SIZE(seq);
+#else
+ len = PyObject_Length(seq);
+#endif
+ for (i = 0; i < len; ++i) {
+ PyObject *s = PySequence_Fast_GET_ITEM(seq, i);
+ /*
+ * PyDict_GetItem() returns NULL if there isn't a matching
+ * item, but without setting an exception, so this does what
+ * we want.
+ */
+ if (PyDict_GetItem(self->swr_dict, s) == NULL)
+ if (PyList_Append(result, s) < 0) {
+ Py_DECREF(result);
+ result = NULL;
+ goto finally;
+ }
+ }
+ finally:
+ Py_XDECREF(seq);
+ return result;
+}
+
+static struct memberlist swr_members[] = {
+ {"dict", T_OBJECT, offsetof(StopWordRemover, swr_dict), READONLY},
+ {NULL}
+};
+
+static PyMethodDef swr_methods[] = {
+ {"process", (PyCFunction)swr_process, METH_VARARGS,
+ "process([str, ...]) --> [str, ...]\n"
+ "Remove stop words from the input list of strings to create a new list."},
+ {NULL}
+};
+
+static PyObject *
+swr_getattr(PyObject *self, char *name)
+{
+ PyObject *res;
+
+ res = Py_FindMethod(swr_methods, self, name);
+ if (res != NULL)
+ return res;
+ PyErr_Clear();
+ return PyMember_Get((char *)self, swr_members, name);
+}
+
+static void
+swr_dealloc(StopWordRemover *self)
+{
+ Py_XDECREF(self->swr_dict);
+ PyObject_Del(self);
+}
+
+static PyTypeObject StopWordRemover_Type = {
+ PyObject_HEAD_INIT(NULL) /* ob_type */
+ 0, /* ob_size */
+ "stopper.StopWordRemover", /* tp_name */
+ sizeof(StopWordRemover), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)swr_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ (getattrfunc)swr_getattr, /* tp_getattr */
+ 0, /* tp_setattr */
+};
+
+static PyObject *
+swr_new(PyObject *notused, PyObject *args)
+{
+ StopWordRemover *swr = NULL;
+ PyObject *dict = NULL;
+
+ if (PyArg_ParseTuple(args, "|O!:new", &PyDict_Type, &dict)) {
+ swr = PyObject_New(StopWordRemover, &StopWordRemover_Type);
+ if (swr != NULL) {
+ if (dict != NULL) {
+ Py_INCREF(dict);
+ swr->swr_dict = dict;
+ }
+ else {
+ swr->swr_dict = PyDict_New();
+ if (swr->swr_dict == NULL) {
+ Py_DECREF(swr);
+ swr = NULL;
+ }
+ }
+ }
+ }
+ return (PyObject *) swr;
+}
+
+static PyObject*
+pickle_constructor = NULL;
+
+PyObject *
+swr_pickler(PyObject *unused, PyObject *args)
+{
+ StopWordRemover *swr;
+ PyObject *result = NULL;
+
+ if (PyArg_ParseTuple(args, "O!:_pickler", &StopWordRemover_Type, &swr)) {
+ result = Py_BuildValue("O(O)", pickle_constructor, swr->swr_dict);
+ }
+ return result;
+}
+
+static PyMethodDef stopper_functions[] = {
+ {"new", swr_new, METH_VARARGS,
+ "new() -> StopWordRemover instance\n"
+ "Create & return a new stop-word remover."},
+ {"_pickler", swr_pickler, METH_VARARGS,
+ "_pickler(StopWordRemover instance) -> pickle magic\n"
+ "Internal magic used to make stop-word removers picklable."},
+ {NULL}
+};
+
+void
+initstopper(void)
+{
+ PyObject *m, *copy_reg;
+
+ StopWordRemover_Type.ob_type = &PyType_Type;
+ m = Py_InitModule3("stopper", stopper_functions,
+ "Fast StopWordRemover implementation.");
+ if (m == NULL)
+ return;
+ if (PyObject_SetAttrString(m, "StopWordRemoverType",
+ (PyObject *) &StopWordRemover_Type) < 0)
+ return;
+
+ /* register to support pickling */
+ copy_reg = PyImport_ImportModule("copy_reg");
+ if (copy_reg != NULL) {
+ PyObject *pickler;
+
+ if (pickle_constructor == NULL) {
+ pickle_constructor = PyObject_GetAttrString(m, "new");
+ Py_XINCREF(pickle_constructor);
+ }
+ pickler = PyObject_GetAttrString(m, "_pickler");
+ if ((pickle_constructor != NULL) && (pickler != NULL)) {
+ PyObject *res;
+
+ res = PyObject_CallMethod(
+ copy_reg, "pickle", "OOO", &StopWordRemover_Type,
+ pickler, pickle_constructor);
+ Py_XDECREF(res);
+ }
+ Py_DECREF(copy_reg);
+ }
+}