[Zope-CVS] CVS: Products/ZCTextIndex/tests - .cvsignore:1.2 __init__.py:1.2 hs-tool.py:1.2 indexhtml.py:1.2 mailtest.py:1.2 mhindex.py:1.2 testIndex.py:1.2 testLexicon.py:1.2 testNBest.py:1.2 testQueryEngine.py:1.2 testQueryParser.py:1.2 testZCTextIndex.py:1.2 wordstats.py:1.2
Guido van Rossum
guido@python.org
Tue, 14 May 2002 11:12:35 -0400
Update of /cvs-repository/Products/ZCTextIndex/tests
In directory cvs.zope.org:/tmp/cvs-serv10099/tests
Added Files:
.cvsignore __init__.py hs-tool.py indexhtml.py mailtest.py
mhindex.py testIndex.py testLexicon.py testNBest.py
testQueryEngine.py testQueryParser.py testZCTextIndex.py
wordstats.py
Log Message:
Merged TextIndexDS9-branch into trunk.
=== Products/ZCTextIndex/tests/.cvsignore 1.1 => 1.2 ===
=== Products/ZCTextIndex/tests/__init__.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+"""
+
+Revision information:
+$Id$
+"""
=== Products/ZCTextIndex/tests/hs-tool.py 1.1 => 1.2 ===
+
+import cPickle
+import os.path
+import sys
+
+from hotshot.log import LogReader
+
+def load_line_info(log):
+ byline = {}
+ prevloc = None
+ for what, place, tdelta in log:
+ if tdelta > 0:
+ t, nhits = byline.get(prevloc, (0, 0))
+ byline[prevloc] = (tdelta + t), (nhits + 1)
+ prevloc = place
+ return byline
+
+def basename(path, cache={}):
+ try:
+ return cache[path]
+ except KeyError:
+ fn = os.path.split(path)[1]
+ cache[path] = fn
+ return fn
+
+def print_results(results):
+ for info, place in results:
+ if not place:
+ print 'Bad unpack:', info, place
+ continue
+ filename, line, funcname = place
+ print '%8d %8d' % info, basename(filename), line
+
+def annotate_results(results):
+ files = {}
+ for stats, place in results:
+ if not place:
+ continue
+ time, hits = stats
+ file, line, func = place
+ l = files.get(file)
+ if l is None:
+ l = files[file] = []
+ l.append((line, hits, time))
+ order = files.keys()
+ order.sort()
+ for k in order:
+ if os.path.exists(k):
+ v = files[k]
+ v.sort()
+ annotate(k, v)
+
+def annotate(file, lines):
+ print "-" * 60
+ print file
+ print "-" * 60
+ f = open(file)
+ i = 1
+ match = lines[0][0]
+ for line in f:
+ if match == i:
+ print "%6d %8d " % lines[0][1:], line,
+ del lines[0]
+ if lines:
+ match = lines[0][0]
+ else:
+ match = None
+ else:
+ print " " * 16, line,
+ i += 1
+ print
+
+def get_cache_name(filename):
+ d, fn = os.path.split(filename)
+ cache_dir = os.path.join(d, '.hs-tool')
+ cache_file = os.path.join(cache_dir, fn)
+ return cache_dir, cache_file
+
+def cache_results(filename, results):
+ cache_dir, cache_file = get_cache_name(filename)
+ if not os.path.exists(cache_dir):
+ os.mkdir(cache_dir)
+ fp = open(cache_file, 'wb')
+ try:
+ cPickle.dump(results, fp, 1)
+ finally:
+ fp.close()
+
+def main(filename, annotate):
+ cache_dir, cache_file = get_cache_name(filename)
+
+ if ( os.path.isfile(cache_file)
+ and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
+ # cached data is up-to-date:
+ fp = open(cache_file, 'rb')
+ results = cPickle.load(fp)
+ fp.close()
+ else:
+ log = LogReader(filename)
+ byline = load_line_info(log)
+ # Sort
+ results = [(v, k) for k, v in byline.items()]
+ results.sort()
+ cache_results(filename, results)
+
+ if annotate:
+ annotate_results(results)
+ else:
+ print_results(results)
+
+
+if __name__ == "__main__":
+ import getopt
+
+ annotate_p = 0
+ opts, args = getopt.getopt(sys.argv[1:], 'A')
+ for o, v in opts:
+ if o == '-A':
+ annotate_p = 1
+ if args:
+ filename, = args
+ else:
+ filename = "profile.dat"
+
+ main(filename, annotate_p)
=== Products/ZCTextIndex/tests/indexhtml.py 1.1 => 1.2 ===
+
+"""Index a collection of HTML files on the filesystem.
+
+usage: indexhtml.py [options] dir
+
+Will create an index of all files in dir or its subdirectories.
+
+options:
+-f data.fs -- the path to the filestorage datafile
+"""
+
+import os
+
+import ZODB
+from ZODB.FileStorage import FileStorage
+from BTrees.IOBTree import IOBTree
+
+from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
+from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
+from Products.ZCTextIndex.Lexicon import Lexicon, StopWordRemover
+
+def make_index():
+ # there's an elaborate dance necessary to construct an index
+ class Struct:
+ pass
+ extra = Struct()
+ extra.doc_attr = "read"
+ extra.lexicon_id = "lexicon"
+ caller = Struct()
+ caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
+ return ZCTextIndex(extra, caller)
+
+def main(db, root, dir):
+ rt["index"] = index = make_index()
+ rt["files"] = paths = IOBTree()
+ get_transaction().commit()
+
+ files = [os.path.join(dir, file) for file in os.listdir(dir)]
+ docid = 0
+ for file in files:
+ if os.path.isdir(file):
+ files += [os.path.join(file, sub) for sub in os.listdir(file)]
+ else:
+ if not file.endswith(".html"):
+ continue
+ docid += 1
+ print "%5d" % docid, file
+ f = open(file, "rb")
+ paths[docid] = file
+ index.index_object(docid, f)
+ f.close()
+ if docid % TXN_INTERVAL == 0:
+ get_transaction().commit()
+ if docid % PACK_INTERVAL == 0:
+ db.pack()
+ get_transaction().commit()
+
+if __name__ == "__main__":
+ import sys
+ import getopt
+
+ VERBOSE = 0
+ FSPATH = "Data.fs"
+ TXN_INTERVAL = 100
+ PACK_INTERVAL = 500
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], 'vf:')
+ except getopt.error, msg:
+ print msg
+ print __doc__
+ sys.exit(2)
+
+ for o, v in opts:
+ if o == '-v':
+ VERBOSE += 1
+ if o == '-f':
+ FSPATH = v
+
+ if len(args) != 1:
+ print "Expected on argument"
+ print __doc__
+ sys.exit(2)
+ dir = args[0]
+
+ fs = FileStorage(FSPATH)
+ db = ZODB.DB(fs)
+ cn = db.open()
+ rt = cn.root()
+ dir = os.path.join(os.getcwd(), dir)
+ print dir
+ main(db, rt, dir)
+ cn.close()
+ fs.close()
+
=== Products/ZCTextIndex/tests/mailtest.py 1.1 => 1.2 ===
+
+usage: python mailtest.py [options] <data.fs>
+
+options:
+ -v -- verbose
+ -n NNN -- max number of messages to read from mailbox
+ -q query
+ -i mailbox
+ -p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
+ -p 0 -- don't pack at all
+ -b NNN -- return the NNN best matches (default: 10)
+ -x -- exclude the message text from the data.fs
+ -t NNN -- commit a transaction every NNN messages (default: 1)
+
+The script either indexes or queries depending on whether -q or -i is
+passed as an option.
+
+For -i mailbox, the script reads mail messages from the mailbox and
+indexes them. It indexes one message at a time, then commits the
+transaction.
+
+For -q query, it performs a query on an existing index.
+
+If both are specified, the index is performed first.
+
+You can also interact with the index after it is completed. Load the
+index from the database:
+
+ import ZODB
+ from ZODB.FileStorage import FileStorage
+ fs = FileStorage(<data.fs>
+ db = ZODB.DB(fs)
+ index = cn.open().root()["index"]
+ index.search("python AND unicode")
+"""
+
+import ZODB
+import ZODB.FileStorage
+from Products.ZCTextIndex.Lexicon import Lexicon, \
+ CaseNormalizer, Splitter, StopWordRemover
+from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
+from BTrees.IOBTree import IOBTree
+
+import sys
+import mailbox
+import time
+
+def usage(msg):
+ print msg
+ print __doc__
+ sys.exit(2)
+
+class Message:
+
+ total_bytes = 0
+
+ def __init__(self, msg):
+ subject = msg.getheader('subject', '')
+ author = msg.getheader('from', '')
+ if author:
+ summary = "%s (%s)\n" % (subject, author)
+ else:
+ summary = "%s\n" % subject
+ self.text = summary + msg.fp.read()
+ Message.total_bytes += len(self.text)
+
+class Extra:
+ pass
+
+def index(rt, mboxfile, db):
+ global NUM
+ idx_time = 0
+ pack_time = 0
+
+ lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+ extra = Extra()
+ extra.lexicon_id = 'lexicon'
+ extra.doc_attr = 'text'
+ caller = Extra()
+ caller.lexicon = lexicon
+ rt["index"] = idx = ZCTextIndex("index", extra, caller)
+ if not EXCLUDE_TEXT:
+ rt["documents"] = docs = IOBTree()
+ get_transaction().commit()
+
+ mbox = mailbox.UnixMailbox(open(mboxfile))
+ if VERBOSE:
+ print "opened", mboxfile
+ if not NUM:
+ NUM = sys.maxint
+ i = 0
+ while i < NUM:
+ _msg = mbox.next()
+ if _msg is None:
+ break
+ i += 1
+ msg = Message(_msg)
+ if VERBOSE >= 2:
+ print "indexing msg", i
+ i0 = time.clock()
+ idx.index_object(i, msg)
+ if not EXCLUDE_TEXT:
+ docs[i] = msg
+ if i % TXN_SIZE == 0:
+ get_transaction().commit()
+ i1 = time.clock()
+ idx_time += i1 - i0
+ if VERBOSE and i % 50 == 0:
+ print i, "messages indexed"
+ print "cache size", db.cacheSize()
+ if PACK_INTERVAL and i % PACK_INTERVAL == 0:
+ if VERBOSE >= 2:
+ print "packing..."
+ p0 = time.clock()
+ db.pack(time.time())
+ p1 = time.clock()
+ if VERBOSE:
+ print "pack took %s sec" % (p1 - p0)
+ pack_time += p1 - p0
+
+ get_transaction().commit()
+
+ if PACK_INTERVAL and i % PACK_INTERVAL != 0:
+ if VERBOSE >= 2:
+ print "packing one last time..."
+ p0 = time.clock()
+ db.pack(time.time())
+ p1 = time.clock()
+ if VERBOSE:
+ print "pack took %s sec" % (p1 - p0)
+ pack_time += p1 - p0
+
+ if VERBOSE:
+ print "Index time", idx_time
+ print "Index bytes", Message.total_bytes
+ rate = (Message.total_bytes / idx_time) / 1024
+ print "Index rate %d KB/sec" % int(rate)
+
+def query(rt, query_str):
+ idx = rt["index"]
+ docs = rt["documents"]
+ results = idx.query(query_str, BEST)
+ print "query:", query_str
+ print "# results:", len(results)
+ for docid, score in results:
+ print "docid %4d score %2d" % (docid, score)
+ if VERBOSE:
+ msg = docs[docid]
+ # print 3 lines of context
+ CONTEXT = 5
+ ctx = msg.text.split("\n", CONTEXT)
+ del ctx[-1]
+ print "-" * 60
+ print "message:"
+ for l in ctx:
+ print l
+ print "-" * 60
+
+
+def main(fs_path, mbox_path, query_str):
+ f = ZODB.FileStorage.FileStorage(fs_path)
+ db = ZODB.DB(f, cache_size=CACHE_SIZE)
+ cn = db.open()
+ rt = cn.root()
+
+ if mbox_path is not None:
+ index(rt, mbox_path, db)
+ if query_str is not None:
+ query(rt, query_str)
+
+ cn.close()
+ db.close()
+ f.close()
+
+if __name__ == "__main__":
+ import getopt
+
+ NUM = 0
+ BEST = 10
+ VERBOSE = 0
+ PACK_INTERVAL = 500
+ EXCLUDE_TEXT = 0
+ CACHE_SIZE = 10000
+ TXN_SIZE = 1
+ query_str = None
+ mbox_path = None
+ profile = None
+ old_profile = None
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:xt:',
+ ['profile=', 'old-profile='])
+ except getopt.error, msg:
+ usage(msg)
+ if len(args) != 1:
+ usage("exactly 1 filename argument required")
+ for o, v in opts:
+ if o == '-n':
+ NUM = int(v)
+ elif o == '-v':
+ VERBOSE += 1
+ elif o == '-p':
+ PACK_INTERVAL = int(v)
+ elif o == '-q':
+ query_str = v
+ elif o == '-i':
+ mbox_path = v
+ elif o == '-b':
+ BEST = int(v)
+ elif o == '-x':
+ EXCLUDE_TEXT = 1
+ elif o == '-t':
+ TXN_SIZE = int(v)
+ elif o == '--profile':
+ profile = v
+ elif o == '--old-profile':
+ old_profile = v
+ fs_path, = args
+ if profile:
+ import hotshot
+ profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
+ profiler.runcall(main, fs_path, mbox_path, query_str)
+ profiler.close()
+ elif old_profile:
+ import profile, pstats
+ profiler = profile.Profile()
+ profiler.runcall(main, fs_path, mbox_path, query_str)
+ profiler.dump_stats(old_profile)
+ stats = pstats.Stats(old_profile)
+ stats.strip_dirs().sort_stats('time').print_stats(20)
+ else:
+ main(fs_path, mbox_path, query_str)
=== Products/ZCTextIndex/tests/mhindex.py 1.1 => 1.2 ===
+
+"""MH mail indexer."""
+
+import re
+import sys
+import time
+import mhlib
+import getopt
+import traceback
+from StringIO import StringIO
+
+DATAFS = "/home/guido/.Data.fs"
+ZOPECODE = "/home/guido/projects/ds9/lib/python"
+
+sys.path.append(ZOPECODE)
+
+from ZODB import DB
+from ZODB.FileStorage import FileStorage
+from Persistence import Persistent
+from BTrees.IOBTree import IOBTree
+from BTrees.OIBTree import OIBTree
+
+from Products.ZCTextIndex.NBest import NBest
+from Products.ZCTextIndex.OkapiIndex import Index
+from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
+from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
+from Products.ZCTextIndex.QueryParser import QueryParser
+from Products.ZCTextIndex.StopDict import get_stopdict
+
+NBEST = 3
+MAXLINES = 3
+
+def main():
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Opu")
+ except getopt.error, msg:
+ print msg
+ sys.exit(2)
+ update = 0
+ bulk = 0
+ optimize = 0
+ nbest = NBEST
+ maxlines = MAXLINES
+ datafs = DATAFS
+ pack = 0
+ for o, a in opts:
+ if o == "-b":
+ bulk = 1
+ if o == "-d":
+ datafs = a
+ if o == "-m":
+ maxlines = int(a)
+ if o == "-n":
+ nbest = int(a)
+ if o == "-O":
+ optimize = 1
+ if o == "-p":
+ pack = 1
+ if o == "-u":
+ update = 1
+ ix = Indexer(datafs, update or bulk)
+ if bulk:
+ if optimize:
+ ix.optimize(args)
+ ix.bulkupdate(args)
+ elif update:
+ ix.update(args)
+ if pack:
+ ix.pack()
+ elif args:
+ for i in range(len(args)):
+ a = args[i]
+ if " " in a:
+ if a[0] == "-":
+ args[i] = '-"' + a[1:] + '"'
+ else:
+ args[i] = '"' + a + '"'
+ ix.query(" ".join(args), nbest, maxlines)
+ else:
+ ix.interact(nbest)
+
+class Indexer:
+
+ filestorage = database = connection = root = None
+
+ def __init__(self, datafs, writable=0):
+ self.stopdict = get_stopdict()
+ self.mh = mhlib.MH()
+ self.filestorage = FileStorage(datafs, read_only=(not writable))
+ self.database = DB(self.filestorage)
+ self.connection = self.database.open()
+ self.root = self.connection.root()
+ try:
+ self.index = self.root["index"]
+ except KeyError:
+ self.index = self.root["index"] = TextIndex()
+ try:
+ self.docpaths = self.root["docpaths"]
+ except KeyError:
+ self.docpaths = self.root["docpaths"] = IOBTree()
+ self.path2docid = OIBTree()
+ for docid in self.docpaths.keys():
+ path = self.docpaths[docid]
+ self.path2docid[path] = docid
+ try:
+ self.maxdocid = max(self.docpaths.keys())
+ except ValueError:
+ self.maxdocid = 0
+ print len(self.docpaths), "Document ids"
+ print len(self.path2docid), "Pathnames"
+
+ def close(self):
+ self.root = None
+ if self.connection is not None:
+ self.connection.close()
+ self.connection = None
+ if self.database is not None:
+ self.database.close()
+ self.database = None
+ if self.filestorage is not None:
+ self.filestorage.close()
+ self.filestorage = None
+
+ def interact(self, nbest=NBEST, maxlines=MAXLINES):
+ try:
+ import readline
+ except ImportError:
+ pass
+ text = ""
+ top = 0
+ while 1:
+ try:
+ line = raw_input("Query: ")
+ except EOFError:
+ print "\nBye."
+ break
+ line = line.strip()
+ if line:
+ text = line
+ top = 0
+ else:
+ if not text:
+ continue
+ try:
+ n, results = self.timequery(text, top + nbest)
+ except:
+ reportexc()
+ text = ""
+ top = 0
+ continue
+ if len(results) <= top:
+ if not n:
+ print "No hits for %r." % text
+ else:
+ print "No more hits for %r." % text
+ text = ""
+ top = 0
+ continue
+ print "[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
+ print "for query %s]" % repr(text)
+ self.formatresults(text, results, maxlines, top, top+nbest)
+ top += nbest
+
+ def query(self, text, nbest=NBEST, maxlines=MAXLINES):
+ n, results = self.timequery(text, nbest)
+ if not n:
+ print "No hits for %r." % text
+ return
+ print "[Results 1-%d from %d]" % (len(results), n)
+ self.formatresults(text, results, maxlines)
+
+ def timequery(self, text, nbest):
+ t0 = time.time()
+ c0 = time.clock()
+ n, results = self.index.query(text, nbest)
+ t1 = time.time()
+ c1 = time.clock()
+ print "[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)
+ return n, results
+
+ def formatresults(self, text, results, maxlines=MAXLINES,
+ lo=0, hi=sys.maxint):
+ stop = self.stopdict.has_key
+ words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
+ pattern = r"\b(" + "|".join(words) + r")\b"
+ pattern = pattern.replace("*", ".*") # glob -> re syntax
+ prog = re.compile(pattern, re.IGNORECASE)
+ print '='*70
+ rank = lo
+ qw = max(1, self.index.query_weight(text))
+ factor = 100.0 / qw / 1024
+ for docid, score in results[lo:hi]:
+ rank += 1
+ path = self.docpaths[docid]
+ score = min(100, int(score * factor))
+ print "Rank: %d Score: %d%% File: %s" % (rank, score, path)
+ fp = open(path)
+ msg = mhlib.Message("<folder>", 0, fp)
+ for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
+ h = msg.getheader(header)
+ if h:
+ print "%-8s %s" % (header+":", h)
+ text = self.getmessagetext(msg)
+ if text:
+ print
+ nleft = maxlines
+ for part in text:
+ for line in part.splitlines():
+ if prog.search(line):
+ print line
+ nleft -= 1
+ if nleft <= 0:
+ break
+ if nleft <= 0:
+ break
+ print '-'*70
+
+ def update(self, args):
+ folder = None
+ seqs = []
+
+ for arg in args:
+ if arg.startswith("+"):
+ if folder is None:
+ folder = arg[1:]
+ else:
+ print "only one folder at a time"
+ return
+ else:
+ seqs.append(arg)
+
+ if not folder:
+ folder = self.mh.getcontext()
+ if not seqs:
+ seqs = ['all']
+
+ try:
+ f = self.mh.openfolder(folder)
+ except mhlib.Error, msg:
+ print msg
+ return
+
+ dict = {}
+ for seq in seqs:
+ try:
+ nums = f.parsesequence(seq)
+ except mhlib.Error, msg:
+ print msg or "unparsable message sequence: %s" % `seq`
+ return
+ for n in nums:
+ dict[n] = n
+ msgs = dict.keys()
+ msgs.sort()
+
+ self.updatefolder(f, msgs)
+
+ def optimize(self, args):
+ uniqwords = {}
+ for folder in args:
+ if folder.startswith("+"):
+ folder = folder[1:]
+ print "\nOPTIMIZE FOLDER", folder
+ try:
+ f = self.mh.openfolder(folder)
+ except mhlib.Error, msg:
+ print msg
+ continue
+ self.prescan(f, f.listmessages(), uniqwords)
+ L = [(uniqwords[word], word) for word in uniqwords.keys()]
+ L.sort()
+ L.reverse()
+ for i in range(100):
+ print "%3d. %6d %s" % ((i+1,) + L[i])
+ self.index.lexicon.sourceToWordIds([word for (count, word) in L])
+
+ def prescan(self, f, msgs, uniqwords):
+ pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
+ for n in msgs:
+ print "prescanning", n
+ m = f.openmessage(n)
+ text = self.getmessagetext(m)
+ for p in pipeline:
+ text = p.process(text)
+ for word in text:
+ uniqwords[word] = uniqwords.get(word, 0) + 1
+
+ def bulkupdate(self, args):
+ chunk = 5000
+ target = len(self.docpaths) + chunk
+ for folder in args:
+ if len(self.docpaths) >= target:
+ self.pack()
+ target = len(self.docpaths) + chunk
+ if folder.startswith("+"):
+ folder = folder[1:]
+ print "\nFOLDER", folder
+ try:
+ f = self.mh.openfolder(folder)
+ except mhlib.Error, msg:
+ print msg
+ continue
+ self.updatefolder(f, f.listmessages())
+ print "Total", len(self.docpaths)
+ self.pack()
+
+ def updatefolder(self, f, msgs):
+ done = 0
+ new = 0
+ for n in msgs:
+ print "indexing", n
+ m = f.openmessage(n)
+ text = self.getmessagetext(m)
+ path = f.getmessagefilename(n)
+ self.unindexpath(path)
+ if not text:
+ continue
+ docid = self.newdocid(path)
+ self.index.index_text(docid, text)
+ done += 1
+ new = 1
+ if done%500 == 0:
+ self.commit()
+ new = 0
+ if new:
+ self.commit()
+ print "done."
+
+ def unindexpath(self, path):
+ if self.path2docid.has_key(path):
+ docid = self.path2docid[path]
+ print "unindexing", docid, path
+ del self.docpaths[docid]
+ del self.path2docid[path]
+ try:
+ self.index.unindex(docid)
+ except KeyError, msg:
+ print "KeyError", msg
+
+ def getmessagetext(self, m):
+ L = []
+ try:
+ self.getmsgparts(m, L, 0)
+ except:
+ print "(getmsgparts failed:)"
+ reportexc()
+ return L
+
+ def getmsgparts(self, m, L, level):
+ ctype = m.gettype()
+ if level or ctype != "text/plain":
+ print ". "*level + str(ctype)
+ if ctype == "text/plain":
+ L.append(m.getbodytext())
+ elif ctype in ("multipart/alternative", "multipart/mixed"):
+ for part in m.getbodyparts():
+ self.getmsgparts(part, L, level+1)
+ elif ctype == "message/rfc822":
+ f = StringIO(m.getbodytext())
+ m = mhlib.Message("<folder>", 0, f)
+ self.getmsgparts(m, L, level+1)
+
+ def newdocid(self, path):
+ docid = self.maxdocid + 1
+ self.maxdocid = docid
+ self.docpaths[docid] = path
+ self.path2docid[path] = docid
+ return docid
+
+ def commit(self):
+ print "committing..."
+ get_transaction().commit()
+
+ def pack(self):
+ print "packing..."
+ self.database.pack()
+
+class TextIndex(Persistent):
+
+ def __init__(self):
+ self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+ self.index = Index(self.lexicon)
+
+ def index_text(self, docid, text):
+ self.index.index_doc(docid, text)
+ self._p_changed = 1 # XXX
+
+ def unindex(self, docid):
+ self.index.unindex_doc(docid)
+ self._p_changed = 1 # XXX
+
+ def query(self, query, nbest=10):
+ # returns a total hit count and a mapping from docids to scores
+ parser = QueryParser()
+ tree = parser.parseQuery(query)
+ results = tree.executeQuery(self.index)
+ chooser = NBest(nbest)
+ chooser.addmany(results.items())
+ return len(results), chooser.getbest()
+
+ def query_weight(self, query):
+ parser = QueryParser()
+ tree = parser.parseQuery(query)
+ terms = tree.terms()
+ return self.index.query_weight(terms)
+
+def reportexc():
+ traceback.print_exc()
+
+if __name__ == "__main__":
+ main()
=== Products/ZCTextIndex/tests/testIndex.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from Products.ZCTextIndex.Index import Index
+from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
+
+class IndexTest(TestCase):
+ def setUp(self):
+ self.lexicon = Lexicon(Splitter())
+ self.index = Index(self.lexicon)
+
+ def test_index_document(self, DOCID=1):
+ doc = "simple document contains five words"
+ self.index.index_doc(DOCID, doc)
+ self.assert_(self.index._docweight[DOCID])
+ self.assertEqual(len(self.index._wordinfo), 5)
+ self.assertEqual(len(self.index._docwords), 1)
+ self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
+ for map in self.index._wordinfo.values():
+ self.assertEqual(len(map), 1)
+ self.assert_(map.has_key(DOCID))
+
+ def test_unindex_document(self):
+ DOCID = 1
+ self.test_index_document(DOCID)
+ self.index.unindex_doc(DOCID)
+ self.assertEqual(len(self.index._docweight), 0)
+ self.assertEqual(len(self.index._wordinfo), 0)
+ self.assertEqual(len(self.index._docwords), 0)
+
+ def test_index_two_documents(self):
+ self.test_index_document()
+ doc = "another document just four"
+ DOCID = 2
+ self.index.index_doc(DOCID, doc)
+ self.assert_(self.index._docweight[DOCID])
+ self.assertEqual(len(self.index._wordinfo), 8)
+ self.assertEqual(len(self.index._docwords), 2)
+ self.assertEqual(len(self.index._get_undoinfo(DOCID)), 4)
+ wids = self.lexicon.termToWordIds("document")
+ self.assertEqual(len(wids), 1)
+ document_wid = wids[0]
+ for wid, map in self.index._wordinfo.items():
+ if wid == document_wid:
+ self.assertEqual(len(map), 2)
+ self.assert_(map.has_key(1))
+ self.assert_(map.has_key(DOCID))
+ else:
+ self.assertEqual(len(map), 1)
+
+ def test_index_two_unindex_one(self):
+ # index two documents, unindex one, and test the results
+ self.test_index_two_documents()
+ self.index.unindex_doc(1)
+ DOCID = 2
+ self.assertEqual(len(self.index._docweight), 1)
+ self.assert_(self.index._docweight[DOCID])
+ self.assertEqual(len(self.index._wordinfo), 4)
+ self.assertEqual(len(self.index._docwords), 1)
+ self.assertEqual(len(self.index._get_undoinfo(DOCID)), 4)
+ for map in self.index._wordinfo.values():
+ self.assertEqual(len(map), 1)
+ self.assert_(map.has_key(DOCID))
+
+ def test_index_duplicated_words(self, DOCID=1):
+ doc = "very simple repeat repeat repeat document test"
+ self.index.index_doc(DOCID, doc)
+ self.assert_(self.index._docweight[DOCID])
+ self.assertEqual(len(self.index._wordinfo), 5)
+ self.assertEqual(len(self.index._docwords), 1)
+## self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
+ wids = self.lexicon.termToWordIds("repeat")
+ self.assertEqual(len(wids), 1)
+ repititive_wid = wids[0]
+ for wid, map in self.index._wordinfo.items():
+ self.assertEqual(len(map), 1)
+ self.assert_(map.has_key(DOCID))
+
+ def test_simple_query_oneresult(self):
+ self.index.index_doc(1, 'not the same document')
+ results = self.index.search("document")
+ self.assertEqual(list(results.keys()), [1])
+
+ def test_simple_query_noresults(self):
+ self.index.index_doc(1, 'not the same document')
+ results = self.index.search("frobnicate")
+ self.assertEqual(list(results.keys()), [])
+
+ def test_query_oneresult(self):
+ self.index.index_doc(1, 'not the same document')
+ self.index.index_doc(2, 'something about something else')
+ results = self.index.search("document")
+ self.assertEqual(list(results.keys()), [1])
+
+ def test_search_phrase(self):
+ self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
+ self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
+ results = self.index.search_phrase("quick brown fox")
+ self.assertEqual(list(results.keys()), [1])
+
+ def test_search_glob(self):
+ self.index.index_doc(1, "how now brown cow")
+ self.index.index_doc(2, "hough nough browne cough")
+ self.index.index_doc(3, "bar brawl")
+ results = self.index.search_glob("bro*")
+ self.assertEqual(list(results.keys()), [1, 2])
+ results = self.index.search_glob("b*")
+ self.assertEqual(list(results.keys()), [1, 2, 3])
+
+def test_suite():
+ return makeSuite(IndexTest)
+
+if __name__=='__main__':
+ main(defaultTest='test_suite')
=== Products/ZCTextIndex/tests/testLexicon.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from Products.ZCTextIndex.Lexicon import Lexicon
+from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer
+
+class StupidPipelineElement:
+ def __init__(self, fromword, toword):
+ self.__fromword = fromword
+ self.__toword = toword
+
+ def process(self, seq):
+ res = []
+ for term in seq:
+ if term == self.__fromword:
+ res.append(self.__toword)
+ else:
+ res.append(term)
+ return res
+
+class WackyReversePipelineElement:
+ def __init__(self, revword):
+ self.__revword = revword
+
+ def process(self, seq):
+ res = []
+ for term in seq:
+ if term == self.__revword:
+ x = list(term)
+ x.reverse()
+ res.append(''.join(x))
+ else:
+ res.append(term)
+ return res
+
+class StopWordPipelineElement:
+ def __init__(self, stopdict={}):
+ self.__stopdict = stopdict
+
+ def process(self, seq):
+ res = []
+ for term in seq:
+ if self.__stopdict.get(term):
+ continue
+ else:
+ res.append(term)
+ return res
+
+
+class Test(TestCase):
+ def testSourceToWordIds(self):
+ lexicon = Lexicon(Splitter())
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ self.assertEqual(wids, [1, 2, 3])
+
+ def testTermToWordIds(self):
+ lexicon = Lexicon(Splitter())
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('dogs')
+ self.assertEqual(wids, [3])
+
+ def testMissingTermToWordIds(self):
+ lexicon = Lexicon(Splitter())
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('boxes')
+ self.assertEqual(wids, [])
+
+ def testOnePipelineElement(self):
+ lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('fish')
+ self.assertEqual(wids, [3])
+
+ def testSplitterAdaptorFold(self):
+ lexicon = Lexicon(Splitter(), CaseNormalizer())
+ wids = lexicon.sourceToWordIds('CATS and dogs')
+ wids = lexicon.termToWordIds('cats and dogs')
+ self.assertEqual(wids, [1, 2, 3])
+
+ def testSplitterAdaptorNofold(self):
+ lexicon = Lexicon(Splitter())
+ wids = lexicon.sourceToWordIds('CATS and dogs')
+ wids = lexicon.termToWordIds('cats and dogs')
+ self.assertEqual(wids, [2, 3])
+
+ def testTwoElementPipeline(self):
+ lexicon = Lexicon(Splitter(),
+ StupidPipelineElement('cats', 'fish'),
+ WackyReversePipelineElement('fish'))
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('hsif')
+ self.assertEqual(wids, [1])
+
+ def testThreeElementPipeline(self):
+ lexicon = Lexicon(Splitter(),
+ StopWordPipelineElement({'and':1}),
+ StupidPipelineElement('dogs', 'fish'),
+ WackyReversePipelineElement('fish'))
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('hsif')
+ self.assertEqual(wids, [2])
+
+
+def test_suite():
+ return makeSuite(Test)
+
+if __name__=='__main__':
+ main(defaultTest='test_suite')
=== Products/ZCTextIndex/tests/testNBest.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from Products.ZCTextIndex.NBest import NBest
+
+class NBestTest(TestCase):
+
+ def testConstructor(self):
+ self.assertRaises(ValueError, NBest, 0)
+ self.assertRaises(ValueError, NBest, -1)
+
+ for n in range(1, 11):
+ nb = NBest(n)
+ self.assertEqual(len(nb), 0)
+ self.assertEqual(nb.capacity(), n)
+
+ def testOne(self):
+ nb = NBest(1)
+ nb.add('a', 0)
+ self.assertEqual(nb.getbest(), [('a', 0)])
+
+ nb.add('b', 1)
+ self.assertEqual(len(nb), 1)
+ self.assertEqual(nb.capacity(), 1)
+ self.assertEqual(nb.getbest(), [('b', 1)])
+
+ nb.add('c', -1)
+ self.assertEqual(len(nb), 1)
+ self.assertEqual(nb.capacity(), 1)
+ self.assertEqual(nb.getbest(), [('b', 1)])
+
+ nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
+ self.assertEqual(len(nb), 1)
+ self.assertEqual(nb.capacity(), 1)
+ self.assertEqual(nb.getbest(), [('f', 5)])
+
+ def testMany(self):
+ import random
+ inputs = [(-i, i) for i in range(50)]
+
+ reversed_inputs = inputs[:]
+ reversed_inputs.reverse()
+
+ # Test the N-best for a variety of n (1, 6, 11, ... 50).
+ for n in range(1, len(inputs)+1, 5):
+ expected = inputs[-n:]
+ expected.reverse()
+
+ random_inputs = inputs[:]
+ random.shuffle(random_inputs)
+
+ for source in inputs, reversed_inputs, random_inputs:
+ # Try feeding them one at a time.
+ nb = NBest(n)
+ for item, score in source:
+ nb.add(item, score)
+ self.assertEqual(len(nb), n)
+ self.assertEqual(nb.capacity(), n)
+ self.assertEqual(nb.getbest(), expected)
+
+ # And again in one gulp.
+ nb = NBest(n)
+ nb.addmany(source)
+ self.assertEqual(len(nb), n)
+ self.assertEqual(nb.capacity(), n)
+ self.assertEqual(nb.getbest(), expected)
+
+ for i in range(1, n+1):
+ self.assertEqual(nb.pop_smallest(), expected[-i])
+ self.assertRaises(IndexError, nb.pop_smallest)
+
+def test_suite():
+ return makeSuite(NBestTest)
+
+if __name__=='__main__':
+ main(defaultTest='test_suite')
=== Products/ZCTextIndex/tests/testQueryEngine.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from BTrees.IIBTree import IIBucket
+
+from Products.ZCTextIndex.QueryParser import QueryParser
+from Products.ZCTextIndex.ParseTree import ParseError, QueryError
+
+class FauxIndex:
+
+ def search(self, term):
+ b = IIBucket()
+ if term == "foo":
+ b[1] = b[3] = 1
+ elif term == "bar":
+ b[1] = b[2] = 1
+ elif term == "ham":
+ b[1] = b[2] = b[3] = b[4] = 1
+ return b
+
+class TestQueryEngine(TestCase):
+
+ def setUp(self):
+ self.parser = QueryParser()
+ self.index = FauxIndex()
+
+ def compareSet(self, set, dict):
+ d = {}
+ for k, v in set.items():
+ d[k] = v
+ self.assertEqual(d, dict)
+
+ def compareQuery(self, query, dict):
+ tree = self.parser.parseQuery(query)
+ set = tree.executeQuery(self.index)
+ self.compareSet(set, dict)
+
+ def testExecuteQuery(self):
+ self.compareQuery("foo AND bar", {1: 2})
+ self.compareQuery("foo OR bar", {1: 2, 2: 1, 3:1})
+ self.compareQuery("foo AND NOT bar", {3: 1})
+ self.compareQuery("foo AND foo AND foo", {1: 3, 3: 3})
+ self.compareQuery("foo OR foo OR foo", {1: 3, 3: 3})
+ self.compareQuery("ham AND NOT foo AND NOT bar", {4: 1})
+ self.compareQuery("ham OR foo OR bar", {1: 3, 2: 2, 3: 2, 4: 1})
+ self.compareQuery("ham AND foo AND bar", {1: 3})
+
+ def testInvalidQuery(self):
+ from Products.ZCTextIndex.ParseTree import NotNode, AtomNode
+ tree = NotNode(AtomNode("foo"))
+ self.assertRaises(QueryError, tree.executeQuery, self.index)
+
+def test_suite():
+ return makeSuite(TestQueryEngine)
+
+if __name__=='__main__':
+ main(defaultTest='test_suite')
=== Products/ZCTextIndex/tests/testQueryParser.py 1.1 => 1.2 ===
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from Products.ZCTextIndex.QueryParser import QueryParser
+
+from Products.ZCTextIndex.ParseTree import ParseError, ParseTreeNode
+from Products.ZCTextIndex.ParseTree import OrNode, AndNode, NotNode
+from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode
+
+class TestQueryParser(TestCase):
+
+ def compareParseTrees(self, got, expected):
+ self.assertEqual(isinstance(got, ParseTreeNode), 1)
+ self.assertEqual(got.__class__, expected.__class__)
+ if isinstance(got, PhraseNode):
+ self.assertEqual(got.nodeType(), "PHRASE")
+ self.assertEqual(got.getValue(), expected.getValue())
+ elif isinstance(got, GlobNode):
+ self.assertEqual(got.nodeType(), "GLOB")
+ self.assertEqual(got.getValue(), expected.getValue())
+ elif isinstance(got, AtomNode):
+ self.assertEqual(got.nodeType(), "ATOM")
+ self.assertEqual(got.getValue(), expected.getValue())
+ elif isinstance(got, NotNode):
+ self.assertEqual(got.nodeType(), "NOT")
+ self.compareParseTrees(got.getValue(), expected.getValue())
+ elif isinstance(got, AndNode) or isinstance(got, OrNode):
+ self.assertEqual(got.nodeType(),
+ isinstance(got, AndNode) and "AND" or "OR")
+ list1 = got.getValue()
+ list2 = expected.getValue()
+ self.assertEqual(len(list1), len(list2))
+ for i in range(len(list1)):
+ self.compareParseTrees(list1[i], list2[i])
+
+ def expect(self, input, output):
+ tree = self.p.parseQuery(input)
+ self.compareParseTrees(tree, output)
+
+ def failure(self, input):
+ self.assertRaises(ParseError, self.p.parseQuery, input)
+
+ def setUp(self):
+ self.p = QueryParser()
+
+ def testParseQuery(self):
+ self.expect("foo", AtomNode("foo"))
+ self.expect("note", AtomNode("note"))
+ self.expect("a and b AND c",
+ AndNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
+ self.expect("a OR b or c",
+ OrNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
+ self.expect("a AND b OR c AnD d",
+ OrNode([AndNode([AtomNode("a"), AtomNode("b")]),
+ AndNode([AtomNode("c"), AtomNode("d")])]))
+ self.expect("(a OR b) AND (c OR d)",
+ AndNode([OrNode([AtomNode("a"), AtomNode("b")]),
+ OrNode([AtomNode("c"), AtomNode("d")])]))
+ self.expect("a AND not b",
+ AndNode([AtomNode("a"), NotNode(AtomNode("b"))]))
+
+ self.expect('"foo bar"', PhraseNode("foo bar"))
+ self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+ self.expect('(("foo bar"))"', PhraseNode("foo bar"))
+ self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+ self.expect('and/', AtomNode("and"))
+
+ self.expect("foo-bar", PhraseNode("foo bar"))
+ self.expect("foo -bar", AndNode([AtomNode("foo"),
+ NotNode(AtomNode("bar"))]))
+ self.expect("-foo bar", AndNode([AtomNode("bar"),
+ NotNode(AtomNode("foo"))]))
+ self.expect("booh -foo-bar",
+ AndNode([AtomNode("booh"),
+ NotNode(PhraseNode("foo bar"))]))
+ self.expect('booh -"foo bar"',
+ AndNode([AtomNode("booh"),
+ NotNode(PhraseNode("foo bar"))]))
+ self.expect('foo"bar"',
+ AndNode([AtomNode("foo"), AtomNode("bar")]))
+ self.expect('"foo"bar',
+ AndNode([AtomNode("foo"), AtomNode("bar")]))
+ self.expect('foo"bar"blech',
+ AndNode([AtomNode("foo"), AtomNode("bar"),
+ AtomNode("blech")]))
+
+ self.expect("foo*", GlobNode("foo*"))
+ self.expect("foo* bar", AndNode([GlobNode("foo*"),
+ AtomNode("bar")]))
+
+ def testParseFailures(self):
+ self.failure("")
+ self.failure("not")
+ self.failure("OR")
+ self.failure("AND")
+ self.failure("not foo")
+ self.failure(")")
+ self.failure("(")
+ self.failure("foo OR")
+ self.failure("foo AND")
+ self.failure("OR foo")
+ self.failure("and foo")
+ self.failure("(foo) bar")
+ self.failure("(foo OR)")
+ self.failure("(foo AND)")
+ self.failure("(NOT foo)")
+ self.failure("-foo")
+ self.failure("-foo -bar")
+ self.failure('""')
+
+
+def test_suite():
+ return makeSuite(TestQueryParser)
+
+if __name__=="__main__":
+ main(defaultTest='test_suite')
=== Products/ZCTextIndex/tests/testZCTextIndex.py 1.1 => 1.2 ===
+from Products.ZCTextIndex.tests \
+ import testIndex, testQueryEngine, testQueryParser
+from Products.ZCTextIndex.Index import scaled_int, SCALE_FACTOR
+from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
+from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
+
+import unittest
+
+class Indexable:
+ def __init__(self, text):
+ self.text = text
+
+class LexiconHolder:
+ def __init__(self, lexicon):
+ self.lexicon = lexicon
+
+class Extra:
+ pass
+
+# The tests classes below create a ZCTextIndex(). Then they create
+# instance variables that point to the internal components used by
+# ZCTextIndex. These tests run the individual module unit tests with
+# the fully integrated ZCTextIndex.
+
+def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
+ if abs(scaled1 - scaled2) > epsilon:
+ raise AssertionError, "%s != %s" % (scaled1, scaled2)
+
+class IndexTests(testIndex.IndexTest):
+
+ def setUp(self):
+ extra = Extra()
+ extra.doc_attr = 'text'
+ extra.lexicon_id = 'lexicon'
+ caller = LexiconHolder(Lexicon(Splitter(), CaseNormalizer(),
+ StopWordRemover()))
+ self.zc_index = ZCTextIndex('name', extra, caller)
+ self.index = self.zc_index.index
+ self.lexicon = self.zc_index.lexicon
+
+ def testStopWords(self):
+ # the only non-stopword is question
+ text = ("to be or not to be "
+ "that is the question")
+ doc = Indexable(text)
+ self.zc_index.index_object(1, doc)
+ for word in text.split():
+ if word != "question":
+ wids = self.lexicon.termToWordIds(word)
+ self.assertEqual(wids, [])
+ self.assertEqual(len(self.index._get_undoinfo(1)), 1)
+
+ def testRanking(self):
+ # A fairly involved test of the ranking calculations based on
+ # an example set of documents in queries in Managing
+ # Gigabytes, pp. 180-188.
+ self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
+ "pease", "porridge", "pot"]
+ self._ranking_index()
+ self._ranking_tf()
+ self._ranking_idf()
+ self._ranking_queries()
+
+ def _ranking_index(self):
+ docs = ["Pease porridge hot, pease porridge cold,",
+ "Pease porridge in the pot,",
+ "Nine days old.",
+ "In the pot cold, in the pot hot,",
+ "Pease porridge, pease porridge,",
+ "Eat the lot."]
+ for i in range(len(docs)):
+ self.zc_index.index_object(i + 1, Indexable(docs[i]))
+
+ def _ranking_tf(self):
+ # matrix of term weights for the rows are docids
+ # and the columns are indexes into this list:
+ l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
+ (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
+ (0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
+ (1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
+ (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
+ (0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
+ l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
+
+ for i in range(len(l_Wd)):
+ docid = i + 1
+ scaled_Wd = scaled_int(l_Wd[i])
+ eq(scaled_Wd, self.index._get_Wd(docid))
+ wdts = [scaled_int(t) for t in l_wdt[i]]
+ for j in range(len(wdts)):
+ wdt = self.index._get_wdt(docid, self.words[j])
+ eq(wdts[j], wdt)
+
+ def _ranking_idf(self):
+ word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
+ idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
+ for i in range(len(self.words)):
+ word = self.words[i]
+ eq(word_freqs[i], self.index._get_ft(word))
+ eq(scaled_int(idfs[i]), self.index._get_wt(word))
+
+ def _ranking_queries(self):
+ queries = ["eat", "porridge", "hot OR porridge",
+ "eat OR nine OR day OR old OR porridge"]
+ wqs = [1.95, 1.10, 1.77, 3.55]
+ results = [[(6, 0.71)],
+ [(1, 0.61), (2, 0.58), (5, 0.71)],
+ [(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
+ [(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
+ for i in range(len(queries)):
+ raw = queries[i]
+ q = self.zc_index.parser.parseQuery(raw)
+ wq = self.index.query_weight(q.terms())
+ eq(wq, scaled_int(wqs[i]))
+ r = self.zc_index.query(raw)
+ self.assertEqual(len(r), len(results[i]))
+ # convert the results to a dict for each checking
+ d = {}
+ for doc, score in results[i]:
+ d[doc] = scaled_int(score)
+ for doc, score in r:
+ score = scaled_int(float(score / SCALE_FACTOR) / wq)
+ self.assert_(0 <= score <= SCALE_FACTOR)
+ eq(d[doc], score)
+
+class QueryTests(testQueryEngine.TestQueryEngine,
+ testQueryParser.TestQueryParser):
+
+ # The FauxIndex in testQueryEngine contains four documents.
+ # docid 1: foo, bar, ham
+ # docid 2: bar, ham
+ # docid 3: foo, ham
+ # docid 4: ham
+
+ docs = ["foo bar ham", "bar ham", "foo ham", "ham"]
+
+ def setUp(self):
+ extra = Extra()
+ extra.doc_attr = 'text'
+ extra.lexicon_id = 'lexicon'
+ caller = LexiconHolder(Lexicon(Splitter(), CaseNormalizer(),
+ StopWordRemover()))
+ self.zc_index = ZCTextIndex('name', extra, caller)
+ self.p = self.parser = self.zc_index.parser
+ self.index = self.zc_index.index
+ self.add_docs()
+
+ def add_docs(self):
+ for i in range(len(self.docs)):
+ text = self.docs[i]
+ obj = Indexable(text)
+ self.zc_index.index_object(i + 1, obj)
+
+ def compareSet(self, set, dict):
+ # XXX The FauxIndex and the real Index score documents very
+ # differently. The set comparison can't actually compare the
+ # items, but it can compare the keys. That will have to do for now.
+ d = {}
+ for k, v in set.items():
+ d[k] = v
+ self.assertEqual(d.keys(), dict.keys())
+
+
+def test_suite():
+ s = unittest.TestSuite()
+ for klass in IndexTests, QueryTests:
+ s.addTest(unittest.makeSuite(klass))
+ return s
+
+if __name__=='__main__':
+ unittest.main(defaultTest='test_suite')
=== Products/ZCTextIndex/tests/wordstats.py 1.1 => 1.2 ===
+"""Dump statistics about each word in the index.
+
+usage: wordstats.py data.fs [index key]
+"""
+
+import ZODB
+from ZODB.FileStorage import FileStorage
+
+def main(fspath, key):
+ fs = FileStorage(fspath, read_only=1)
+ db = ZODB.DB(fs)
+ rt = db.open().root()
+ index = rt[key]
+
+ lex = index.lexicon
+ idx = index.index
+ print "Words", lex.length()
+ print "Documents", idx.length()
+
+ print "Word frequencies: count, word, wid"
+ for word, wid in lex.items():
+ docs = idx._wordinfo[wid]
+ print len(docs), word, wid
+
+ print "Per-doc scores: wid, (doc, score,)+"
+ for wid in lex.wids():
+ print wid,
+ docs = idx._wordinfo[wid]
+ for docid, score in docs.items():
+ print docid, score,
+ print
+
+if __name__ == "__main__":
+ import sys
+
+ args = sys.argv[1:]
+ index_key = "index"
+ if len(args) == 1:
+ fspath = args[0]
+ elif len(args) == 2:
+ fspath, index_key = args
+ else:
+ print "Expected 1 or 2 args, got", len(args)
+ main(fspath, index_key)