[Zope3-checkins] CVS: Zope3/src/zope/textindex/tests - __init__.py:1.2 hs-tool.py:1.2 indexhtml.py:1.2 mailtest.py:1.2 mhindex.py:1.2 queryhtml.py:1.2 test_index.py:1.2 test_lexicon.py:1.2 test_nbest.py:1.2 test_pipelinefactory.py:1.2 test_queryengine.py:1.2 test_queryparser.py:1.2 test_setops.py:1.2 test_textindexwrapper.py:1.2 wordstats.py:1.2
Jim Fulton
jim@zope.com
Wed, 25 Dec 2002 09:16:07 -0500
Update of /cvs-repository/Zope3/src/zope/textindex/tests
In directory cvs.zope.org:/tmp/cvs-serv20790/src/zope/textindex/tests
Added Files:
__init__.py hs-tool.py indexhtml.py mailtest.py mhindex.py
queryhtml.py test_index.py test_lexicon.py test_nbest.py
test_pipelinefactory.py test_queryengine.py
test_queryparser.py test_setops.py test_textindexwrapper.py
wordstats.py
Log Message:
Grand renaming:
- Renamed most files (especially python modules) to lower case.
- Moved views and interfaces into separate hierarchies within each
project, where each top-level directory under the zope package
is a separate project.
- Moved everything to src from lib/python.
lib/python will eventually go away. I need access to the cvs
repository to make this happen, however.
There are probably some bits that are broken. All tests pass
and zope runs, but I haven't tried everything. There are a number
of cleanups I'll work on tomorrow.
=== Zope3/src/zope/textindex/tests/__init__.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/__init__.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,2 @@
+#
+# This file is necessary to make this directory a package.
=== Zope3/src/zope/textindex/tests/hs-tool.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/hs-tool.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,129 @@
+#! /usr/bin/env python
+
+import cPickle
+import os.path
+import sys
+
+from hotshot.log import LogReader
+
+def load_line_info(log):
+ byline = {}
+ prevloc = None
+ for what, place, tdelta in log:
+ if tdelta > 0:
+ t, nhits = byline.get(prevloc, (0, 0))
+ byline[prevloc] = (tdelta + t), (nhits + 1)
+ prevloc = place
+ return byline
+
+def basename(path, cache={}):
+ try:
+ return cache[path]
+ except KeyError:
+ fn = os.path.split(path)[1]
+ cache[path] = fn
+ return fn
+
+def print_results(results):
+ for info, place in results:
+ if place is None:
+ # This is the startup time for the profiler, and only
+ # occurs at the very beginning. Just ignore it, since it
+ # corresponds to frame setup of the outermost call, not
+ # anything that's actually interesting.
+ continue
+ filename, line, funcname = place
+ print '%8d %8d' % info, basename(filename), line
+
+def annotate_results(results):
+ files = {}
+ for stats, place in results:
+ if not place:
+ continue
+ time, hits = stats
+ file, line, func = place
+ l = files.get(file)
+ if l is None:
+ l = files[file] = []
+ l.append((line, hits, time))
+ order = files.keys()
+ order.sort()
+ for k in order:
+ if os.path.exists(k):
+ v = files[k]
+ v.sort()
+ annotate(k, v)
+
+def annotate(file, lines):
+ print "-" * 60
+ print file
+ print "-" * 60
+ f = open(file)
+ i = 1
+ match = lines[0][0]
+ for line in f:
+ if match == i:
+ print "%6d %8d " % lines[0][1:], line,
+ del lines[0]
+ if lines:
+ match = lines[0][0]
+ else:
+ match = None
+ else:
+ print " " * 16, line,
+ i += 1
+ print
+
+def get_cache_name(filename):
+ d, fn = os.path.split(filename)
+ cache_dir = os.path.join(d, '.hs-tool')
+ cache_file = os.path.join(cache_dir, fn)
+ return cache_dir, cache_file
+
+def cache_results(filename, results):
+ cache_dir, cache_file = get_cache_name(filename)
+ if not os.path.exists(cache_dir):
+ os.mkdir(cache_dir)
+ fp = open(cache_file, 'wb')
+ try:
+ cPickle.dump(results, fp, 1)
+ finally:
+ fp.close()
+
+def main(filename, annotate):
+ cache_dir, cache_file = get_cache_name(filename)
+
+ if ( os.path.isfile(cache_file)
+ and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
+ # cached data is up-to-date:
+ fp = open(cache_file, 'rb')
+ results = cPickle.load(fp)
+ fp.close()
+ else:
+ log = LogReader(filename)
+ byline = load_line_info(log)
+ # Sort
+ results = [(v, k) for k, v in byline.items()]
+ results.sort()
+ cache_results(filename, results)
+
+ if annotate:
+ annotate_results(results)
+ else:
+ print_results(results)
+
+
+if __name__ == "__main__":
+ import getopt
+
+ annotate_p = 0
+ opts, args = getopt.getopt(sys.argv[1:], 'A')
+ for o, v in opts:
+ if o == '-A':
+ annotate_p = 1
+ if args:
+ filename, = args
+ else:
+ filename = "profile.dat"
+
+ main(filename, annotate_p)
=== Zope3/src/zope/textindex/tests/indexhtml.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/indexhtml.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,154 @@
+#! /usr/bin/env python
+"""Index a collection of HTML files on the filesystem.
+
+usage: indexhtml.py [options] dir
+
+Will create an index of all files in dir or its subdirectories.
+
+options:
+-f data.fs -- the path to the filestorage datafile
+"""
+from __future__ import nested_scopes
+
+import os
+from time import clock
+
+import zodb
+from zodb.storage.file import FileStorage
+from zodb.btrees.IOBTree import IOBTree
+
+from zope.textindex.textindexwrapper import TextIndexWrapper
+from zope.textindex.htmlsplitter import HTMLWordSplitter
+from zope.textindex.lexicon import Lexicon, StopWordRemover
+
+def make_zc_index():
+ # there's an elaborate dance necessary to construct an index
+ class Struct:
+ pass
+ extra = Struct()
+ extra.doc_attr = "read"
+ extra.lexicon_id = "lexicon"
+ caller = Struct()
+ caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
+ return ZCTextIndex("read", extra, caller)
+
+# XXX make a splitter more like the HTMLSplitter for TextIndex
+# signature is
+# Splitter(string, stop_words, encoding,
+# singlechar, indexnumbers, casefolding)
+
+class MySplitter:
+ def __init__(self):
+ self._v_splitter = HTMLWordSplitter()
+ def __call__(self, text, stopdict, *args, **kwargs):
+ words = self._v_splitter._split(text)
+ def lookup(w):
+ return stopdict.get(w, w)
+ return filter(None, map(lookup, words))
+
+def make_old_index():
+ from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
+ from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
+ from zope.textindex.stopdict import get_stopdict
+
+ l = Lexicon(get_stopdict())
+ l.SplitterFunc = MySplitter()
+ return TextIndex("read", lexicon=l)
+
+def main(db, root, dir):
+ rt["index"] = index = INDEX()
+ rt["files"] = paths = IOBTree()
+ get_transaction().commit()
+
+ zodb_time = 0.0
+ pack_time = 0.0
+
+ files = [os.path.join(dir, file) for file in os.listdir(dir)]
+ docid = 0
+ t0 = clock()
+ for file in files:
+ if os.path.isdir(file):
+ files += [os.path.join(file, sub) for sub in os.listdir(file)]
+ else:
+ if not file.endswith(".html"):
+ continue
+ docid += 1
+ if LIMIT is not None and docid > LIMIT:
+ break
+ if VERBOSE:
+ print "%5d" % docid, file
+ f = open(file, "rb")
+ paths[docid] = file
+ index.index_object(docid, f)
+ f.close()
+ if docid % TXN_INTERVAL == 0:
+ z0 = clock()
+ get_transaction().commit()
+ z1 = clock()
+ zodb_time += z1 - z0
+ if VERBOSE:
+ print "commit took", z1 - z0, zodb_time
+ if docid % PACK_INTERVAL == 0:
+ p0 = clock()
+ db.pack()
+ p1 = clock()
+ zodb_time += p1 - p0
+ pack_time += p1 - p0
+ if VERBOSE:
+ print "pack took", p1 - p0, pack_time
+ z0 = clock()
+ get_transaction().commit()
+ z1 = t1 = clock()
+ total_time = t1 - t0
+ zodb_time += z1 - z0
+ if VERBOSE:
+ print "Total index time", total_time
+ print "Non-pack time", total_time - pack_time
+ print "Non-ZODB time", total_time - zodb_time
+
+if __name__ == "__main__":
+ import sys
+ import getopt
+
+ VERBOSE = 0
+ FSPATH = "Data.fs"
+ TXN_INTERVAL = 100
+ PACK_INTERVAL = 500
+ LIMIT = None
+ INDEX = make_zc_index
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], 'vf:t:p:n:T')
+ except getopt.error, msg:
+ print msg
+ print __doc__
+ sys.exit(2)
+
+ for o, v in opts:
+ if o == '-v':
+ VERBOSE += 1
+ if o == '-f':
+ FSPATH = v
+ if o == '-t':
+ TXN_INTERVAL = int(v)
+ if o == '-p':
+ PACK_INTERVAL = int(v)
+ if o == '-n':
+ LIMIT = int(v)
+ if o == '-T':
+ INDEX = make_old_index
+
+ if len(args) != 1:
+ print "Expected on argument"
+ print __doc__
+ sys.exit(2)
+ dir = args[0]
+
+ fs = FileStorage(FSPATH)
+ db = ZODB.DB(fs)
+ cn = db.open()
+ rt = cn.root()
+ dir = os.path.join(os.getcwd(), dir)
+ print dir
+ main(db, rt, dir)
+ cn.close()
+ fs.close()
=== Zope3/src/zope/textindex/tests/mailtest.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/mailtest.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,290 @@
+"""Test an index with a Unix mailbox file.
+
+usage: python mailtest.py [options] <data.fs>
+
+options:
+ -v -- verbose
+
+ Index Generation
+ -i mailbox
+ -n NNN -- max number of messages to read from mailbox
+ -t NNN -- commit a transaction every NNN messages (default: 1)
+ -p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
+ -p 0 -- don't pack at all
+ -x -- exclude the message text from the data.fs
+
+ Queries
+ -q query
+ -b NNN -- return the NNN best matches (default: 10)
+ -c NNN -- context; if -v, show the first NNN lines of results (default: 5)
+
+The script either indexes or queries depending on whether -q or -i is
+passed as an option.
+
+For -i mailbox, the script reads mail messages from the mailbox and
+indexes them. It indexes one message at a time, then commits the
+transaction.
+
+For -q query, it performs a query on an existing index.
+
+If both are specified, the index is performed first.
+
+You can also interact with the index after it is completed. Load the
+index from the database:
+
+ import zodb
+ from zodb.storage.file import FileStorage
+ fs = FileStorage(<data.fs>
+ db = ZODB.DB(fs)
+ index = cn.open().root()["index"]
+ index.search("python AND unicode")
+"""
+
+import zodb
+import zodb.storage.file
+from zope.textindex.lexicon import \
+ Lexicon, CaseNormalizer, Splitter, StopWordRemover
+
+# XXX This import is bad, and was so before the renaming
+from zope.textindex.zctextindex import ZCTextIndex
+
+from BTrees.IOBTree import IOBTree
+from zope.textindex.queryparser import QueryParser
+
+import sys
+import mailbox
+import time
+
+def usage(msg):
+ print msg
+ print __doc__
+ sys.exit(2)
+
+class Message:
+
+ total_bytes = 0
+
+ def __init__(self, msg):
+ subject = msg.getheader('subject', '')
+ author = msg.getheader('from', '')
+ if author:
+ summary = "%s (%s)\n" % (subject, author)
+ else:
+ summary = "%s\n" % subject
+ self.text = summary + msg.fp.read()
+ Message.total_bytes += len(self.text)
+
+class Extra:
+ pass
+
+def index(rt, mboxfile, db, profiler):
+ global NUM
+ idx_time = 0
+ pack_time = 0
+ start_time = time.time()
+
+ lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+ extra = Extra()
+ extra.lexicon_id = 'lexicon'
+ extra.doc_attr = 'text'
+ extra.index_type = 'Okapi BM25 Rank'
+ caller = Extra()
+ caller.lexicon = lexicon
+ rt["index"] = idx = ZCTextIndex("index", extra, caller)
+ if not EXCLUDE_TEXT:
+ rt["documents"] = docs = IOBTree()
+ else:
+ docs = None
+ get_transaction().commit()
+
+ mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
+ if VERBOSE:
+ print "opened", mboxfile
+ if not NUM:
+ NUM = sys.maxint
+
+ if profiler:
+ itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
+ else:
+ itime, ptime, i = indexmbox(mbox, idx, docs, db)
+ idx_time += itime
+ pack_time += ptime
+
+ get_transaction().commit()
+
+ if PACK_INTERVAL and i % PACK_INTERVAL != 0:
+ if VERBOSE >= 2:
+ print "packing one last time..."
+ p0 = time.clock()
+ db.pack(time.time())
+ p1 = time.clock()
+ if VERBOSE:
+ print "pack took %s sec" % (p1 - p0)
+ pack_time += p1 - p0
+
+ if VERBOSE:
+ finish_time = time.time()
+ print
+ print "Index time", round(idx_time / 60, 3), "minutes"
+ print "Pack time", round(pack_time / 60, 3), "minutes"
+ print "Index bytes", Message.total_bytes
+ rate = (Message.total_bytes / idx_time) / 1024
+ print "Index rate %.2f KB/sec" % rate
+ print "Indexing began", time.ctime(start_time)
+ print "Indexing ended", time.ctime(finish_time)
+ print "Wall clock minutes", round((finish_time - start_time)/60, 3)
+
+def indexmbox(mbox, idx, docs, db):
+ idx_time = 0
+ pack_time = 0
+ i = 0
+ while i < NUM:
+ _msg = mbox.next()
+ if _msg is None:
+ break
+ i += 1
+ msg = Message(_msg)
+ if VERBOSE >= 2:
+ print "indexing msg", i
+ i0 = time.clock()
+ idx.index_object(i, msg)
+ if not EXCLUDE_TEXT:
+ docs[i] = msg
+ if i % TXN_SIZE == 0:
+ get_transaction().commit()
+ i1 = time.clock()
+ idx_time += i1 - i0
+ if VERBOSE and i % 50 == 0:
+ print i, "messages indexed"
+ print "cache size", db.cacheSize()
+ if PACK_INTERVAL and i % PACK_INTERVAL == 0:
+ if VERBOSE >= 2:
+ print "packing..."
+ p0 = time.clock()
+ db.pack(time.time())
+ p1 = time.clock()
+ if VERBOSE:
+ print "pack took %s sec" % (p1 - p0)
+ pack_time += p1 - p0
+ return idx_time, pack_time, i
+
+
+def query(rt, query_str, profiler):
+ idx = rt["index"]
+ docs = rt["documents"]
+
+ start = time.clock()
+ if profiler is None:
+ results, num_results = idx.query(query_str, BEST)
+ else:
+ if WARM_CACHE:
+ print "Warming the cache..."
+ idx.query(query_str, BEST)
+ start = time.clock()
+ results, num_results = profiler.runcall(idx.query, query_str, BEST)
+ elapsed = time.clock() - start
+
+ print "query:", query_str
+ print "# results:", len(results), "of", num_results, \
+ "in %.2f ms" % (elapsed * 1000)
+
+ tree = QueryParser(idx.lexicon).parseQuery(query_str)
+ qw = idx.index.query_weight(tree.terms())
+
+ for docid, score in results:
+ scaled = 100.0 * score / qw
+ print "docid %7d score %6d scaled %5.2f%%" % (docid, score, scaled)
+ if VERBOSE:
+ msg = docs[docid]
+ ctx = msg.text.split("\n", CONTEXT)
+ del ctx[-1]
+ print "-" * 60
+ print "message:"
+ for l in ctx:
+ print l
+ print "-" * 60
+
+
+def main(fs_path, mbox_path, query_str, profiler):
+ f = ZODB.FileStorage.FileStorage(fs_path)
+ db = ZODB.DB(f, cache_size=CACHE_SIZE)
+ cn = db.open()
+ rt = cn.root()
+
+ if mbox_path is not None:
+ index(rt, mbox_path, db, profiler)
+ if query_str is not None:
+ query(rt, query_str, profiler)
+
+ cn.close()
+ db.close()
+ f.close()
+
+if __name__ == "__main__":
+ import getopt
+
+ NUM = 0
+ VERBOSE = 0
+ PACK_INTERVAL = 500
+ EXCLUDE_TEXT = 0
+ CACHE_SIZE = 10000
+ TXN_SIZE = 1
+ BEST = 10
+ CONTEXT = 5
+ WARM_CACHE = 0
+ query_str = None
+ mbox_path = None
+ profile = None
+ old_profile = None
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:c:xt:w',
+ ['profile=', 'old-profile='])
+ except getopt.error, msg:
+ usage(msg)
+ if len(args) != 1:
+ usage("exactly 1 filename argument required")
+ for o, v in opts:
+ if o == '-n':
+ NUM = int(v)
+ elif o == '-v':
+ VERBOSE += 1
+ elif o == '-p':
+ PACK_INTERVAL = int(v)
+ elif o == '-q':
+ query_str = v
+ elif o == '-i':
+ mbox_path = v
+ elif o == '-b':
+ BEST = int(v)
+ elif o == '-x':
+ EXCLUDE_TEXT = 1
+ elif o == '-t':
+ TXN_SIZE = int(v)
+ elif o == '-c':
+ CONTEXT = int(v)
+ elif o == '-w':
+ WARM_CACHE = 1
+ elif o == '--profile':
+ profile = v
+ elif o == '--old-profile':
+ old_profile = v
+ fs_path, = args
+
+ if profile:
+ import hotshot
+ profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
+ elif old_profile:
+ import profile
+ profiler = profile.Profile()
+ else:
+ profiler = None
+
+ main(fs_path, mbox_path, query_str, profiler)
+
+ if profile:
+ profiler.close()
+ elif old_profile:
+ import pstats
+ profiler.dump_stats(old_profile)
+ stats = pstats.Stats(old_profile)
+ stats.strip_dirs().sort_stats('time').print_stats(20)
=== Zope3/src/zope/textindex/tests/mhindex.py 1.1 => 1.2 === (478/578 lines abridged)
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/mhindex.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,575 @@
+#! /usr/bin/env python2.2
+
+"""MH mail indexer.
+
+To index messages from a single folder (messages defaults to 'all'):
+ mhindex.py [options] -u +folder [messages ...]
+
+To bulk index all messages from several folders:
+ mhindex.py [options] -b folder ...; the folder name ALL means all folders.
+
+To execute a single query:
+ mhindex.py [options] query
+
+To enter interactive query mode:
+ mhindex.py [options]
+
+Common options:
+ -d FILE -- specify the Data.fs to use (default ~/.Data.fs)
+ -w -- dump the word list in alphabetical order and exit
+ -W -- dump the word list ordered by word id and exit
+
+Indexing options:
+ -O -- do a prescan on the data to compute optimal word id assignments;
+ this is only useful the first time the Data.fs is used
+ -t N -- commit a transaction after every N messages (default 20000)
+ -p N -- pack after every N commits (by default no packing is done)
+
+Querying options:
+ -m N -- show at most N matching lines from the message (default 3)
+ -n N -- show the N best matching messages (default 3)
+"""
+
+import os
+import re
+import sys
+import time
+import mhlib
+import getopt
+import traceback
+from StringIO import StringIO
+from stat import ST_MTIME
+
+DATAFS = "~/.mhindex.fs"
+ZOPECODE = "~/projects/Zope3/lib/python"
+
+zopecode = os.path.expanduser(ZOPECODE)
+sys.path.insert(0, zopecode)
[-=- -=- -=- 478 lines omitted -=- -=- -=-]
+ if value:
+ H.append(value)
+ if H:
+ L.append("\n".join(H))
+
+ def newdocid(self, path):
+ docid = self.path2docid.get(path)
+ if docid is not None:
+ self.doctimes[docid] = self.getmtime(path)
+ return docid
+ docid = self.maxdocid + 1
+ self.maxdocid = docid
+ self.docpaths[docid] = path
+ self.doctimes[docid] = self.getmtime(path)
+ self.path2docid[path] = docid
+ return docid
+
+ def getmtime(self, path):
+ path = os.path.join(self.mh.getpath(), path)
+ try:
+ st = os.stat(path)
+ except os.error, msg:
+ return 0
+ return int(st[ST_MTIME])
+
+ def maycommit(self):
+ self.trans_count += 1
+ if self.trans_count >= self.trans_limit > 0:
+ self.commit()
+
+ def commit(self):
+ if self.trans_count > 0:
+ print "committing..."
+ get_transaction().commit()
+ self.trans_count = 0
+ self.pack_count += 1
+ if self.pack_count >= self.pack_limit > 0:
+ self.pack()
+
+ def pack(self):
+ if self.pack_count > 0:
+ print "packing..."
+ self.database.pack()
+ self.pack_count = 0
+
+def reportexc():
+ traceback.print_exc()
+
+if __name__ == "__main__":
+ sys.exit(main())
=== Zope3/src/zope/textindex/tests/queryhtml.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/queryhtml.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,116 @@
+import os
+from time import clock
+
+import zodb
+from zodb.storage.file import FileStorage
+
+QUERIES = ["nested recursive functions",
+ "explicit better than implicit",
+ "build hpux",
+ "cannot create 'method-wrapper' instances",
+ "extension module C++",
+ "class method",
+ "instance variable",
+ "articulate information",
+ "import default files",
+ "gopher ftp http",
+ "documentation",
+ ]
+
+def path2url(p):
+ # convert the paths to a python.org URL
+ # hack: only works for the way Jeremy indexed his copy of python.org
+ marker = "www.python.org/."
+ i = p.find(marker)
+ if i == -1:
+ return p
+ i += len(marker)
+ return "http://www.python.org" + p[i:]
+
+from Products.PluginIndexes.TextIndex.TextIndex import And, Or
+from zope.textindex.tests.indexhtml import MySplitter
+from zope.textindex.nbest import NBest
+
+def main(rt):
+ index = rt["index"]
+ files = rt["files"]
+ times = {}
+ ITERS = range(50)
+ for i in range(11):
+ for q in QUERIES:
+ terms = q.split()
+ for c in " OR ", " AND ":
+ query = c.join(terms)
+ t0 = clock()
+ if TEXTINDEX:
+ if c == " OR ":
+ op = Or
+ else:
+ op = And
+ _q = " ".join(terms)
+ for _ in ITERS:
+ b = index.query(_q, op).bucket()
+ num = len(b)
+ chooser = NBest(10)
+ chooser.addmany(b.items())
+ results = chooser.getbest()
+
+ else:
+ try:
+ for _ in ITERS:
+ results, num = index.query(query)
+ except:
+ continue
+ t1 = clock()
+ print "<p>Query: \"%s\"" % query
+ print "<br>Num results: %d" % num
+ print "<br>time.clock(): %s" % (t1 - t0)
+ key = query
+ if i == 0:
+ print "<ol>"
+ for docid, score in results:
+ url = path2url(files[docid])
+ fmt = '<li><a href="%s">%s</A> score = %s'
+ print fmt % (url, url, score)
+ print "</ol>"
+ continue
+ l = times.setdefault(key, [])
+ l.append(t1 - t0)
+
+ l = times.keys()
+ l.sort()
+ print "<hr>"
+ for k in l:
+ v = times[k]
+ print "<p>Query: \"%s\"" % k
+ print "<br>Min time: %s" % min(v)
+ print "<br>All times: %s" % " ".join(map(str, v))
+
+if __name__ == "__main__":
+ import sys
+ import getopt
+
+ VERBOSE = 0
+ FSPATH = "Data.fs"
+ TEXTINDEX = 0
+
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], 'vf:T')
+ except getopt.error, msg:
+ print msg
+ print __doc__
+ sys.exit(2)
+
+ for o, v in opts:
+ if o == '-v':
+ VERBOSE += 1
+ if o == '-f':
+ FSPATH = v
+ if o == '-T':
+ TEXTINDEX = 1
+
+ fs = FileStorage(FSPATH, read_only=1)
+ db = ZODB.DB(fs, cache_size=10000)
+ cn = db.open()
+ rt = cn.root()
+ main(rt)
=== Zope3/src/zope/textindex/tests/test_index.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_index.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,157 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zope.textindex.lexicon import Lexicon, Splitter
+from zope.textindex.cosineindex import CosineIndex
+from zope.textindex.okapiindex import OkapiIndex
+
+# Subclasses must set a class variable IndexFactory to the appropriate
+# index object constructor.
+
+class IndexTest(TestCase):
+
+ def setUp(self):
+ self.lexicon = Lexicon(Splitter())
+ self.index = self.IndexFactory(self.lexicon)
+
+ def test_index_document(self, DOCID=1):
+ doc = "simple document contains five words"
+ self.assert_(not self.index.has_doc(DOCID))
+ self.index.index_doc(DOCID, doc)
+ self.assertEqual(self.index.documentCount(), 1)
+ self.assertEqual(self.index.wordCount(), 5)
+ self.assertEqual(self.lexicon.wordCount(), 5)
+ self.assert_(self.index.has_doc(DOCID))
+ self.assert_(self.index._docweight[DOCID])
+ self.assertEqual(len(self.index._docweight), 1)
+ self.assertEqual(len(self.index._wordinfo), 5)
+ self.assertEqual(len(self.index._docwords), 1)
+ self.assertEqual(len(self.index.get_words(DOCID)), 5)
+ self.assertEqual(len(self.index._wordinfo),
+ self.index.wordCount())
+ for map in self.index._wordinfo.values():
+ self.assertEqual(len(map), 1)
+ self.assert_(map.has_key(DOCID))
+
+ def test_unindex_document(self):
+ DOCID = 1
+ self.test_index_document(DOCID)
+ self.index.unindex_doc(DOCID)
+ self.assertEqual(len(self.index._docweight), 0)
+ self.assertEqual(len(self.index._wordinfo), 0)
+ self.assertEqual(len(self.index._docwords), 0)
+ self.assertEqual(len(self.index._wordinfo),
+ self.index.wordCount())
+
+ def test_index_two_documents(self):
+ self.test_index_document()
+ doc = "another document just four"
+ DOCID = 2
+ self.index.index_doc(DOCID, doc)
+ self.assert_(self.index._docweight[DOCID])
+ self.assertEqual(len(self.index._docweight), 2)
+ self.assertEqual(len(self.index._wordinfo), 8)
+ self.assertEqual(len(self.index._docwords), 2)
+ self.assertEqual(len(self.index.get_words(DOCID)), 4)
+ self.assertEqual(len(self.index._wordinfo),
+ self.index.wordCount())
+ wids = self.lexicon.termToWordIds("document")
+ self.assertEqual(len(wids), 1)
+ document_wid = wids[0]
+ for wid, map in self.index._wordinfo.items():
+ if wid == document_wid:
+ self.assertEqual(len(map), 2)
+ self.assert_(map.has_key(1))
+ self.assert_(map.has_key(DOCID))
+ else:
+ self.assertEqual(len(map), 1)
+
+ def test_index_two_unindex_one(self):
+ # index two documents, unindex one, and test the results
+ self.test_index_two_documents()
+ self.index.unindex_doc(1)
+ DOCID = 2
+ self.assertEqual(len(self.index._docweight), 1)
+ self.assert_(self.index._docweight[DOCID])
+ self.assertEqual(len(self.index._wordinfo), 4)
+ self.assertEqual(len(self.index._docwords), 1)
+ self.assertEqual(len(self.index.get_words(DOCID)), 4)
+ self.assertEqual(len(self.index._wordinfo),
+ self.index.wordCount())
+ for map in self.index._wordinfo.values():
+ self.assertEqual(len(map), 1)
+ self.assert_(map.has_key(DOCID))
+
+ def test_index_duplicated_words(self, DOCID=1):
+ doc = "very simple repeat repeat repeat document test"
+ self.index.index_doc(DOCID, doc)
+ self.assert_(self.index._docweight[DOCID])
+ self.assertEqual(len(self.index._wordinfo), 5)
+ self.assertEqual(len(self.index._docwords), 1)
+ self.assertEqual(len(self.index.get_words(DOCID)), 7)
+ self.assertEqual(len(self.index._wordinfo),
+ self.index.wordCount())
+ wids = self.lexicon.termToWordIds("repeat")
+ self.assertEqual(len(wids), 1)
+ repititive_wid = wids[0]
+ for wid, map in self.index._wordinfo.items():
+ self.assertEqual(len(map), 1)
+ self.assert_(map.has_key(DOCID))
+
+ def test_simple_query_oneresult(self):
+ self.index.index_doc(1, 'not the same document')
+ results = self.index.search("document")
+ self.assertEqual(list(results.keys()), [1])
+
+ def test_simple_query_noresults(self):
+ self.index.index_doc(1, 'not the same document')
+ results = self.index.search("frobnicate")
+ self.assertEqual(list(results.keys()), [])
+
+ def test_query_oneresult(self):
+ self.index.index_doc(1, 'not the same document')
+ self.index.index_doc(2, 'something about something else')
+ results = self.index.search("document")
+ self.assertEqual(list(results.keys()), [1])
+
+ def test_search_phrase(self):
+ self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
+ self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
+ results = self.index.search_phrase("quick brown fox")
+ self.assertEqual(list(results.keys()), [1])
+
+ def test_search_glob(self):
+ self.index.index_doc(1, "how now brown cow")
+ self.index.index_doc(2, "hough nough browne cough")
+ self.index.index_doc(3, "bar brawl")
+ results = self.index.search_glob("bro*")
+ self.assertEqual(list(results.keys()), [1, 2])
+ results = self.index.search_glob("b*")
+ self.assertEqual(list(results.keys()), [1, 2, 3])
+
+class CosineIndexTest(IndexTest):
+ IndexFactory = CosineIndex
+
+class OkapiIndexTest(IndexTest):
+ IndexFactory = OkapiIndex
+
+def test_suite():
+ return TestSuite((makeSuite(CosineIndexTest),
+ makeSuite(OkapiIndexTest),
+ ))
+
+if __name__=='__main__':
+ main(defaultTest='test_suite')
=== Zope3/src/zope/textindex/tests/test_lexicon.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_lexicon.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,142 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+import sys
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zope.textindex.lexicon import Lexicon
+from zope.textindex.lexicon import Splitter, CaseNormalizer
+
+class StupidPipelineElement:
+ def __init__(self, fromword, toword):
+ self.__fromword = fromword
+ self.__toword = toword
+
+ def process(self, seq):
+ res = []
+ for term in seq:
+ if term == self.__fromword:
+ res.append(self.__toword)
+ else:
+ res.append(term)
+ return res
+
+class WackyReversePipelineElement:
+ def __init__(self, revword):
+ self.__revword = revword
+
+ def process(self, seq):
+ res = []
+ for term in seq:
+ if term == self.__revword:
+ x = list(term)
+ x.reverse()
+ res.append(''.join(x))
+ else:
+ res.append(term)
+ return res
+
+class StopWordPipelineElement:
+ def __init__(self, stopdict={}):
+ self.__stopdict = stopdict
+
+ def process(self, seq):
+ res = []
+ for term in seq:
+ if self.__stopdict.get(term):
+ continue
+ else:
+ res.append(term)
+ return res
+
+
+class Test(TestCase):
+ def testSourceToWordIds(self):
+ lexicon = Lexicon(Splitter())
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ self.assertEqual(wids, [1, 2, 3])
+
+ def testTermToWordIds(self):
+ lexicon = Lexicon(Splitter())
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('dogs')
+ self.assertEqual(wids, [3])
+
+ def testMissingTermToWordIds(self):
+ lexicon = Lexicon(Splitter())
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('boxes')
+ self.assertEqual(wids, [0])
+
+ def testOnePipelineElement(self):
+ lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('fish')
+ self.assertEqual(wids, [3])
+
+ def testSplitterAdaptorFold(self):
+ lexicon = Lexicon(Splitter(), CaseNormalizer())
+ wids = lexicon.sourceToWordIds('CATS and dogs')
+ wids = lexicon.termToWordIds('cats and dogs')
+ self.assertEqual(wids, [1, 2, 3])
+
+ def testSplitterAdaptorNofold(self):
+ lexicon = Lexicon(Splitter())
+ wids = lexicon.sourceToWordIds('CATS and dogs')
+ wids = lexicon.termToWordIds('cats and dogs')
+ self.assertEqual(wids, [0, 2, 3])
+
+ def testTwoElementPipeline(self):
+ lexicon = Lexicon(Splitter(),
+ StupidPipelineElement('cats', 'fish'),
+ WackyReversePipelineElement('fish'))
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('hsif')
+ self.assertEqual(wids, [1])
+
+ def testThreeElementPipeline(self):
+ lexicon = Lexicon(Splitter(),
+ StopWordPipelineElement({'and':1}),
+ StupidPipelineElement('dogs', 'fish'),
+ WackyReversePipelineElement('fish'))
+ wids = lexicon.sourceToWordIds('cats and dogs')
+ wids = lexicon.termToWordIds('hsif')
+ self.assertEqual(wids, [2])
+
+ def testSplitterLocaleAwareness(self):
+ from zope.textindex.htmlsplitter import HTMLWordSplitter
+ import locale
+ loc = locale.setlocale(locale.LC_ALL) # get current locale
+ # set German locale
+ try:
+ if sys.platform != 'win32':
+ locale.setlocale(locale.LC_ALL, 'de_DE.ISO8859-1')
+ else:
+ locale.setlocale(locale.LC_ALL, 'German_Germany.1252')
+ except locale.Error:
+ return # This test doesn't work here :-(
+ expected = ['m\xfclltonne', 'waschb\xe4r',
+ 'beh\xf6rde', '\xfcberflieger']
+ words = [" ".join(expected)]
+ words = Splitter().process(words)
+ self.assertEqual(words, expected)
+ words = HTMLWordSplitter().process(words)
+ self.assertEqual(words, expected)
+ locale.setlocale(locale.LC_ALL, loc) # restore saved locale
+
+def test_suite():
+ return makeSuite(Test)
+
+if __name__=='__main__':
+ main(defaultTest='test_suite')
=== Zope3/src/zope/textindex/tests/test_nbest.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_nbest.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,97 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zope.textindex.nbest import NBest
+
+class NBestTest(TestCase):
+
+ def testConstructor(self):
+ self.assertRaises(ValueError, NBest, 0)
+ self.assertRaises(ValueError, NBest, -1)
+
+ for n in range(1, 11):
+ nb = NBest(n)
+ self.assertEqual(len(nb), 0)
+ self.assertEqual(nb.capacity(), n)
+
+ def testOne(self):
+ nb = NBest(1)
+ nb.add('a', 0)
+ self.assertEqual(nb.getbest(), [('a', 0)])
+
+ nb.add('b', 1)
+ self.assertEqual(len(nb), 1)
+ self.assertEqual(nb.capacity(), 1)
+ self.assertEqual(nb.getbest(), [('b', 1)])
+
+ nb.add('c', -1)
+ self.assertEqual(len(nb), 1)
+ self.assertEqual(nb.capacity(), 1)
+ self.assertEqual(nb.getbest(), [('b', 1)])
+
+ nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
+ self.assertEqual(len(nb), 1)
+ self.assertEqual(nb.capacity(), 1)
+ self.assertEqual(nb.getbest(), [('f', 5)])
+
+ def testMany(self):
+ import random
+ inputs = [(-i, i) for i in range(50)]
+
+ reversed_inputs = inputs[:]
+ reversed_inputs.reverse()
+
+ # Test the N-best for a variety of n (1, 6, 11, ... 50).
+ for n in range(1, len(inputs)+1, 5):
+ expected = inputs[-n:]
+ expected.reverse()
+
+ random_inputs = inputs[:]
+ random.shuffle(random_inputs)
+
+ for source in inputs, reversed_inputs, random_inputs:
+ # Try feeding them one at a time.
+ nb = NBest(n)
+ for item, score in source:
+ nb.add(item, score)
+ self.assertEqual(len(nb), n)
+ self.assertEqual(nb.capacity(), n)
+ self.assertEqual(nb.getbest(), expected)
+
+ # And again in one gulp.
+ nb = NBest(n)
+ nb.addmany(source)
+ self.assertEqual(len(nb), n)
+ self.assertEqual(nb.capacity(), n)
+ self.assertEqual(nb.getbest(), expected)
+
+ for i in range(1, n+1):
+ self.assertEqual(nb.pop_smallest(), expected[-i])
+ self.assertRaises(IndexError, nb.pop_smallest)
+
+ def testAllSameScore(self):
+ inputs = [(i, 0) for i in range(10)]
+ for n in range(1, 12):
+ nb = NBest(n)
+ nb.addmany(inputs)
+ outputs = nb.getbest()
+ self.assertEqual(outputs, inputs[:len(outputs)])
+
+def test_suite():
+ return makeSuite(NBestTest)
+
+if __name__=='__main__':
+ main(defaultTest='test_suite')
=== Zope3/src/zope/textindex/tests/test_pipelinefactory.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_pipelinefactory.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,50 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+from zope.textindex.ipipelineelement import IPipelineElement
+from zope.textindex.pipelinefactory import PipelineElementFactory
+
+class NullPipelineElement:
+
+ __implements__ = IPipelineElement
+
+ def process(source):
+ pass
+
+class PipelineFactoryTest(TestCase):
+
+ def setUp(self):
+ self.huey = NullPipelineElement()
+ self.dooey = NullPipelineElement()
+ self.louie = NullPipelineElement()
+ self.daffy = NullPipelineElement()
+
+ def testPipeline(self):
+ pf = PipelineElementFactory()
+ pf.registerFactory('donald', 'huey', self.huey)
+ pf.registerFactory('donald', 'dooey', self.dooey)
+ pf.registerFactory('donald', 'louie', self.louie)
+ pf.registerFactory('looney', 'daffy', self.daffy)
+ self.assertRaises(ValueError, pf.registerFactory,'donald', 'huey',
+ self.huey)
+ self.assertEqual(pf.getFactoryGroups(), ['donald', 'looney'])
+ self.assertEqual(pf.getFactoryNames('donald'),
+ ['dooey', 'huey', 'louie'])
+
+def test_suite():
+ return makeSuite(PipelineFactoryTest)
+
+if __name__=='__main__':
+ main(defaultTest='test_suite')
=== Zope3/src/zope/textindex/tests/test_queryengine.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_queryengine.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,72 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+import unittest
+
+from zodb.btrees.IIBTree import IIBucket
+
+from zope.textindex.queryparser import QueryParser
+from zope.textindex.parsetree import ParseError, QueryError
+from zope.textindex.lexicon import Lexicon, Splitter
+
+class FauxIndex:
+
+ def search(self, term):
+ b = IIBucket()
+ if term == "foo":
+ b[1] = b[3] = 1
+ elif term == "bar":
+ b[1] = b[2] = 1
+ elif term == "ham":
+ b[1] = b[2] = b[3] = b[4] = 1
+ return b
+
+class TestQueryEngine(unittest.TestCase):
+
+ def setUp(self):
+ self.lexicon = Lexicon(Splitter())
+ self.parser = QueryParser(self.lexicon)
+ self.index = FauxIndex()
+
+ def compareSet(self, set, dict):
+ d = {}
+ for k, v in set.items():
+ d[k] = v
+ self.assertEqual(d, dict)
+
+ def compareQuery(self, query, dict):
+ tree = self.parser.parseQuery(query)
+ set = tree.executeQuery(self.index)
+ self.compareSet(set, dict)
+
+ def testExecuteQuery(self):
+ self.compareQuery("foo AND bar", {1: 2})
+ self.compareQuery("foo OR bar", {1: 2, 2: 1, 3:1})
+ self.compareQuery("foo AND NOT bar", {3: 1})
+ self.compareQuery("foo AND foo AND foo", {1: 3, 3: 3})
+ self.compareQuery("foo OR foo OR foo", {1: 3, 3: 3})
+ self.compareQuery("ham AND NOT foo AND NOT bar", {4: 1})
+ self.compareQuery("ham OR foo OR bar", {1: 3, 2: 2, 3: 2, 4: 1})
+ self.compareQuery("ham AND foo AND bar", {1: 3})
+
+ def testInvalidQuery(self):
+ from zope.textindex.parsetree import NotNode, AtomNode
+ tree = NotNode(AtomNode("foo"))
+ self.assertRaises(QueryError, tree.executeQuery, self.index)
+
+def test_suite():
+ return unittest.makeSuite(TestQueryEngine)
+
+if __name__=='__main__':
+ unittest.main(defaultTest='test_suite')
=== Zope3/src/zope/textindex/tests/test_queryparser.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_queryparser.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,297 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zope.interface.verify import verifyClass
+
+from zope.textindex.iqueryparser import IQueryParser
+from zope.textindex.iqueryparsetree import IQueryParseTree
+
+from zope.textindex.queryparser import QueryParser
+from zope.textindex.parsetree import ParseError, ParseTreeNode
+from zope.textindex.parsetree import OrNode, AndNode, NotNode
+from zope.textindex.parsetree import AtomNode, PhraseNode, GlobNode
+from zope.textindex.lexicon import Lexicon, Splitter
+
+
+class TestInterfaces(TestCase):
+
+ def testInterfaces(self):
+ verifyClass(IQueryParser, QueryParser)
+ verifyClass(IQueryParseTree, ParseTreeNode)
+ verifyClass(IQueryParseTree, OrNode)
+ verifyClass(IQueryParseTree, AndNode)
+ verifyClass(IQueryParseTree, NotNode)
+ verifyClass(IQueryParseTree, AtomNode)
+ verifyClass(IQueryParseTree, PhraseNode)
+ verifyClass(IQueryParseTree, GlobNode)
+
+
+class TestQueryParserBase(TestCase):
+
+ def setUp(self):
+ self.lexicon = Lexicon(Splitter())
+ self.parser = QueryParser(self.lexicon)
+
+ def expect(self, input, output, expected_ignored=[]):
+ tree = self.parser.parseQuery(input)
+ ignored = self.parser.getIgnored()
+ self.compareParseTrees(tree, output)
+ self.assertEqual(ignored, expected_ignored)
+ # Check that parseQueryEx() == (parseQuery(), getIgnored())
+ ex_tree, ex_ignored = self.parser.parseQueryEx(input)
+ self.compareParseTrees(ex_tree, tree)
+ self.assertEqual(ex_ignored, expected_ignored)
+
+ def failure(self, input):
+ self.assertRaises(ParseError, self.parser.parseQuery, input)
+ self.assertRaises(ParseError, self.parser.parseQueryEx, input)
+
+ def compareParseTrees(self, got, expected, msg=None):
+ if msg is None:
+ msg = repr(got)
+ self.assertEqual(isinstance(got, ParseTreeNode), 1)
+ self.assertEqual(got.__class__, expected.__class__, msg)
+ if isinstance(got, PhraseNode):
+ self.assertEqual(got.nodeType(), "PHRASE", msg)
+ self.assertEqual(got.getValue(), expected.getValue(), msg)
+ elif isinstance(got, GlobNode):
+ self.assertEqual(got.nodeType(), "GLOB", msg)
+ self.assertEqual(got.getValue(), expected.getValue(), msg)
+ elif isinstance(got, AtomNode):
+ self.assertEqual(got.nodeType(), "ATOM", msg)
+ self.assertEqual(got.getValue(), expected.getValue(), msg)
+ elif isinstance(got, NotNode):
+ self.assertEqual(got.nodeType(), "NOT")
+ self.compareParseTrees(got.getValue(), expected.getValue(), msg)
+ elif isinstance(got, AndNode) or isinstance(got, OrNode):
+ self.assertEqual(got.nodeType(),
+ isinstance(got, AndNode) and "AND" or "OR", msg)
+ list1 = got.getValue()
+ list2 = expected.getValue()
+ self.assertEqual(len(list1), len(list2), msg)
+ for i in range(len(list1)):
+ self.compareParseTrees(list1[i], list2[i], msg)
+
+
+class TestQueryParser(TestQueryParserBase):
+
+ def test001(self):
+ self.expect("foo", AtomNode("foo"))
+
+ def test002(self):
+ self.expect("note", AtomNode("note"))
+
+ def test003(self):
+ self.expect("aa and bb AND cc",
+ AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
+
+ def test004(self):
+ self.expect("aa OR bb or cc",
+ OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
+
+ def test005(self):
+ self.expect("aa AND bb OR cc AnD dd",
+ OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
+ AndNode([AtomNode("cc"), AtomNode("dd")])]))
+
+ def test006(self):
+ self.expect("(aa OR bb) AND (cc OR dd)",
+ AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
+ OrNode([AtomNode("cc"), AtomNode("dd")])]))
+
+ def test007(self):
+ self.expect("aa AND NOT bb",
+ AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
+
+ def test010(self):
+ self.expect('"foo bar"', PhraseNode(["foo", "bar"]))
+
+ def test011(self):
+ self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+ def test012(self):
+ self.expect('(("foo bar"))"', PhraseNode(["foo", "bar"]))
+
+ def test013(self):
+ self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+ def test014(self):
+ self.expect("foo-bar", PhraseNode(["foo", "bar"]))
+
+ def test015(self):
+ self.expect("foo -bar", AndNode([AtomNode("foo"),
+ NotNode(AtomNode("bar"))]))
+
+ def test016(self):
+ self.expect("-foo bar", AndNode([AtomNode("bar"),
+ NotNode(AtomNode("foo"))]))
+
+ def test017(self):
+ self.expect("booh -foo-bar",
+ AndNode([AtomNode("booh"),
+ NotNode(PhraseNode(["foo", "bar"]))]))
+
+ def test018(self):
+ self.expect('booh -"foo bar"',
+ AndNode([AtomNode("booh"),
+ NotNode(PhraseNode(["foo", "bar"]))]))
+
+ def test019(self):
+ self.expect('foo"bar"',
+ AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+ def test020(self):
+ self.expect('"foo"bar',
+ AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+ def test021(self):
+ self.expect('foo"bar"blech',
+ AndNode([AtomNode("foo"), AtomNode("bar"),
+ AtomNode("blech")]))
+
+ def test022(self):
+ self.expect("foo*", GlobNode("foo*"))
+
+ def test023(self):
+ self.expect("foo* bar", AndNode([GlobNode("foo*"),
+ AtomNode("bar")]))
+
+ def test101(self):
+ self.failure("")
+
+ def test102(self):
+ self.failure("not")
+
+ def test103(self):
+ self.failure("or")
+
+ def test104(self):
+ self.failure("and")
+
+ def test105(self):
+ self.failure("NOT")
+
+ def test106(self):
+ self.failure("OR")
+
+ def test107(self):
+ self.failure("AND")
+
+ def test108(self):
+ self.failure("NOT foo")
+
+ def test109(self):
+ self.failure(")")
+
+ def test110(self):
+ self.failure("(")
+
+ def test111(self):
+ self.failure("foo OR")
+
+ def test112(self):
+ self.failure("foo AND")
+
+ def test113(self):
+ self.failure("OR foo")
+
+ def test114(self):
+ self.failure("AND foo")
+
+ def test115(self):
+ self.failure("(foo) bar")
+
+ def test116(self):
+ self.failure("(foo OR)")
+
+ def test117(self):
+ self.failure("(foo AND)")
+
+ def test118(self):
+ self.failure("(NOT foo)")
+
+ def test119(self):
+ self.failure("-foo")
+
+ def test120(self):
+ self.failure("-foo -bar")
+
+ def test121(self):
+ self.failure("foo OR -bar")
+
+ def test122(self):
+ self.failure("foo AND -bar")
+
+
+class StopWordTestQueryParser(TestQueryParserBase):
+
+ def setUp(self):
+ # Only 'stop' is a stopword (but 'and' is still an operator)
+ self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
+ self.parser = QueryParser(self.lexicon)
+
+ def test201(self):
+ self.expect('and/', AtomNode("and"))
+
+ def test202(self):
+ self.expect('foo AND stop', AtomNode("foo"), ["stop"])
+
+ def test203(self):
+ self.expect('foo AND NOT stop', AtomNode("foo"), ["stop"])
+
+ def test204(self):
+ self.expect('stop AND foo', AtomNode("foo"), ["stop"])
+
+ def test205(self):
+ self.expect('foo OR stop', AtomNode("foo"), ["stop"])
+
+ def test206(self):
+ self.expect('stop OR foo', AtomNode("foo"), ["stop"])
+
+ def test301(self):
+ self.failure('stop')
+
+ def test302(self):
+ self.failure('stop stop')
+
+ def test303(self):
+ self.failure('stop AND stop')
+
+ def test304(self):
+ self.failure('stop OR stop')
+
+ def test305(self):
+ self.failure('stop -foo')
+
+ def test306(self):
+ self.failure('stop AND NOT foo')
+
+
+class FakeStopWordRemover:
+
+ def process(self, list):
+ return [word for word in list if word != "stop"]
+
+
+def test_suite():
+ return TestSuite((makeSuite(TestQueryParser),
+ makeSuite(StopWordTestQueryParser),
+ makeSuite(TestInterfaces),
+ ))
+
+
+if __name__=="__main__":
+ main(defaultTest='test_suite')
=== Zope3/src/zope/textindex/tests/test_setops.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_setops.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,135 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zodb.btrees.IIBTree import IIBTree, IIBucket
+
+from zope.textindex.setops import mass_weightedIntersection
+from zope.textindex.setops import mass_weightedUnion
+
+class TestSetOps(TestCase):
+
+ def testEmptyLists(self):
+ self.assertEqual(len(mass_weightedIntersection([])), 0)
+ self.assertEqual(len(mass_weightedUnion([])), 0)
+
+ def testIdentity(self):
+ t = IIBTree([(1, 2)])
+ b = IIBucket([(1, 2)])
+ for x in t, b:
+ for func in mass_weightedUnion, mass_weightedIntersection:
+ result = func([(x, 1)])
+ self.assertEqual(len(result), 1)
+ self.assertEqual(list(result.items()), list(x.items()))
+
+ def testScalarMultiply(self):
+ t = IIBTree([(1, 2), (2, 3), (3, 4)])
+ allkeys = [1, 2, 3]
+ b = IIBucket(t)
+ for x in t, b:
+ self.assertEqual(list(x.keys()), allkeys)
+ for func in mass_weightedUnion, mass_weightedIntersection:
+ for factor in 0, 1, 5, 10:
+ result = func([(x, factor)])
+ self.assertEqual(allkeys, list(result.keys()))
+ for key in x.keys():
+ self.assertEqual(x[key] * factor, result[key])
+
+ def testPairs(self):
+ t1 = IIBTree([(1, 10), (3, 30), (7, 70)])
+ t2 = IIBTree([(3, 30), (5, 50), (7, 7), (9, 90)])
+ allkeys = [1, 3, 5, 7, 9]
+ b1 = IIBucket(t1)
+ b2 = IIBucket(t2)
+ for x in t1, t2, b1, b2:
+ for key in x.keys():
+ self.assertEqual(key in allkeys, 1)
+ for y in t1, t2, b1, b2:
+ for w1, w2 in (0, 0), (1, 10), (10, 1), (2, 3):
+ # Test the union.
+ expected = []
+ for key in allkeys:
+ if x.has_key(key) or y.has_key(key):
+ result = x.get(key, 0) * w1 + y.get(key, 0) * w2
+ expected.append((key, result))
+ expected.sort()
+ got = mass_weightedUnion([(x, w1), (y, w2)])
+ self.assertEqual(expected, list(got.items()))
+ got = mass_weightedUnion([(y, w2), (x, w1)])
+ self.assertEqual(expected, list(got.items()))
+
+ # Test the intersection.
+ expected = []
+ for key in allkeys:
+ if x.has_key(key) and y.has_key(key):
+ result = x[key] * w1 + y[key] * w2
+ expected.append((key, result))
+ expected.sort()
+ got = mass_weightedIntersection([(x, w1), (y, w2)])
+ self.assertEqual(expected, list(got.items()))
+ got = mass_weightedIntersection([(y, w2), (x, w1)])
+ self.assertEqual(expected, list(got.items()))
+
+ def testMany(self):
+ import random
+ N = 15 # number of IIBTrees to feed in
+ L = []
+ commonkey = N * 1000
+ allkeys = {commonkey: 1}
+ for i in range(N):
+ t = IIBTree()
+ t[commonkey] = i
+ for j in range(N-i):
+ key = i + j
+ allkeys[key] = 1
+ t[key] = N*i + j
+ L.append((t, i+1))
+ random.shuffle(L)
+ allkeys = allkeys.keys()
+ allkeys.sort()
+
+ # Test the union.
+ expected = []
+ for key in allkeys:
+ sum = 0
+ for t, w in L:
+ if t.has_key(key):
+ sum += t[key] * w
+ expected.append((key, sum))
+ # print 'union', expected
+ got = mass_weightedUnion(L)
+ self.assertEqual(expected, list(got.items()))
+
+ # Test the intersection.
+ expected = []
+ for key in allkeys:
+ sum = 0
+ for t, w in L:
+ if t.has_key(key):
+ sum += t[key] * w
+ else:
+ break
+ else:
+ # We didn't break out of the loop so it's in the intersection.
+ expected.append((key, sum))
+ # print 'intersection', expected
+ got = mass_weightedIntersection(L)
+ self.assertEqual(expected, list(got.items()))
+
+def test_suite():
+ return makeSuite(TestSetOps)
+
+if __name__=="__main__":
+ main(defaultTest='test_suite')
=== Zope3/src/zope/textindex/tests/test_textindexwrapper.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_textindexwrapper.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,131 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Unit tests for TextIndexWrapper.
+
+$Id$
+"""
+
+import unittest
+
+from zope.textindex.textindexwrapper import TextIndexWrapper
+from zope.textindex import parsetree
+
+class TextIndexWrapperTest(unittest.TestCase):
+
+ def setUp(self):
+ w = TextIndexWrapper()
+ doc = u"the quick brown fox jumps over the lazy dog"
+ w.index_doc(1000, [doc])
+ doc = u"the brown fox and the yellow fox don't need the retriever"
+ w.index_doc(1001, [doc])
+ self.wrapper = w
+
+ def testCounts(self):
+ w = self.wrapper
+ self.assertEqual(self.wrapper.documentCount(), 2)
+ self.assertEqual(self.wrapper.wordCount(), 12)
+ doc = u"foo bar"
+ w.index_doc(1002, [doc])
+ self.assertEqual(self.wrapper.documentCount(), 3)
+ self.assertEqual(self.wrapper.wordCount(), 14)
+
+ def testOne(self):
+ matches, total = self.wrapper.query(u"quick fox", 0, 10)
+ self.assertEqual(total, 1)
+ [(docid, rank)] = matches # if this fails there's a problem
+ self.assertEqual(docid, 1000)
+
+ def testDefaultBatch(self):
+ matches, total = self.wrapper.query(u"fox", 0)
+ self.assertEqual(total, 2)
+ self.assertEqual(len(matches), 2)
+ matches, total = self.wrapper.query(u"fox")
+ self.assertEqual(total, 2)
+ self.assertEqual(len(matches), 2)
+ matches, total = self.wrapper.query(u" fox", 1)
+ self.assertEqual(total, 2)
+ self.assertEqual(len(matches), 1)
+
+ def testGlobbing(self):
+ matches, total = self.wrapper.query("fo*")
+ self.assertEqual(total, 2)
+ self.assertEqual(len(matches), 2)
+
+ def testLatin1(self):
+ w = self.wrapper
+ doc = u"Fran\xe7ois"
+ w.index_doc(1002, [doc])
+ matches, total = self.wrapper.query(doc, 0, 10)
+ self.assertEqual(total, 1)
+ [(docid, rank)] = matches # if this fails there's a problem
+ self.assertEqual(docid, 1002)
+
+ def testUnicode(self):
+ w = self.wrapper
+ # Verbose, but easy to debug
+ delta = u"\N{GREEK SMALL LETTER DELTA}"
+ delta += u"\N{GREEK SMALL LETTER EPSILON}"
+ delta += u"\N{GREEK SMALL LETTER LAMDA}"
+ delta += u"\N{GREEK SMALL LETTER TAU}"
+ delta += u"\N{GREEK SMALL LETTER ALPHA}"
+ assert delta.islower()
+ emdash = u"\N{EM DASH}"
+ assert not emdash.isalnum()
+ alpha = u"\N{GREEK SMALL LETTER ALPHA}"
+ assert alpha.islower()
+ lamda = u"\N{GREEK SMALL LETTER LAMDA}"
+ lamda += u"\N{GREEK SMALL LETTER ALPHA}"
+ assert lamda.islower()
+ doc = delta + emdash + alpha
+ w.index_doc(1002, [doc])
+ for word in delta, alpha:
+ matches, total = self.wrapper.query(word, 0, 10)
+ self.assertEqual(total, 1)
+ [(docid, rank)] = matches # if this fails there's a problem
+ self.assertEqual(docid, 1002)
+ self.assertRaises(parsetree.ParseError,
+ self.wrapper.query, emdash, 0, 10)
+ matches, total = self.wrapper.query(lamda, 0, 10)
+ self.assertEqual(total, 0)
+
+ def testNone(self):
+ matches, total = self.wrapper.query(u"dalmatian", 0, 10)
+ self.assertEqual(total, 0)
+ self.assertEqual(len(matches), 0)
+
+ def testAll(self):
+ matches, total = self.wrapper.query(u"brown fox", 0, 10)
+ self.assertEqual(total, 2)
+ self.assertEqual(len(matches), 2)
+ matches.sort()
+ self.assertEqual(matches[0][0], 1000)
+ self.assertEqual(matches[1][0], 1001)
+
+ def testBatching(self):
+ matches1, total = self.wrapper.query(u"brown fox", 0, 1)
+ self.assertEqual(total, 2)
+ self.assertEqual(len(matches1), 1)
+ matches2, total = self.wrapper.query(u"brown fox", 1, 1)
+ self.assertEqual(total, 2)
+ self.assertEqual(len(matches2), 1)
+ matches = matches1 + matches2
+ matches.sort()
+ self.assertEqual(matches[0][0], 1000)
+ self.assertEqual(matches[1][0], 1001)
+
+def test_suite():
+ return unittest.makeSuite(TextIndexWrapperTest)
+
+if __name__=='__main__':
+ unittest.main(defaultTest='test_suite')
=== Zope3/src/zope/textindex/tests/wordstats.py 1.1 => 1.2 ===
--- /dev/null Wed Dec 25 09:16:07 2002
+++ Zope3/src/zope/textindex/tests/wordstats.py Wed Dec 25 09:15:35 2002
@@ -0,0 +1,45 @@
+#! /usr/bin/env python
+"""Dump statistics about each word in the index.
+
+usage: wordstats.py data.fs [index key]
+"""
+
+import zodb
+from zodb.storage.file import FileStorage
+
+def main(fspath, key):
+ fs = FileStorage(fspath, read_only=1)
+ db = ZODB.DB(fs)
+ rt = db.open().root()
+ index = rt[key]
+
+ lex = index.lexicon
+ idx = index.index
+ print "Words", lex.length()
+ print "Documents", idx.length()
+
+ print "Word frequencies: count, word, wid"
+ for word, wid in lex.items():
+ docs = idx._wordinfo[wid]
+ print len(docs), word, wid
+
+ print "Per-doc scores: wid, (doc, score,)+"
+ for wid in lex.wids():
+ print wid,
+ docs = idx._wordinfo[wid]
+ for docid, score in docs.items():
+ print docid, score,
+ print
+
+if __name__ == "__main__":
+ import sys
+
+ args = sys.argv[1:]
+ index_key = "index"
+ if len(args) == 1:
+ fspath = args[0]
+ elif len(args) == 2:
+ fspath, index_key = args
+ else:
+ print "Expected 1 or 2 args, got", len(args)
+ main(fspath, index_key)