[Zope3-checkins] CVS: Zope3/src/zope/textindex/tests - __init__.py:1.2 hs-tool.py:1.2 indexhtml.py:1.2 mailtest.py:1.2 mhindex.py:1.2 queryhtml.py:1.2 test_index.py:1.2 test_lexicon.py:1.2 test_nbest.py:1.2 test_pipelinefactory.py:1.2 test_queryengine.py:1.2 test_queryparser.py:1.2 test_setops.py:1.2 test_textindexwrapper.py:1.2 wordstats.py:1.2

Jim Fulton jim@zope.com
Wed, 25 Dec 2002 09:16:07 -0500


Update of /cvs-repository/Zope3/src/zope/textindex/tests
In directory cvs.zope.org:/tmp/cvs-serv20790/src/zope/textindex/tests

Added Files:
	__init__.py hs-tool.py indexhtml.py mailtest.py mhindex.py 
	queryhtml.py test_index.py test_lexicon.py test_nbest.py 
	test_pipelinefactory.py test_queryengine.py 
	test_queryparser.py test_setops.py test_textindexwrapper.py 
	wordstats.py 
Log Message:
Grand renaming:

- Renamed most files (especially python modules) to lower case.

- Moved views and interfaces into separate hierarchies within each
  project, where each top-level directory under the zope package
  is a separate project.

- Moved everything to src from lib/python.

  lib/python will eventually go away. I need access to the cvs
  repository to make this happen, however.

There are probably some bits that are broken. All tests pass
and zope runs, but I haven't tried everything. There are a number
of cleanups I'll work on tomorrow.



=== Zope3/src/zope/textindex/tests/__init__.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/__init__.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,2 @@
+#
+# This file is necessary to make this directory a package.


=== Zope3/src/zope/textindex/tests/hs-tool.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/hs-tool.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,129 @@
+#! /usr/bin/env python
+
+import cPickle
+import os.path
+import sys
+
+from hotshot.log import LogReader
+
+def load_line_info(log):
+    byline = {}
+    prevloc = None
+    for what, place, tdelta in log:
+        if tdelta > 0:
+            t, nhits = byline.get(prevloc, (0, 0))
+            byline[prevloc] = (tdelta + t), (nhits + 1)
+            prevloc = place
+    return byline
+
+def basename(path, cache={}):
+    try:
+        return cache[path]
+    except KeyError:
+        fn = os.path.split(path)[1]
+        cache[path] = fn
+        return fn
+
+def print_results(results):
+    for info, place in results:
+        if place is None:
+            # This is the startup time for the profiler, and only
+            # occurs at the very beginning.  Just ignore it, since it
+            # corresponds to frame setup of the outermost call, not
+            # anything that's actually interesting.
+            continue
+        filename, line, funcname = place
+        print '%8d %8d' % info, basename(filename), line
+
+def annotate_results(results):
+    files = {}
+    for stats, place in results:
+        if not place:
+            continue
+        time, hits = stats
+        file, line, func = place
+        l = files.get(file)
+        if l is None:
+            l = files[file] = []
+        l.append((line, hits, time))
+    order = files.keys()
+    order.sort()
+    for k in order:
+        if os.path.exists(k):
+            v = files[k]
+            v.sort()
+            annotate(k, v)
+
+def annotate(file, lines):
+    print "-" * 60
+    print file
+    print "-" * 60
+    f = open(file)
+    i = 1
+    match = lines[0][0]
+    for line in f:
+        if match == i:
+            print "%6d %8d " % lines[0][1:], line,
+            del lines[0]
+            if lines:
+                match = lines[0][0]
+            else:
+                match = None
+        else:
+            print " " * 16, line,
+        i += 1
+    print
+
+def get_cache_name(filename):
+    d, fn = os.path.split(filename)
+    cache_dir = os.path.join(d, '.hs-tool')
+    cache_file = os.path.join(cache_dir, fn)
+    return cache_dir, cache_file
+
+def cache_results(filename, results):
+    cache_dir, cache_file = get_cache_name(filename)
+    if not os.path.exists(cache_dir):
+        os.mkdir(cache_dir)
+    fp = open(cache_file, 'wb')
+    try:
+        cPickle.dump(results, fp, 1)
+    finally:
+        fp.close()
+
+def main(filename, annotate):
+    cache_dir, cache_file = get_cache_name(filename)
+
+    if (  os.path.isfile(cache_file)
+          and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
+        # cached data is up-to-date:
+        fp = open(cache_file, 'rb')
+        results = cPickle.load(fp)
+        fp.close()
+    else:
+        log = LogReader(filename)
+        byline = load_line_info(log)
+        # Sort
+        results = [(v, k) for k, v in byline.items()]
+        results.sort()
+        cache_results(filename, results)
+
+    if annotate:
+        annotate_results(results)
+    else:
+        print_results(results)
+
+
+if __name__ == "__main__":
+    import getopt
+
+    annotate_p = 0
+    opts, args = getopt.getopt(sys.argv[1:], 'A')
+    for o, v in opts:
+        if o == '-A':
+            annotate_p = 1
+    if args:
+        filename, = args
+    else:
+        filename = "profile.dat"
+
+    main(filename, annotate_p)


=== Zope3/src/zope/textindex/tests/indexhtml.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/indexhtml.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,154 @@
+#! /usr/bin/env python
+"""Index a collection of HTML files on the filesystem.
+
+usage: indexhtml.py [options] dir
+
+Will create an index of all files in dir or its subdirectories.
+
+options:
+-f data.fs  -- the path to the filestorage datafile
+"""
+from __future__ import nested_scopes
+
+import os
+from time import clock
+
+import zodb
+from zodb.storage.file import FileStorage
+from zodb.btrees.IOBTree import IOBTree
+
+from zope.textindex.textindexwrapper import TextIndexWrapper
+from zope.textindex.htmlsplitter import HTMLWordSplitter
+from zope.textindex.lexicon import Lexicon, StopWordRemover
+
+def make_zc_index():
+    # there's an elaborate dance necessary to construct an index
+    class Struct:
+        pass
+    extra = Struct()
+    extra.doc_attr = "read"
+    extra.lexicon_id = "lexicon"
+    caller = Struct()
+    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
+    return ZCTextIndex("read", extra, caller)
+
+# XXX make a splitter more like the HTMLSplitter for TextIndex
+# signature is
+# Splitter(string, stop_words, encoding,
+#          singlechar, indexnumbers, casefolding)
+
+class MySplitter:
+    def __init__(self):
+        self._v_splitter = HTMLWordSplitter()
+    def __call__(self, text, stopdict, *args, **kwargs):
+        words = self._v_splitter._split(text)
+        def lookup(w):
+            return stopdict.get(w, w)
+        return filter(None, map(lookup, words))
+
+def make_old_index():
+    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
+    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
+    from zope.textindex.stopdict import get_stopdict
+
+    l = Lexicon(get_stopdict())
+    l.SplitterFunc = MySplitter()
+    return TextIndex("read", lexicon=l)
+
+def main(db, root, dir):
+    rt["index"] = index = INDEX()
+    rt["files"] = paths = IOBTree()
+    get_transaction().commit()
+
+    zodb_time = 0.0
+    pack_time = 0.0
+
+    files = [os.path.join(dir, file) for file in os.listdir(dir)]
+    docid = 0
+    t0 = clock()
+    for file in files:
+        if os.path.isdir(file):
+            files += [os.path.join(file, sub) for sub in os.listdir(file)]
+        else:
+            if not file.endswith(".html"):
+                continue
+            docid += 1
+            if LIMIT is not None and docid > LIMIT:
+                break
+            if VERBOSE:
+                print "%5d" % docid, file
+            f = open(file, "rb")
+            paths[docid] = file
+            index.index_object(docid, f)
+            f.close()
+            if docid % TXN_INTERVAL == 0:
+                z0 = clock()
+                get_transaction().commit()
+                z1 = clock()
+                zodb_time += z1 - z0
+                if VERBOSE:
+                    print "commit took", z1 - z0, zodb_time
+            if docid % PACK_INTERVAL == 0:
+                p0 = clock()
+                db.pack()
+                p1 = clock()
+                zodb_time += p1 - p0
+                pack_time += p1 - p0
+                if VERBOSE:
+                    print "pack took", p1 - p0, pack_time
+    z0 = clock()
+    get_transaction().commit()
+    z1 = t1 = clock()
+    total_time = t1 - t0
+    zodb_time += z1 - z0
+    if VERBOSE:
+        print "Total index time", total_time
+        print "Non-pack time", total_time - pack_time
+        print "Non-ZODB time", total_time - zodb_time
+
+if __name__ == "__main__":
+    import sys
+    import getopt
+
+    VERBOSE = 0
+    FSPATH = "Data.fs"
+    TXN_INTERVAL = 100
+    PACK_INTERVAL = 500
+    LIMIT = None
+    INDEX = make_zc_index
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], 'vf:t:p:n:T')
+    except getopt.error, msg:
+        print msg
+        print __doc__
+        sys.exit(2)
+
+    for o, v in opts:
+        if o == '-v':
+            VERBOSE += 1
+        if o == '-f':
+            FSPATH = v
+        if o == '-t':
+            TXN_INTERVAL = int(v)
+        if o == '-p':
+            PACK_INTERVAL = int(v)
+        if o == '-n':
+            LIMIT = int(v)
+        if o == '-T':
+            INDEX = make_old_index
+
+    if len(args) != 1:
+        print "Expected on argument"
+        print __doc__
+        sys.exit(2)
+    dir = args[0]
+
+    fs = FileStorage(FSPATH)
+    db = ZODB.DB(fs)
+    cn = db.open()
+    rt = cn.root()
+    dir = os.path.join(os.getcwd(), dir)
+    print dir
+    main(db, rt, dir)
+    cn.close()
+    fs.close()


=== Zope3/src/zope/textindex/tests/mailtest.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/mailtest.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,290 @@
+"""Test an index with a Unix mailbox file.
+
+usage: python mailtest.py [options] <data.fs>
+
+options:
+    -v     -- verbose
+
+    Index Generation
+    -i mailbox
+    -n NNN -- max number of messages to read from mailbox
+    -t NNN -- commit a transaction every NNN messages (default: 1)
+    -p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
+    -p 0   -- don't pack at all
+    -x     -- exclude the message text from the data.fs
+
+    Queries
+    -q query
+    -b NNN -- return the NNN best matches (default: 10)
+    -c NNN -- context; if -v, show the first NNN lines of results (default: 5)
+
+The script either indexes or queries depending on whether -q or -i is
+passed as an option.
+
+For -i mailbox, the script reads mail messages from the mailbox and
+indexes them.  It indexes one message at a time, then commits the
+transaction.
+
+For -q query, it performs a query on an existing index.
+
+If both are specified, the index is performed first.
+
+You can also interact with the index after it is completed. Load the
+index from the database:
+
+    import zodb
+    from zodb.storage.file import FileStorage
+    fs = FileStorage(<data.fs>
+    db = ZODB.DB(fs)
+    index = cn.open().root()["index"]
+    index.search("python AND unicode")
+"""
+
+import zodb
+import zodb.storage.file
+from zope.textindex.lexicon import \
+     Lexicon, CaseNormalizer, Splitter, StopWordRemover
+
+# XXX This import is bad, and was so before the renaming
+from zope.textindex.zctextindex import ZCTextIndex
+
+from BTrees.IOBTree import IOBTree
+from zope.textindex.queryparser import QueryParser
+
+import sys
+import mailbox
+import time
+
+def usage(msg):
+    print msg
+    print __doc__
+    sys.exit(2)
+
+class Message:
+
+    total_bytes = 0
+
+    def __init__(self, msg):
+        subject = msg.getheader('subject', '')
+        author = msg.getheader('from', '')
+        if author:
+            summary = "%s (%s)\n" % (subject, author)
+        else:
+            summary = "%s\n" % subject
+        self.text = summary + msg.fp.read()
+        Message.total_bytes += len(self.text)
+
+class Extra:
+    pass
+
+def index(rt, mboxfile, db, profiler):
+    global NUM
+    idx_time = 0
+    pack_time = 0
+    start_time = time.time()
+
+    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+    extra = Extra()
+    extra.lexicon_id = 'lexicon'
+    extra.doc_attr = 'text'
+    extra.index_type = 'Okapi BM25 Rank'
+    caller = Extra()
+    caller.lexicon = lexicon
+    rt["index"] = idx = ZCTextIndex("index", extra, caller)
+    if not EXCLUDE_TEXT:
+        rt["documents"] = docs = IOBTree()
+    else:
+        docs = None
+    get_transaction().commit()
+
+    mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
+    if VERBOSE:
+        print "opened", mboxfile
+    if not NUM:
+        NUM = sys.maxint
+
+    if profiler:
+        itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
+    else:
+        itime, ptime, i = indexmbox(mbox, idx, docs, db)
+    idx_time += itime
+    pack_time += ptime
+
+    get_transaction().commit()
+
+    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
+        if VERBOSE >= 2:
+            print "packing one last time..."
+        p0 = time.clock()
+        db.pack(time.time())
+        p1 = time.clock()
+        if VERBOSE:
+            print "pack took %s sec" % (p1 - p0)
+        pack_time += p1 - p0
+
+    if VERBOSE:
+        finish_time = time.time()
+        print
+        print "Index time", round(idx_time / 60, 3), "minutes"
+        print "Pack time", round(pack_time / 60, 3), "minutes"
+        print "Index bytes", Message.total_bytes
+        rate = (Message.total_bytes / idx_time) / 1024
+        print "Index rate %.2f KB/sec" % rate
+        print "Indexing began", time.ctime(start_time)
+        print "Indexing ended", time.ctime(finish_time)
+        print "Wall clock minutes", round((finish_time - start_time)/60, 3)
+
+def indexmbox(mbox, idx, docs, db):
+    idx_time = 0
+    pack_time = 0
+    i = 0
+    while i < NUM:
+        _msg = mbox.next()
+        if _msg is None:
+            break
+        i += 1
+        msg = Message(_msg)
+        if VERBOSE >= 2:
+            print "indexing msg", i
+        i0 = time.clock()
+        idx.index_object(i, msg)
+        if not EXCLUDE_TEXT:
+            docs[i] = msg
+        if i % TXN_SIZE == 0:
+            get_transaction().commit()
+        i1 = time.clock()
+        idx_time += i1 - i0
+        if VERBOSE and i % 50 == 0:
+            print i, "messages indexed"
+            print "cache size", db.cacheSize()
+        if PACK_INTERVAL and i % PACK_INTERVAL == 0:
+            if VERBOSE >= 2:
+                print "packing..."
+            p0 = time.clock()
+            db.pack(time.time())
+            p1 = time.clock()
+            if VERBOSE:
+                print "pack took %s sec" % (p1 - p0)
+            pack_time += p1 - p0
+    return idx_time, pack_time, i
+
+
+def query(rt, query_str, profiler):
+    idx = rt["index"]
+    docs = rt["documents"]
+
+    start = time.clock()
+    if profiler is None:
+        results, num_results = idx.query(query_str, BEST)
+    else:
+        if WARM_CACHE:
+            print "Warming the cache..."
+            idx.query(query_str, BEST)
+        start = time.clock()
+        results, num_results = profiler.runcall(idx.query, query_str, BEST)
+    elapsed = time.clock() - start
+
+    print "query:", query_str
+    print "# results:", len(results), "of", num_results, \
+          "in %.2f ms" % (elapsed * 1000)
+
+    tree = QueryParser(idx.lexicon).parseQuery(query_str)
+    qw = idx.index.query_weight(tree.terms())
+
+    for docid, score in results:
+        scaled = 100.0 * score / qw
+        print "docid %7d score %6d scaled %5.2f%%" % (docid, score, scaled)
+        if VERBOSE:
+            msg = docs[docid]
+            ctx = msg.text.split("\n", CONTEXT)
+            del ctx[-1]
+            print "-" * 60
+            print "message:"
+            for l in ctx:
+                print l
+            print "-" * 60
+
+
+def main(fs_path, mbox_path, query_str, profiler):
+    f = ZODB.FileStorage.FileStorage(fs_path)
+    db = ZODB.DB(f, cache_size=CACHE_SIZE)
+    cn = db.open()
+    rt = cn.root()
+
+    if mbox_path is not None:
+        index(rt, mbox_path, db, profiler)
+    if query_str is not None:
+        query(rt, query_str, profiler)
+
+    cn.close()
+    db.close()
+    f.close()
+
+if __name__ == "__main__":
+    import getopt
+
+    NUM = 0
+    VERBOSE = 0
+    PACK_INTERVAL = 500
+    EXCLUDE_TEXT = 0
+    CACHE_SIZE = 10000
+    TXN_SIZE = 1
+    BEST = 10
+    CONTEXT = 5
+    WARM_CACHE = 0
+    query_str = None
+    mbox_path = None
+    profile = None
+    old_profile = None
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:c:xt:w',
+                                   ['profile=', 'old-profile='])
+    except getopt.error, msg:
+        usage(msg)
+    if len(args) != 1:
+        usage("exactly 1 filename argument required")
+    for o, v in opts:
+        if o == '-n':
+            NUM = int(v)
+        elif o == '-v':
+            VERBOSE += 1
+        elif o == '-p':
+            PACK_INTERVAL = int(v)
+        elif o == '-q':
+            query_str = v
+        elif o == '-i':
+            mbox_path = v
+        elif o == '-b':
+            BEST = int(v)
+        elif o == '-x':
+            EXCLUDE_TEXT = 1
+        elif o == '-t':
+            TXN_SIZE = int(v)
+        elif o == '-c':
+            CONTEXT = int(v)
+        elif o == '-w':
+            WARM_CACHE = 1
+        elif o == '--profile':
+            profile = v
+        elif o == '--old-profile':
+            old_profile = v
+    fs_path, = args
+
+    if profile:
+        import hotshot
+        profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
+    elif old_profile:
+        import profile
+        profiler = profile.Profile()
+    else:
+        profiler = None
+
+    main(fs_path, mbox_path, query_str, profiler)
+
+    if profile:
+        profiler.close()
+    elif old_profile:
+        import pstats
+        profiler.dump_stats(old_profile)
+        stats = pstats.Stats(old_profile)
+        stats.strip_dirs().sort_stats('time').print_stats(20)


=== Zope3/src/zope/textindex/tests/mhindex.py 1.1 => 1.2 === (478/578 lines abridged)
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/mhindex.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,575 @@
+#! /usr/bin/env python2.2
+
+"""MH mail indexer.
+
+To index messages from a single folder (messages defaults to 'all'):
+  mhindex.py [options] -u +folder [messages ...]
+
+To bulk index all messages from several folders:
+  mhindex.py [options] -b folder ...; the folder name ALL means all folders.
+
+To execute a single query:
+  mhindex.py [options] query
+
+To enter interactive query mode:
+  mhindex.py [options]
+
+Common options:
+  -d FILE -- specify the Data.fs to use (default ~/.Data.fs)
+  -w -- dump the word list in alphabetical order and exit
+  -W -- dump the word list ordered by word id and exit
+
+Indexing options:
+  -O -- do a prescan on the data to compute optimal word id assignments;
+        this is only useful the first time the Data.fs is used
+  -t N -- commit a transaction after every N messages (default 20000)
+  -p N -- pack after every N commits (by default no packing is done)
+
+Querying options:
+  -m N -- show at most N matching lines from the message (default 3)
+  -n N -- show the N best matching messages (default 3)
+"""
+
+import os
+import re
+import sys
+import time
+import mhlib
+import getopt
+import traceback
+from StringIO import StringIO
+from stat import ST_MTIME
+
+DATAFS = "~/.mhindex.fs"
+ZOPECODE = "~/projects/Zope3/lib/python"
+
+zopecode = os.path.expanduser(ZOPECODE)
+sys.path.insert(0, zopecode)

[-=- -=- -=- 478 lines omitted -=- -=- -=-]

+            if value:
+                H.append(value)
+        if H:
+            L.append("\n".join(H))
+
+    def newdocid(self, path):
+        docid = self.path2docid.get(path)
+        if docid is not None:
+            self.doctimes[docid] = self.getmtime(path)
+            return docid
+        docid = self.maxdocid + 1
+        self.maxdocid = docid
+        self.docpaths[docid] = path
+        self.doctimes[docid] = self.getmtime(path)
+        self.path2docid[path] = docid
+        return docid
+
+    def getmtime(self, path):
+        path = os.path.join(self.mh.getpath(), path)
+        try:
+            st = os.stat(path)
+        except os.error, msg:
+            return 0
+        return int(st[ST_MTIME])
+
+    def maycommit(self):
+        self.trans_count += 1
+        if self.trans_count >= self.trans_limit > 0:
+            self.commit()
+
+    def commit(self):
+        if self.trans_count > 0:
+            print "committing..."
+            get_transaction().commit()
+            self.trans_count = 0
+            self.pack_count += 1
+            if self.pack_count >= self.pack_limit > 0:
+                self.pack()
+
+    def pack(self):
+        if self.pack_count > 0:
+            print "packing..."
+            self.database.pack()
+            self.pack_count = 0
+
+def reportexc():
+    traceback.print_exc()
+
+if __name__ == "__main__":
+    sys.exit(main())


=== Zope3/src/zope/textindex/tests/queryhtml.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/queryhtml.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,116 @@
+import os
+from time import clock
+
+import zodb
+from zodb.storage.file import FileStorage
+
+QUERIES = ["nested recursive functions",
+           "explicit better than implicit",
+           "build hpux",
+           "cannot create 'method-wrapper' instances",
+            "extension module C++",
+           "class method",
+           "instance variable",
+           "articulate information",
+           "import default files",
+           "gopher ftp http",
+           "documentation",
+           ]
+
+def path2url(p):
+    # convert the paths to a python.org URL
+    # hack: only works for the way Jeremy indexed his copy of python.org
+    marker = "www.python.org/."
+    i = p.find(marker)
+    if i == -1:
+        return p
+    i += len(marker)
+    return "http://www.python.org" + p[i:]
+
+from Products.PluginIndexes.TextIndex.TextIndex import And, Or
+from zope.textindex.tests.indexhtml import MySplitter
+from zope.textindex.nbest import NBest
+
+def main(rt):
+    index = rt["index"]
+    files = rt["files"]
+    times = {}
+    ITERS = range(50)
+    for i in range(11):
+        for q in QUERIES:
+            terms = q.split()
+            for c in " OR ", " AND ":
+                query = c.join(terms)
+                t0 = clock()
+                if TEXTINDEX:
+                    if c == " OR ":
+                        op = Or
+                    else:
+                        op = And
+                    _q = " ".join(terms)
+                    for _ in ITERS:
+                        b = index.query(_q, op).bucket()
+                        num = len(b)
+                        chooser = NBest(10)
+                        chooser.addmany(b.items())
+                        results = chooser.getbest()
+
+                else:
+                    try:
+                        for _ in ITERS:
+                            results, num = index.query(query)
+                    except:
+                        continue
+                t1 = clock()
+                print "<p>Query: \"%s\"" % query
+                print "<br>Num results: %d" % num
+                print "<br>time.clock(): %s" % (t1 - t0)
+                key = query
+                if i == 0:
+                    print "<ol>"
+                    for docid, score in results:
+                        url = path2url(files[docid])
+                        fmt = '<li><a href="%s">%s</A> score = %s'
+                        print fmt % (url, url, score)
+                    print "</ol>"
+                    continue
+                l = times.setdefault(key, [])
+                l.append(t1 - t0)
+
+    l = times.keys()
+    l.sort()
+    print "<hr>"
+    for k in l:
+        v = times[k]
+        print "<p>Query: \"%s\"" % k
+        print "<br>Min time: %s" % min(v)
+        print "<br>All times: %s" % " ".join(map(str, v))
+
+if __name__ == "__main__":
+    import sys
+    import getopt
+
+    VERBOSE = 0
+    FSPATH = "Data.fs"
+    TEXTINDEX = 0
+
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], 'vf:T')
+    except getopt.error, msg:
+        print msg
+        print __doc__
+        sys.exit(2)
+
+    for o, v in opts:
+        if o == '-v':
+            VERBOSE += 1
+        if o == '-f':
+            FSPATH = v
+        if o == '-T':
+            TEXTINDEX = 1
+
+    fs = FileStorage(FSPATH, read_only=1)
+    db = ZODB.DB(fs, cache_size=10000)
+    cn = db.open()
+    rt = cn.root()
+    main(rt)


=== Zope3/src/zope/textindex/tests/test_index.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_index.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,157 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zope.textindex.lexicon import Lexicon, Splitter
+from zope.textindex.cosineindex import CosineIndex
+from zope.textindex.okapiindex import OkapiIndex
+
+# Subclasses must set a class variable IndexFactory to the appropriate
+# index object constructor.
+
+class IndexTest(TestCase):
+
+    def setUp(self):
+        self.lexicon = Lexicon(Splitter())
+        self.index = self.IndexFactory(self.lexicon)
+
+    def test_index_document(self, DOCID=1):
+        doc = "simple document contains five words"
+        self.assert_(not self.index.has_doc(DOCID))
+        self.index.index_doc(DOCID, doc)
+        self.assertEqual(self.index.documentCount(), 1)
+        self.assertEqual(self.index.wordCount(), 5)
+        self.assertEqual(self.lexicon.wordCount(), 5)
+        self.assert_(self.index.has_doc(DOCID))
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._docweight), 1)
+        self.assertEqual(len(self.index._wordinfo), 5)
+        self.assertEqual(len(self.index._docwords), 1)
+        self.assertEqual(len(self.index.get_words(DOCID)), 5)
+        self.assertEqual(len(self.index._wordinfo),
+                         self.index.wordCount())
+        for map in self.index._wordinfo.values():
+            self.assertEqual(len(map), 1)
+            self.assert_(map.has_key(DOCID))
+
+    def test_unindex_document(self):
+        DOCID = 1
+        self.test_index_document(DOCID)
+        self.index.unindex_doc(DOCID)
+        self.assertEqual(len(self.index._docweight), 0)
+        self.assertEqual(len(self.index._wordinfo), 0)
+        self.assertEqual(len(self.index._docwords), 0)
+        self.assertEqual(len(self.index._wordinfo),
+                         self.index.wordCount())
+
+    def test_index_two_documents(self):
+        self.test_index_document()
+        doc = "another document just four"
+        DOCID = 2
+        self.index.index_doc(DOCID, doc)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._docweight), 2)
+        self.assertEqual(len(self.index._wordinfo), 8)
+        self.assertEqual(len(self.index._docwords), 2)
+        self.assertEqual(len(self.index.get_words(DOCID)), 4)
+        self.assertEqual(len(self.index._wordinfo),
+                         self.index.wordCount())
+        wids = self.lexicon.termToWordIds("document")
+        self.assertEqual(len(wids), 1)
+        document_wid = wids[0]
+        for wid, map in self.index._wordinfo.items():
+            if wid == document_wid:
+                self.assertEqual(len(map), 2)
+                self.assert_(map.has_key(1))
+                self.assert_(map.has_key(DOCID))
+            else:
+                self.assertEqual(len(map), 1)
+
+    def test_index_two_unindex_one(self):
+        # index two documents, unindex one, and test the results
+        self.test_index_two_documents()
+        self.index.unindex_doc(1)
+        DOCID = 2
+        self.assertEqual(len(self.index._docweight), 1)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._wordinfo), 4)
+        self.assertEqual(len(self.index._docwords), 1)
+        self.assertEqual(len(self.index.get_words(DOCID)), 4)
+        self.assertEqual(len(self.index._wordinfo),
+                         self.index.wordCount())
+        for map in self.index._wordinfo.values():
+            self.assertEqual(len(map), 1)
+            self.assert_(map.has_key(DOCID))
+
+    def test_index_duplicated_words(self, DOCID=1):
+        doc = "very simple repeat repeat repeat document test"
+        self.index.index_doc(DOCID, doc)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._wordinfo), 5)
+        self.assertEqual(len(self.index._docwords), 1)
+        self.assertEqual(len(self.index.get_words(DOCID)), 7)
+        self.assertEqual(len(self.index._wordinfo),
+                         self.index.wordCount())
+        wids = self.lexicon.termToWordIds("repeat")
+        self.assertEqual(len(wids), 1)
+        repititive_wid = wids[0]
+        for wid, map in self.index._wordinfo.items():
+            self.assertEqual(len(map), 1)
+            self.assert_(map.has_key(DOCID))
+
+    def test_simple_query_oneresult(self):
+        self.index.index_doc(1, 'not the same document')
+        results = self.index.search("document")
+        self.assertEqual(list(results.keys()), [1])
+
+    def test_simple_query_noresults(self):
+        self.index.index_doc(1, 'not the same document')
+        results = self.index.search("frobnicate")
+        self.assertEqual(list(results.keys()), [])
+
+    def test_query_oneresult(self):
+        self.index.index_doc(1, 'not the same document')
+        self.index.index_doc(2, 'something about something else')
+        results = self.index.search("document")
+        self.assertEqual(list(results.keys()), [1])
+
+    def test_search_phrase(self):
+        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
+        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
+        results = self.index.search_phrase("quick brown fox")
+        self.assertEqual(list(results.keys()), [1])
+
+    def test_search_glob(self):
+        self.index.index_doc(1, "how now brown cow")
+        self.index.index_doc(2, "hough nough browne cough")
+        self.index.index_doc(3, "bar brawl")
+        results = self.index.search_glob("bro*")
+        self.assertEqual(list(results.keys()), [1, 2])
+        results = self.index.search_glob("b*")
+        self.assertEqual(list(results.keys()), [1, 2, 3])
+
+class CosineIndexTest(IndexTest):
+    IndexFactory = CosineIndex
+
+class OkapiIndexTest(IndexTest):
+    IndexFactory = OkapiIndex
+
+def test_suite():
+    return TestSuite((makeSuite(CosineIndexTest),
+                      makeSuite(OkapiIndexTest),
+                    ))
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')


=== Zope3/src/zope/textindex/tests/test_lexicon.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_lexicon.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,142 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+import sys
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zope.textindex.lexicon import Lexicon
+from zope.textindex.lexicon import Splitter, CaseNormalizer
+
+class StupidPipelineElement:
+    def __init__(self, fromword, toword):
+        self.__fromword = fromword
+        self.__toword = toword
+
+    def process(self, seq):
+        res = []
+        for term in seq:
+            if term == self.__fromword:
+                res.append(self.__toword)
+            else:
+                res.append(term)
+        return res
+
+class WackyReversePipelineElement:
+    def __init__(self, revword):
+        self.__revword = revword
+
+    def process(self, seq):
+        res = []
+        for term in seq:
+            if term == self.__revword:
+                x = list(term)
+                x.reverse()
+                res.append(''.join(x))
+            else:
+                res.append(term)
+        return res
+
+class StopWordPipelineElement:
+    def __init__(self, stopdict={}):
+        self.__stopdict = stopdict
+
+    def process(self, seq):
+        res = []
+        for term in seq:
+            if self.__stopdict.get(term):
+                continue
+            else:
+                res.append(term)
+        return res
+
+
+class Test(TestCase):
+    def testSourceToWordIds(self):
+        lexicon = Lexicon(Splitter())
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        self.assertEqual(wids, [1, 2, 3])
+
+    def testTermToWordIds(self):
+        lexicon = Lexicon(Splitter())
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('dogs')
+        self.assertEqual(wids, [3])
+
+    def testMissingTermToWordIds(self):
+        lexicon = Lexicon(Splitter())
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('boxes')
+        self.assertEqual(wids, [0])
+
+    def testOnePipelineElement(self):
+        lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('fish')
+        self.assertEqual(wids, [3])
+
+    def testSplitterAdaptorFold(self):
+        lexicon = Lexicon(Splitter(), CaseNormalizer())
+        wids = lexicon.sourceToWordIds('CATS and dogs')
+        wids = lexicon.termToWordIds('cats and dogs')
+        self.assertEqual(wids, [1, 2, 3])
+
+    def testSplitterAdaptorNofold(self):
+        lexicon = Lexicon(Splitter())
+        wids = lexicon.sourceToWordIds('CATS and dogs')
+        wids = lexicon.termToWordIds('cats and dogs')
+        self.assertEqual(wids, [0, 2, 3])
+
+    def testTwoElementPipeline(self):
+        lexicon = Lexicon(Splitter(),
+                          StupidPipelineElement('cats', 'fish'),
+                          WackyReversePipelineElement('fish'))
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('hsif')
+        self.assertEqual(wids, [1])
+
+    def testThreeElementPipeline(self):
+        lexicon = Lexicon(Splitter(),
+                          StopWordPipelineElement({'and':1}),
+                          StupidPipelineElement('dogs', 'fish'),
+                          WackyReversePipelineElement('fish'))
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('hsif')
+        self.assertEqual(wids, [2])
+
+    def testSplitterLocaleAwareness(self):
+        from zope.textindex.htmlsplitter import HTMLWordSplitter
+        import locale
+        loc = locale.setlocale(locale.LC_ALL) # get current locale
+         # set German locale
+        try:
+            if sys.platform != 'win32':
+                locale.setlocale(locale.LC_ALL, 'de_DE.ISO8859-1')
+            else:
+                locale.setlocale(locale.LC_ALL, 'German_Germany.1252')
+        except locale.Error:
+            return # This test doesn't work here :-(
+        expected = ['m\xfclltonne', 'waschb\xe4r',
+                    'beh\xf6rde', '\xfcberflieger']
+        words = [" ".join(expected)]
+        words = Splitter().process(words)
+        self.assertEqual(words, expected)
+        words = HTMLWordSplitter().process(words)
+        self.assertEqual(words, expected)
+        locale.setlocale(locale.LC_ALL, loc) # restore saved locale
+
+def test_suite():
+    return makeSuite(Test)
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')


=== Zope3/src/zope/textindex/tests/test_nbest.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_nbest.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,97 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zope.textindex.nbest import NBest
+
+class NBestTest(TestCase):
+
+    def testConstructor(self):
+        self.assertRaises(ValueError, NBest, 0)
+        self.assertRaises(ValueError, NBest, -1)
+
+        for n in range(1, 11):
+            nb = NBest(n)
+            self.assertEqual(len(nb), 0)
+            self.assertEqual(nb.capacity(), n)
+
+    def testOne(self):
+        nb = NBest(1)
+        nb.add('a', 0)
+        self.assertEqual(nb.getbest(), [('a', 0)])
+
+        nb.add('b', 1)
+        self.assertEqual(len(nb), 1)
+        self.assertEqual(nb.capacity(), 1)
+        self.assertEqual(nb.getbest(), [('b', 1)])
+
+        nb.add('c', -1)
+        self.assertEqual(len(nb), 1)
+        self.assertEqual(nb.capacity(), 1)
+        self.assertEqual(nb.getbest(), [('b', 1)])
+
+        nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
+        self.assertEqual(len(nb), 1)
+        self.assertEqual(nb.capacity(), 1)
+        self.assertEqual(nb.getbest(), [('f', 5)])
+
+    def testMany(self):
+        import random
+        inputs = [(-i, i) for i in range(50)]
+
+        reversed_inputs = inputs[:]
+        reversed_inputs.reverse()
+
+        # Test the N-best for a variety of n (1, 6, 11, ... 50).
+        for n in range(1, len(inputs)+1, 5):
+            expected = inputs[-n:]
+            expected.reverse()
+
+            random_inputs = inputs[:]
+            random.shuffle(random_inputs)
+
+            for source in inputs, reversed_inputs, random_inputs:
+                # Try feeding them one at a time.
+                nb = NBest(n)
+                for item, score in source:
+                    nb.add(item, score)
+                self.assertEqual(len(nb), n)
+                self.assertEqual(nb.capacity(), n)
+                self.assertEqual(nb.getbest(), expected)
+
+                # And again in one gulp.
+                nb = NBest(n)
+                nb.addmany(source)
+                self.assertEqual(len(nb), n)
+                self.assertEqual(nb.capacity(), n)
+                self.assertEqual(nb.getbest(), expected)
+
+                for i in range(1, n+1):
+                    self.assertEqual(nb.pop_smallest(), expected[-i])
+                self.assertRaises(IndexError, nb.pop_smallest)
+
+    def testAllSameScore(self):
+        inputs = [(i, 0) for i in range(10)]
+        for n in range(1, 12):
+            nb = NBest(n)
+            nb.addmany(inputs)
+            outputs = nb.getbest()
+            self.assertEqual(outputs, inputs[:len(outputs)])
+
+def test_suite():
+    return makeSuite(NBestTest)
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')


=== Zope3/src/zope/textindex/tests/test_pipelinefactory.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_pipelinefactory.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,50 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+from zope.textindex.ipipelineelement import IPipelineElement
+from zope.textindex.pipelinefactory import PipelineElementFactory
+
+class NullPipelineElement:
+
+    __implements__ = IPipelineElement
+
+    def process(source):
+        pass
+
+class PipelineFactoryTest(TestCase):
+
+    def setUp(self):
+        self.huey = NullPipelineElement()
+        self.dooey = NullPipelineElement()
+        self.louie = NullPipelineElement()
+        self.daffy = NullPipelineElement()
+
+    def testPipeline(self):
+        pf = PipelineElementFactory()
+        pf.registerFactory('donald', 'huey', self.huey)
+        pf.registerFactory('donald', 'dooey',  self.dooey)
+        pf.registerFactory('donald', 'louie', self.louie)
+        pf.registerFactory('looney', 'daffy', self.daffy)
+        self.assertRaises(ValueError, pf.registerFactory,'donald',  'huey',
+                          self.huey)
+        self.assertEqual(pf.getFactoryGroups(), ['donald', 'looney'])
+        self.assertEqual(pf.getFactoryNames('donald'),
+                         ['dooey', 'huey', 'louie'])
+
+def test_suite():
+    return makeSuite(PipelineFactoryTest)
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')


=== Zope3/src/zope/textindex/tests/test_queryengine.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_queryengine.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,72 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+import unittest
+
+from zodb.btrees.IIBTree import IIBucket
+
+from zope.textindex.queryparser import QueryParser
+from zope.textindex.parsetree import ParseError, QueryError
+from zope.textindex.lexicon import Lexicon, Splitter
+
+class FauxIndex:
+
+    def search(self, term):
+        b = IIBucket()
+        if term == "foo":
+            b[1] = b[3] = 1
+        elif term == "bar":
+            b[1] = b[2] = 1
+        elif term == "ham":
+            b[1] = b[2] = b[3] = b[4] = 1
+        return b
+
+class TestQueryEngine(unittest.TestCase):
+
+    def setUp(self):
+        self.lexicon = Lexicon(Splitter())
+        self.parser = QueryParser(self.lexicon)
+        self.index = FauxIndex()
+
+    def compareSet(self, set, dict):
+        d = {}
+        for k, v in set.items():
+            d[k] = v
+        self.assertEqual(d, dict)
+
+    def compareQuery(self, query, dict):
+        tree = self.parser.parseQuery(query)
+        set = tree.executeQuery(self.index)
+        self.compareSet(set, dict)
+
+    def testExecuteQuery(self):
+        self.compareQuery("foo AND bar", {1: 2})
+        self.compareQuery("foo OR bar", {1: 2, 2: 1, 3:1})
+        self.compareQuery("foo AND NOT bar", {3: 1})
+        self.compareQuery("foo AND foo AND foo", {1: 3, 3: 3})
+        self.compareQuery("foo OR foo OR foo", {1: 3, 3: 3})
+        self.compareQuery("ham AND NOT foo AND NOT bar", {4: 1})
+        self.compareQuery("ham OR foo OR bar", {1: 3, 2: 2, 3: 2, 4: 1})
+        self.compareQuery("ham AND foo AND bar", {1: 3})
+
+    def testInvalidQuery(self):
+        from zope.textindex.parsetree import NotNode, AtomNode
+        tree = NotNode(AtomNode("foo"))
+        self.assertRaises(QueryError, tree.executeQuery, self.index)
+
+def test_suite():
+    return unittest.makeSuite(TestQueryEngine)
+
+if __name__=='__main__':
+    unittest.main(defaultTest='test_suite')


=== Zope3/src/zope/textindex/tests/test_queryparser.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_queryparser.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,297 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zope.interface.verify import verifyClass
+
+from zope.textindex.iqueryparser import IQueryParser
+from zope.textindex.iqueryparsetree import IQueryParseTree
+
+from zope.textindex.queryparser import QueryParser
+from zope.textindex.parsetree import ParseError, ParseTreeNode
+from zope.textindex.parsetree import OrNode, AndNode, NotNode
+from zope.textindex.parsetree import AtomNode, PhraseNode, GlobNode
+from zope.textindex.lexicon import Lexicon, Splitter
+
+
+class TestInterfaces(TestCase):
+
+    def testInterfaces(self):
+        verifyClass(IQueryParser, QueryParser)
+        verifyClass(IQueryParseTree, ParseTreeNode)
+        verifyClass(IQueryParseTree, OrNode)
+        verifyClass(IQueryParseTree, AndNode)
+        verifyClass(IQueryParseTree, NotNode)
+        verifyClass(IQueryParseTree, AtomNode)
+        verifyClass(IQueryParseTree, PhraseNode)
+        verifyClass(IQueryParseTree, GlobNode)
+
+
+class TestQueryParserBase(TestCase):
+
+    def setUp(self):
+        self.lexicon = Lexicon(Splitter())
+        self.parser = QueryParser(self.lexicon)
+
+    def expect(self, input, output, expected_ignored=[]):
+        tree = self.parser.parseQuery(input)
+        ignored = self.parser.getIgnored()
+        self.compareParseTrees(tree, output)
+        self.assertEqual(ignored, expected_ignored)
+        # Check that parseQueryEx() == (parseQuery(), getIgnored())
+        ex_tree, ex_ignored = self.parser.parseQueryEx(input)
+        self.compareParseTrees(ex_tree, tree)
+        self.assertEqual(ex_ignored, expected_ignored)
+
+    def failure(self, input):
+        self.assertRaises(ParseError, self.parser.parseQuery, input)
+        self.assertRaises(ParseError, self.parser.parseQueryEx, input)
+
+    def compareParseTrees(self, got, expected, msg=None):
+        if msg is None:
+            msg = repr(got)
+        self.assertEqual(isinstance(got, ParseTreeNode), 1)
+        self.assertEqual(got.__class__, expected.__class__, msg)
+        if isinstance(got, PhraseNode):
+            self.assertEqual(got.nodeType(), "PHRASE", msg)
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
+        elif isinstance(got, GlobNode):
+            self.assertEqual(got.nodeType(), "GLOB", msg)
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
+        elif isinstance(got, AtomNode):
+            self.assertEqual(got.nodeType(), "ATOM", msg)
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
+        elif isinstance(got, NotNode):
+            self.assertEqual(got.nodeType(), "NOT")
+            self.compareParseTrees(got.getValue(), expected.getValue(), msg)
+        elif isinstance(got, AndNode) or isinstance(got, OrNode):
+            self.assertEqual(got.nodeType(),
+                             isinstance(got, AndNode) and "AND" or "OR", msg)
+            list1 = got.getValue()
+            list2 = expected.getValue()
+            self.assertEqual(len(list1), len(list2), msg)
+            for i in range(len(list1)):
+                self.compareParseTrees(list1[i], list2[i], msg)
+
+
+class TestQueryParser(TestQueryParserBase):
+
+    def test001(self):
+        self.expect("foo", AtomNode("foo"))
+
+    def test002(self):
+        self.expect("note", AtomNode("note"))
+
+    def test003(self):
+        self.expect("aa and bb AND cc",
+                    AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
+
+    def test004(self):
+        self.expect("aa OR bb or cc",
+                    OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
+
+    def test005(self):
+        self.expect("aa AND bb OR cc AnD dd",
+                    OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
+                            AndNode([AtomNode("cc"), AtomNode("dd")])]))
+
+    def test006(self):
+        self.expect("(aa OR bb) AND (cc OR dd)",
+                    AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
+                             OrNode([AtomNode("cc"), AtomNode("dd")])]))
+
+    def test007(self):
+        self.expect("aa AND NOT bb",
+                    AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
+
+    def test010(self):
+        self.expect('"foo bar"', PhraseNode(["foo", "bar"]))
+
+    def test011(self):
+        self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+    def test012(self):
+        self.expect('(("foo bar"))"', PhraseNode(["foo", "bar"]))
+
+    def test013(self):
+        self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+    def test014(self):
+        self.expect("foo-bar", PhraseNode(["foo", "bar"]))
+
+    def test015(self):
+        self.expect("foo -bar", AndNode([AtomNode("foo"),
+                                         NotNode(AtomNode("bar"))]))
+
+    def test016(self):
+        self.expect("-foo bar", AndNode([AtomNode("bar"),
+                                         NotNode(AtomNode("foo"))]))
+
+    def test017(self):
+        self.expect("booh -foo-bar",
+                    AndNode([AtomNode("booh"),
+                             NotNode(PhraseNode(["foo", "bar"]))]))
+
+    def test018(self):
+        self.expect('booh -"foo bar"',
+                    AndNode([AtomNode("booh"),
+                             NotNode(PhraseNode(["foo", "bar"]))]))
+
+    def test019(self):
+        self.expect('foo"bar"',
+                    AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+    def test020(self):
+        self.expect('"foo"bar',
+                    AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+    def test021(self):
+        self.expect('foo"bar"blech',
+                    AndNode([AtomNode("foo"), AtomNode("bar"),
+                             AtomNode("blech")]))
+
+    def test022(self):
+        self.expect("foo*", GlobNode("foo*"))
+
+    def test023(self):
+        self.expect("foo* bar", AndNode([GlobNode("foo*"),
+                                         AtomNode("bar")]))
+
+    def test101(self):
+        self.failure("")
+
+    def test102(self):
+        self.failure("not")
+
+    def test103(self):
+        self.failure("or")
+
+    def test104(self):
+        self.failure("and")
+
+    def test105(self):
+        self.failure("NOT")
+
+    def test106(self):
+        self.failure("OR")
+
+    def test107(self):
+        self.failure("AND")
+
+    def test108(self):
+        self.failure("NOT foo")
+
+    def test109(self):
+        self.failure(")")
+
+    def test110(self):
+        self.failure("(")
+
+    def test111(self):
+        self.failure("foo OR")
+
+    def test112(self):
+        self.failure("foo AND")
+
+    def test113(self):
+        self.failure("OR foo")
+
+    def test114(self):
+        self.failure("AND foo")
+
+    def test115(self):
+        self.failure("(foo) bar")
+
+    def test116(self):
+        self.failure("(foo OR)")
+
+    def test117(self):
+        self.failure("(foo AND)")
+
+    def test118(self):
+        self.failure("(NOT foo)")
+
+    def test119(self):
+        self.failure("-foo")
+
+    def test120(self):
+        self.failure("-foo -bar")
+
+    def test121(self):
+        self.failure("foo OR -bar")
+
+    def test122(self):
+        self.failure("foo AND -bar")
+
+
+class StopWordTestQueryParser(TestQueryParserBase):
+
+    def setUp(self):
+        # Only 'stop' is a stopword (but 'and' is still an operator)
+        self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
+        self.parser = QueryParser(self.lexicon)
+
+    def test201(self):
+        self.expect('and/', AtomNode("and"))
+
+    def test202(self):
+        self.expect('foo AND stop', AtomNode("foo"), ["stop"])
+
+    def test203(self):
+        self.expect('foo AND NOT stop', AtomNode("foo"), ["stop"])
+
+    def test204(self):
+        self.expect('stop AND foo', AtomNode("foo"), ["stop"])
+
+    def test205(self):
+        self.expect('foo OR stop', AtomNode("foo"), ["stop"])
+
+    def test206(self):
+        self.expect('stop OR foo', AtomNode("foo"), ["stop"])
+
+    def test301(self):
+        self.failure('stop')
+
+    def test302(self):
+        self.failure('stop stop')
+
+    def test303(self):
+        self.failure('stop AND stop')
+
+    def test304(self):
+        self.failure('stop OR stop')
+
+    def test305(self):
+        self.failure('stop -foo')
+
+    def test306(self):
+        self.failure('stop AND NOT foo')
+
+
+class FakeStopWordRemover:
+
+    def process(self, list):
+        return [word for word in list if word != "stop"]
+
+
+def test_suite():
+    return TestSuite((makeSuite(TestQueryParser),
+                      makeSuite(StopWordTestQueryParser),
+                      makeSuite(TestInterfaces),
+                    ))
+
+
+if __name__=="__main__":
+    main(defaultTest='test_suite')


=== Zope3/src/zope/textindex/tests/test_setops.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_setops.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,135 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from zodb.btrees.IIBTree import IIBTree, IIBucket
+
+from zope.textindex.setops import mass_weightedIntersection
+from zope.textindex.setops import mass_weightedUnion
+
+class TestSetOps(TestCase):
+
+    def testEmptyLists(self):
+        self.assertEqual(len(mass_weightedIntersection([])), 0)
+        self.assertEqual(len(mass_weightedUnion([])), 0)
+
+    def testIdentity(self):
+        t = IIBTree([(1, 2)])
+        b = IIBucket([(1, 2)])
+        for x in t, b:
+            for func in mass_weightedUnion, mass_weightedIntersection:
+                result = func([(x, 1)])
+                self.assertEqual(len(result), 1)
+                self.assertEqual(list(result.items()), list(x.items()))
+
+    def testScalarMultiply(self):
+        t = IIBTree([(1, 2), (2, 3), (3, 4)])
+        allkeys = [1, 2, 3]
+        b = IIBucket(t)
+        for x in t, b:
+            self.assertEqual(list(x.keys()), allkeys)
+            for func in mass_weightedUnion, mass_weightedIntersection:
+                for factor in 0, 1, 5, 10:
+                    result = func([(x, factor)])
+                    self.assertEqual(allkeys, list(result.keys()))
+                    for key in x.keys():
+                        self.assertEqual(x[key] * factor, result[key])
+
+    def testPairs(self):
+        t1 = IIBTree([(1, 10), (3, 30), (7, 70)])
+        t2 = IIBTree([(3, 30), (5, 50), (7, 7), (9, 90)])
+        allkeys = [1, 3, 5, 7, 9]
+        b1 = IIBucket(t1)
+        b2 = IIBucket(t2)
+        for x in t1, t2, b1, b2:
+            for key in x.keys():
+                self.assertEqual(key in allkeys, 1)
+            for y in t1, t2, b1, b2:
+                for w1, w2 in (0, 0), (1, 10), (10, 1), (2, 3):
+                    # Test the union.
+                    expected = []
+                    for key in allkeys:
+                        if x.has_key(key) or y.has_key(key):
+                            result = x.get(key, 0) * w1 + y.get(key, 0) * w2
+                            expected.append((key, result))
+                    expected.sort()
+                    got = mass_weightedUnion([(x, w1), (y, w2)])
+                    self.assertEqual(expected, list(got.items()))
+                    got = mass_weightedUnion([(y, w2), (x, w1)])
+                    self.assertEqual(expected, list(got.items()))
+
+                    # Test the intersection.
+                    expected = []
+                    for key in allkeys:
+                        if x.has_key(key) and y.has_key(key):
+                            result = x[key] * w1 + y[key] * w2
+                            expected.append((key, result))
+                    expected.sort()
+                    got = mass_weightedIntersection([(x, w1), (y, w2)])
+                    self.assertEqual(expected, list(got.items()))
+                    got = mass_weightedIntersection([(y, w2), (x, w1)])
+                    self.assertEqual(expected, list(got.items()))
+
+    def testMany(self):
+        import random
+        N = 15  # number of IIBTrees to feed in
+        L = []
+        commonkey = N * 1000
+        allkeys = {commonkey: 1}
+        for i in range(N):
+            t = IIBTree()
+            t[commonkey] = i
+            for j in range(N-i):
+                key = i + j
+                allkeys[key] = 1
+                t[key] = N*i + j
+            L.append((t, i+1))
+        random.shuffle(L)
+        allkeys = allkeys.keys()
+        allkeys.sort()
+
+        # Test the union.
+        expected = []
+        for key in allkeys:
+            sum = 0
+            for t, w in L:
+                if t.has_key(key):
+                    sum += t[key] * w
+            expected.append((key, sum))
+        # print 'union', expected
+        got = mass_weightedUnion(L)
+        self.assertEqual(expected, list(got.items()))
+
+        # Test the intersection.
+        expected = []
+        for key in allkeys:
+            sum = 0
+            for t, w in L:
+                if t.has_key(key):
+                    sum += t[key] * w
+                else:
+                    break
+            else:
+                # We didn't break out of the loop so it's in the intersection.
+                expected.append((key, sum))
+        # print 'intersection', expected
+        got = mass_weightedIntersection(L)
+        self.assertEqual(expected, list(got.items()))
+
+def test_suite():
+    return makeSuite(TestSetOps)
+
+if __name__=="__main__":
+    main(defaultTest='test_suite')


=== Zope3/src/zope/textindex/tests/test_textindexwrapper.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:06 2002
+++ Zope3/src/zope/textindex/tests/test_textindexwrapper.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,131 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Unit tests for TextIndexWrapper.
+
+$Id$
+"""
+
+import unittest
+
+from zope.textindex.textindexwrapper import TextIndexWrapper
+from zope.textindex import parsetree
+
+class TextIndexWrapperTest(unittest.TestCase):
+
+    def setUp(self):
+        w = TextIndexWrapper()
+        doc = u"the quick brown fox jumps over the lazy dog"
+        w.index_doc(1000, [doc])
+        doc = u"the brown fox and the yellow fox don't need the retriever"
+        w.index_doc(1001, [doc])
+        self.wrapper = w
+
+    def testCounts(self):
+        w = self.wrapper
+        self.assertEqual(self.wrapper.documentCount(), 2)
+        self.assertEqual(self.wrapper.wordCount(), 12)
+        doc = u"foo bar"
+        w.index_doc(1002, [doc])
+        self.assertEqual(self.wrapper.documentCount(), 3)
+        self.assertEqual(self.wrapper.wordCount(), 14)
+
+    def testOne(self):
+        matches, total = self.wrapper.query(u"quick fox", 0, 10)
+        self.assertEqual(total, 1)
+        [(docid, rank)] = matches # if this fails there's a problem
+        self.assertEqual(docid, 1000)
+
+    def testDefaultBatch(self):
+        matches, total = self.wrapper.query(u"fox", 0)
+        self.assertEqual(total, 2)
+        self.assertEqual(len(matches), 2)
+        matches, total = self.wrapper.query(u"fox")
+        self.assertEqual(total, 2)
+        self.assertEqual(len(matches), 2)
+        matches, total = self.wrapper.query(u" fox", 1)
+        self.assertEqual(total, 2)
+        self.assertEqual(len(matches), 1)
+
+    def testGlobbing(self):
+        matches, total = self.wrapper.query("fo*")
+        self.assertEqual(total, 2)
+        self.assertEqual(len(matches), 2)
+
+    def testLatin1(self):
+        w = self.wrapper
+        doc = u"Fran\xe7ois"
+        w.index_doc(1002, [doc])
+        matches, total = self.wrapper.query(doc, 0, 10)
+        self.assertEqual(total, 1)
+        [(docid, rank)] = matches # if this fails there's a problem
+        self.assertEqual(docid, 1002)
+
+    def testUnicode(self):
+        w = self.wrapper
+        # Verbose, but easy to debug
+        delta  = u"\N{GREEK SMALL LETTER DELTA}"
+        delta += u"\N{GREEK SMALL LETTER EPSILON}"
+        delta += u"\N{GREEK SMALL LETTER LAMDA}"
+        delta += u"\N{GREEK SMALL LETTER TAU}"
+        delta += u"\N{GREEK SMALL LETTER ALPHA}"
+        assert delta.islower()
+        emdash = u"\N{EM DASH}"
+        assert not emdash.isalnum()
+        alpha  = u"\N{GREEK SMALL LETTER ALPHA}"
+        assert alpha.islower()
+        lamda  = u"\N{GREEK SMALL LETTER LAMDA}"
+        lamda += u"\N{GREEK SMALL LETTER ALPHA}"
+        assert lamda.islower()
+        doc = delta + emdash + alpha
+        w.index_doc(1002, [doc])
+        for word in delta, alpha:
+            matches, total = self.wrapper.query(word, 0, 10)
+            self.assertEqual(total, 1)
+            [(docid, rank)] = matches # if this fails there's a problem
+            self.assertEqual(docid, 1002)
+        self.assertRaises(parsetree.ParseError,
+                          self.wrapper.query, emdash, 0, 10)
+        matches, total = self.wrapper.query(lamda, 0, 10)
+        self.assertEqual(total, 0)
+
+    def testNone(self):
+        matches, total = self.wrapper.query(u"dalmatian", 0, 10)
+        self.assertEqual(total, 0)
+        self.assertEqual(len(matches), 0)
+
+    def testAll(self):
+        matches, total = self.wrapper.query(u"brown fox", 0, 10)
+        self.assertEqual(total, 2)
+        self.assertEqual(len(matches), 2)
+        matches.sort()
+        self.assertEqual(matches[0][0], 1000)
+        self.assertEqual(matches[1][0], 1001)
+
+    def testBatching(self):
+        matches1, total = self.wrapper.query(u"brown fox", 0, 1)
+        self.assertEqual(total, 2)
+        self.assertEqual(len(matches1), 1)
+        matches2, total = self.wrapper.query(u"brown fox", 1, 1)
+        self.assertEqual(total, 2)
+        self.assertEqual(len(matches2), 1)
+        matches = matches1 + matches2
+        matches.sort()
+        self.assertEqual(matches[0][0], 1000)
+        self.assertEqual(matches[1][0], 1001)
+
+def test_suite():
+    return unittest.makeSuite(TextIndexWrapperTest)
+
+if __name__=='__main__':
+    unittest.main(defaultTest='test_suite')


=== Zope3/src/zope/textindex/tests/wordstats.py 1.1 => 1.2 ===
--- /dev/null	Wed Dec 25 09:16:07 2002
+++ Zope3/src/zope/textindex/tests/wordstats.py	Wed Dec 25 09:15:35 2002
@@ -0,0 +1,45 @@
+#! /usr/bin/env python
+"""Dump statistics about each word in the index.
+
+usage: wordstats.py data.fs [index key]
+"""
+
+import zodb
+from zodb.storage.file import FileStorage
+
+def main(fspath, key):
+    fs = FileStorage(fspath, read_only=1)
+    db = ZODB.DB(fs)
+    rt = db.open().root()
+    index = rt[key]
+
+    lex = index.lexicon
+    idx = index.index
+    print "Words", lex.length()
+    print "Documents", idx.length()
+
+    print "Word frequencies: count, word, wid"
+    for word, wid in lex.items():
+        docs = idx._wordinfo[wid]
+        print len(docs), word, wid
+
+    print "Per-doc scores: wid, (doc, score,)+"
+    for wid in lex.wids():
+        print wid,
+        docs = idx._wordinfo[wid]
+        for docid, score in docs.items():
+            print docid, score,
+        print
+
+if __name__ == "__main__":
+    import sys
+
+    args = sys.argv[1:]
+    index_key = "index"
+    if len(args) == 1:
+        fspath = args[0]
+    elif len(args) == 2:
+        fspath, index_key = args
+    else:
+        print "Expected 1 or 2 args, got", len(args)
+    main(fspath, index_key)