[Zope3-checkins]
SVN: Zope3/trunk/src/zope/index/text/tests/mailtest.py
This utility was never properly ported and could have not worked. I
Stephan Richter
srichter at cosmos.phy.tufts.edu
Tue Jul 13 12:33:41 EDT 2004
Log message for revision 26456:
This utility was never properly ported and could have not worked. I
remove it now, till someone really needs it and therefore fixes it.
Changed:
D Zope3/trunk/src/zope/index/text/tests/mailtest.py
-=-
Deleted: Zope3/trunk/src/zope/index/text/tests/mailtest.py
===================================================================
--- Zope3/trunk/src/zope/index/text/tests/mailtest.py 2004-07-13 16:32:34 UTC (rev 26455)
+++ Zope3/trunk/src/zope/index/text/tests/mailtest.py 2004-07-13 16:33:41 UTC (rev 26456)
@@ -1,301 +0,0 @@
-##############################################################################
-#
-# Copyright (c) 2003 Zope Corporation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Test an index with a Unix mailbox file.
-
-usage: python mailtest.py [options] <data.fs>
-
-options:
- -v -- verbose
-
- Index Generation
- -i mailbox
- -n NNN -- max number of messages to read from mailbox
- -t NNN -- commit a transaction every NNN messages (default: 1)
- -p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
- -p 0 -- don't pack at all
- -x -- exclude the message text from the data.fs
-
- Queries
- -q query
- -b NNN -- return the NNN best matches (default: 10)
- -c NNN -- context; if -v, show the first NNN lines of results (default: 5)
-
-The script either indexes or queries depending on whether -q or -i is
-passed as an option.
-
-For -i mailbox, the script reads mail messages from the mailbox and
-indexes them. It indexes one message at a time, then commits the
-transaction.
-
-For -q query, it performs a query on an existing index.
-
-If both are specified, the index is performed first.
-
-You can also interact with the index after it is completed. Load the
-index from the database:
-
- import ZODB
- from ZODB.Storage.FileStorage import FileStorage
- fs = FileStorage(<data.fs>)
- db = ZODB.DB(fs)
- index = cn.open().root()["index"]
- index.search("python AND unicode")
-"""
-
-from zope.index.text.lexicon import \
- Lexicon, CaseNormalizer, Splitter, StopWordRemover
-
-# XXX This import is bad, and was so before the renaming
-from zope.index.text.zctextindex import ZCTextIndex
-
-from BTrees.IOBTree import IOBTree
-from zope.index.text.queryparser import QueryParser
-
-import sys
-import mailbox
-import time
-
-def usage(msg):
- print msg
- print __doc__
- sys.exit(2)
-
-class Message:
-
- total_bytes = 0
-
- def __init__(self, msg):
- subject = msg.getheader('subject', '')
- author = msg.getheader('from', '')
- if author:
- summary = "%s (%s)\n" % (subject, author)
- else:
- summary = "%s\n" % subject
- self.text = summary + msg.fp.read()
- Message.total_bytes += len(self.text)
-
-class Extra:
- pass
-
-def index(rt, mboxfile, db, profiler):
- global NUM
- idx_time = 0
- pack_time = 0
- start_time = time.time()
-
- lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
- extra = Extra()
- extra.lexicon_id = 'lexicon'
- extra.doc_attr = 'text'
- extra.index_type = 'Okapi BM25 Rank'
- caller = Extra()
- caller.lexicon = lexicon
- rt["index"] = idx = ZCTextIndex("index", extra, caller)
- if not EXCLUDE_TEXT:
- rt["documents"] = docs = IOBTree()
- else:
- docs = None
- get_transaction().commit()
-
- mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
- if VERBOSE:
- print "opened", mboxfile
- if not NUM:
- NUM = sys.maxint
-
- if profiler:
- itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
- else:
- itime, ptime, i = indexmbox(mbox, idx, docs, db)
- idx_time += itime
- pack_time += ptime
-
- get_transaction().commit()
-
- if PACK_INTERVAL and i % PACK_INTERVAL != 0:
- if VERBOSE >= 2:
- print "packing one last time..."
- p0 = time.clock()
- db.pack(time.time())
- p1 = time.clock()
- if VERBOSE:
- print "pack took %s sec" % (p1 - p0)
- pack_time += p1 - p0
-
- if VERBOSE:
- finish_time = time.time()
- print
- print "Index time", round(idx_time / 60, 3), "minutes"
- print "Pack time", round(pack_time / 60, 3), "minutes"
- print "Index bytes", Message.total_bytes
- rate = (Message.total_bytes / idx_time) / 1024
- print "Index rate %.2f KB/sec" % rate
- print "Indexing began", time.ctime(start_time)
- print "Indexing ended", time.ctime(finish_time)
- print "Wall clock minutes", round((finish_time - start_time)/60, 3)
-
-def indexmbox(mbox, idx, docs, db):
- idx_time = 0
- pack_time = 0
- i = 0
- while i < NUM:
- _msg = mbox.next()
- if _msg is None:
- break
- i += 1
- msg = Message(_msg)
- if VERBOSE >= 2:
- print "indexing msg", i
- i0 = time.clock()
- idx.index_object(i, msg)
- if not EXCLUDE_TEXT:
- docs[i] = msg
- if i % TXN_SIZE == 0:
- get_transaction().commit()
- i1 = time.clock()
- idx_time += i1 - i0
- if VERBOSE and i % 50 == 0:
- print i, "messages indexed"
- print "cache size", db.cacheSize()
- if PACK_INTERVAL and i % PACK_INTERVAL == 0:
- if VERBOSE >= 2:
- print "packing..."
- p0 = time.clock()
- db.pack(time.time())
- p1 = time.clock()
- if VERBOSE:
- print "pack took %s sec" % (p1 - p0)
- pack_time += p1 - p0
- return idx_time, pack_time, i
-
-
-def query(rt, query_str, profiler):
- idx = rt["index"]
- docs = rt["documents"]
-
- start = time.clock()
- if profiler is None:
- results, num_results = idx.query(query_str, BEST)
- else:
- if WARM_CACHE:
- print "Warming the cache..."
- idx.query(query_str, BEST)
- start = time.clock()
- results, num_results = profiler.runcall(idx.query, query_str, BEST)
- elapsed = time.clock() - start
-
- print "query:", query_str
- print "# results:", len(results), "of", num_results, \
- "in %.2f ms" % (elapsed * 1000)
-
- tree = QueryParser(idx.lexicon).parseQuery(query_str)
- qw = idx.index.query_weight(tree.terms())
-
- for docid, score in results:
- scaled = 100.0 * score / qw
- print "docid %7d score %6d scaled %5.2f%%" % (docid, score, scaled)
- if VERBOSE:
- msg = docs[docid]
- ctx = msg.text.split("\n", CONTEXT)
- del ctx[-1]
- print "-" * 60
- print "message:"
- for l in ctx:
- print l
- print "-" * 60
-
-
-def main(fs_path, mbox_path, query_str, profiler):
- f = ZODB.FileStorage.FileStorage(fs_path)
- db = ZODB.DB(f, cache_size=CACHE_SIZE)
- cn = db.open()
- rt = cn.root()
-
- if mbox_path is not None:
- index(rt, mbox_path, db, profiler)
- if query_str is not None:
- query(rt, query_str, profiler)
-
- cn.close()
- db.close()
- f.close()
-
-if __name__ == "__main__":
- import getopt
-
- NUM = 0
- VERBOSE = 0
- PACK_INTERVAL = 500
- EXCLUDE_TEXT = 0
- CACHE_SIZE = 10000
- TXN_SIZE = 1
- BEST = 10
- CONTEXT = 5
- WARM_CACHE = 0
- query_str = None
- mbox_path = None
- profile = None
- old_profile = None
- try:
- opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:c:xt:w',
- ['profile=', 'old-profile='])
- except getopt.error, msg:
- usage(msg)
- if len(args) != 1:
- usage("exactly 1 filename argument required")
- for o, v in opts:
- if o == '-n':
- NUM = int(v)
- elif o == '-v':
- VERBOSE += 1
- elif o == '-p':
- PACK_INTERVAL = int(v)
- elif o == '-q':
- query_str = v
- elif o == '-i':
- mbox_path = v
- elif o == '-b':
- BEST = int(v)
- elif o == '-x':
- EXCLUDE_TEXT = 1
- elif o == '-t':
- TXN_SIZE = int(v)
- elif o == '-c':
- CONTEXT = int(v)
- elif o == '-w':
- WARM_CACHE = 1
- elif o == '--profile':
- profile = v
- elif o == '--old-profile':
- old_profile = v
- fs_path, = args
-
- if profile:
- import hotshot
- profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
- elif old_profile:
- import profile
- profiler = profile.Profile()
- else:
- profiler = None
-
- main(fs_path, mbox_path, query_str, profiler)
-
- if profile:
- profiler.close()
- elif old_profile:
- import pstats
- profiler.dump_stats(old_profile)
- stats = pstats.Stats(old_profile)
- stats.strip_dirs().sort_stats('time').print_stats(20)
More information about the Zope3-Checkins
mailing list