[Zope-CVS] CVS: Products/ZCTextIndex/tests - indexhtml.py:1.1.2.1
Jeremy Hylton
jeremy@zope.com
Thu, 9 May 2002 19:19:02 -0400
Update of /cvs-repository/Products/ZCTextIndex/tests
In directory cvs.zope.org:/tmp/cvs-serv1524/tests
Added Files:
Tag: TextIndexDS9-branch
indexhtml.py
Log Message:
First cut at a driver script to index a collection of html pages.
=== Added File Products/ZCTextIndex/tests/indexhtml.py ===
#! /usr/bin/env python
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Will create an index of all files in dir or its subdirectories.
options:
-f data.fs -- the path to the filestorage datafile
"""
import os
import ZODB
from ZODB.FileStorage import FileStorage
from BTrees.IOBTree import IOBTree
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex, StopWordRemover
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
from Products.ZCTextIndex.Lexicon import Lexicon
def make_index():
# there's an elaborate dance necessary to construct an index
class Struct:
pass
extra = Struct()
extra.doc_attr = "read"
extra.lexicon_id = "lexicon"
caller = Struct()
caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
return ZCTextIndex(extra, caller)
def main(root, dir):
rt["index"] = index = make_index()
rt["files"] = paths = IOBTree()
get_transaction().commit()
files = os.listdir(dir)
docid = 0
for file in files:
if os.path.isdir(file):
files += [os.path.join(file, sub) for sub in os.listdir(file)]
else:
if not file.endswith(".html"):
continue
print file
docid += 1
f = open(file, "rb")
paths[docid] = file
index.index_object(docid, f)
f.close()
get_transaction().commit()
if __name__ == "__main__":
import sys
import getopt
VERBOSE = 0
FSPATH = "Data.fs"
try:
opts, args = getopt.getopt(sys.argv[1:], 'vf:')
except getopt.error, msg:
print msg
print __doc__
sys.exit(2)
for o, v in opts:
if o == '-v':
VERBOSE += 1
if o == '-f':
FSPATH = v
if len(args) != 1:
print "Expected on argument"
print __doc__
sys.exit(2)
dir = args[0]
fs = FileStorage(FSPATH)
db = ZODB.DB(fs)
cn = db.open()
rt = cn.root()
main(rt, dir)
cn.close()