[Zope-CVS] CVS: Products/ZCTextIndex/tests - indexhtml.py:1.1.2.1
   
    Jeremy Hylton
     
    jeremy@zope.com
       
    Thu, 9 May 2002 19:19:02 -0400
    
    
  
Update of /cvs-repository/Products/ZCTextIndex/tests
In directory cvs.zope.org:/tmp/cvs-serv1524/tests
Added Files:
      Tag: TextIndexDS9-branch
	indexhtml.py 
Log Message:
First cut at a driver script to index a collection of html pages.
=== Added File Products/ZCTextIndex/tests/indexhtml.py ===
#! /usr/bin/env python
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Will create an index of all files in dir or its subdirectories.
options:
-f data.fs  -- the path to the filestorage datafile
"""
import os
import ZODB
from ZODB.FileStorage import FileStorage
from BTrees.IOBTree import IOBTree
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex, StopWordRemover
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
from Products.ZCTextIndex.Lexicon import Lexicon
def make_index():
    # there's an elaborate dance necessary to construct an index
    class Struct:
        pass
    extra = Struct()
    extra.doc_attr = "read"
    extra.lexicon_id = "lexicon"
    caller = Struct()
    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
    return ZCTextIndex(extra, caller)
def main(root, dir):
    rt["index"] = index = make_index()
    rt["files"] = paths = IOBTree()
    get_transaction().commit()
    files = os.listdir(dir)
    docid = 0
    for file in files:
        if os.path.isdir(file):
            files += [os.path.join(file, sub) for sub in os.listdir(file)]
        else:
            if not file.endswith(".html"):
                continue
            print file
            docid += 1
            f = open(file, "rb")
            paths[docid] = file
            index.index_object(docid, f)
            f.close()
    get_transaction().commit()
if __name__ == "__main__":
    import sys
    import getopt
    VERBOSE = 0
    FSPATH = "Data.fs"
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'vf:')
    except getopt.error, msg:
        print msg
        print __doc__
        sys.exit(2)
        
    for o, v in opts:
        if o == '-v':
            VERBOSE += 1
        if o == '-f':
            FSPATH = v
            
    if len(args) != 1:
        print "Expected on argument"
        print __doc__
        sys.exit(2)
    dir = args[0]
    fs = FileStorage(FSPATH)
    db = ZODB.DB(fs)
    cn = db.open()
    rt = cn.root()
    main(rt, dir)
    cn.close()