[Zope-CVS] CVS: Products/ZCTextIndex/tests - indexhtml.py:1.1.2.1

Jeremy Hylton jeremy@zope.com
Thu, 9 May 2002 19:19:02 -0400


Update of /cvs-repository/Products/ZCTextIndex/tests
In directory cvs.zope.org:/tmp/cvs-serv1524/tests

Added Files:
      Tag: TextIndexDS9-branch
	indexhtml.py 
Log Message:
First cut at a driver script to index a collection of html pages.



=== Added File Products/ZCTextIndex/tests/indexhtml.py ===
#! /usr/bin/env python

"""Index a collection of HTML files on the filesystem.

usage: indexhtml.py [options] dir

Will create an index of all files in dir or its subdirectories.

options:
-f data.fs  -- the path to the filestorage datafile
"""

import os

import ZODB
from ZODB.FileStorage import FileStorage
from BTrees.IOBTree import IOBTree

from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex, StopWordRemover
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
from Products.ZCTextIndex.Lexicon import Lexicon

def make_index():
    # there's an elaborate dance necessary to construct an index
    class Struct:
        pass
    extra = Struct()
    extra.doc_attr = "read"
    extra.lexicon_id = "lexicon"
    caller = Struct()
    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
    return ZCTextIndex(extra, caller)

def main(root, dir):
    rt["index"] = index = make_index()
    rt["files"] = paths = IOBTree()
    get_transaction().commit()

    files = os.listdir(dir)
    docid = 0
    for file in files:
        if os.path.isdir(file):
            files += [os.path.join(file, sub) for sub in os.listdir(file)]
        else:
            if not file.endswith(".html"):
                continue
            print file
            docid += 1
            f = open(file, "rb")
            paths[docid] = file
            index.index_object(docid, f)
            f.close()
    get_transaction().commit()

if __name__ == "__main__":
    import sys
    import getopt

    VERBOSE = 0
    FSPATH = "Data.fs"
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'vf:')
    except getopt.error, msg:
        print msg
        print __doc__
        sys.exit(2)
        
    for o, v in opts:
        if o == '-v':
            VERBOSE += 1
        if o == '-f':
            FSPATH = v
            
    if len(args) != 1:
        print "Expected on argument"
        print __doc__
        sys.exit(2)
    dir = args[0]

    fs = FileStorage(FSPATH)
    db = ZODB.DB(fs)
    cn = db.open()
    rt = cn.root()
    main(rt, dir)
    cn.close()