[Zope3-checkins] SVN: Zope3/trunk/src/zope/index/text/tests/mailtest.py This utility was never properly ported and could have not worked. I

Stephan Richter srichter at cosmos.phy.tufts.edu
Tue Jul 13 12:33:41 EDT 2004


Log message for revision 26456:
  This utility was never properly ported and could have not worked. I 
  remove it now, till someone really needs it and therefore fixes it.
  


Changed:
  D   Zope3/trunk/src/zope/index/text/tests/mailtest.py


-=-
Deleted: Zope3/trunk/src/zope/index/text/tests/mailtest.py
===================================================================
--- Zope3/trunk/src/zope/index/text/tests/mailtest.py	2004-07-13 16:32:34 UTC (rev 26455)
+++ Zope3/trunk/src/zope/index/text/tests/mailtest.py	2004-07-13 16:33:41 UTC (rev 26456)
@@ -1,301 +0,0 @@
-##############################################################################
-#
-# Copyright (c) 2003 Zope Corporation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Test an index with a Unix mailbox file.
-
-usage: python mailtest.py [options] <data.fs>
-
-options:
-    -v     -- verbose
-
-    Index Generation
-    -i mailbox
-    -n NNN -- max number of messages to read from mailbox
-    -t NNN -- commit a transaction every NNN messages (default: 1)
-    -p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-    -p 0   -- don't pack at all
-    -x     -- exclude the message text from the data.fs
-
-    Queries
-    -q query
-    -b NNN -- return the NNN best matches (default: 10)
-    -c NNN -- context; if -v, show the first NNN lines of results (default: 5)
-
-The script either indexes or queries depending on whether -q or -i is
-passed as an option.
-
-For -i mailbox, the script reads mail messages from the mailbox and
-indexes them.  It indexes one message at a time, then commits the
-transaction.
-
-For -q query, it performs a query on an existing index.
-
-If both are specified, the index is performed first.
-
-You can also interact with the index after it is completed. Load the
-index from the database:
-
-    import ZODB
-    from ZODB.Storage.FileStorage import FileStorage
-    fs = FileStorage(<data.fs>)
-    db = ZODB.DB(fs)
-    index = cn.open().root()["index"]
-    index.search("python AND unicode")
-"""
-
-from zope.index.text.lexicon import \
-     Lexicon, CaseNormalizer, Splitter, StopWordRemover
-
-# XXX This import is bad, and was so before the renaming
-from zope.index.text.zctextindex import ZCTextIndex
-
-from BTrees.IOBTree import IOBTree
-from zope.index.text.queryparser import QueryParser
-
-import sys
-import mailbox
-import time
-
-def usage(msg):
-    print msg
-    print __doc__
-    sys.exit(2)
-
-class Message:
-
-    total_bytes = 0
-
-    def __init__(self, msg):
-        subject = msg.getheader('subject', '')
-        author = msg.getheader('from', '')
-        if author:
-            summary = "%s (%s)\n" % (subject, author)
-        else:
-            summary = "%s\n" % subject
-        self.text = summary + msg.fp.read()
-        Message.total_bytes += len(self.text)
-
-class Extra:
-    pass
-
-def index(rt, mboxfile, db, profiler):
-    global NUM
-    idx_time = 0
-    pack_time = 0
-    start_time = time.time()
-
-    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
-    extra = Extra()
-    extra.lexicon_id = 'lexicon'
-    extra.doc_attr = 'text'
-    extra.index_type = 'Okapi BM25 Rank'
-    caller = Extra()
-    caller.lexicon = lexicon
-    rt["index"] = idx = ZCTextIndex("index", extra, caller)
-    if not EXCLUDE_TEXT:
-        rt["documents"] = docs = IOBTree()
-    else:
-        docs = None
-    get_transaction().commit()
-
-    mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
-    if VERBOSE:
-        print "opened", mboxfile
-    if not NUM:
-        NUM = sys.maxint
-
-    if profiler:
-        itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
-    else:
-        itime, ptime, i = indexmbox(mbox, idx, docs, db)
-    idx_time += itime
-    pack_time += ptime
-
-    get_transaction().commit()
-
-    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
-        if VERBOSE >= 2:
-            print "packing one last time..."
-        p0 = time.clock()
-        db.pack(time.time())
-        p1 = time.clock()
-        if VERBOSE:
-            print "pack took %s sec" % (p1 - p0)
-        pack_time += p1 - p0
-
-    if VERBOSE:
-        finish_time = time.time()
-        print
-        print "Index time", round(idx_time / 60, 3), "minutes"
-        print "Pack time", round(pack_time / 60, 3), "minutes"
-        print "Index bytes", Message.total_bytes
-        rate = (Message.total_bytes / idx_time) / 1024
-        print "Index rate %.2f KB/sec" % rate
-        print "Indexing began", time.ctime(start_time)
-        print "Indexing ended", time.ctime(finish_time)
-        print "Wall clock minutes", round((finish_time - start_time)/60, 3)
-
-def indexmbox(mbox, idx, docs, db):
-    idx_time = 0
-    pack_time = 0
-    i = 0
-    while i < NUM:
-        _msg = mbox.next()
-        if _msg is None:
-            break
-        i += 1
-        msg = Message(_msg)
-        if VERBOSE >= 2:
-            print "indexing msg", i
-        i0 = time.clock()
-        idx.index_object(i, msg)
-        if not EXCLUDE_TEXT:
-            docs[i] = msg
-        if i % TXN_SIZE == 0:
-            get_transaction().commit()
-        i1 = time.clock()
-        idx_time += i1 - i0
-        if VERBOSE and i % 50 == 0:
-            print i, "messages indexed"
-            print "cache size", db.cacheSize()
-        if PACK_INTERVAL and i % PACK_INTERVAL == 0:
-            if VERBOSE >= 2:
-                print "packing..."
-            p0 = time.clock()
-            db.pack(time.time())
-            p1 = time.clock()
-            if VERBOSE:
-                print "pack took %s sec" % (p1 - p0)
-            pack_time += p1 - p0
-    return idx_time, pack_time, i
-
-
-def query(rt, query_str, profiler):
-    idx = rt["index"]
-    docs = rt["documents"]
-
-    start = time.clock()
-    if profiler is None:
-        results, num_results = idx.query(query_str, BEST)
-    else:
-        if WARM_CACHE:
-            print "Warming the cache..."
-            idx.query(query_str, BEST)
-        start = time.clock()
-        results, num_results = profiler.runcall(idx.query, query_str, BEST)
-    elapsed = time.clock() - start
-
-    print "query:", query_str
-    print "# results:", len(results), "of", num_results, \
-          "in %.2f ms" % (elapsed * 1000)
-
-    tree = QueryParser(idx.lexicon).parseQuery(query_str)
-    qw = idx.index.query_weight(tree.terms())
-
-    for docid, score in results:
-        scaled = 100.0 * score / qw
-        print "docid %7d score %6d scaled %5.2f%%" % (docid, score, scaled)
-        if VERBOSE:
-            msg = docs[docid]
-            ctx = msg.text.split("\n", CONTEXT)
-            del ctx[-1]
-            print "-" * 60
-            print "message:"
-            for l in ctx:
-                print l
-            print "-" * 60
-
-
-def main(fs_path, mbox_path, query_str, profiler):
-    f = ZODB.FileStorage.FileStorage(fs_path)
-    db = ZODB.DB(f, cache_size=CACHE_SIZE)
-    cn = db.open()
-    rt = cn.root()
-
-    if mbox_path is not None:
-        index(rt, mbox_path, db, profiler)
-    if query_str is not None:
-        query(rt, query_str, profiler)
-
-    cn.close()
-    db.close()
-    f.close()
-
-if __name__ == "__main__":
-    import getopt
-
-    NUM = 0
-    VERBOSE = 0
-    PACK_INTERVAL = 500
-    EXCLUDE_TEXT = 0
-    CACHE_SIZE = 10000
-    TXN_SIZE = 1
-    BEST = 10
-    CONTEXT = 5
-    WARM_CACHE = 0
-    query_str = None
-    mbox_path = None
-    profile = None
-    old_profile = None
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:c:xt:w',
-                                   ['profile=', 'old-profile='])
-    except getopt.error, msg:
-        usage(msg)
-    if len(args) != 1:
-        usage("exactly 1 filename argument required")
-    for o, v in opts:
-        if o == '-n':
-            NUM = int(v)
-        elif o == '-v':
-            VERBOSE += 1
-        elif o == '-p':
-            PACK_INTERVAL = int(v)
-        elif o == '-q':
-            query_str = v
-        elif o == '-i':
-            mbox_path = v
-        elif o == '-b':
-            BEST = int(v)
-        elif o == '-x':
-            EXCLUDE_TEXT = 1
-        elif o == '-t':
-            TXN_SIZE = int(v)
-        elif o == '-c':
-            CONTEXT = int(v)
-        elif o == '-w':
-            WARM_CACHE = 1
-        elif o == '--profile':
-            profile = v
-        elif o == '--old-profile':
-            old_profile = v
-    fs_path, = args
-
-    if profile:
-        import hotshot
-        profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
-    elif old_profile:
-        import profile
-        profiler = profile.Profile()
-    else:
-        profiler = None
-
-    main(fs_path, mbox_path, query_str, profiler)
-
-    if profile:
-        profiler.close()
-    elif old_profile:
-        import pstats
-        profiler.dump_stats(old_profile)
-        stats = pstats.Stats(old_profile)
-        stats.strip_dirs().sort_stats('time').print_stats(20)



More information about the Zope3-Checkins mailing list