[Zope-CVS] CVS: Products/ZCTextIndex - Index.py:1.1.2.1

Fred L. Drake, Jr. fdrake@acm.org
Tue, 30 Apr 2002 16:19:44 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv16959

Added Files:
      Tag: TextIndexDS9-branch
	Index.py 
Log Message:
First portion of new index.

=== Added File Products/ZCTextIndex/Index.py ===
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
# 
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
##############################################################################
"""Text Index.

Plugin text index for ZCatalog, with relevance ranking.

Revision information:
$Id: Index.py,v 1.1.2.1 2002/04/30 20:19:43 fdrake Exp $
"""

import math

from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IISet


class Index:
    def __init__(self, lexicon, fieldname):
        self._lexicon = lexicon
        self._fieldname = fieldname

        # wid -> ( doc-frequency, { docid -> frequency } )
        self._wordinfo = IOBTree()

        # docid -> W
        self._docweight = IIBTree()

        # docid -> [ wid ]
        # used for un-indexing
        self._docwords = IOBTree()

    def index_object(self, docid, obj, threshold=None):
        wids = self._lexicon.textToWordIDs(self._get_object_text(obj))
        freqs, docweight = self._get_frequencies(wids)
        uniqwids = []
        for wid, f in freqs:
            self._add_wordinfo(wid, f, docid)
            uniqwids.append(wid)
        self._docweight[docid] = docweight
        self._docwords[docid] = IISet(uniqwids)

    def unindex_object(self, docid):
        wids = self._docwords[docid]
        for wid in wids:
            self._del_wordinfo(wid, docid)
        del self._docwords[docid]
        del self._docweight[docid]

    def _get_object_text(self, obj):
        x = getattr(obj, self._fieldname)
        if callable(x):
            return x()
        else:
            return x

    def _get_frequencies(self, wids):
        d = {}
        for wid in wids:
            d[wid] = d.get(wid, 0) + 1
        Wsquares = 0
        freqs = []
        for wid, count in d.items():
            f = frequency(count)
            Wsquares += f ** 2
            freqs.append((wid, f))
        return freqs, int(math.sqrt(Wsquares))

    def _add_wordinfo(self, wid, f, docid):
        try:
            olddocfreq, map = self._wordinfo[wid]
        except KeyError:
            olddocfreq = 0
            map = IIBTree()
        map[docid] = f
        self._wordinfo[wid] = olddocfreq + 1, map

    def _del_wordinfo(self, wid, docid):
        olddocfreq, map = self._wordinfo[wid]
        if olddocfreq == 1:
            del self._wordinfo[wid]
            return
        del map[docid]
        self._wordinfo[wid] = olddocfreq - 1, map


def frequency(count):
    return count