[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.51

Andreas Jung andreas@digicool.com
Mon, 11 Mar 2002 18:48:39 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv12730

Modified Files:
      Tag: ajung-textindexng-branch
	TextIndexNG.py 
Log Message:
refactored forward and reverse indexes in seperate module


=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.50 => 1.2.2.51 ===
 from Products.PluginIndexes.common.util import parseIndexRequest
 from OFS.content_types import guess_content_type
-from BTrees.IOBTree import IOBTree
-from BTrees.OOBTree import OOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import IIBucket, IISet
 from BTrees.IIBTree import intersection as IntIntersection, union as IntUnion
 from BTrees.OOBTree import intersection as ObjIntersection, union as ObjUnion
 from BTrees.IIBTree import  weightedIntersection
@@ -44,6 +42,7 @@
 
 from queryparser.queryparser import QueryParser
 from DumbQueryParser import DumbQueryParser
+from WordIdDocumentIdStorage import WordIdDocumentIdStorage
 
 import Stemmer, Similarity
 import Thesaurus, StopWords, Normalizer
@@ -51,7 +50,6 @@
 import TextIndexCommon
 import ConverterRegistry
 
-
 class Timer:
 
     def __init__(self, verbose=1):
@@ -210,10 +208,9 @@
 
     def clear(self):
 
-        self.__OBJECTS   = IOBTree()  # mapping RID->object reference 
+        self.WordDocStorage           = WordIdDocumentIdStorage()
+        self.SimilarityWordDocStorage = WordIdDocumentIdStorage()
 
-        self._IDX        = IOBTree()
-        self._invIDX     = IOBTree()
         self._thesaurus  = None
         self._stopwords  = None
 
@@ -250,9 +247,7 @@
         # Similarity
 
         if self.useSimilarity:
-            self._PROX_LEX    = SimilarityLexicon(algorithm=self.useSimilarity)
-            self._PROX_IDX    = IOBTree()
-            self._invPROX_IDX = IOBTree()
+            self._SIMILARITY_LEXICON    = SimilarityLexicon(algorithm=self.useSimilarity)
 
             # the selection of the Similarity function must be more general
             # in the future. This requires some more work on the Python
@@ -302,75 +297,8 @@
         return Stemmer.availableStemmers()    
 
 
-    def insertBackwardEntries(self,widLst, documentId):
-        """ insert a list of wordIds for the given documentId
-            into the backward index 
-        """
-
-        idx = self._invIDX
-        
-        try:
-            idx[documentId].update(widLst)
-        except:
-            idx[documentId] = IISet(widLst)
-
-
-
-    def insertForwardEntries(self,wordIds,documentId):
-        """ insert entries for forward index. This function does not store
-            word positions. Word positions are calculated when document is in the 
-            hitlist.
-
-            wordId is either an integer or a list of integers 
-        """
-
-        # self._IDX is a mapping:
-        # wordId -> documentId 
-        
-        idx = self._IDX
-
-        if isinstance(wordIds,IntType): wordIds = [wordIds]
-
-        for wordId in wordIds:
-
-            try:
-                idx[wordId].insert(documentId)
-            except:
-                idx[wordId] = IISet()
-                idx[wordId].insert(documentId)
-
-
-    def insertSimilarityEntries(self,wordIds,documentId):
-        """ insert forward *and* backword entries for Similarity indexes """
-
-        idx = self._PROX_IDX
-        invidx = self._invPROX_IDX
-
-        if isinstance(wordIds,IntType): wordIds = [wordIds]
-
-        for wordId in wordIds:
-            
-            if idx.has_key(wordId) == 0:
-                idx[wordId] = IISet()
-
-            idx[wordId].insert(documentId)
-
-
-        if invidx.has_key(documentId)==0:
-            invidx[documentId] = IISet(wordIds)
-        else:
-            invidx[documentId].update(wordIds)
-        
-
-
     def _printIndex(self):
-
-        for wordId in self._IDX.keys():
-            print '-'*78
-            print wordId,self._v_getWordById(wordId),
-            print 
-            for k,v in self._IDX[wordId].items():
-                print k,v
+        print self.WordDocStorage
 
 
     def index_object(self, documentId, obj, threshold=None):
@@ -437,10 +365,10 @@
 
         if self.useSimilarity:
             Similarity_words =  self._v_Similarityfunc(words)
-            Similarity_widList = self._PROX_LEX.getWordIdList(Similarity_words)
+            Similarity_widList = self._SIMILARITY_LEXICON.getWordIdList(Similarity_words)
             assert len(Similarity_words)==len(Similarity_widList)
 
-            self.insertSimilarityEntries(Similarity_widList,documentId)
+            self.SimilarityWordDocStorage(Similarity_widList, documentId)
        
         T("Similarity")
 
@@ -470,12 +398,8 @@
         T("Widlist")
 
         # insert forward entries 
-        self.insertForwardEntries(widLst,documentId)  
-        T("ForwardEntries")
-
-        # insert backward entries
-        self.insertBackwardEntries(widLst,documentId)
-        T("BackwardEntries")
+        self.WordDocStorage.insert(widLst, documentId)
+        T("WordDocEntries")
 
         if self.verbose: T.printStats()
 
@@ -486,18 +410,9 @@
         """ carefully unindex document with Id 'documentId'
             index and do not fail if it does not exist 
         """
+        
+        self.WordDocStorage.removeDocument(documentId)
        
-        invIDX = self._invIDX
-        IDX    = self._IDX
-
-        for wid in invIDX[documentId]:
-            IDX[wid].remove( documentId ) 
-    
-            if len(IDX[wid])==0:
-                del IDX[wid]
-
-        del invIDX[documentID]
-            
 
     def getLexicon(self):
         return self._LEXICON
@@ -600,7 +515,7 @@
         docIds = IISet()
 
         for wid in wids:
-            docIds.update( self._IDX.get(wid) )
+            docIds.update( self.WordDocStorage.get(wid) )
 
         debug('\tDocIds: ', list(docIds.keys()))
 
@@ -620,7 +535,7 @@
             raise TextIndexNGException, 'Similarity search is not enabled'
 
         # Lookup list of wordIds (usually should contain only *one*)
-        wids = self._PROX_LEX.get(word)
+        wids = self._SIMILARITY_LEXICON.get(word)
         debug("\tWids: ", wids)
 
         # Retrieve list of docIds for that wordId 
@@ -628,10 +543,12 @@
         
         # docIds is an IOBTree and contains the mapping
         # (documentId, list of positions) for one word/wid
-        docIds = self._PROX_IDX.get(wids[0])
 
-        debug('\tDocIds: ', list(docIds.keys()))
-        debug('\tPositions: ', list(docIds.values()))
+        docIds = IISet()
+        for wid in wids:
+            docIds.update ( self._SIMILARITY_INDEX.get(wid) )
+
+        debug('\tDocIds: ', docIds)
 
         r = ResultSet( docIds, (word,))
 
@@ -821,7 +738,7 @@
 
     def numObjects(self):
         """ return number of index objects """
-        return len(self._IDX)
+        return len(self.WordDocStorage)
 
 
     def info(self):