[ZODB-Dev] ZODB memory problems (was: processing a Very Large file)

Sat May 21 07:00:09 EDT 2005

[posted to comp.lang.python, mailed to zodb-dev at zope.org]

Hi,

I'm having problems storing large amounts of objects in a ZODB.
After committing changes to the database, elements are not cleared from
memory. Since the number of objects I'd like to store in the ZODB is too
large to fit in RAM, my program gets killed with signal 11 or signal 9...

Below a minimal working (or actually: it doesn't work because of memory 
errors)
example code with hopefully enough comments:

# This was suggested by Tim Peters in comp.lang.python thread
# 'processing a Very Large file'
# It is to make sure that no two or more copies of the same object
# reside in memory
class ObjectInterning:
    def __init__(self):
        self.object_table = {}

    def object_intern(self,o):
        return self.object_table.setdefault(o, o)

from sets import Set

# An ExtentedTuple is a tuple with some extra information
# (hence: 'Extended'). Furthermore, the elements of the tuple are
# unique.
# As you can see, ExtendedTuple does not inheret from Persistent.
# It will not be stored in the root of a database directly, it will
# be stored in a Persistent ExtendedTupleTable (see below).
class ExtendedTuple(tuple):

    def __init__(self, els):
        tuple.__init__(self,els)

        # This is a set containing other ExtendedTuple objects
        # which conflicts with self
        # e.g. if self = ExtendedTuple([1,2,3,4]) and
        # other = ExtendedTuple([3,4,5]) then self conflicts with
        # other, because they share one or more elements (in this
        # case:. 3 and 4)
        # So, self.conflicts = Set([ExtendedTuple([3,4,5])])
        #    other.conflicts = Set([ExtendedTuple([1,2,3,4])])
        self.conflicts = Set()

    def __hash__(self):
        return hash(tuple(self))

    def __repr__(self):
        return 'ExtendedTuple(%s)' % str(list(self))

import ZODB
from persistent import Persistent
import random

# The Persistent ExtendedTupleTable generates and stores a large
# amount of ExtendedTuple objects. Since ExtendedTuple contains a
# Set with other ExtendedTuple objects, each ExtendedTuple object
# may get very large.
class ExtendedTupleTable(Persistent):
    def __init__(self):
        self.interning = ObjectInterning()

        # This Set stores all generated ExtendedTuple objects.
        self.ets = Set() # et(s): ExtendedTuple object(s)
        # This dictionary stores a mapping of elements to Sets of
        # ExtendedTuples.
        # eg: self.el2ets[3] = Set([(1,2,3), (3,4,5), (1,3,9)])
        #     self.el2ets[4] = Set([(3,4,5), (2,4,9)])
        self.el2ets = {}  # el: element of an ExtendedTuple object

        # These dictionaries are here for performance optimizations.
        # It is being used to prevent billions of hash()
        # calculations (relatively slow compared to dictionary
        # lookups)
        self._v_el2hs = {} # h(s): hash(es) of ExtendedTuple object(s)
        self._v_h2et = {}
        self._v_et2h = {}

        # The keys of el2ets (and thus the elements of the
        # ExtendedTuple objects) are all in a prespecified range.
        # In this example: range(200):
        self.__el_count = 200
        # Number of ExtendedTuple objects in this ExtendedTupleTable
        self.__et_count = 5000

    # Start generation of ExtendedTuple objects and calculation of
    # conflicts for each ExtendedTuple object
    def calculate_all(self):
        self.calc_ets()
        self.calc_el2ets()
        self.calc_conflicts()

    def add(self, et_uninterned):
        et = self.interning.object_intern(et_uninterned)
        h = self.interning.object_intern(hash(et))
        self.ets.add(et)
        self._v_h2et[h] = et
        self._v_et2h[et] = h
        self._p_changed = True

    def calc_ets(self):
        # Calculate a large amount of ExtendedTuple objects.
        # In this example, the tuples are random, the elements of
        # the tuples are within a prespecified range.
        # The elements of each tuple are unique.
        print 'generating %s random ExtendedTuple objects' % self.__et_count
        for i in xrange(self.__et_count):
            # Create random tuple with unique elements
            l = []
            for el in xrange(self.__el_count/3):
                l.append(random.randint(0,self.__el_count-1))
            et = ExtendedTuple(tuple(Set(l)))
            self.add(et)
        self.__et_count = len(self.ets)

    def calc_el2ets(self):
        '''For each el, calculate which et uses that el'''
        for el in xrange(self.__el_count):
            print 'calculating all ExtendedTuple objects using', el
            self.el2ets[el] = Set([ et for et in self.ets if el in et])
            self._v_el2hs[el] = Set([ self._v_et2h[et] for et in
self.el2ets[el] ])
            self._p_changed = True

    def calc_conflicts(self):
        '''For each et, calculate the set of conflicting ets'''
        self.__et_count = len(self.ets)
        commit_interval = 100
        for i, et in enumerate(self.ets):
            print 'calculating conflicting ExtendedTuple %.2f%%' %
((i+1)*100./self.__et_count)
            # use the el2et dictionary (faster than 'Cartesian'
            # comparison of each ExtendedTuple objects) and an
            # optimization dictionary _v_el2hs to prevent billions
            # of hash() calculations later on
            conflicts_h = [ h for el in et for h in self._v_el2hs[el] ]
            # Make sure each element is unique and store the
            # result as conflicts Set in the current ExtendedTuple
            # object
            conflicts_unique = Set(conflicts_h) - Set([ et ])
            et.conflicts_set = Set([ self._v_h2et[h] for h in
conflicts_unique ])
            self._p_changed = True
            if i % commit_interval == 0:
                print 'committing data to database...'
                # This does NOT seem to work, the memory usage will
                # increase until all memory + swap is used. Then the
                # process gets killed...
                get_transaction().commit(True)
        get_transaction().commit(True)

from ZODB import FileStorage, DB

# Open Database
storage = FileStorage.FileStorage('/tmp/test_extendedtuples.fs')
db = DB(storage)
conn = db.open()
root = conn.root()

name = 'test'

if root.has_key(name):
    del root[name]

root[name] = ExtendedTupleTable()

rt = root[name]
rt.calculate_all()

# if needed, commit final changes and close the database
get_transaction().commit()
conn.close()
db.pack()
db.close()
storage.close()

What should I do to make sure RAM is no longer a limiting factor?
(in other words: The program should work with any (large) value of
self.__range and self.__et_count
Because in my case, self.__et_count = 5000 is only a toy example...)
I'm now working on a PC with 2.5 GB RAM and even that's not enough!

If you think the design of this program is bad, please let me know what you
would do to solve this problem
(calculating and saving tuples + set of conflicts). The only thing I can't 
change is that ExtendedTuple inherits from tuple

Thanks in advance,
Stan.