[Checkins] SVN: mongopersist/trunk/ - Feature: Massively improved performance on all levels. This was mainly
Stephen Richter
cvs-admin at zope.org
Mon Apr 2 06:34:44 UTC 2012
Log message for revision 124872:
- Feature: Massively improved performance on all levels. This was mainly
accomplished by removing unnecessary database accesses, better caching and
more efficient algorithms. This results in speedups between 4-25 times.
Changed:
U mongopersist/trunk/CHANGES.txt
U mongopersist/trunk/src/mongopersist/performance.py
U mongopersist/trunk/src/mongopersist/serialize.py
U mongopersist/trunk/src/mongopersist/testing.py
U mongopersist/trunk/src/mongopersist/tests/test_serialize.py
U mongopersist/trunk/src/mongopersist/zope/tests/test_container.py
-=-
Modified: mongopersist/trunk/CHANGES.txt
===================================================================
--- mongopersist/trunk/CHANGES.txt 2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/CHANGES.txt 2012-04-02 06:34:41 UTC (rev 124872)
@@ -50,16 +50,42 @@
- Feature: Added a little script to test performance. It is not very
sophisticated, but it is sufficient for a first round of optimizations.
-- Performance: Drastically improved performance for collections that store
- only one type of objects and where the documents do not store the type
- (i.e. it is stored in the name map collection).
+- Feature: Massively improved performance on all levels. This was mainly
+ accomplished by removing unnecessary database accesses, better caching and
+ more efficient algorithms. This results in speedups between 4-25 times.
-- Performance: The Zope Container fast load via find() did not work correctly,
- since setstate() did not change the state from ghost to active and thus the
- state was loaded again from MongoDB and set on the object. Now we use the
- new ``_latest_states`` cache to lookup a document when ``setstate()`` is
- called through the proper channels.
+ - When resolving the path to a klass, the result is now cached. More
+ importantly, lookup failures are also cached mapping path ->
+ ``None``. This is important, since an optimization the ``resolve()``
+ method causes a lot of failing lookups.
+ - When resolving the dbref to a type, we try to resolve the dbref early
+ using the document, if we know that the documents within the collection
+ store their type path. This avoids frequent queries of the name map
+ collection when it is not needed.
+
+ - When getting the object document to read the class path, it will now read
+ the entire document and store it in the ``_latest_states`` dictionary, so
+ that other code may pick it up and use it. This should avoid superflous
+ reads from MongoDB.
+
+ - Drastically improved performance for collections that store only one type
+ of object and where the documents do not store the type (i.e. it is
+ stored in the name map collection).
+
+ - The Zope Container fast load via find() did not work correctly, since
+ setstate() did not change the state from ghost to active and thus the
+ state was loaded again from MongoDB and set on the object. Now we use the
+ new ``_latest_states`` cache to lookup a document when ``setstate()`` is
+ called through the proper channels. Now this "fast load" method truly
+ causes O(1) database lookups.
+
+ - Whenever the Mongo Object Id is used as a hash key, use the hash of the id
+ instead. The ``__cmp__()`` method of the ``ObjectId`` class is way too
+ slow.
+
+ - Cache collection name lookup from objects in the ``ObjectWriter`` class.
+
- Bug: We have seen several occasions in production where we suddenly lost
some state in some documents, which prohibited the objects from being
loadable again. The cause was that the ``_original_states`` attribute did not
Modified: mongopersist/trunk/src/mongopersist/performance.py
===================================================================
--- mongopersist/trunk/src/mongopersist/performance.py 2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/performance.py 2012-04-02 06:34:41 UTC (rev 124872)
@@ -25,6 +25,8 @@
from mongopersist import conflict, datamanager
from mongopersist.zope import container
+MULTIPLE_CLASSES = True
+
class People(container.AllItemsMongoContainer):
_p_mongo_collection = 'people'
_m_database = 'performance'
@@ -32,7 +34,7 @@
class Person(persistent.Persistent, container.MongoContained):
_p_mongo_collection = 'person'
- #_p_mongo_store_type = True
+ _p_mongo_store_type = True
def __init__(self, name, age):
self.name = name
@@ -42,6 +44,10 @@
return '<%s %s @ %i [%s]>' %(
self.__class__.__name__, self.name, self.age, self.__name__)
+class Person2(Person):
+ pass
+
+
def run_basic_crud(options):
conn = pymongo.Connection('localhost', 27017, tz_aware=False)
dm = datamanager.MongoDataManager(
@@ -57,7 +63,8 @@
transaction.begin()
t1 = time.time()
for idx in xrange(options.size):
- people[None] = Person('Mr Number %.5i' %idx, random.randint(0, 100))
+ klass = Person if (MULTIPLE_CLASSES and idx % 2) else Person2
+ people[None] = klass('Mr Number %.5i' %idx, random.randint(0, 100))
transaction.commit()
t2 = time.time()
print 'Insert: %.4f secs' % (t2-t1)
@@ -80,16 +87,21 @@
transaction.begin()
t1 = time.time()
[person.name for person in people.find()]
- #cProfile.runctx('[person.name for person in people.find()]', globals(), locals())
+ #cProfile.runctx(
+ # '[person.name for person in people.find()]', globals(), locals())
t2 = time.time()
print 'Fast Read: %.4f secs' % (t2-t1)
# Profile modification
t1 = time.time()
- for person in people.find():
- person.name += 'X'
- person.age += 1
- transaction.commit()
+ def modify():
+ for person in list(people.find()):
+ person.name += 'X'
+ person.age += 1
+ transaction.commit()
+ modify()
+ #cProfile.runctx(
+ # 'modify()', globals(), locals())
t2 = time.time()
print 'Modification: %.4f secs' % (t2-t1)
Modified: mongopersist/trunk/src/mongopersist/serialize.py
===================================================================
--- mongopersist/trunk/src/mongopersist/serialize.py 2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/serialize.py 2012-04-02 06:34:41 UTC (rev 124872)
@@ -28,11 +28,15 @@
from mongopersist import interfaces
+IGNORE_IDENTICAL_DOCUMENTS = True
+ALWAYS_READ_FULL_DOC = True
+
SERIALIZERS = []
OID_CLASS_LRU = lru.LRUCache(20000)
+COLLECTIONS_WITH_TYPE = set()
+AVAILABLE_NAME_MAPPINGS = set()
+PATH_RESOLVE_CACHE = {}
-IGNORE_IDENTICAL_DOCUMENTS = True
-
def get_dotted_name(obj):
return obj.__module__+'.'+obj.__name__
@@ -73,11 +77,15 @@
except AttributeError:
return db_name, get_dotted_name(obj.__class__)
# Make sure that the coll_name to class path mapping is available.
+ # Let's make sure we do the lookup only once, since the info will
+ # never change.
+ path = get_dotted_name(obj.__class__)
+ map = {'collection': coll_name, 'database': db_name, 'path': path}
+ map_hash = (db_name, coll_name, path)
+ if map_hash in AVAILABLE_NAME_MAPPINGS:
+ return db_name, coll_name
db = self._jar._conn[self._jar.default_database]
coll = db[self._jar.name_map_collection]
- map = {'collection': coll_name,
- 'database': db_name,
- 'path': get_dotted_name(obj.__class__)}
result = coll.find_one(map)
if result is None:
# If there is already a map for this collection, the next map must
@@ -88,6 +96,7 @@
setattr(obj, '_p_mongo_store_type', True)
map['doc_has_type'] = getattr(obj, '_p_mongo_store_type', False)
coll.save(map)
+ AVAILABLE_NAME_MAPPINGS.add(map_hash)
return db_name, coll_name
def get_non_persistent_state(self, obj, seen):
@@ -281,13 +290,29 @@
self._single_map_cache = {}
def simple_resolve(self, path):
- return resolve(path)
+ # We try to look up the klass from a cache. The important part here is
+ # that we also cache lookup failures as None, since they actually
+ # happen more frequently than a hit due to an optimization in the
+ # resolve() function.
+ try:
+ klass = PATH_RESOLVE_CACHE[path]
+ except KeyError:
+ try:
+ klass = resolve(path)
+ except ImportError:
+ PATH_RESOLVE_CACHE[path] = klass = None
+ else:
+ PATH_RESOLVE_CACHE[path] = klass
+ if klass is None:
+ raise ImportError(path)
+ return klass
def resolve(self, dbref):
__traceback_info__ = dbref
- # 1. Check the global oid-based lookup cache.
+ # 1. Check the global oid-based lookup cache. Use the hash of the id,
+ # since otherwise the comparison is way too expensive.
try:
- return OID_CLASS_LRU[dbref.id]
+ return OID_CLASS_LRU[hash(dbref.id)]
except KeyError:
pass
# 2. Check the transient single map entry lookup cache.
@@ -295,19 +320,43 @@
return self._single_map_cache[(dbref.database, dbref.collection)]
except KeyError:
pass
- # 3. Try to resolve the path directly.
+ # 3. If we have found the type within the document for a collection
+ # before, let's try again. This will only hit, if we have more than
+ # one type for the collection, otherwise the single map entry
+ # lookup failed.
+ coll_key = (dbref.database, dbref.collection)
+ if coll_key in COLLECTIONS_WITH_TYPE:
+ if dbref in self._jar._latest_states:
+ obj_doc = self._jar._latest_states[dbref]
+ elif ALWAYS_READ_FULL_DOC:
+ obj_doc = self._jar.get_collection(
+ dbref.database, dbref.collection).find_one(dbref.id)
+ self._jar._latest_states[dbref] = obj_doc
+ else:
+ obj_doc = self._jar\
+ .get_collection(dbref.database, dbref.collection)\
+ .find_one(dbref.id, fields=('_py_persistent_type',))
+ if '_py_persistent_type' in obj_doc:
+ klass = self.simple_resolve(obj_doc['_py_persistent_type'])
+ OID_CLASS_LRU[hash(dbref.id)] = klass
+ return klass
+ # 4. Try to resolve the path directly. We want to do this optimization
+ # after all others, because trying it a lot is very expensive.
try:
return self.simple_resolve(dbref.collection)
except ImportError:
pass
- # 4. No simple hits, so we have to do some leg work.
+ # 5. No simple hits, so we have to do some leg work.
# Let's now try to look up the path from the collection to path
# mapping
db = self._jar._conn[self._jar.default_database]
coll = db[self._jar.name_map_collection]
- result = coll.find(
- {'collection': dbref.collection, 'database': dbref.database})
- count = result.count()
+ result = tuple(coll.find(
+ {'collection': dbref.collection, 'database': dbref.database}))
+ # Calling count() on a query result causes another database
+ # access. Since the result sets should be typically very small, let's
+ # load them all.
+ count = len(result)
if count == 0:
raise ImportError(dbref)
elif count == 1:
@@ -315,7 +364,7 @@
# change later. But storing it for the length of the transaction
# is fine, which is really useful if you load a lot of objects of
# the same type.
- klass = self.simple_resolve(result.next()['path'])
+ klass = self.simple_resolve(result[0]['path'])
self._single_map_cache[(dbref.database, dbref.collection)] = klass
return klass
else:
@@ -323,13 +372,31 @@
raise ImportError(dbref)
# Multiple object types are stored in the collection. We have to
# look at the object to find out the type.
- obj_doc = self._jar\
- .get_collection(dbref.database, dbref.collection).find_one(
- dbref.id, fields=('_py_persistent_type',))
+ if dbref in self._jar._latest_states:
+ # Optimization: If we have the latest state, then we just get
+ # this object document. This is used for fast loading or when
+ # resolving the same object path a second time. (The latter
+ # should never happen due to the object cache.)
+ obj_doc = self._jar._latest_states[dbref]
+ elif ALWAYS_READ_FULL_DOC:
+ # Optimization: Read the entire doc and stick it in the right
+ # place so that unghostifying the object later will not cause
+ # another database access.
+ obj_doc = self._jar\
+ .get_collection(dbref.database, dbref.collection)\
+ .find_one(dbref.id)
+ self._jar._latest_states[dbref] = obj_doc
+ else:
+ obj_doc = self._jar\
+ .get_collection(dbref.database, dbref.collection)\
+ .find_one(dbref.id, fields=('_py_persistent_type',))
if '_py_persistent_type' in obj_doc:
+ COLLECTIONS_WITH_TYPE.add(coll_key)
klass = self.simple_resolve(obj_doc['_py_persistent_type'])
else:
# Find the name-map entry where "doc_has_type" is False.
+ # Note: This case is really inefficient and does not allow any
+ # optimization. It should be avoided as much as possible.
for name_map_item in result:
if not name_map_item['doc_has_type']:
klass = self.simple_resolve(name_map_item['path'])
Modified: mongopersist/trunk/src/mongopersist/testing.py
===================================================================
--- mongopersist/trunk/src/mongopersist/testing.py 2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/testing.py 2012-04-02 06:34:41 UTC (rev 124872)
@@ -53,3 +53,7 @@
test.globs['conn'].drop_database(test.globs['DBNAME'])
test.globs['conn'].disconnect()
serialize.SERIALIZERS.__init__()
+ serialize.OID_CLASS_LRU.__init__(20000)
+ serialize.COLLECTIONS_WITH_TYPE.__init__()
+ serialize.AVAILABLE_NAME_MAPPINGS.__init__()
+ serialize.PATH_RESOLVE_CACHE = {}
Modified: mongopersist/trunk/src/mongopersist/tests/test_serialize.py
===================================================================
--- mongopersist/trunk/src/mongopersist/tests/test_serialize.py 2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/tests/test_serialize.py 2012-04-02 06:34:41 UTC (rev 124872)
@@ -41,6 +41,13 @@
class Anything(persistent.Persistent):
pass
+class StoreType(persistent.Persistent):
+ _p_mongo_collection = 'storetype'
+ _p_mongo_store_type = True
+
+class StoreType2(StoreType):
+ pass
+
class Simple(object):
pass
@@ -298,6 +305,37 @@
{'_py_persistent_type': 'mongopersist.tests.test_serialize.Top'}
"""
+def doctest_ObjectWriter_get_full_state():
+ """ObjectWriter: get_full_state()
+
+ >>> writer = serialize.ObjectWriter(dm)
+
+ Let's get the state of a regular object"
+
+ >>> any = Anything()
+ >>> any.name = 'anything'
+ >>> writer.get_full_state(any)
+ {'name': 'anything'}
+
+ >>> any_ref = dm.insert(any)
+ >>> writer.get_full_state(any)
+ {'_id': ObjectId('4f79368e37a08e1c91000000'), 'name': 'anything'}
+
+ Now an object that stores its type:
+
+ >>> st = StoreType()
+ >>> st.name = 'storetype'
+ >>> pprint.pprint(writer.get_full_state(st))
+ {'_py_persistent_type': 'mongopersist.tests.test_serialize.StoreType',
+ 'name': 'storetype'}
+
+ >>> st_ref = dm.insert(st)
+ >>> pprint.pprint(writer.get_full_state(st))
+ {'_id': ObjectId('4f79372637a08e1cdf000001'),
+ '_py_persistent_type': 'mongopersist.tests.test_serialize.StoreType',
+ 'name': 'storetype'}
+ """
+
def doctest_ObjectWriter_store():
"""ObjectWriter: store()
@@ -385,6 +423,34 @@
>>> reader = serialize.ObjectReader(dm)
>>> reader.simple_resolve('mongopersist.tests.test_serialize.Top')
<class 'mongopersist.tests.test_serialize.Top'>
+
+ After the original lookup, the result is cached:
+
+ >>> pprint.pprint(serialize.PATH_RESOLVE_CACHE)
+ {'mongopersist.tests.test_serialize.Top':
+ <class 'mongopersist.tests.test_serialize.Top'>}
+
+ Note that even lookup failures are cached.
+
+ >>> reader.simple_resolve('path.to.bad')
+ Traceback (most recent call last):
+ ...
+ ImportError: path.to.bad
+
+ >>> pprint.pprint(serialize.PATH_RESOLVE_CACHE)
+ {'mongopersist.tests.test_serialize.Top':
+ <class 'mongopersist.tests.test_serialize.Top'>,
+ 'path.to.bad': None}
+
+ Resolving the path the second time uses the cache:
+
+ >>> reader.simple_resolve('mongopersist.tests.test_serialize.Top')
+ <class 'mongopersist.tests.test_serialize.Top'>
+
+ >>> reader.simple_resolve('path.to.bad')
+ Traceback (most recent call last):
+ ...
+ ImportError: path.to.bad
"""
def doctest_ObjectReader_resolve_simple():
@@ -400,6 +466,50 @@
<class 'mongopersist.tests.test_serialize.Top'>
"""
+def doctest_ObjectReader_resolve_quick_when_type_in_doc():
+ """ObjectReader: resolve(): Quick lookup when type in document.
+
+ This methods resolves a collection name to its class. The collection name
+ can be either any arbitrary string or a Python path.
+
+ >>> st = StoreType()
+ >>> st_ref = dm.insert(st)
+ >>> st2 = StoreType2()
+ >>> st2_ref = dm.insert(st2)
+ >>> dm.reset()
+
+ Let's now resolve the references:
+
+ >>> reader = serialize.ObjectReader(dm)
+ >>> reader.resolve(st_ref)
+ <class 'mongopersist.tests.test_serialize.StoreType'>
+ >>> reader.resolve(st2_ref)
+ <class 'mongopersist.tests.test_serialize.StoreType2'>
+ >>> dm.reset()
+
+ The collection is now stored as one where objects save their type:
+
+ >>> serialize.COLLECTIONS_WITH_TYPE
+ set([('mongopersist_test', 'storetype')])
+
+ So here comes the trick. When fast-loading objects, the documents are made
+ immediately available in the ``_latest_states`` mapping. This allows our
+ quick resolve to utilize that document instead of looking it up in the
+ database:
+
+ >>> writer = serialize.ObjectWriter(dm)
+ >>> coll = dm._get_collection_from_object(st)
+ >>> dm._latest_states[st_ref] = writer.get_full_state(st)
+ >>> dm._latest_states[st2_ref] = writer.get_full_state(st2)
+
+ >>> reader = serialize.ObjectReader(dm)
+ >>> reader.resolve(st_ref)
+ <class 'mongopersist.tests.test_serialize.StoreType'>
+ >>> reader.resolve(st2_ref)
+ <class 'mongopersist.tests.test_serialize.StoreType2'>
+
+ """
+
def doctest_ObjectReader_resolve_lookup():
"""ObjectReader: resolve(): lookup
@@ -459,6 +569,51 @@
ImportError: DBRef('Top', None, 'mongopersist_test')
"""
+def doctest_ObjectReader_resolve_lookup_with_multiple_maps_dont_read_full():
+ """ObjectReader: resolve(): lookup with multiple maps entries
+
+ Multiple maps lookup with the ALWAYS_READ_FULL_DOC option set to False.
+
+ >>> serialize.ALWAYS_READ_FULL_DOC = False
+
+ >>> writer = serialize.ObjectWriter(dm)
+ >>> top = Top()
+ >>> writer.store(top)
+ DBRef('Top', ObjectId('4eb1e0f237a08e38dd000002'), 'mongopersist_test')
+ >>> top2 = Top2()
+ >>> writer.store(top2)
+ DBRef('Top', ObjectId('4eb1e10437a08e38e8000004'), 'mongopersist_test')
+
+ >>> reader = serialize.ObjectReader(dm)
+ >>> reader.resolve(top._p_oid)
+ <class 'mongopersist.tests.test_serialize.Top'>
+ >>> reader.resolve(top2._p_oid)
+ <class 'mongopersist.tests.test_serialize.Top2'>
+
+ Let's clear dome caches and try again:
+
+ >>> dm.reset()
+ >>> serialize.COLLECTIONS_WITH_TYPE.__init__()
+
+ >>> reader = serialize.ObjectReader(dm)
+ >>> reader.resolve(top._p_oid)
+ <class 'mongopersist.tests.test_serialize.Top'>
+ >>> reader.resolve(top2._p_oid)
+ <class 'mongopersist.tests.test_serialize.Top2'>
+
+ If the DBRef does not have an object id, then an import error is raised:
+
+ >>> reader.resolve(dbref.DBRef('Top', None, 'mongopersist_test'))
+ Traceback (most recent call last):
+ ...
+ ImportError: DBRef('Top', None, 'mongopersist_test')
+
+ Cleanup:
+
+ >>> serialize.ALWAYS_READ_FULL_DOC = True
+
+ """
+
def doctest_ObjectReader_get_non_persistent_object_py_type():
"""ObjectReader: get_non_persistent_object(): _py_type
Modified: mongopersist/trunk/src/mongopersist/zope/tests/test_container.py
===================================================================
--- mongopersist/trunk/src/mongopersist/zope/tests/test_container.py 2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/zope/tests/test_container.py 2012-04-02 06:34:41 UTC (rev 124872)
@@ -633,6 +633,10 @@
module.tearDown(test)
test.globs['conn'].disconnect()
serialize.SERIALIZERS.__init__()
+ serialize.OID_CLASS_LRU.__init__(20000)
+ serialize.COLLECTIONS_WITH_TYPE.__init__()
+ serialize.AVAILABLE_NAME_MAPPINGS.__init__()
+ serialize.PATH_RESOLVE_CACHE.__init__()
exceptionformatter.DEBUG_EXCEPTION_FORMATTER = test.orig_DEBUG_EXCEPTION_FORMATTER
def test_suite():
More information about the checkins
mailing list