[Checkins] SVN: mongopersist/trunk/ - Feature: Massively improved performance on all levels. This was mainly

Stephen Richter cvs-admin at zope.org
Mon Apr 2 06:34:44 UTC 2012


Log message for revision 124872:
  - Feature: Massively improved performance on all levels. This was mainly
    accomplished by removing unnecessary database accesses, better caching and
    more efficient algorithms. This results in speedups between 4-25 times.
  

Changed:
  U   mongopersist/trunk/CHANGES.txt
  U   mongopersist/trunk/src/mongopersist/performance.py
  U   mongopersist/trunk/src/mongopersist/serialize.py
  U   mongopersist/trunk/src/mongopersist/testing.py
  U   mongopersist/trunk/src/mongopersist/tests/test_serialize.py
  U   mongopersist/trunk/src/mongopersist/zope/tests/test_container.py

-=-
Modified: mongopersist/trunk/CHANGES.txt
===================================================================
--- mongopersist/trunk/CHANGES.txt	2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/CHANGES.txt	2012-04-02 06:34:41 UTC (rev 124872)
@@ -50,16 +50,42 @@
 - Feature: Added a little script to test performance. It is not very
   sophisticated, but it is sufficient for a first round of optimizations.
 
-- Performance: Drastically improved performance for collections that store
-  only one type of objects and where the documents do not store the type
-  (i.e. it is stored in the name map collection).
+- Feature: Massively improved performance on all levels. This was mainly
+  accomplished by removing unnecessary database accesses, better caching and
+  more efficient algorithms. This results in speedups between 4-25 times.
 
-- Performance: The Zope Container fast load via find() did not work correctly,
-  since setstate() did not change the state from ghost to active and thus the
-  state was loaded again from MongoDB and set on the object. Now we use the
-  new ``_latest_states`` cache to lookup a document when ``setstate()`` is
-  called through the proper channels.
+  - When resolving the path to a klass, the result is now cached. More
+    importantly, lookup failures are also cached mapping path ->
+    ``None``. This is important, since an optimization the ``resolve()``
+    method causes a lot of failing lookups.
 
+  - When resolving the dbref to a type, we try to resolve the dbref early
+    using the document, if we know that the documents within the collection
+    store their type path. This avoids frequent queries of the name map
+    collection when it is not needed.
+
+  - When getting the object document to read the class path, it will now read
+    the entire document and store it in the ``_latest_states`` dictionary, so
+    that other code may pick it up and use it. This should avoid superflous
+    reads from MongoDB.
+
+  - Drastically improved performance for collections that store only one type
+    of object and where the documents do not store the type (i.e. it is
+    stored in the name map collection).
+
+  - The Zope Container fast load via find() did not work correctly, since
+    setstate() did not change the state from ghost to active and thus the
+    state was loaded again from MongoDB and set on the object. Now we use the
+    new ``_latest_states`` cache to lookup a document when ``setstate()`` is
+    called through the proper channels. Now this "fast load" method truly
+    causes O(1) database lookups.
+
+  - Whenever the Mongo Object Id is used as a hash key, use the hash of the id
+    instead. The ``__cmp__()`` method of the ``ObjectId`` class is way too
+    slow.
+
+  - Cache collection name lookup from objects in the ``ObjectWriter`` class.
+
 - Bug: We have seen several occasions in production where we suddenly lost
   some state in some documents, which prohibited the objects from being
   loadable again. The cause was that the ``_original_states`` attribute did not

Modified: mongopersist/trunk/src/mongopersist/performance.py
===================================================================
--- mongopersist/trunk/src/mongopersist/performance.py	2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/performance.py	2012-04-02 06:34:41 UTC (rev 124872)
@@ -25,6 +25,8 @@
 from mongopersist import conflict, datamanager
 from mongopersist.zope import container
 
+MULTIPLE_CLASSES = True
+
 class People(container.AllItemsMongoContainer):
     _p_mongo_collection = 'people'
     _m_database = 'performance'
@@ -32,7 +34,7 @@
 
 class Person(persistent.Persistent, container.MongoContained):
     _p_mongo_collection = 'person'
-    #_p_mongo_store_type = True
+    _p_mongo_store_type = True
 
     def __init__(self, name, age):
         self.name = name
@@ -42,6 +44,10 @@
         return '<%s %s @ %i [%s]>' %(
             self.__class__.__name__, self.name, self.age, self.__name__)
 
+class Person2(Person):
+    pass
+
+
 def run_basic_crud(options):
     conn = pymongo.Connection('localhost', 27017, tz_aware=False)
     dm = datamanager.MongoDataManager(
@@ -57,7 +63,8 @@
         transaction.begin()
         t1 = time.time()
         for idx in xrange(options.size):
-            people[None] = Person('Mr Number %.5i' %idx, random.randint(0, 100))
+            klass = Person if (MULTIPLE_CLASSES and idx % 2) else Person2
+            people[None] = klass('Mr Number %.5i' %idx, random.randint(0, 100))
         transaction.commit()
         t2 = time.time()
         print 'Insert:       %.4f secs' % (t2-t1)
@@ -80,16 +87,21 @@
     transaction.begin()
     t1 = time.time()
     [person.name for person in people.find()]
-    #cProfile.runctx('[person.name for person in people.find()]', globals(), locals())
+    #cProfile.runctx(
+    #    '[person.name for person in people.find()]', globals(), locals())
     t2 = time.time()
     print 'Fast Read:    %.4f secs' % (t2-t1)
 
     # Profile modification
     t1 = time.time()
-    for person in people.find():
-        person.name += 'X'
-        person.age += 1
-    transaction.commit()
+    def modify():
+        for person in list(people.find()):
+            person.name += 'X'
+            person.age += 1
+        transaction.commit()
+    modify()
+    #cProfile.runctx(
+    #    'modify()', globals(), locals())
     t2 = time.time()
     print 'Modification: %.4f secs' % (t2-t1)
 

Modified: mongopersist/trunk/src/mongopersist/serialize.py
===================================================================
--- mongopersist/trunk/src/mongopersist/serialize.py	2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/serialize.py	2012-04-02 06:34:41 UTC (rev 124872)
@@ -28,11 +28,15 @@
 
 from mongopersist import interfaces
 
+IGNORE_IDENTICAL_DOCUMENTS = True
+ALWAYS_READ_FULL_DOC = True
+
 SERIALIZERS = []
 OID_CLASS_LRU = lru.LRUCache(20000)
+COLLECTIONS_WITH_TYPE = set()
+AVAILABLE_NAME_MAPPINGS = set()
+PATH_RESOLVE_CACHE = {}
 
-IGNORE_IDENTICAL_DOCUMENTS = True
-
 def get_dotted_name(obj):
     return obj.__module__+'.'+obj.__name__
 
@@ -73,11 +77,15 @@
         except AttributeError:
             return db_name, get_dotted_name(obj.__class__)
         # Make sure that the coll_name to class path mapping is available.
+        # Let's make sure we do the lookup only once, since the info will
+        # never change.
+        path = get_dotted_name(obj.__class__)
+        map = {'collection': coll_name, 'database': db_name, 'path': path}
+        map_hash = (db_name, coll_name, path)
+        if map_hash in AVAILABLE_NAME_MAPPINGS:
+            return db_name, coll_name
         db = self._jar._conn[self._jar.default_database]
         coll = db[self._jar.name_map_collection]
-        map = {'collection': coll_name,
-               'database': db_name,
-               'path': get_dotted_name(obj.__class__)}
         result = coll.find_one(map)
         if result is None:
             # If there is already a map for this collection, the next map must
@@ -88,6 +96,7 @@
                 setattr(obj, '_p_mongo_store_type', True)
             map['doc_has_type'] = getattr(obj, '_p_mongo_store_type', False)
             coll.save(map)
+        AVAILABLE_NAME_MAPPINGS.add(map_hash)
         return db_name, coll_name
 
     def get_non_persistent_state(self, obj, seen):
@@ -281,13 +290,29 @@
         self._single_map_cache = {}
 
     def simple_resolve(self, path):
-        return resolve(path)
+        # We try to look up the klass from a cache. The important part here is
+        # that we also cache lookup failures as None, since they actually
+        # happen more frequently than a hit due to an optimization in the
+        # resolve() function.
+        try:
+            klass = PATH_RESOLVE_CACHE[path]
+        except KeyError:
+            try:
+                klass = resolve(path)
+            except ImportError:
+                PATH_RESOLVE_CACHE[path] = klass = None
+            else:
+                PATH_RESOLVE_CACHE[path] = klass
+        if klass is None:
+            raise ImportError(path)
+        return klass
 
     def resolve(self, dbref):
         __traceback_info__ = dbref
-        # 1. Check the global oid-based lookup cache.
+        # 1. Check the global oid-based lookup cache. Use the hash of the id,
+        #    since otherwise the comparison is way too expensive.
         try:
-            return OID_CLASS_LRU[dbref.id]
+            return OID_CLASS_LRU[hash(dbref.id)]
         except KeyError:
             pass
         # 2. Check the transient single map entry lookup cache.
@@ -295,19 +320,43 @@
             return self._single_map_cache[(dbref.database, dbref.collection)]
         except KeyError:
             pass
-        # 3. Try to resolve the path directly.
+        # 3. If we have found the type within the document for a collection
+        #    before, let's try again. This will only hit, if we have more than
+        #    one type for the collection, otherwise the single map entry
+        #    lookup failed.
+        coll_key = (dbref.database, dbref.collection)
+        if coll_key in COLLECTIONS_WITH_TYPE:
+            if dbref in self._jar._latest_states:
+                obj_doc = self._jar._latest_states[dbref]
+            elif ALWAYS_READ_FULL_DOC:
+                obj_doc = self._jar.get_collection(
+                    dbref.database, dbref.collection).find_one(dbref.id)
+                self._jar._latest_states[dbref] = obj_doc
+            else:
+                obj_doc = self._jar\
+                    .get_collection(dbref.database, dbref.collection)\
+                    .find_one(dbref.id, fields=('_py_persistent_type',))
+            if '_py_persistent_type' in obj_doc:
+                klass = self.simple_resolve(obj_doc['_py_persistent_type'])
+                OID_CLASS_LRU[hash(dbref.id)] = klass
+                return klass
+        # 4. Try to resolve the path directly. We want to do this optimization
+        #    after all others, because trying it a lot is very expensive.
         try:
             return self.simple_resolve(dbref.collection)
         except ImportError:
             pass
-        # 4. No simple hits, so we have to do some leg work.
+        # 5. No simple hits, so we have to do some leg work.
         # Let's now try to look up the path from the collection to path
         # mapping
         db = self._jar._conn[self._jar.default_database]
         coll = db[self._jar.name_map_collection]
-        result = coll.find(
-            {'collection': dbref.collection, 'database': dbref.database})
-        count = result.count()
+        result = tuple(coll.find(
+            {'collection': dbref.collection, 'database': dbref.database}))
+        # Calling count() on a query result causes another database
+        # access. Since the result sets should be typically very small, let's
+        # load them all.
+        count = len(result)
         if count == 0:
             raise ImportError(dbref)
         elif count == 1:
@@ -315,7 +364,7 @@
             # change later. But storing it for the length of the transaction
             # is fine, which is really useful if you load a lot of objects of
             # the same type.
-            klass = self.simple_resolve(result.next()['path'])
+            klass = self.simple_resolve(result[0]['path'])
             self._single_map_cache[(dbref.database, dbref.collection)] = klass
             return klass
         else:
@@ -323,13 +372,31 @@
                 raise ImportError(dbref)
             # Multiple object types are stored in the collection. We have to
             # look at the object to find out the type.
-            obj_doc = self._jar\
-                .get_collection(dbref.database, dbref.collection).find_one(
-                    dbref.id, fields=('_py_persistent_type',))
+            if dbref in self._jar._latest_states:
+                # Optimization: If we have the latest state, then we just get
+                # this object document. This is used for fast loading or when
+                # resolving the same object path a second time. (The latter
+                # should never happen due to the object cache.)
+                obj_doc = self._jar._latest_states[dbref]
+            elif ALWAYS_READ_FULL_DOC:
+                # Optimization: Read the entire doc and stick it in the right
+                # place so that unghostifying the object later will not cause
+                # another database access.
+                obj_doc = self._jar\
+                    .get_collection(dbref.database, dbref.collection)\
+                    .find_one(dbref.id)
+                self._jar._latest_states[dbref] = obj_doc
+            else:
+                obj_doc = self._jar\
+                    .get_collection(dbref.database, dbref.collection)\
+                    .find_one(dbref.id, fields=('_py_persistent_type',))
             if '_py_persistent_type' in obj_doc:
+                COLLECTIONS_WITH_TYPE.add(coll_key)
                 klass = self.simple_resolve(obj_doc['_py_persistent_type'])
             else:
                 # Find the name-map entry where "doc_has_type" is False.
+                # Note: This case is really inefficient and does not allow any
+                # optimization. It should be avoided as much as possible.
                 for name_map_item in result:
                     if not name_map_item['doc_has_type']:
                         klass = self.simple_resolve(name_map_item['path'])

Modified: mongopersist/trunk/src/mongopersist/testing.py
===================================================================
--- mongopersist/trunk/src/mongopersist/testing.py	2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/testing.py	2012-04-02 06:34:41 UTC (rev 124872)
@@ -53,3 +53,7 @@
     test.globs['conn'].drop_database(test.globs['DBNAME'])
     test.globs['conn'].disconnect()
     serialize.SERIALIZERS.__init__()
+    serialize.OID_CLASS_LRU.__init__(20000)
+    serialize.COLLECTIONS_WITH_TYPE.__init__()
+    serialize.AVAILABLE_NAME_MAPPINGS.__init__()
+    serialize.PATH_RESOLVE_CACHE = {}

Modified: mongopersist/trunk/src/mongopersist/tests/test_serialize.py
===================================================================
--- mongopersist/trunk/src/mongopersist/tests/test_serialize.py	2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/tests/test_serialize.py	2012-04-02 06:34:41 UTC (rev 124872)
@@ -41,6 +41,13 @@
 class Anything(persistent.Persistent):
     pass
 
+class StoreType(persistent.Persistent):
+    _p_mongo_collection = 'storetype'
+    _p_mongo_store_type = True
+
+class StoreType2(StoreType):
+    pass
+
 class Simple(object):
     pass
 
@@ -298,6 +305,37 @@
       {'_py_persistent_type': 'mongopersist.tests.test_serialize.Top'}
     """
 
+def doctest_ObjectWriter_get_full_state():
+    """ObjectWriter: get_full_state()
+
+      >>> writer = serialize.ObjectWriter(dm)
+
+    Let's get the state of a regular object"
+
+      >>> any = Anything()
+      >>> any.name = 'anything'
+      >>> writer.get_full_state(any)
+      {'name': 'anything'}
+
+      >>> any_ref = dm.insert(any)
+      >>> writer.get_full_state(any)
+      {'_id': ObjectId('4f79368e37a08e1c91000000'), 'name': 'anything'}
+
+    Now an object that stores its type:
+
+      >>> st = StoreType()
+      >>> st.name = 'storetype'
+      >>> pprint.pprint(writer.get_full_state(st))
+      {'_py_persistent_type': 'mongopersist.tests.test_serialize.StoreType',
+       'name': 'storetype'}
+
+      >>> st_ref = dm.insert(st)
+      >>> pprint.pprint(writer.get_full_state(st))
+      {'_id': ObjectId('4f79372637a08e1cdf000001'),
+       '_py_persistent_type': 'mongopersist.tests.test_serialize.StoreType',
+       'name': 'storetype'}
+    """
+
 def doctest_ObjectWriter_store():
     """ObjectWriter: store()
 
@@ -385,6 +423,34 @@
       >>> reader = serialize.ObjectReader(dm)
       >>> reader.simple_resolve('mongopersist.tests.test_serialize.Top')
       <class 'mongopersist.tests.test_serialize.Top'>
+
+    After the original lookup, the result is cached:
+
+      >>> pprint.pprint(serialize.PATH_RESOLVE_CACHE)
+      {'mongopersist.tests.test_serialize.Top':
+          <class 'mongopersist.tests.test_serialize.Top'>}
+
+    Note that even lookup failures are cached.
+
+      >>> reader.simple_resolve('path.to.bad')
+      Traceback (most recent call last):
+      ...
+      ImportError: path.to.bad
+
+      >>> pprint.pprint(serialize.PATH_RESOLVE_CACHE)
+      {'mongopersist.tests.test_serialize.Top':
+          <class 'mongopersist.tests.test_serialize.Top'>,
+       'path.to.bad': None}
+
+    Resolving the path the second time uses the cache:
+
+      >>> reader.simple_resolve('mongopersist.tests.test_serialize.Top')
+      <class 'mongopersist.tests.test_serialize.Top'>
+
+      >>> reader.simple_resolve('path.to.bad')
+      Traceback (most recent call last):
+      ...
+      ImportError: path.to.bad
     """
 
 def doctest_ObjectReader_resolve_simple():
@@ -400,6 +466,50 @@
       <class 'mongopersist.tests.test_serialize.Top'>
     """
 
+def doctest_ObjectReader_resolve_quick_when_type_in_doc():
+    """ObjectReader: resolve(): Quick lookup when type in document.
+
+    This methods resolves a collection name to its class. The collection name
+    can be either any arbitrary string or a Python path.
+
+      >>> st = StoreType()
+      >>> st_ref = dm.insert(st)
+      >>> st2 = StoreType2()
+      >>> st2_ref = dm.insert(st2)
+      >>> dm.reset()
+
+    Let's now resolve the references:
+
+      >>> reader = serialize.ObjectReader(dm)
+      >>> reader.resolve(st_ref)
+      <class 'mongopersist.tests.test_serialize.StoreType'>
+      >>> reader.resolve(st2_ref)
+      <class 'mongopersist.tests.test_serialize.StoreType2'>
+      >>> dm.reset()
+
+    The collection is now stored as one where objects save their type:
+
+      >>> serialize.COLLECTIONS_WITH_TYPE
+      set([('mongopersist_test', 'storetype')])
+
+    So here comes the trick. When fast-loading objects, the documents are made
+    immediately available in the ``_latest_states`` mapping. This allows our
+    quick resolve to utilize that document instead of looking it up in the
+    database:
+
+      >>> writer = serialize.ObjectWriter(dm)
+      >>> coll = dm._get_collection_from_object(st)
+      >>> dm._latest_states[st_ref] = writer.get_full_state(st)
+      >>> dm._latest_states[st2_ref] = writer.get_full_state(st2)
+
+      >>> reader = serialize.ObjectReader(dm)
+      >>> reader.resolve(st_ref)
+      <class 'mongopersist.tests.test_serialize.StoreType'>
+      >>> reader.resolve(st2_ref)
+      <class 'mongopersist.tests.test_serialize.StoreType2'>
+
+  """
+
 def doctest_ObjectReader_resolve_lookup():
     """ObjectReader: resolve(): lookup
 
@@ -459,6 +569,51 @@
       ImportError: DBRef('Top', None, 'mongopersist_test')
     """
 
+def doctest_ObjectReader_resolve_lookup_with_multiple_maps_dont_read_full():
+    """ObjectReader: resolve(): lookup with multiple maps entries
+
+    Multiple maps lookup with the ALWAYS_READ_FULL_DOC option set to False.
+
+      >>> serialize.ALWAYS_READ_FULL_DOC = False
+
+      >>> writer = serialize.ObjectWriter(dm)
+      >>> top = Top()
+      >>> writer.store(top)
+      DBRef('Top', ObjectId('4eb1e0f237a08e38dd000002'), 'mongopersist_test')
+      >>> top2 = Top2()
+      >>> writer.store(top2)
+      DBRef('Top', ObjectId('4eb1e10437a08e38e8000004'), 'mongopersist_test')
+
+      >>> reader = serialize.ObjectReader(dm)
+      >>> reader.resolve(top._p_oid)
+      <class 'mongopersist.tests.test_serialize.Top'>
+      >>> reader.resolve(top2._p_oid)
+      <class 'mongopersist.tests.test_serialize.Top2'>
+
+    Let's clear dome caches and try again:
+
+      >>> dm.reset()
+      >>> serialize.COLLECTIONS_WITH_TYPE.__init__()
+
+      >>> reader = serialize.ObjectReader(dm)
+      >>> reader.resolve(top._p_oid)
+      <class 'mongopersist.tests.test_serialize.Top'>
+      >>> reader.resolve(top2._p_oid)
+      <class 'mongopersist.tests.test_serialize.Top2'>
+
+    If the DBRef does not have an object id, then an import error is raised:
+
+      >>> reader.resolve(dbref.DBRef('Top', None, 'mongopersist_test'))
+      Traceback (most recent call last):
+      ...
+      ImportError: DBRef('Top', None, 'mongopersist_test')
+
+    Cleanup:
+
+      >>> serialize.ALWAYS_READ_FULL_DOC = True
+
+    """
+
 def doctest_ObjectReader_get_non_persistent_object_py_type():
     """ObjectReader: get_non_persistent_object(): _py_type
 

Modified: mongopersist/trunk/src/mongopersist/zope/tests/test_container.py
===================================================================
--- mongopersist/trunk/src/mongopersist/zope/tests/test_container.py	2012-04-02 06:30:34 UTC (rev 124871)
+++ mongopersist/trunk/src/mongopersist/zope/tests/test_container.py	2012-04-02 06:34:41 UTC (rev 124872)
@@ -633,6 +633,10 @@
     module.tearDown(test)
     test.globs['conn'].disconnect()
     serialize.SERIALIZERS.__init__()
+    serialize.OID_CLASS_LRU.__init__(20000)
+    serialize.COLLECTIONS_WITH_TYPE.__init__()
+    serialize.AVAILABLE_NAME_MAPPINGS.__init__()
+    serialize.PATH_RESOLVE_CACHE.__init__()
     exceptionformatter.DEBUG_EXCEPTION_FORMATTER = test.orig_DEBUG_EXCEPTION_FORMATTER
 
 def test_suite():



More information about the checkins mailing list