[Checkins] SVN: relstorage/trunk/ Detect and handle backward time travel, which can happen after

Wed Sep 30 04:23:21 EDT 2009

Log message for revision 104642:
  Detect and handle backward time travel, which can happen after
  failover to an out-of-date asynchronous slave database. For
  simplicity, invalidate the whole ZODB cache when this happens.
  

Changed:
  U   relstorage/trunk/CHANGES.txt
  U   relstorage/trunk/relstorage/adapters/poller.py
  U   relstorage/trunk/relstorage/tests/reltestbase.py

-=-
Modified: relstorage/trunk/CHANGES.txt
===================================================================

--- relstorage/trunk/CHANGES.txt	2009-09-30 07:55:35 UTC (rev 104641)
+++ relstorage/trunk/CHANGES.txt	2009-09-30 08:23:21 UTC (rev 104642)
@@ -34,6 +34,10 @@
 - Use the store connection rather than the load connection for OID
   allocation.
 
+- Detect and handle backward time travel, which can happen after
+  failover to an out-of-date asynchronous slave database. For
+  simplicity, invalidate the whole ZODB cache when this happens.
+
 1.3.0b1 (2009-09-04)
 --------------------
 

Modified: relstorage/trunk/relstorage/adapters/poller.py
===================================================================
--- relstorage/trunk/relstorage/adapters/poller.py	2009-09-30 07:55:35 UTC (rev 104641)
+++ relstorage/trunk/relstorage/adapters/poller.py	2009-09-30 08:23:21 UTC (rev 104642)
@@ -14,6 +14,8 @@
 
 from relstorage.adapters.interfaces import IPoller
 from zope.interface import implements
+import logging
+log = logging.getLogger(__name__)
 
 class Poller:
     """Database change notification poller"""
@@ -51,8 +53,12 @@
             # If the previously polled transaction no longer exists,
             # the cache is too old and needs to be cleared.
             # XXX Do we actually need to detect this condition? I think
-            # if we delete this block of code, all the reachable objects
-            # will be invalidated anyway.
+            # if we delete this block of code, all the unreachable
+            # objects will be invalidated anyway. So, as a test, I have
+            # not written the equivalent of this block of code for
+            # history-free storage. If something goes wrong, then we'll
+            # know there's some other edge condition we have to account
+            # for.
             stmt = "SELECT 1 FROM transaction WHERE tid = %(tid)s"
             cursor.execute(intern(stmt % self.runner.script_vars),
                 {'tid': prev_polled_tid})
@@ -63,23 +69,42 @@
                 return None, new_polled_tid
 
         # Get the list of changed OIDs and return it.
-        if self.keep_history:
-            stmt = """
-            SELECT zoid
-            FROM current_object
-            WHERE tid > %(tid)s
-            """
+        if new_polled_tid > prev_polled_tid:
+            if self.keep_history:
+                stmt = """
+                SELECT zoid
+                FROM current_object
+                WHERE tid > %(tid)s
+                """
+            else:
+                stmt = """
+                SELECT zoid
+                FROM object_state
+                WHERE tid > %(tid)s
+                """
+            params = {'tid': prev_polled_tid}
+            if ignore_tid is not None:
+                stmt += " AND tid != %(self_tid)s"
+                params['self_tid'] = ignore_tid
+            stmt = intern(stmt % self.runner.script_vars)
+
         else:
-            stmt = """
-            SELECT zoid
-            FROM object_state
-            WHERE tid > %(tid)s
-            """
-        params = {'tid': prev_polled_tid}
-        if ignore_tid is not None:
-            stmt += " AND tid != %(self_tid)s"
-            params['self_tid'] = ignore_tid
-        stmt = intern(stmt % self.runner.script_vars)
+            # We moved backward in time. This can happen after failover
+            # to an asynchronous slave that is not fully up to date. If
+            # this was not caused by failover, it suggests that
+            # transaction IDs are not being created in order, which can
+            # lead to consistency violations.
+            log.warning(
+                "Detected backward time travel (old tid %d, new tid %d). "
+                "This is acceptable if it was caused by failover to a "
+                "read-only asynchronous slave, but otherwise it may "
+                "indicate a problem.",
+                prev_polled_tid, new_polled_tid)
+            # Although we could handle this situation by looking at the
+            # whole cache and invalidating only certain objects,
+            # invalidating the whole cache is simpler.
+            return None, new_polled_tid
+
         cursor.execute(stmt, params)
         oids = [oid for (oid,) in cursor]
 

Modified: relstorage/trunk/relstorage/tests/reltestbase.py
===================================================================
--- relstorage/trunk/relstorage/tests/reltestbase.py	2009-09-30 07:55:35 UTC (rev 104641)
+++ relstorage/trunk/relstorage/tests/reltestbase.py	2009-09-30 08:23:21 UTC (rev 104642)
@@ -421,7 +421,60 @@
         self.assertRaises(UnpicklingError, self._storage.pack,
             time.time() + 10000, referencesf)
 
+    def checkBackwardTimeTravel(self):
+        # When a failover event causes the storage to switch to an
+        # asynchronous slave that is not fully up to date, the poller
+        # should notice that backward time travel has occurred and
+        # handle the situation by invalidating all objects that have
+        # changed in the interval. (Currently, we simply invalidate all
+        # objects when backward time travel occurs.)
+        import os
+        import shutil
+        import tempfile
+        from ZODB.FileStorage import FileStorage
+        db = DB(self._storage)
+        try:
+            c = db.open()
+            r = c.root()
+            r['alpha'] = PersistentMapping()
+            transaction.commit()
 
+            # To simulate failover to an out of date async slave, take
+            # a snapshot of the database at this point, change some
+            # object, then restore the database to its earlier state.
+
+            d = tempfile.mkdtemp()
+            try:
+                fs = FileStorage(os.path.join(d, 'Data.fs'))
+                fs.copyTransactionsFrom(c._storage)
+
+                r['beta'] = PersistentMapping()
+                transaction.commit()
+                self.assertTrue('beta' in r)
+
+                c._storage.zap_all()
+                c._storage.copyTransactionsFrom(fs)
+
+                fs.close()
+            finally:
+                shutil.rmtree(d)
+
+            # r should still be in the cache.
+            self.assertTrue('beta' in r)
+
+            # Now sync, which will call poll_invalidations().
+            c.sync()
+
+            # r should have been invalidated
+            self.assertEqual(r._p_changed, None)
+
+            # r should be reverted to its earlier state.
+            self.assertFalse('beta' in r)
+
+        finally:
+            db.close()
+
+
 class DoubleCommitter(Persistent):
     """A crazy persistent class that changes self in __getstate__"""
     def __getstate__(self):