[Checkins] SVN: gocept.zeoraid/trunk/ - #484921: Log reason why a backend is degraded. We also display that reason

Sat Nov 21 06:44:34 EST 2009

Log message for revision 105944:
  - #484921: Log reason why a backend is degraded. We also display that reason
    in the detailed status of the controller client now. If an exception occured
    then we also log the traceback now.
  
  - Refactored protocol for displaying detailed status information of backend
    storages to be simpler, less error prone and easier to change in the future.
  
  - Fix little bug in status detail which would fail if the recovery status
    missed the transaction information bit.
  
  

Changed:
  U   gocept.zeoraid/trunk/CHANGES.txt
  U   gocept.zeoraid/trunk/src/gocept/zeoraid/recovery.py
  U   gocept.zeoraid/trunk/src/gocept/zeoraid/scripts/controller.py
  U   gocept.zeoraid/trunk/src/gocept/zeoraid/storage.py
  U   gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_basics.py
  U   gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_recovery.py
  U   gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_zeo.py

-=-
Modified: gocept.zeoraid/trunk/CHANGES.txt
===================================================================

--- gocept.zeoraid/trunk/CHANGES.txt	2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/CHANGES.txt	2009-11-21 11:44:33 UTC (rev 105944)
@@ -5,9 +5,19 @@
 1.0b7 (unreleased)
 ------------------
 
-- Nothing changed yet.
+- #484921: Log reason why a backend is degraded. We also display that reason
+  in the detailed status of the controller client now. If an exception occured
+  then we also log the traceback now.
 
+- Refactored protocol for displaying detailed status information of backend
+  storages to be simpler, less error prone and easier to change in the future.
 
+- Fix little bug in status detail which would fail if the recovery status
+  missed the transaction information bit.
+
+- Improve documentation about issues with recovery in a multi-ZEORaid setup
+
+
 1.0b6 (2009-11-18)
 ------------------
 

Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/recovery.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/recovery.py	2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/recovery.py	2009-11-21 11:44:33 UTC (rev 105944)
@@ -97,7 +97,7 @@
 
             yield ('verify', ZODB.utils.tid_repr(source_txn.tid))
 
-        yield ('verified',)
+        yield ('verified', '')
 
         # Recover from that point on until the target storage has all
         # transactions that exist in the source storage at the time of
@@ -152,4 +152,4 @@
 
             yield ('recover', ZODB.utils.tid_repr(txn_info.tid))
 
-        yield ('recovered',)
+        yield ('recovered', '')

Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/scripts/controller.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/scripts/controller.py	2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/scripts/controller.py	2009-11-21 11:44:33 UTC (rev 105944)
@@ -80,25 +80,16 @@
     def cmd_details(self):
         col1 = 25
 
-        ok, recovering, failed, recovery_status = self.raid.raid_details()
+        storages = self.raid.raid_details()
         print " %s| Status" % ('%s:%s' % (self.host, self.port)).ljust(col1)
         print " %s+-------------------" % ('-' * col1)
         print " %s| %s" % (('RAID %s' % self.storage).ljust(col1),
                            self.raid.raid_status().ljust(col1))
         print " %s+-------------------" % ('-' * col1)
 
-        storages = {}
-        for storage in ok:
-            storages[storage] = 'optimal'
-        for storage in failed:
-            storages[storage] = 'failed'
-        if recovering:
-            storages[recovering] = ('recovering: %s transaction %s' %
-                                    recovery_status)
+        for storage in sorted(storages):
+            print ' %s| %s' % (str(storage).ljust(col1), storages[storage])
 
-        for id in sorted(storages):
-            print ' %s| %s' % (str(id).ljust(col1), storages[id])
-
         return STATUS_TO_NAGIOS[self.raid.raid_status()]
 
     def cmd_recover(self, storage):

Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/storage.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/storage.py	2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/storage.py	2009-11-21 11:44:33 UTC (rev 105944)
@@ -123,6 +123,7 @@
         self._threads = set()
         self.storages_optimal = []
         self.storages_degraded = []
+        self.degrade_reasons = {}
 
         # Temporary files and directories that should be removed at the end of
         # the two-phase commit. The list must only be modified while holding
@@ -171,11 +172,11 @@
 
         # Degrade all remaining (non-optimal) storages
         for name in reduce(lambda x, y: x + y, tids.values(), []):
-            self._degrade_storage(name)
+            self._degrade_storage(name, reason='missing transactions')
 
         # No storage is recovering initially
         self.storage_recovering = None
-        self.recovery_status = ''
+        self.recovery_status = None
 
     # IStorage
 
@@ -273,7 +274,7 @@
         min_oid = sorted(oids)[0][0]
         for oid, storage in oids:
             if oid > min_oid:
-                self._degrade_storage(storage)
+                self._degrade_storage(storage, reason='inconsistent OIDs')
         return min_oid
 
     @ensure_writable
@@ -589,12 +590,25 @@
 
     @ensure_open_storage
     def raid_details(self):
-        return [self.storages_optimal, self.storage_recovering,
-                self.storages_degraded, self.recovery_status]
+        storages = {}
+        for storage in self.storages_optimal:
+            storages[storage] = 'optimal'
+        for storage in self.storages_degraded:
+            storages[storage] = 'failed: %s' % (
+                self.degrade_reasons.get(storage, 'n/a'))
+        if self.storage_recovering:
+            msg = 'recovering: '
+            if self.recovery_status:
+                msg += '%s transaction %s' % self.recovery_status
+            else:
+                msg += '???'
+            storages[self.storage_recovering] = msg
+        return storages
 
     @ensure_open_storage
     def raid_disable(self, name):
-        self._degrade_storage(name, fail=False)
+        self._degrade_storage(
+            name, reason='disabled by controller', fail=False)
         return 'disabled %r' % (name,)
 
     @ensure_open_storage
@@ -648,12 +662,15 @@
         for name in added:
             self.openers[name] = configured_storages[name]
             self.storages_degraded.append(name)
+            self.degrade_reasons[name] = 'added by controller'
 
         for name in removed:
             self._close_storage(name)
             del self.openers[name]
             if name in self.storages_degraded:
                 self.storages_degraded.remove(name)
+            if name in self.degrade_reasons:
+                del self.degrade_reasons[name]
 
     # internal
 
@@ -666,10 +683,12 @@
         except ZODB.POSException.POSKeyError, e:
             pass
         except Exception, e:
-            logger.critical('Could not open storage %s' % name, exc_info=True)
+            logger.exception('Could not open storage %s' % name)
             # We were trying to open a storage. Even if we fail we can't be
             # more broke than before, so don't ever fail due to this.
-            self._degrade_storage(name, fail=False)
+            self._degrade_storage(
+                name, reason='an error occured opening the storage',
+                fail=False)
             return
         self.storages[name] = storage
 
@@ -683,9 +702,11 @@
             t.setDaemon(True)
             t.start()
 
-    def _degrade_storage(self, name, fail=True):
+    def _degrade_storage(self, name, reason, fail=True):
         self._close_storage(name)
         self.storages_degraded.append(name)
+        self.degrade_reasons[name] = reason
+        logger.critical('Storage %s degraded. Reason: %s' % (name, reason))
         if not self.storages_optimal and fail:
             raise gocept.zeoraid.interfaces.RAIDError("No storages remain.")
 
@@ -709,6 +730,8 @@
         except ZODB.POSException.StorageError:
             # Handle StorageErrors first, otherwise they would be swallowed
             # when POSErrors are handled.
+            logger.exception('Error calling %r' % method_name)
+            reason = 'an error occured calling %r' % method_name
             reliable = False
         except (ZODB.POSException.POSError,
                 transaction.interfaces.TransactionError), e:
@@ -716,14 +739,17 @@
             # indicate storage failure.
             raise
         except Exception:
+            logger.exception('Error calling %r' % method_name)
+            reason = 'an error occured calling %r' % method_name
             reliable = False
 
         if (isinstance(storage, ZEO.ClientStorage.ClientStorage) and
             expect_connected and not storage.is_connected()):
             reliable = False
+            reason = 'storage is not connected'
 
         if not reliable:
-            self._degrade_storage(storage_name)
+            self._degrade_storage(storage_name, reason=reason)
         return (reliable, result)
 
     @ensure_open_storage
@@ -793,7 +819,10 @@
             thread.join(self.timeout)
             if thread.isAlive():
                 # Storage timed out.
-                self._degrade_storage(thread.storage_name)
+                self._degrade_storage(
+                    thread.storage_name,
+                    reason='no response within %s seconds' %
+                        self.timeout)
                 self._threads.add(thread)
                 continue
             if thread.exception:
@@ -836,12 +865,16 @@
 
     def _recover_impl(self, name):
         self.storages_degraded.remove(name)
+        del self.degrade_reasons[name]
         self.storage_recovering = name
         try:
             target = self.openers[name].open()
         except Exception:
+            logger.exception('Error opening storage %s' % name)
             self.storage_recovering = None
             self.storages_degraded.append(name)
+            self.degrade_reasons[name] = (
+                'an error occured opening the storage')
             raise
         self.storages[name] = target
         recovery = gocept.zeoraid.recovery.Recovery(
@@ -874,10 +907,9 @@
                     oid = catch_up()
 
                 if oid != target:
-                    logging.warn(
-                        'Degrading recovering storage %r due to '
-                        'new OID mismatch' % self.storage_recovering)
-                    self._degrade_storage(self.storage_recovering)
+                    self._degrade_storage(
+                        self.storage_recovering,
+                        reason='failed matching OIDs')
                     return
 
             self.storages_optimal.append(self.storage_recovering)

Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_basics.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_basics.py	2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_basics.py	2009-11-21 11:44:33 UTC (rev 105944)
@@ -1518,14 +1518,17 @@
         # configure the RAID to no longer use the removed backend
         self.update_config()
         self._storage.raid_reload()
-        self.assertEquals([['1', '0', '2', '4'], None, [], ''],
+        self.assertEquals({'1': 'optimal', '0': 'optimal', '2': 'optimal',
+                           '4': 'optimal'},
                           self._storage.raid_details())
 
         # Re-insert the storage
         self._storages.append(removed_storage)
         self.update_config()
         self._storage.raid_reload()
-        self.assertEquals([['1', '0', '2', '4'], None, ['3'], ''],
+        self.assertEquals({'1': 'optimal', '0': 'optimal',
+                           '3': 'failed: added by controller', '2': 'optimal',
+                           '4': 'optimal'},
                           self._storage.raid_details())
 
         self._storage.raid_recover('3')
@@ -1536,7 +1539,8 @@
         # new_oid causes storage 4 to give an inconsistent result thus being
         # dropped from the pool of good storages. Storage 3 however meets the
         # OID target then thus being picked up.
-        self.assertEquals([['1', '0', '2', '3'], None, ['4'], ('recovered',)],
+        self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+                           '2': 'optimal', '4': 'failed: inconsistent OIDs'},
                           self._storage.raid_details())
 
     def test_reload_remove_readd(self):
@@ -1545,7 +1549,8 @@
         # configure the RAID to no longer use the removed backend
         self.update_config()
         self._storage.raid_reload()
-        self.assertEquals([['1', '0', '3', '2'], None, [], ''],
+        self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+                           '2': 'optimal'},
                           self._storage.raid_details())
 
         # ensure that we can still write to the RAID
@@ -1561,7 +1566,8 @@
         self._storages.append(removed_storage)
         self.update_config()
         self._storage.raid_reload()
-        self.assertEquals([['1', '0', '3', '2'], None, ['4'], ''],
+        self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+                           '2': 'optimal', '4': 'failed: added by controller'},
                           self._storage.raid_details())
 
         self._storage.raid_recover('4')
@@ -1570,7 +1576,8 @@
             time.sleep(0.1)
 
         self.assertEquals(
-            [['1', '0', '3', '2', '4'], None, [], ('recovered',)],
+            {'1': 'optimal', '0': 'optimal', '3': 'optimal', '2': 'optimal',
+             '4': 'optimal'},
             self._storage.raid_details())
 
         # Now, after recovery, the OID should be visible
@@ -1582,14 +1589,17 @@
         removed_storage = self._storages.pop()
         self._storage.raid_disable(removed_storage.name)
 
-        self.assertEquals([['1', '0', '3', '2'], None, ['4'], ''],
-                          self._storage.raid_details())
+        self.assertEquals(
+            {'1': 'optimal', '0': 'optimal', '3': 'optimal', '2': 'optimal',
+             '4': 'failed: disabled by controller'},
+            self._storage.raid_details())
 
         # configure the RAID to no longer use the removed backend
         self.update_config()
 
         self._storage.raid_reload()
-        self.assertEquals([['1', '0', '3', '2'], None, [], ''],
+        self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+                           '2': 'optimal'},
                           self._storage.raid_details())
 
     def test_reload_broken_config(self):
@@ -1599,7 +1609,8 @@
         f.close()
 
         self.assertRaises(RuntimeError, self._storage.raid_reload)
-        self.assertEquals([['1', '0', '3', '2', '4'], None, [], ''],
+        self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+                           '2': 'optimal', '4': 'optimal'},
                           self._storage.raid_details())
 
     def test_reload_no_remaining_storages(self):
@@ -1609,7 +1620,9 @@
 
         self.update_config()
         self.assertRaises(RuntimeError, self._storage.raid_reload)
-        self.assertEquals([['1', '0', '3', '2'], None, ['4'], ''],
+        self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+                           '2': 'optimal',
+                           '4': 'failed: disabled by controller'},
                           self._storage.raid_details())
 
 

Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_recovery.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_recovery.py	2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_recovery.py	2009-11-21 11:44:33 UTC (rev 105944)
@@ -43,7 +43,7 @@
     recovery = gocept.zeoraid.recovery.Recovery(
         source, target, lambda target: None)
     protocol = list(recovery())
-    test.assertEquals([('verified',), ('recovered',)], protocol[-2:])
+    test.assertEquals([('verified', ''), ('recovered', '')], protocol[-2:])
     for source_txn, target_txn in itertools.izip(source.iterator(),
                                                  target.iterator()):
         # We need not compare the transaction metadata because that has
@@ -209,7 +209,7 @@
             blob_dir=blob_dir, shared_blob_dir=self.shared)
 
     def test_verify_both_empty(self):
-        self.assertEquals([('verified',), ('recovered',)],
+        self.assertEquals([('verified', ''), ('recovered', '')],
                           list(self.recovery()))
 
     def test_verify_empty_target(self):

Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_zeo.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_zeo.py	2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_zeo.py	2009-11-21 11:44:33 UTC (rev 105944)
@@ -88,19 +88,17 @@
             wait_timeout=60).open()
         self._storages.append(raid)
 
-        self.assertEquals([['1'], None, ['2'], ''], raid.raid_details())
+        self.assertEquals({'1': 'optimal',
+                           '2': 'failed: missing transactions'},
+                          raid.raid_details())
         self.assertEquals('degraded', raid.raid_status())
 
         raid.raid_recover('2')
 
         # Wait until the storage starts to recover
-        status = ''
-        while  status != 'recover':
-            status = raid.raid_details()[-1]
-            if isinstance(status, tuple):
-                status = status[0]
-            else:
-                status = ''
+        status = {'2': ''}
+        while 'recover ' not in status['2']:
+            status = raid.raid_details()
             time.sleep(0.5)
 
         # Now, hammer the storage with more transactions with a high chance of