[Checkins] SVN: gocept.zeoraid/trunk/ - #484921: Log reason why a backend is degraded. We also display that reason
Christian Theune
ct at gocept.com
Sat Nov 21 06:44:34 EST 2009
Log message for revision 105944:
- #484921: Log reason why a backend is degraded. We also display that reason
in the detailed status of the controller client now. If an exception occured
then we also log the traceback now.
- Refactored protocol for displaying detailed status information of backend
storages to be simpler, less error prone and easier to change in the future.
- Fix little bug in status detail which would fail if the recovery status
missed the transaction information bit.
Changed:
U gocept.zeoraid/trunk/CHANGES.txt
U gocept.zeoraid/trunk/src/gocept/zeoraid/recovery.py
U gocept.zeoraid/trunk/src/gocept/zeoraid/scripts/controller.py
U gocept.zeoraid/trunk/src/gocept/zeoraid/storage.py
U gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_basics.py
U gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_recovery.py
U gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_zeo.py
-=-
Modified: gocept.zeoraid/trunk/CHANGES.txt
===================================================================
--- gocept.zeoraid/trunk/CHANGES.txt 2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/CHANGES.txt 2009-11-21 11:44:33 UTC (rev 105944)
@@ -5,9 +5,19 @@
1.0b7 (unreleased)
------------------
-- Nothing changed yet.
+- #484921: Log reason why a backend is degraded. We also display that reason
+ in the detailed status of the controller client now. If an exception occured
+ then we also log the traceback now.
+- Refactored protocol for displaying detailed status information of backend
+ storages to be simpler, less error prone and easier to change in the future.
+- Fix little bug in status detail which would fail if the recovery status
+ missed the transaction information bit.
+
+- Improve documentation about issues with recovery in a multi-ZEORaid setup
+
+
1.0b6 (2009-11-18)
------------------
Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/recovery.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/recovery.py 2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/recovery.py 2009-11-21 11:44:33 UTC (rev 105944)
@@ -97,7 +97,7 @@
yield ('verify', ZODB.utils.tid_repr(source_txn.tid))
- yield ('verified',)
+ yield ('verified', '')
# Recover from that point on until the target storage has all
# transactions that exist in the source storage at the time of
@@ -152,4 +152,4 @@
yield ('recover', ZODB.utils.tid_repr(txn_info.tid))
- yield ('recovered',)
+ yield ('recovered', '')
Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/scripts/controller.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/scripts/controller.py 2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/scripts/controller.py 2009-11-21 11:44:33 UTC (rev 105944)
@@ -80,25 +80,16 @@
def cmd_details(self):
col1 = 25
- ok, recovering, failed, recovery_status = self.raid.raid_details()
+ storages = self.raid.raid_details()
print " %s| Status" % ('%s:%s' % (self.host, self.port)).ljust(col1)
print " %s+-------------------" % ('-' * col1)
print " %s| %s" % (('RAID %s' % self.storage).ljust(col1),
self.raid.raid_status().ljust(col1))
print " %s+-------------------" % ('-' * col1)
- storages = {}
- for storage in ok:
- storages[storage] = 'optimal'
- for storage in failed:
- storages[storage] = 'failed'
- if recovering:
- storages[recovering] = ('recovering: %s transaction %s' %
- recovery_status)
+ for storage in sorted(storages):
+ print ' %s| %s' % (str(storage).ljust(col1), storages[storage])
- for id in sorted(storages):
- print ' %s| %s' % (str(id).ljust(col1), storages[id])
-
return STATUS_TO_NAGIOS[self.raid.raid_status()]
def cmd_recover(self, storage):
Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/storage.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/storage.py 2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/storage.py 2009-11-21 11:44:33 UTC (rev 105944)
@@ -123,6 +123,7 @@
self._threads = set()
self.storages_optimal = []
self.storages_degraded = []
+ self.degrade_reasons = {}
# Temporary files and directories that should be removed at the end of
# the two-phase commit. The list must only be modified while holding
@@ -171,11 +172,11 @@
# Degrade all remaining (non-optimal) storages
for name in reduce(lambda x, y: x + y, tids.values(), []):
- self._degrade_storage(name)
+ self._degrade_storage(name, reason='missing transactions')
# No storage is recovering initially
self.storage_recovering = None
- self.recovery_status = ''
+ self.recovery_status = None
# IStorage
@@ -273,7 +274,7 @@
min_oid = sorted(oids)[0][0]
for oid, storage in oids:
if oid > min_oid:
- self._degrade_storage(storage)
+ self._degrade_storage(storage, reason='inconsistent OIDs')
return min_oid
@ensure_writable
@@ -589,12 +590,25 @@
@ensure_open_storage
def raid_details(self):
- return [self.storages_optimal, self.storage_recovering,
- self.storages_degraded, self.recovery_status]
+ storages = {}
+ for storage in self.storages_optimal:
+ storages[storage] = 'optimal'
+ for storage in self.storages_degraded:
+ storages[storage] = 'failed: %s' % (
+ self.degrade_reasons.get(storage, 'n/a'))
+ if self.storage_recovering:
+ msg = 'recovering: '
+ if self.recovery_status:
+ msg += '%s transaction %s' % self.recovery_status
+ else:
+ msg += '???'
+ storages[self.storage_recovering] = msg
+ return storages
@ensure_open_storage
def raid_disable(self, name):
- self._degrade_storage(name, fail=False)
+ self._degrade_storage(
+ name, reason='disabled by controller', fail=False)
return 'disabled %r' % (name,)
@ensure_open_storage
@@ -648,12 +662,15 @@
for name in added:
self.openers[name] = configured_storages[name]
self.storages_degraded.append(name)
+ self.degrade_reasons[name] = 'added by controller'
for name in removed:
self._close_storage(name)
del self.openers[name]
if name in self.storages_degraded:
self.storages_degraded.remove(name)
+ if name in self.degrade_reasons:
+ del self.degrade_reasons[name]
# internal
@@ -666,10 +683,12 @@
except ZODB.POSException.POSKeyError, e:
pass
except Exception, e:
- logger.critical('Could not open storage %s' % name, exc_info=True)
+ logger.exception('Could not open storage %s' % name)
# We were trying to open a storage. Even if we fail we can't be
# more broke than before, so don't ever fail due to this.
- self._degrade_storage(name, fail=False)
+ self._degrade_storage(
+ name, reason='an error occured opening the storage',
+ fail=False)
return
self.storages[name] = storage
@@ -683,9 +702,11 @@
t.setDaemon(True)
t.start()
- def _degrade_storage(self, name, fail=True):
+ def _degrade_storage(self, name, reason, fail=True):
self._close_storage(name)
self.storages_degraded.append(name)
+ self.degrade_reasons[name] = reason
+ logger.critical('Storage %s degraded. Reason: %s' % (name, reason))
if not self.storages_optimal and fail:
raise gocept.zeoraid.interfaces.RAIDError("No storages remain.")
@@ -709,6 +730,8 @@
except ZODB.POSException.StorageError:
# Handle StorageErrors first, otherwise they would be swallowed
# when POSErrors are handled.
+ logger.exception('Error calling %r' % method_name)
+ reason = 'an error occured calling %r' % method_name
reliable = False
except (ZODB.POSException.POSError,
transaction.interfaces.TransactionError), e:
@@ -716,14 +739,17 @@
# indicate storage failure.
raise
except Exception:
+ logger.exception('Error calling %r' % method_name)
+ reason = 'an error occured calling %r' % method_name
reliable = False
if (isinstance(storage, ZEO.ClientStorage.ClientStorage) and
expect_connected and not storage.is_connected()):
reliable = False
+ reason = 'storage is not connected'
if not reliable:
- self._degrade_storage(storage_name)
+ self._degrade_storage(storage_name, reason=reason)
return (reliable, result)
@ensure_open_storage
@@ -793,7 +819,10 @@
thread.join(self.timeout)
if thread.isAlive():
# Storage timed out.
- self._degrade_storage(thread.storage_name)
+ self._degrade_storage(
+ thread.storage_name,
+ reason='no response within %s seconds' %
+ self.timeout)
self._threads.add(thread)
continue
if thread.exception:
@@ -836,12 +865,16 @@
def _recover_impl(self, name):
self.storages_degraded.remove(name)
+ del self.degrade_reasons[name]
self.storage_recovering = name
try:
target = self.openers[name].open()
except Exception:
+ logger.exception('Error opening storage %s' % name)
self.storage_recovering = None
self.storages_degraded.append(name)
+ self.degrade_reasons[name] = (
+ 'an error occured opening the storage')
raise
self.storages[name] = target
recovery = gocept.zeoraid.recovery.Recovery(
@@ -874,10 +907,9 @@
oid = catch_up()
if oid != target:
- logging.warn(
- 'Degrading recovering storage %r due to '
- 'new OID mismatch' % self.storage_recovering)
- self._degrade_storage(self.storage_recovering)
+ self._degrade_storage(
+ self.storage_recovering,
+ reason='failed matching OIDs')
return
self.storages_optimal.append(self.storage_recovering)
Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_basics.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_basics.py 2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_basics.py 2009-11-21 11:44:33 UTC (rev 105944)
@@ -1518,14 +1518,17 @@
# configure the RAID to no longer use the removed backend
self.update_config()
self._storage.raid_reload()
- self.assertEquals([['1', '0', '2', '4'], None, [], ''],
+ self.assertEquals({'1': 'optimal', '0': 'optimal', '2': 'optimal',
+ '4': 'optimal'},
self._storage.raid_details())
# Re-insert the storage
self._storages.append(removed_storage)
self.update_config()
self._storage.raid_reload()
- self.assertEquals([['1', '0', '2', '4'], None, ['3'], ''],
+ self.assertEquals({'1': 'optimal', '0': 'optimal',
+ '3': 'failed: added by controller', '2': 'optimal',
+ '4': 'optimal'},
self._storage.raid_details())
self._storage.raid_recover('3')
@@ -1536,7 +1539,8 @@
# new_oid causes storage 4 to give an inconsistent result thus being
# dropped from the pool of good storages. Storage 3 however meets the
# OID target then thus being picked up.
- self.assertEquals([['1', '0', '2', '3'], None, ['4'], ('recovered',)],
+ self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+ '2': 'optimal', '4': 'failed: inconsistent OIDs'},
self._storage.raid_details())
def test_reload_remove_readd(self):
@@ -1545,7 +1549,8 @@
# configure the RAID to no longer use the removed backend
self.update_config()
self._storage.raid_reload()
- self.assertEquals([['1', '0', '3', '2'], None, [], ''],
+ self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+ '2': 'optimal'},
self._storage.raid_details())
# ensure that we can still write to the RAID
@@ -1561,7 +1566,8 @@
self._storages.append(removed_storage)
self.update_config()
self._storage.raid_reload()
- self.assertEquals([['1', '0', '3', '2'], None, ['4'], ''],
+ self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+ '2': 'optimal', '4': 'failed: added by controller'},
self._storage.raid_details())
self._storage.raid_recover('4')
@@ -1570,7 +1576,8 @@
time.sleep(0.1)
self.assertEquals(
- [['1', '0', '3', '2', '4'], None, [], ('recovered',)],
+ {'1': 'optimal', '0': 'optimal', '3': 'optimal', '2': 'optimal',
+ '4': 'optimal'},
self._storage.raid_details())
# Now, after recovery, the OID should be visible
@@ -1582,14 +1589,17 @@
removed_storage = self._storages.pop()
self._storage.raid_disable(removed_storage.name)
- self.assertEquals([['1', '0', '3', '2'], None, ['4'], ''],
- self._storage.raid_details())
+ self.assertEquals(
+ {'1': 'optimal', '0': 'optimal', '3': 'optimal', '2': 'optimal',
+ '4': 'failed: disabled by controller'},
+ self._storage.raid_details())
# configure the RAID to no longer use the removed backend
self.update_config()
self._storage.raid_reload()
- self.assertEquals([['1', '0', '3', '2'], None, [], ''],
+ self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+ '2': 'optimal'},
self._storage.raid_details())
def test_reload_broken_config(self):
@@ -1599,7 +1609,8 @@
f.close()
self.assertRaises(RuntimeError, self._storage.raid_reload)
- self.assertEquals([['1', '0', '3', '2', '4'], None, [], ''],
+ self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+ '2': 'optimal', '4': 'optimal'},
self._storage.raid_details())
def test_reload_no_remaining_storages(self):
@@ -1609,7 +1620,9 @@
self.update_config()
self.assertRaises(RuntimeError, self._storage.raid_reload)
- self.assertEquals([['1', '0', '3', '2'], None, ['4'], ''],
+ self.assertEquals({'1': 'optimal', '0': 'optimal', '3': 'optimal',
+ '2': 'optimal',
+ '4': 'failed: disabled by controller'},
self._storage.raid_details())
Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_recovery.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_recovery.py 2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_recovery.py 2009-11-21 11:44:33 UTC (rev 105944)
@@ -43,7 +43,7 @@
recovery = gocept.zeoraid.recovery.Recovery(
source, target, lambda target: None)
protocol = list(recovery())
- test.assertEquals([('verified',), ('recovered',)], protocol[-2:])
+ test.assertEquals([('verified', ''), ('recovered', '')], protocol[-2:])
for source_txn, target_txn in itertools.izip(source.iterator(),
target.iterator()):
# We need not compare the transaction metadata because that has
@@ -209,7 +209,7 @@
blob_dir=blob_dir, shared_blob_dir=self.shared)
def test_verify_both_empty(self):
- self.assertEquals([('verified',), ('recovered',)],
+ self.assertEquals([('verified', ''), ('recovered', '')],
list(self.recovery()))
def test_verify_empty_target(self):
Modified: gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_zeo.py
===================================================================
--- gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_zeo.py 2009-11-21 09:03:44 UTC (rev 105943)
+++ gocept.zeoraid/trunk/src/gocept/zeoraid/tests/test_zeo.py 2009-11-21 11:44:33 UTC (rev 105944)
@@ -88,19 +88,17 @@
wait_timeout=60).open()
self._storages.append(raid)
- self.assertEquals([['1'], None, ['2'], ''], raid.raid_details())
+ self.assertEquals({'1': 'optimal',
+ '2': 'failed: missing transactions'},
+ raid.raid_details())
self.assertEquals('degraded', raid.raid_status())
raid.raid_recover('2')
# Wait until the storage starts to recover
- status = ''
- while status != 'recover':
- status = raid.raid_details()[-1]
- if isinstance(status, tuple):
- status = status[0]
- else:
- status = ''
+ status = {'2': ''}
+ while 'recover ' not in status['2']:
+ status = raid.raid_details()
time.sleep(0.5)
# Now, hammer the storage with more transactions with a high chance of
More information about the checkins
mailing list