[Checkins] SVN: zc.zodbdgc/trunk/src/zc/zodbdgc/ - Optimized garbage collection by using a temporary file to
Jim Fulton
jim at zope.com
Thu Sep 3 15:52:28 EDT 2009
Log message for revision 103511:
- Optimized garbage collection by using a temporary file to
store object references rather than loading them from the analysis
database when needed.
- Added an -f option to specify file-storage files directly. It is
wildly faster to iterate over a file storage than over a ZEO
connection. Using this option uses a file iterator rather than
opening a file storage in read-only mode, which avoids scanning the
database to build an index and aviods the memory cost of a
file-storage index.
Changed:
U zc.zodbdgc/trunk/src/zc/zodbdgc/README.test
U zc.zodbdgc/trunk/src/zc/zodbdgc/README.txt
U zc.zodbdgc/trunk/src/zc/zodbdgc/__init__.py
-=-
Modified: zc.zodbdgc/trunk/src/zc/zodbdgc/README.test
===================================================================
--- zc.zodbdgc/trunk/src/zc/zodbdgc/README.test 2009-09-03 18:55:05 UTC (rev 103510)
+++ zc.zodbdgc/trunk/src/zc/zodbdgc/README.test 2009-09-03 19:52:27 UTC (rev 103511)
@@ -291,6 +291,9 @@
-h, --help show this help message and exit
-d DAYS, --days=DAYS Number of trailing days (defaults to 1) to treat as
non-garbage
+ -f FS, --file-storage=FS
+ name=path, use the given file storage path for
+ analysis of the.named database
-i IGNORE, --ignore-database=IGNORE
Ignore references to the given database name.
-l LEVEL, --log-level=LEVEL
@@ -301,9 +304,18 @@
Ignoring index for 2.fs
Ignoring index for 3.fs
Using secondary configuration, config2, for analysis
+ db1: roots
+ db1: recent
+ db2: roots
+ db2: recent
+ db3: roots
+ db3: recent
+ db1: remove garbage
Removed 12 objects from db1
+ db2: remove garbage
+ Removed 1 objects from db2
+ db3: remove garbage
Removed 2 objects from db3
- Removed 1 objects from db2
>>> sorted(bad2.iterator()) == sorted(bad.iterator())
True
@@ -311,9 +323,18 @@
We can gc again, even with deleted records:
>>> sorted(zc.zodbdgc.gc_command(['-d2', 'config']).iterator())
+ db1: roots
+ db1: recent
+ db2: roots
+ db2: recent
+ db3: roots
+ db3: recent
+ db1: remove garbage
Removed 0 objects from db1
+ db2: remove garbage
+ Removed 0 objects from db2
+ db3: remove garbage
Removed 0 objects from db3
- Removed 0 objects from db2
[]
>>> db = ZODB.config.databaseFromFile(open('config'))
@@ -334,16 +355,25 @@
>>> old_argv = sys.argv[:]
>>> old_basicConfig = logging.basicConfig
- >>> def faux_basicConfig(*args, **kw):
- ... print 'basicConfig', args, kw
+ >>> def faux_basicConfig(level=None, format=None):
+ ... print 'basicConfig', (), dict(level=level)
>>> logging.basicConfig = faux_basicConfig
>>> sys.argv[:] = ['-d2', 'config']
>>> sorted(zc.zodbdgc.gc_command().iterator())
basicConfig () {'level': 30}
+ db1: roots
+ db1: recent
+ db2: roots
+ db2: recent
+ db3: roots
+ db3: recent
+ db1: remove garbage
Removed 0 objects from db1
+ db2: remove garbage
+ Removed 0 objects from db2
+ db3: remove garbage
Removed 0 objects from db3
- Removed 0 objects from db2
[]
and check -l option handling:
@@ -351,17 +381,35 @@
>>> sys.argv[:] = ['-d2', '-l10', 'config']
>>> sorted(zc.zodbdgc.gc_command().iterator())
basicConfig () {'level': 10}
+ db1: roots
+ db1: recent
+ db2: roots
+ db2: recent
+ db3: roots
+ db3: recent
+ db1: remove garbage
Removed 0 objects from db1
+ db2: remove garbage
+ Removed 0 objects from db2
+ db3: remove garbage
Removed 0 objects from db3
- Removed 0 objects from db2
[]
>>> sys.argv[:] = ['-d2', '-lINFO', 'config']
>>> sorted(zc.zodbdgc.gc_command().iterator())
basicConfig () {'level': 20}
+ db1: roots
+ db1: recent
+ db2: roots
+ db2: recent
+ db3: roots
+ db3: recent
+ db1: remove garbage
Removed 0 objects from db1
+ db2: remove garbage
+ Removed 0 objects from db2
+ db3: remove garbage
Removed 0 objects from db3
- Removed 0 objects from db2
[]
@@ -389,9 +437,18 @@
Ignoring index for 1.fs-2
Ignoring index for 2.fs-2
Ignoring index for 3.fs-2
+ db1: roots
+ db1: recent
+ db2: roots
+ db2: recent
+ db3: roots
+ db3: recent
+ db1: remove garbage
Removed 0 objects from db1
+ db2: remove garbage
+ Removed 0 objects from db2
+ db3: remove garbage
Removed 0 objects from db3
- Removed 0 objects from db2
[]
Try with 0 days:
@@ -399,9 +456,15 @@
>>> sorted((name, long(u64(oid))) for (name, oid) in
... zc.zodbdgc.gc_command(['-d0', 'config', 'config2']).iterator())
Using secondary configuration, config2, for analysis
+ db1: roots
+ db2: roots
+ db3: roots
+ db1: remove garbage
Removed 0 objects from db1
+ db2: remove garbage
+ Removed 2 objects from db2
+ db3: remove garbage
Removed 0 objects from db3
- Removed 2 objects from db2
[('db2', 3L), ('db2', 4L)]
Test the check command
@@ -572,11 +635,81 @@
>>> sorted(zc.zodbdgc.gc_command(['-idb2', 'config']).iterator())
... # doctest: +NORMALIZE_WHITESPACE
+ db1: roots
+ db1: recent
+ db1: remove garbage
Removed 2 objects from db1
[('db1', '\x00\x00\x00\x00\x00\x00\x00\x02'),
('db1', '\x00\x00\x00\x00\x00\x00\x00\x03')]
+ >>> os.remove('one.fs')
+ >>> os.remove('two.fs')
+Using file-storage iterators directly
+-------------------------------------
+
+If the database under analysis is a file-storage, we can access teh
+files directly:
+
+
+ >>> open('config', 'w').write("""
+ ... <zodb db1>
+ ... <filestorage>
+ ... path one.fs
+ ... pack-gc false
+ ... </filestorage>
+ ... </zodb>
+ ... <zodb db2>
+ ... <filestorage>
+ ... path two.fs
+ ... pack-gc false
+ ... </filestorage>
+ ... </zodb>
+ ... """)
+ >>> db = ZODB.config.databaseFromFile(open('config'))
+ Ignoring index for one.fs
+ Ignoring index for two.fs
+
+ >>> conn = db.open()
+ >>> conn.get_connection('db2').root.x = C()
+ >>> transaction.commit()
+ >>> conn.root.x = C()
+ >>> conn.root.x.x = conn.get_connection('db2').root.x
+ >>> transaction.commit()
+ >>> conn.root.a = C()
+ >>> transaction.commit()
+ >>> conn.root.b = C()
+ >>> conn.root.a.b = conn.root.b
+ >>> conn.root.b.a = conn.root.a
+ >>> transaction.commit()
+ >>> del conn.root.a
+ >>> del conn.root.b
+ >>> transaction.commit()
+
+ >>> now += 2*86400
+
+ >>> db.pack()
+
+ >>> _ = [db.close() for db in db.databases.itervalues()]
+
+
+ >>> sorted(zc.zodbdgc.gc_command(['-fdb1=one.fs', '-fdb2=two.fs', 'config']
+ ... ).iterator())
+ ... # doctest: +NORMALIZE_WHITESPACE
+ db1: roots
+ db1: recent
+ db2: roots
+ db2: recent
+ db1: remove garbage
+ Removed 2 objects from db1
+ db2: remove garbage
+ Removed 0 objects from db2
+ [('db1', '\x00\x00\x00\x00\x00\x00\x00\x02'),
+ ('db1', '\x00\x00\x00\x00\x00\x00\x00\x03')]
+
+ >>> os.remove('one.fs')
+ >>> os.remove('two.fs')
+
.. cleanup
>>> logging.getLogger().setLevel(old_level)
Modified: zc.zodbdgc/trunk/src/zc/zodbdgc/README.txt
===================================================================
--- zc.zodbdgc/trunk/src/zc/zodbdgc/README.txt 2009-09-03 18:55:05 UTC (rev 103510)
+++ zc.zodbdgc/trunk/src/zc/zodbdgc/README.txt 2009-09-03 19:52:27 UTC (rev 103511)
@@ -27,6 +27,12 @@
lightly loaded. This is helpful because finding garbage places a
significant load on the databases used to find garbage.
+If your database uses file-storages, then rather than specifying a
+second configuration file, you can use the -f option to specify
+file-storage iterators for finding garbage. Using file storage
+iterators is much faster than using a ZEO connection and is faster and
+requires less memory than opening a read-only file storage on the files.
+
Some number of trailing days (1 by default) of database records are
considered good, meaning the objects referenced by them are not
garbage. This allows the garbage-collection algorithm to work more
@@ -38,6 +44,7 @@
be treated as non garbage and to specify the logging level. Use the
``--help`` option to get details.
+
multi-zodb-check-refs
---------------------
@@ -65,6 +72,20 @@
Change History
==============
+0.3.0 2009-09-03
+----------------
+
+- Optimized garbage collection by using a temporary file to
+ store object references rather than loading them from the analysis
+ database when needed.
+
+- Added an -f option to specify file-storage files directly. It is
+ wildly faster to iterate over a file storage than over a ZEO
+ connection. Using this option uses a file iterator rather than
+ opening a file storage in read-only mode, which avoids scanning the
+ database to build an index and aviods the memory cost of a
+ file-storage index.
+
0.2.0 2009-06-15
----------------
Modified: zc.zodbdgc/trunk/src/zc/zodbdgc/__init__.py
===================================================================
--- zc.zodbdgc/trunk/src/zc/zodbdgc/__init__.py 2009-09-03 18:55:05 UTC (rev 103510)
+++ zc.zodbdgc/trunk/src/zc/zodbdgc/__init__.py 2009-09-03 19:52:27 UTC (rev 103511)
@@ -19,6 +19,7 @@
import base64
import cPickle
import cStringIO
+import itertools
import logging
import marshal
import optparse
@@ -32,6 +33,8 @@
import ZODB.blob
import ZODB.config
import ZODB.FileStorage
+import ZODB.fsIndex
+import ZODB.POSException
import ZODB.TimeStamp
def p64(v):
@@ -43,17 +46,63 @@
return struct.unpack(">q", v)[0]
logger = logging.getLogger(__name__)
+log_format = "%(asctime)s %(name)s %(levelname)s: %(message)s"
-def gc(conf, days=1, ignore=(), conf2=None, batch_size=10000):
+def gc_command(args=None):
+ if args is None:
+ args = sys.argv[1:]
+ level = logging.WARNING
+ else:
+ level = None
+
+ parser = optparse.OptionParser("usage: %prog [options] config1 [config2]")
+ parser.add_option(
+ '-d', '--days', dest='days', type='int', default=1,
+ help='Number of trailing days (defaults to 1) to treat as non-garbage')
+ parser.add_option(
+ '-f', '--file-storage', dest='fs', action='append',
+ help='name=path, use the given file storage path for analysis of the.'
+ 'named database')
+ parser.add_option(
+ '-i', '--ignore-database', dest='ignore', action='append',
+ help='Ignore references to the given database name.')
+ parser.add_option(
+ '-l', '--log-level', dest='level',
+ help='The logging level. The default is WARNING.')
+
+ options, args = parser.parse_args(args)
+
+ if not args or len(args) > 2:
+ parser.parse_args(['-h'])
+ elif len(args) == 2:
+ conf2=args[1]
+ else:
+ conf2 = None
+
+ if options.level:
+ level = options.level
+
+ if level:
+ try:
+ level = int(level)
+ except ValueError:
+ level = getattr(logging, level)
+ logging.basicConfig(level=level, format=log_format)
+
+ return gc(args[0], options.days, options.ignore or (), conf2=conf2,
+ fs=dict(o.split('=') for o in options.fs or ()))
+
+
+def gc(conf, days=1, ignore=(), conf2=None, batch_size=10000, fs=()):
close = []
try:
- return gc_(close, conf, days, ignore, conf2, batch_size)
+ return gc_(close, conf, days, ignore, conf2, batch_size, fs)
finally:
for db in close:
for db in db.databases.itervalues():
db.close()
-def gc_(close, conf, days, ignore, conf2, batch_size):
+def gc_(close, conf, days, ignore, conf2, batch_size, fs):
db1 = ZODB.config.databaseFromFile(open(conf))
close.append(db1)
if conf2 is None:
@@ -66,29 +115,40 @@
raise ValueError("primary and secondary databases don't match.")
databases = db2.databases
- storages = dict((name, d.storage) for (name, d) in databases.items())
+ storages = sorted((name, d.storage) for (name, d) in databases.items())
ptid = repr(
ZODB.TimeStamp.TimeStamp(*time.gmtime(time.time() - 86400*days)[:6])
)
- # Pre-populate good with roots and recently-written objects
good = oidset(databases)
- bad = oidset(databases)
- both = good, bad
+ bad = Bad(databases)
deleted = oidset(databases)
- for name, storage in storages.iteritems():
+ for name, storage in storages:
+ logger.info("%s: roots", name)
# Make sure we can get the roots
data, s = storage.load(z64, '')
good.insert(name, z64)
for ref in getrefs(data, name, ignore):
good.insert(*ref)
+ n = 0
if days:
# All non-deleted new records are good
- for trans in storage.iterator(ptid):
+ logger.info("%s: recent", name)
+
+ if name in fs:
+ it = ZODB.FileStorage.FileIterator(fs[name], ptid)
+ else:
+ it = storage.iterator(ptid)
+
+ for trans in it:
for record in trans:
+ if n and n%10000 == 0:
+ logger.info("%s: %s recent", name, n)
+ n += 1
+
oid = record.oid
data = record.data
if data:
@@ -108,8 +168,17 @@
good.remove(name, oid)
# Now iterate over older records
- for trans in storage.iterator(None, ptid):
+ if name in fs:
+ it = ZODB.FileStorage.FileIterator(fs[name], None, ptid)
+ else:
+ it = storage.iterator(None, ptid)
+
+ for trans in it:
for record in trans:
+ if n and n%10000 == 0:
+ logger.info("%s: %s old", name, n)
+ n += 1
+
oid = record.oid
data = record.data
if data:
@@ -120,9 +189,14 @@
if deleted.has(*ref):
continue
if good.insert(*ref) and bad.has(*ref):
- bad_to_good(storages, ignore, bad, good, *ref)
+ to_do = [ref]
+ while to_do:
+ for ref in bad.pop(*to_do.pop()):
+ if good.insert(*ref) and bad.has(*ref):
+ to_do.append(ref)
else:
- bad.insert(name, oid)
+ bad.insert(name, oid, record.tid,
+ getrefs(data, name, ignore))
else:
# deleted record
if good.has(name, oid):
@@ -137,19 +211,24 @@
close.pop()
# Now, we have the garbage in bad. Remove it.
- for name, db in db1.databases.iteritems():
+ for name, db in sorted(db1.databases.iteritems()):
+ logger.info("%s: remove garbage", name)
storage = db.storage
t = transaction.begin()
storage.tpc_begin(t)
nd = 0
- for oid in bad.iterator(name):
- p, s = storage.load(oid, '')
- storage.deleteObject(oid, s, t)
+ for oid, tid in bad.iterator(name):
+ try:
+ storage.deleteObject(oid, tid, t)
+ except (ZODB.POSException.POSKeyError,
+ ZODB.POSException.ConflictError):
+ continue
nd += 1
if (nd % batch_size) == 0:
storage.tpc_vote(t)
storage.tpc_finish(t)
t.commit()
+ logger.info("%s: deleted %s", name, nd)
t = transaction.begin()
storage.tpc_begin(t)
@@ -164,23 +243,6 @@
return bad
-def bad_path(baddir, name, oid):
- return os.path.join(baddir, name, base64.urlsafe_b64encode(oid))
-
-def bad_to_good(storages, ignore, bad, good, name, oid):
-
- to_do = [(name, oid)]
- while to_do:
- name, oid = to_do.pop()
- bad.remove(name, oid)
- storage = storages[name]
-
- for h in storage.history(oid, size=1<<99):
- data = storage.loadSerial(oid, h['tid'])
- for ref in getrefs(data, name, ignore):
- if good.insert(*ref) and bad.has(*ref):
- to_do.append(ref)
-
def getrefs(p, rname, ignore):
refs = []
u = cPickle.Unpickler(cStringIO.StringIO(p))
@@ -199,7 +261,11 @@
yield ref[:2]
class oidset(dict):
+ """
+ {(name, oid)} implemented as:
+ {name-> {oid[:6] -> {oid[-2:]}}}
+ """
def __init__(self, names):
for name in names:
self[name] = {}
@@ -258,43 +324,74 @@
for suffix in data:
yield prefix+suffix
-def gc_command(args=None):
- if args is None:
- args = sys.argv[1:]
- level = logging.WARNING
- else:
- level = None
+class Bad:
- parser = optparse.OptionParser("usage: %prog [options] config1 [config2]")
- parser.add_option(
- '-d', '--days', dest='days', type='int', default=1,
- help='Number of trailing days (defaults to 1) to treat as non-garbage')
- parser.add_option(
- '-i', '--ignore-database', dest='ignore', action='append',
- help='Ignore references to the given database name.')
- parser.add_option(
- '-l', '--log-level', dest='level',
- help='The logging level. The default is WARNING.')
+ def __init__(self, names):
+ self._file = tempfile.TemporaryFile(dir='.', prefix='gcbad')
+ self.close = self._file.close
+ self._pos = 0
+ self._dbs = {}
+ for name in names:
+ self._dbs[name] = ZODB.fsIndex.fsIndex()
- options, args = parser.parse_args(args)
+ def remove(self, name, oid):
+ db = self._dbs[name]
+ if oid in db:
+ del db[oid]
- if not args or len(args) > 2:
- parser.parse_args(['-h'])
+ def __nonzero__(self):
+ raise SystemError('wtf')
+ return sum(map(bool, self._dbs.itervalues()))
- if options.level:
- level = options.level
+ def has(self, name, oid):
+ db = self._dbs[name]
+ return oid in db
- if level:
- try:
- level = int(level)
- except ValueError:
- level = getattr(logging, level)
- logging.basicConfig(level=level)
+ def iterator(self, name=None):
+ if name is None:
+ for name in self._dbs:
+ for oid in self._dbs[name]:
+ yield name, oid
+ else:
+ f = self._file
+ for oid, pos in self._dbs[name].iteritems():
+ f.seek(pos)
+ yield oid, f.read(8)
- return gc(args[0], options.days, options.ignore or (), *args[1:])
+ def insert(self, name, oid, tid, refs):
+ assert len(tid) == 8
+ f = self._file
+ db = self._dbs[name]
+ pos = db.get(oid)
+ if pos is not None:
+ f.seek(pos)
+ tid = f.read(8)
+ oldrefs = set(marshal.load(f))
+ refs = oldrefs.union(refs)
+ tid = max(tid, oldtid)
+ if refs == oldrefs:
+ if tid != oldtid:
+ f.seek(pos)
+ f.write(tid)
+ return
+ db[oid] = pos = self._pos
+ f.seek(pos)
+ f.write(tid)
+ marshal.dump(list(refs), f)
+ self._pos = f.tell()
+ def pop(self, name, oid):
+ db = self._dbs[name]
+ pos = db.get(oid, None)
+ if pos is None:
+ return ()
+ del db[oid]
+ f = self._file
+ f.seek(pos+8)
+ return marshal.load(f)
+
def check(config, refdb=None):
if refdb is None:
return check_(config)
@@ -415,7 +512,7 @@
def check_command(args=None):
if args is None:
args = sys.argv[1:]
- logging.basicConfig(level=logging.WARNING)
+ logging.basicConfig(level=logging.WARNING, format=log_format)
parser = optparse.OptionParser("usage: %prog [options] config")
parser.add_option(
More information about the checkins
mailing list