[Checkins] SVN: zc.fsutil/branches/dev/ Trimmed and optimized the
data to make the tool usable with large databases
Jim Fulton
jim at zope.com
Thu Sep 27 09:20:25 EDT 2007
Log message for revision 80221:
Trimmed and optimized the data to make the tool usable with large databases
Changed:
U zc.fsutil/branches/dev/buildout.cfg
U zc.fsutil/branches/dev/setup.py
U zc.fsutil/branches/dev/src/zc/fsutil/references.py
U zc.fsutil/branches/dev/src/zc/fsutil/references.txt
-=-
Modified: zc.fsutil/branches/dev/buildout.cfg
===================================================================
--- zc.fsutil/branches/dev/buildout.cfg 2007-09-27 12:32:17 UTC (rev 80220)
+++ zc.fsutil/branches/dev/buildout.cfg 2007-09-27 13:20:25 UTC (rev 80221)
@@ -5,8 +5,8 @@
[script]
recipe = zc.recipe.egg
eggs = ${test:eggs}
+interpreter = py
[test]
recipe = zc.recipe.testrunner
eggs = zc.fsutil
-
Modified: zc.fsutil/branches/dev/setup.py
===================================================================
--- zc.fsutil/branches/dev/setup.py 2007-09-27 12:32:17 UTC (rev 80220)
+++ zc.fsutil/branches/dev/setup.py 2007-09-27 13:20:25 UTC (rev 80221)
@@ -20,5 +20,6 @@
install_requires = [
'setuptools',
'ZODB3',
+ 'zope.cachedescriptors',
],
)
Modified: zc.fsutil/branches/dev/src/zc/fsutil/references.py
===================================================================
--- zc.fsutil/branches/dev/src/zc/fsutil/references.py 2007-09-27 12:32:17 UTC (rev 80220)
+++ zc.fsutil/branches/dev/src/zc/fsutil/references.py 2007-09-27 13:20:25 UTC (rev 80221)
@@ -19,6 +19,9 @@
import ZODB.FileStorage.FileStorage
import ZODB.utils
+def oid_repr(oid):
+ return hex(ZODB.utils.u64(oid))[2:-1]
+
def collect(iterator, output):
"""Create a database of database references.
@@ -64,6 +67,45 @@
for (oid, serial, refs) in data:
_update(result, trandata, oid, serial, refs)
+def load_trans(fname):
+ unpickler = cPickle.Unpickler(gzip.open(fname))
+ result = {}
+ while 1:
+ try:
+ trandata, data = unpickler.load()
+ except EOFError:
+ return result
+
+ result.__setitem__(*trandata)
+
+class Entry(object):
+ __slots__ = 'from_', 'present' # , '_to'
+
+ def __init__(self):
+ self.present = False
+ self.from_ = ()
+
+ def __getstate__(self):
+ return self.present, self.from_
+
+ def __setstate__(self, state):
+ self.present, self.from_ = state
+
+# def __eq__(self, other):
+# return self.__getstate__() == other.__getstate__()
+
+ def __repr__(self):
+ result = ['']
+ result.append('present: %s' % self.present)
+# result.append(
+# 'to: %s'
+# % ', '.join(map(repr, (sorted(getattr(self, '_to', ()))))))
+ result.append(
+ 'from_: %s'
+ % ', '.join(map(repr, sorted(getattr(self, 'from_', ())))))
+ result.append('')
+ return '\n '.join(result)
+
def _update(result, tinfo, oid, serial, refs):
"""Create a database of database references.
@@ -72,22 +114,20 @@
and 'to' with values that are dictionaries mapping oids to lists
of references.
"""
- from_oid = ZODB.utils.oid_repr(oid)
+ from_oid = oid_repr(oid)
from_data = result.get(from_oid)
if from_data is None:
- from_data = result[from_oid] = {
- 'from': {}, 'to': {}, 'serials': [],
- }
- from_data['serials'].append(serial)
+ from_data = result[from_oid] = Entry()
+ from_data.present = True
for ref in refs:
if isinstance(ref, tuple):
- to_oid = ZODB.utils.oid_repr(ref[0])
+ to_oid = oid_repr(ref[0])
elif isinstance(ref, str):
- to_oid = ZODB.utils.oid_repr(ref)
+ to_oid = oid_repr(ref)
elif isinstance(ref, list):
if len(ref) == 1:
- to_oid = ZODB.utils.oid_repr(ref[0])
+ to_oid = oid_repr(ref[0])
else:
try:
reference_type, args = ref
@@ -96,31 +136,22 @@
continue
if reference_type == 'w':
- to_oid = ZODB.utils.oid_repr(args[0])
+ to_oid = oid_repr(args[0])
elif reference_type in 'nm':
- to_oid = args[0], ZODB.utils.oid_repr(args[1])
+ to_oid = args[0], oid_repr(args[1])
else:
print wtf, reference_type, args
else:
print 'wtf', ref
continue
- ref = dict(ref=ref, tinfo=tinfo)
+ # from_data.to.add(to_oid)
- from_to = from_data['to'].get(to_oid)
- if from_to is None:
- from_to = from_data['to'][to_oid] = []
- from_to.append(ref)
-
to_data = result.get(to_oid)
if to_data is None:
- to_data = result[to_oid] = {
- 'to': {}, 'from': {}, 'serials': [],
- }
- to_from = to_data['from'].get(from_oid)
- if to_from is None:
- to_from = to_data['from'][from_oid] = []
- to_from.append(ref)
+ to_data = result[to_oid] = Entry()
+ if from_oid not in to_data.from_:
+ to_data.from_ += (from_oid, )
def references(iterator):
"""Create a database of database references.
Modified: zc.fsutil/branches/dev/src/zc/fsutil/references.txt
===================================================================
--- zc.fsutil/branches/dev/src/zc/fsutil/references.txt 2007-09-27 12:32:17 UTC (rev 80220)
+++ zc.fsutil/branches/dev/src/zc/fsutil/references.txt 2007-09-27 13:20:25 UTC (rev 80221)
@@ -92,76 +92,39 @@
>>> from pprint import pprint
>>> pprint(refs1, width=1)
- {'0x00': {'from': {},
- 'serials': ['\x03p\x98[\xe0C\xd73',
- '\x03p\x98[\xe0C\xd74'],
- 'to': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd74',
- 168L)}]}},
- '0x01': {'from': {'0x00': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd74',
- 168L)}]},
- 'serials': ['\x03p\x98[\xe0C\xd74',
- '\x03p\x98[\xe0C\xd75'],
- 'to': {'0x02': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd74',
- 168L)},
- {'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 535L)}],
- ('db2', '0x02'): [{'ref': ['m',
- ('db2',
- '\x00\x00\x00\x00\x00\x00\x00\x02',
- None)],
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 535L)}]}},
- '0x02': {'from': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd74',
- 168L)},
- {'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 535L)}]},
- 'serials': ['\x03p\x98[\xe0C\xd74'],
- 'to': {}},
- ('db2', '0x02'): {'from': {'0x01': [{'ref': ['m',
- ('db2',
- '\x00\x00\x00\x00\x00\x00\x00\x02',
- None)],
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 535L)}]},
- 'serials': [],
- 'to': {}}}
+ {'0':
+ present: True
+ from_:
+ ,
+ '1':
+ present: True
+ from_: '0'
+ ,
+ '2':
+ present: True
+ from_: '1'
+ ,
+ ('db2', '2'):
+ present: False
+ from_: '1'
+ }
There's an entry for each object referenced. Each entry is a
-dictionary with 3 keys:
+object with 3 attributes:
-from
- Is a dictionary containing information about references to the entry's
- object id. Each entry has as it's key, the refering object id and
- as it's values, the list of references from that object id.
+present
+ a flag indicating whether the oid is present in the database
-to
- Is a dictionary containing information about references from the
- entry's object id. Each entry has as it's key, the object id of the
- object being referenced, and as it's values, the list of references
- to that object id.
+from_
+ a set of oids that refer to this oid
-serials
- Is a list of serial ids (transaction ids) for the entry's object.
-
- An entry with empty serials is a missing object. It is refered to,
- but there are no records for it.
-
Note that object 1 is refered to by object 0 and refers to object 2
-and to object 2 in the second database. It has 2 database records,
-indicated by 2 values in it's serials.
+and to object 2 in the second database. It is present in the
+database. The object in db2 is, of course, not present.
+Also note that the object ids are actually hex numbers. This isn't
+apparent in the example because the numbers are so small.
+
The references_script function is intended to be used as a setuptools
entry point. We'll call it directly, passing command line arguments,
which are the name of an input file and the name of an output data file:
@@ -173,58 +136,39 @@
>>> import cPickle
>>> refs2 = cPickle.Unpickler(open('fs2.dat')).load()
>>> pprint(refs2, width=1)
- {'0x00': {'from': {},
- 'serials': ['\x03p\x98[\xe0C\xd75'],
- 'to': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 4L)}],
- '0x02': [{'ref': ['w',
- ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 4L)}]}},
- '0x01': {'from': {'0x00': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 4L)}]},
- 'serials': ['\x03p\x98[\xe0C\xd76',
- '\x03p\x98[\xe0C\xd77'],
- 'to': {'0x02': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd77',
- 324L)}]}},
- '0x02': {'from': {'0x00': [{'ref': ['w',
- ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 4L)}],
- '0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd77',
- 324L)}]},
- 'serials': [],
- 'to': {}}}
+ {'0':
+ present: True
+ from_:
+ ,
+ '1':
+ present: True
+ from_: '0'
+ ,
+ '2':
+ present: False
+ from_: '0', '1'
+ }
-In database 2, we see that object id is missing because it doesn't
-have any serials.
+In database 2, we see that object 2 is not present. We see that it is
+refered to by object 0 and 1.
-We can query these data structures using Python. For example, to dind
-missing objects (and the in-database objects that reference them:
+We can query these data structures using Python. For example, to find
+missing objects (and the in-database objects that reference them):
- >>> pprint([(oid, data) for (oid, data) in refs2.iteritems()
- ... if not data['serials']
- ... ], width=1)
- [('0x02',
- {'from': {'0x00': [{'ref': ['w',
- ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
- 'tinfo': ('\x03p\x98[\xe0C\xd75',
- 4L)}],
- '0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
- None),
- 'tinfo': ('\x03p\x98[\xe0C\xd77',
- 324L)}]},
- 'serials': [],
- 'to': {}})]
+ >>> [(oid, data) for (oid, data) in refs1.iteritems()
+ ... if not data.present and isinstance(oid, str)
+ ... ]
+ []
+ >>> [(oid, data) for (oid, data) in refs2.iteritems()
+ ... if not data.present and isinstance(oid, str)
+ ... ]
+ [('2',
+ present: False
+ from_: '0', '1'
+ )]
+
+
Here we see the broken weak reference from object 0 and the broken
ordinary reference from object 1.
@@ -237,14 +181,45 @@
data:
>>> zc.fsutil.references.collect_script(['fs2', 'fs2.dat'])
- >>> zc.fsutil.references.load('fs2.dat') == refs2
- True
+ >>> zc.fsutil.references.load('fs2.dat')
+ {'1':
+ present: True
+ from_: '0'
+ , '0':
+ present: True
+ from_:
+ , '2':
+ present: False
+ from_: '0', '1'
+ }
You can pass multiple pairs of files.
>>> zc.fsutil.references.collect_script(['fs1', 'fs1.dat',
... 'fs2', 'fs2.dat'])
- >>> zc.fsutil.references.load('fs1.dat') == refs1
- True
- >>> zc.fsutil.references.load('fs2.dat') == refs2
- True
+ >>> zc.fsutil.references.load('fs1.dat')
+ {'1':
+ present: True
+ from_: '0'
+ , '0':
+ present: True
+ from_:
+ , '2':
+ present: True
+ from_: '1'
+ , ('db2', '2'):
+ present: False
+ from_: '1'
+ }
+
+ >>> zc.fsutil.references.load('fs2.dat')
+ {'1':
+ present: True
+ from_: '0'
+ , '0':
+ present: True
+ from_:
+ , '2':
+ present: False
+ from_: '0', '1'
+ }
More information about the Checkins
mailing list