[Checkins] SVN: zc.fsutil/branches/dev/ Trimmed and optimized the data to make the tool usable with large databases

Thu Sep 27 09:20:25 EDT 2007

Log message for revision 80221:
  Trimmed and optimized the data to make the tool usable with large databases

Changed:
  U   zc.fsutil/branches/dev/buildout.cfg
  U   zc.fsutil/branches/dev/setup.py
  U   zc.fsutil/branches/dev/src/zc/fsutil/references.py
  U   zc.fsutil/branches/dev/src/zc/fsutil/references.txt

-=-
Modified: zc.fsutil/branches/dev/buildout.cfg
===================================================================

--- zc.fsutil/branches/dev/buildout.cfg	2007-09-27 12:32:17 UTC (rev 80220)
+++ zc.fsutil/branches/dev/buildout.cfg	2007-09-27 13:20:25 UTC (rev 80221)
@@ -5,8 +5,8 @@
 [script]
 recipe = zc.recipe.egg
 eggs = ${test:eggs}
+interpreter = py
 
 [test]
 recipe = zc.recipe.testrunner
 eggs = zc.fsutil
-

Modified: zc.fsutil/branches/dev/setup.py
===================================================================
--- zc.fsutil/branches/dev/setup.py	2007-09-27 12:32:17 UTC (rev 80220)
+++ zc.fsutil/branches/dev/setup.py	2007-09-27 13:20:25 UTC (rev 80221)
@@ -20,5 +20,6 @@
     install_requires = [
         'setuptools',
         'ZODB3',
+        'zope.cachedescriptors',
         ],
     )

Modified: zc.fsutil/branches/dev/src/zc/fsutil/references.py
===================================================================
--- zc.fsutil/branches/dev/src/zc/fsutil/references.py	2007-09-27 12:32:17 UTC (rev 80220)
+++ zc.fsutil/branches/dev/src/zc/fsutil/references.py	2007-09-27 13:20:25 UTC (rev 80221)
@@ -19,6 +19,9 @@
 import ZODB.FileStorage.FileStorage
 import ZODB.utils
 
+def oid_repr(oid):
+    return hex(ZODB.utils.u64(oid))[2:-1]
+
 def collect(iterator, output):
     """Create a database of database references.
 
@@ -64,6 +67,45 @@
         for (oid, serial, refs) in data:
             _update(result, trandata, oid, serial, refs)
 
+def load_trans(fname):
+    unpickler = cPickle.Unpickler(gzip.open(fname))
+    result = {}
+    while 1:
+        try:
+            trandata, data = unpickler.load()
+        except EOFError:
+            return result
+
+        result.__setitem__(*trandata)
+
+class Entry(object):
+    __slots__ = 'from_', 'present' # , '_to'
+
+    def __init__(self):
+        self.present = False
+        self.from_ = () 
+
+    def __getstate__(self):
+        return self.present, self.from_
+
+    def __setstate__(self, state):
+       self.present, self.from_ = state
+
+#     def __eq__(self, other):
+#         return self.__getstate__() == other.__getstate__()
+
+    def __repr__(self):
+        result = ['']
+        result.append('present: %s' % self.present)
+#         result.append(
+#             'to: %s'
+#             % ', '.join(map(repr, (sorted(getattr(self, '_to', ()))))))
+        result.append(
+                'from_: %s'
+                % ', '.join(map(repr, sorted(getattr(self, 'from_', ())))))
+        result.append('')
+        return '\n    '.join(result)
+            
         
 def _update(result, tinfo, oid, serial, refs):
     """Create a database of database references.
@@ -72,22 +114,20 @@
     and 'to' with values that are dictionaries mapping oids to lists
     of references.
     """
-    from_oid = ZODB.utils.oid_repr(oid)
+    from_oid = oid_repr(oid)
     from_data = result.get(from_oid)
     if from_data is None:
-        from_data = result[from_oid] = {
-            'from': {}, 'to': {}, 'serials': [],
-            }
-    from_data['serials'].append(serial)
+        from_data = result[from_oid] = Entry()
+    from_data.present = True
 
     for ref in refs:
         if isinstance(ref, tuple):
-            to_oid = ZODB.utils.oid_repr(ref[0])
+            to_oid = oid_repr(ref[0])
         elif isinstance(ref, str):
-            to_oid = ZODB.utils.oid_repr(ref)
+            to_oid = oid_repr(ref)
         elif isinstance(ref, list):
             if len(ref) == 1:
-                to_oid = ZODB.utils.oid_repr(ref[0])
+                to_oid = oid_repr(ref[0])
             else:
                 try:
                     reference_type, args = ref
@@ -96,31 +136,22 @@
                     continue
 
                 if reference_type == 'w':
-                    to_oid = ZODB.utils.oid_repr(args[0])
+                    to_oid = oid_repr(args[0])
                 elif reference_type in 'nm':
-                    to_oid = args[0], ZODB.utils.oid_repr(args[1])
+                    to_oid = args[0], oid_repr(args[1])
                 else:
                     print wtf, reference_type, args
         else:
             print 'wtf', ref
             continue
 
-        ref = dict(ref=ref, tinfo=tinfo)
+        # from_data.to.add(to_oid)
 
-        from_to = from_data['to'].get(to_oid)
-        if from_to is None:
-            from_to = from_data['to'][to_oid] = []
-        from_to.append(ref)
-
         to_data = result.get(to_oid)
         if to_data is None:
-            to_data = result[to_oid] = {
-                'to': {}, 'from': {}, 'serials': [],
-                }
-        to_from = to_data['from'].get(from_oid)
-        if to_from is None:
-            to_from = to_data['from'][from_oid] = []
-        to_from.append(ref)
+            to_data = result[to_oid] = Entry()
+        if from_oid not in to_data.from_:
+            to_data.from_ += (from_oid, )
 
 def references(iterator):
     """Create a database of database references.

Modified: zc.fsutil/branches/dev/src/zc/fsutil/references.txt
===================================================================
--- zc.fsutil/branches/dev/src/zc/fsutil/references.txt	2007-09-27 12:32:17 UTC (rev 80220)
+++ zc.fsutil/branches/dev/src/zc/fsutil/references.txt	2007-09-27 13:20:25 UTC (rev 80221)
@@ -92,76 +92,39 @@
 
     >>> from pprint import pprint
     >>> pprint(refs1, width=1)
-    {'0x00': {'from': {},
-              'serials': ['\x03p\x98[\xe0C\xd73',
-                          '\x03p\x98[\xe0C\xd74'],
-              'to': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
-                                       None),
-                               'tinfo': ('\x03p\x98[\xe0C\xd74',
-                                         168L)}]}},
-     '0x01': {'from': {'0x00': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
-                                         None),
-                                 'tinfo': ('\x03p\x98[\xe0C\xd74',
-                                           168L)}]},
-              'serials': ['\x03p\x98[\xe0C\xd74',
-                          '\x03p\x98[\xe0C\xd75'],
-              'to': {'0x02': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
-                                       None),
-                               'tinfo': ('\x03p\x98[\xe0C\xd74',
-                                         168L)},
-                              {'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
-                                       None),
-                               'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                         535L)}],
-                     ('db2', '0x02'): [{'ref': ['m',
-                                                ('db2',
-                                                 '\x00\x00\x00\x00\x00\x00\x00\x02',
-                                                 None)],
-                                        'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                                  535L)}]}},
-     '0x02': {'from': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
-                                         None),
-                                 'tinfo': ('\x03p\x98[\xe0C\xd74',
-                                           168L)},
-                                {'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
-                                         None),
-                                 'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                           535L)}]},
-              'serials': ['\x03p\x98[\xe0C\xd74'],
-              'to': {}},
-     ('db2', '0x02'): {'from': {'0x01': [{'ref': ['m',
-                                                  ('db2',
-                                                   '\x00\x00\x00\x00\x00\x00\x00\x02',
-                                                   None)],
-                                          'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                                    535L)}]},
-                       'serials': [],
-                       'to': {}}}
+    {'0': 
+        present: True
+        from_: 
+        ,
+     '1': 
+        present: True
+        from_: '0'
+        ,
+     '2': 
+        present: True
+        from_: '1'
+        ,
+     ('db2', '2'): 
+        present: False
+        from_: '1'
+        }
 
 There's an entry for each object referenced.  Each entry is a
-dictionary with 3 keys:
+object with 3 attributes:
 
-from
-   Is a dictionary containing information about references to the entry's
-   object id. Each entry has as it's key, the refering object id and
-   as it's values, the list of references from that object id.  
+present
+   a flag indicating whether the oid is present in the database
 
-to
-   Is a dictionary containing information about references from the
-   entry's object id. Each entry has as it's key, the object id of the
-   object being referenced, and as it's values, the list of references
-   to that object id.
+from_
+   a set of oids that refer to this oid
 
-serials
-   Is a list of serial ids (transaction ids) for the entry's object.
-
-   An entry with empty serials is a missing object. It is refered to,
-   but there are no records for it.
-
 Note that object 1 is refered to by object 0 and refers to object 2
-and to object 2 in the second database.  It has 2 database records,
-indicated by 2 values in it's serials.
+and to object 2 in the second database.  It is present in the
+database.  The object in db2 is, of course, not present.
 
+Also note that the object ids are actually hex numbers.  This isn't
+apparent in the example because the numbers are so small.
+
 The references_script function is intended to be used as a setuptools
 entry point.  We'll call it directly, passing command line arguments,
 which are the name of an input file and the name of an output data file:
@@ -173,58 +136,39 @@
     >>> import cPickle
     >>> refs2 = cPickle.Unpickler(open('fs2.dat')).load()
     >>> pprint(refs2, width=1)
-    {'0x00': {'from': {},
-              'serials': ['\x03p\x98[\xe0C\xd75'],
-              'to': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
-                                       None),
-                               'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                         4L)}],
-                     '0x02': [{'ref': ['w',
-                                       ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
-                               'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                         4L)}]}},
-     '0x01': {'from': {'0x00': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
-                                         None),
-                                 'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                           4L)}]},
-              'serials': ['\x03p\x98[\xe0C\xd76',
-                          '\x03p\x98[\xe0C\xd77'],
-              'to': {'0x02': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
-                                       None),
-                               'tinfo': ('\x03p\x98[\xe0C\xd77',
-                                         324L)}]}},
-     '0x02': {'from': {'0x00': [{'ref': ['w',
-                                         ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
-                                 'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                           4L)}],
-                       '0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
-                                         None),
-                                 'tinfo': ('\x03p\x98[\xe0C\xd77',
-                                           324L)}]},
-              'serials': [],
-              'to': {}}}
+    {'0': 
+        present: True
+        from_: 
+        ,
+     '1': 
+        present: True
+        from_: '0'
+        ,
+     '2': 
+        present: False
+        from_: '0', '1'
+        }
 
-In database 2, we see that object id is missing because it doesn't
-have any serials.
+In database 2, we see that object 2 is not present. We see that it is
+refered to by object 0 and 1.
 
-We can query these data structures using Python.  For example, to dind
-missing objects (and the in-database objects that reference them:
+We can query these data structures using Python.  For example, to find
+missing objects (and the in-database objects that reference them):
 
-    >>> pprint([(oid, data) for (oid, data) in refs2.iteritems()
-    ...        if not data['serials']
-    ...        ], width=1)
-    [('0x02',
-      {'from': {'0x00': [{'ref': ['w',
-                                  ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
-                          'tinfo': ('\x03p\x98[\xe0C\xd75',
-                                    4L)}],
-                '0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
-                                  None),
-                          'tinfo': ('\x03p\x98[\xe0C\xd77',
-                                    324L)}]},
-       'serials': [],
-       'to': {}})]
+    >>> [(oid, data) for (oid, data) in refs1.iteritems()
+    ...    if not data.present and isinstance(oid, str)
+    ...    ]
+    []
 
+    >>> [(oid, data) for (oid, data) in refs2.iteritems()
+    ...    if not data.present and isinstance(oid, str)
+    ...    ]
+    [('2', 
+        present: False
+        from_: '0', '1'
+        )]
+
+
 Here we see the broken weak reference from object 0 and the broken
 ordinary reference from object 1.
 
@@ -237,14 +181,45 @@
 data:
 
     >>> zc.fsutil.references.collect_script(['fs2', 'fs2.dat'])
-    >>> zc.fsutil.references.load('fs2.dat') == refs2
-    True
+    >>> zc.fsutil.references.load('fs2.dat')
+    {'1': 
+        present: True
+        from_: '0'
+        , '0': 
+        present: True
+        from_: 
+        , '2': 
+        present: False
+        from_: '0', '1'
+        }
 
 You can pass multiple pairs of files.
 
     >>> zc.fsutil.references.collect_script(['fs1', 'fs1.dat', 
     ...                                      'fs2', 'fs2.dat'])
-    >>> zc.fsutil.references.load('fs1.dat') == refs1
-    True
-    >>> zc.fsutil.references.load('fs2.dat') == refs2
-    True
+    >>> zc.fsutil.references.load('fs1.dat')
+    {'1': 
+        present: True
+        from_: '0'
+        , '0': 
+        present: True
+        from_: 
+        , '2': 
+        present: True
+        from_: '1'
+        , ('db2', '2'): 
+        present: False
+        from_: '1'
+        }
+
+    >>> zc.fsutil.references.load('fs2.dat')
+    {'1': 
+        present: True
+        from_: '0'
+        , '0': 
+        present: True
+        from_: 
+        , '2': 
+        present: False
+        from_: '0', '1'
+        }