[Checkins] SVN: zc.fsutils/branches/dev/src/zc/fsutil/references. added a low-memory collection format

Jim Fulton jim at zope.com
Tue Sep 25 19:27:16 EDT 2007


Log message for revision 80063:
  added a low-memory collection format

Changed:
  U   zc.fsutils/branches/dev/src/zc/fsutil/references.py
  U   zc.fsutils/branches/dev/src/zc/fsutil/references.txt

-=-
Modified: zc.fsutils/branches/dev/src/zc/fsutil/references.py
===================================================================
--- zc.fsutils/branches/dev/src/zc/fsutil/references.py	2007-09-25 22:40:23 UTC (rev 80062)
+++ zc.fsutils/branches/dev/src/zc/fsutil/references.py	2007-09-25 23:27:16 UTC (rev 80063)
@@ -20,71 +20,121 @@
 import ZODB.utils
 
 
-def references(iterator):
+def collect(iterator, output):
     """Create a database of database references.
 
     Return a dictionary mapping oids to dictionaries with keys 'from'
     and 'to' with values that are dictionaries mapping oids to lists
     of references.
     """
-    result = {}
+    pickler = cPickle.Pickler(open(output, 'w'))
+    
     for trans in iterator:
+        trandata = trans.tid, trans._tpos
         for record in trans:
-            from_oid = ZODB.utils.oid_repr(record.oid)
-            from_data = result.get(from_oid)
-            if from_data is None:
-                from_data = result[from_oid] = {
-                    'from': {}, 'to': {}, 'serials': [],
-                    }
-            from_data['serials'].append(record.tid)
-
             refs = []
             u = cPickle.Unpickler(cStringIO.StringIO(record.data))
             u.persistent_load = refs
             u.noload()
             u.noload()
-            for ref in refs:
-                if isinstance(ref, tuple):
-                    to_oid = ZODB.utils.oid_repr(ref[0])
-                elif isinstance(ref, str):
-                    to_oid = ZODB.utils.oid_repr(ref)
-                elif isinstance(ref, list):
-                    if len(ref) == 1:
-                        to_oid = ZODB.utils.oid_repr(ref[0])
-                    else:
-                        try:
-                            reference_type, args = ref
-                        except ValueError:
-                            print 'wtf', ref
-                            continue
+            pickler.dump((trandata, record.oid, record.tid, refs))
+    
+def collect_script(args=None):
+    if args is None:
+        args = sys.argv[1:]
 
-                        if reference_type == 'w':
-                            to_oid = ZODB.utils.oid_repr(args[0])
-                        elif reference_type in 'nm':
-                            to_oid = args[0], ZODB.utils.oid_repr(args[1])
-                        else:
-                            print wtf, reference_type, args
-                else:
+    [inp, outp] = args
+
+    iterator = sys.modules['ZODB.FileStorage.FileStorage' # :(
+                           ].FileIterator(inp)
+    data = collect(iterator, outp)
+
+def load(fname):
+    unpickler = cPickle.Unpickler(open(fname))
+    result = {}
+    while 1:
+        try:
+            data = unpickler.load()
+        except EOFError:
+            return result
+        _update(result, *data)
+
+        
+def _update(result, tinfo, oid, serial, refs):
+    """Create a database of database references.
+
+    Return a dictionary mapping oids to dictionaries with keys 'from'
+    and 'to' with values that are dictionaries mapping oids to lists
+    of references.
+    """
+    from_oid = ZODB.utils.oid_repr(oid)
+    from_data = result.get(from_oid)
+    if from_data is None:
+        from_data = result[from_oid] = {
+            'from': {}, 'to': {}, 'serials': [],
+            }
+    from_data['serials'].append(serial)
+
+    for ref in refs:
+        if isinstance(ref, tuple):
+            to_oid = ZODB.utils.oid_repr(ref[0])
+        elif isinstance(ref, str):
+            to_oid = ZODB.utils.oid_repr(ref)
+        elif isinstance(ref, list):
+            if len(ref) == 1:
+                to_oid = ZODB.utils.oid_repr(ref[0])
+            else:
+                try:
+                    reference_type, args = ref
+                except ValueError:
                     print 'wtf', ref
                     continue
 
-                ref = dict(ref=ref, tid=trans.tid, tpos=trans._tpos)
-                
-                from_to = from_data['to'].get(to_oid)
-                if from_to is None:
-                    from_to = from_data['to'][to_oid] = []
-                from_to.append(ref)
+                if reference_type == 'w':
+                    to_oid = ZODB.utils.oid_repr(args[0])
+                elif reference_type in 'nm':
+                    to_oid = args[0], ZODB.utils.oid_repr(args[1])
+                else:
+                    print wtf, reference_type, args
+        else:
+            print 'wtf', ref
+            continue
 
-                to_data = result.get(to_oid)
-                if to_data is None:
-                    to_data = result[to_oid] = {
-                        'to': {}, 'from': {}, 'serials': [],
-                        }
-                to_from = to_data['from'].get(from_oid)
-                if to_from is None:
-                    to_from = to_data['from'][from_oid] = []
-                to_from.append(ref)
-                
+        ref = dict(ref=ref, tinfo=tinfo)
+
+        from_to = from_data['to'].get(to_oid)
+        if from_to is None:
+            from_to = from_data['to'][to_oid] = []
+        from_to.append(ref)
+
+        to_data = result.get(to_oid)
+        if to_data is None:
+            to_data = result[to_oid] = {
+                'to': {}, 'from': {}, 'serials': [],
+                }
+        to_from = to_data['from'].get(from_oid)
+        if to_from is None:
+            to_from = to_data['from'][from_oid] = []
+        to_from.append(ref)
+
+def references(iterator):
+    """Create a database of database references.
+
+    Return a dictionary mapping oids to dictionaries with keys 'from'
+    and 'to' with values that are dictionaries mapping oids to lists
+    of references.
+    """
+    result = {}
+    for trans in iterator:
+        trandata = trans.tid, trans._tpos
+        for record in trans:
+            refs = []
+            u = cPickle.Unpickler(cStringIO.StringIO(record.data))
+            u.persistent_load = refs
+            u.noload()
+            u.noload()
+            _update(result, trandata, record.oid, record.tid, refs)
+            
     return result
 
 def references_script(args=None):
@@ -97,7 +147,3 @@
                            ].FileIterator(inp)
     data = references(iterator)
     cPickle.Pickler(open(outp, 'w'), 1).dump(data)
-
-
-
-    

Modified: zc.fsutils/branches/dev/src/zc/fsutil/references.txt
===================================================================
--- zc.fsutils/branches/dev/src/zc/fsutil/references.txt	2007-09-25 22:40:23 UTC (rev 80062)
+++ zc.fsutils/branches/dev/src/zc/fsutil/references.txt	2007-09-25 23:27:16 UTC (rev 80063)
@@ -97,44 +97,44 @@
                           '\x03p\x98[\xe0C\xd74'],
               'to': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
                                        None),
-                               'tid': '\x03p\x98[\xe0C\xd74',
-                               'tpos': 168L}]}},
+                               'tinfo': ('\x03p\x98[\xe0C\xd74',
+                                         168L)}]}},
      '0x01': {'from': {'0x00': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
                                          None),
-                                 'tid': '\x03p\x98[\xe0C\xd74',
-                                 'tpos': 168L}]},
+                                 'tinfo': ('\x03p\x98[\xe0C\xd74',
+                                           168L)}]},
               'serials': ['\x03p\x98[\xe0C\xd74',
                           '\x03p\x98[\xe0C\xd75'],
               'to': {'0x02': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
                                        None),
-                               'tid': '\x03p\x98[\xe0C\xd74',
-                               'tpos': 168L},
+                               'tinfo': ('\x03p\x98[\xe0C\xd74',
+                                         168L)},
                               {'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
                                        None),
-                               'tid': '\x03p\x98[\xe0C\xd75',
-                               'tpos': 535L}],
+                               'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                         535L)}],
                      ('db2', '0x02'): [{'ref': ['m',
                                                 ('db2',
                                                  '\x00\x00\x00\x00\x00\x00\x00\x02',
                                                  None)],
-                                        'tid': '\x03p\x98[\xe0C\xd75',
-                                        'tpos': 535L}]}},
+                                        'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                                  535L)}]}},
      '0x02': {'from': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
                                          None),
-                                 'tid': '\x03p\x98[\xe0C\xd74',
-                                 'tpos': 168L},
+                                 'tinfo': ('\x03p\x98[\xe0C\xd74',
+                                           168L)},
                                 {'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
                                          None),
-                                 'tid': '\x03p\x98[\xe0C\xd75',
-                                 'tpos': 535L}]},
+                                 'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                           535L)}]},
               'serials': ['\x03p\x98[\xe0C\xd74'],
               'to': {}},
      ('db2', '0x02'): {'from': {'0x01': [{'ref': ['m',
                                                   ('db2',
                                                    '\x00\x00\x00\x00\x00\x00\x00\x02',
                                                    None)],
-                                          'tid': '\x03p\x98[\xe0C\xd75',
-                                          'tpos': 535L}]},
+                                          'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                                    535L)}]},
                        'serials': [],
                        'to': {}}}
 
@@ -177,30 +177,30 @@
               'serials': ['\x03p\x98[\xe0C\xd75'],
               'to': {'0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
                                        None),
-                               'tid': '\x03p\x98[\xe0C\xd75',
-                               'tpos': 4L}],
+                               'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                         4L)}],
                      '0x02': [{'ref': ['w',
                                        ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
-                               'tid': '\x03p\x98[\xe0C\xd75',
-                               'tpos': 4L}]}},
+                               'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                         4L)}]}},
      '0x01': {'from': {'0x00': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x01',
                                          None),
-                                 'tid': '\x03p\x98[\xe0C\xd75',
-                                 'tpos': 4L}]},
+                                 'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                           4L)}]},
               'serials': ['\x03p\x98[\xe0C\xd76',
                           '\x03p\x98[\xe0C\xd77'],
               'to': {'0x02': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
                                        None),
-                               'tid': '\x03p\x98[\xe0C\xd77',
-                               'tpos': 324L}]}},
+                               'tinfo': ('\x03p\x98[\xe0C\xd77',
+                                         324L)}]}},
      '0x02': {'from': {'0x00': [{'ref': ['w',
                                          ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
-                                 'tid': '\x03p\x98[\xe0C\xd75',
-                                 'tpos': 4L}],
+                                 'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                           4L)}],
                        '0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
                                          None),
-                                 'tid': '\x03p\x98[\xe0C\xd77',
-                                 'tpos': 324L}]},
+                                 'tinfo': ('\x03p\x98[\xe0C\xd77',
+                                           324L)}]},
               'serials': [],
               'to': {}}}
 
@@ -216,12 +216,12 @@
     [('0x02',
       {'from': {'0x00': [{'ref': ['w',
                                   ('\x00\x00\x00\x00\x00\x00\x00\x02',)],
-                          'tid': '\x03p\x98[\xe0C\xd75',
-                          'tpos': 4L}],
+                          'tinfo': ('\x03p\x98[\xe0C\xd75',
+                                    4L)}],
                 '0x01': [{'ref': ('\x00\x00\x00\x00\x00\x00\x00\x02',
                                   None),
-                          'tid': '\x03p\x98[\xe0C\xd77',
-                          'tpos': 324L}]},
+                          'tinfo': ('\x03p\x98[\xe0C\xd77',
+                                    324L)}]},
        'serials': [],
        'to': {}})]
 
@@ -231,3 +231,11 @@
 Our data structure can't tell us about broken cross-database references
 directly.
 
+For large databases, we may need to avoid consuming lots of memory on
+the database machine.  We can use a collection script that doesn't
+accumulate data in memory and the use a load function to load the
+data:
+
+    >>> zc.fsutil.references.collect_script(['fs2', 'fs2.dat'])
+    >>> zc.fsutil.references.load('fs2.dat') == refs2
+    True



More information about the Checkins mailing list