[Zope-Checkins] CVS: Zope/lib/python/BTrees - sorters.c:1.1 BTreeModuleTemplate.c:1.22 SetOpTemplate.c:1.13 _IIBTree.c:1.6

Tim Peters tim.one@comcast.net
Thu, 30 May 2002 17:00:30 -0400


Update of /cvs-repository/Zope/lib/python/BTrees
In directory cvs.zope.org:/tmp/cvs-serv27288

Modified Files:
	BTreeModuleTemplate.c SetOpTemplate.c _IIBTree.c 
Added Files:
	sorters.c 
Log Message:
A new multiunion() function for integer sets computes the union of many
input sets quickly, and will become much faster again soon (see TODO
in C function multiunion_m).


=== Added File Zope/lib/python/BTrees/sorters.c === (422/522 lines abridged)
/*****************************************************************************

  Copyright (c) 2002 Zope Corporation and Contributors.
  All Rights Reserved.

  This software is subject to the provisions of the Zope Public License,
  Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
  WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
  FOR A PARTICULAR PURPOSE

 ****************************************************************************/

/* Revision information: $Id: sorters.c,v 1.1 2002/05/30 21:00:30 tim_one Exp $ */

/* The only routine here intended to be used outside the file is
   size_t sort_int4_nodups(int *p, size_t n)

   Sort the array of n ints pointed at by p, in place, and also remove
   duplicates.  Return the number of unique elements remaining, which occupy
   a contiguous and monotonically increasing slice of the array starting at p.

   Example:  If the input array is [3, 1, 2, 3, 1, 5, 2], sort_int4_nodups
   returns 4, and the first 4 elements of the array are changed to
   [1, 2, 3, 5].  The content of the remaining array positions is not defined.

   Notes:

   + This is specific to 4-byte signed ints, with endianness natural to the
     platform.

   + 4*n bytes of available heap memory are required for best speed.
*/

#include <stdlib.h>
#include <stddef.h>
#include <malloc.h>
#include <memory.h>
#include <string.h>
#include <assert.h>

/* The type of array elements to be sorted.  Most of the routines don't
   care about the type, and will work fine for any scalar C type (provided
   they're recompiled with element_type appropriately redefined).  However,
   the radix sort has to know everything about the type's internal
   representation.
*/
typedef int element_type;


[-=- -=- -=- 422 lines omitted -=- -=- -=-]

		plo[1] = *pj;
		*pj = pivot;

		/* Subfiles are from plo to pj-1 inclusive, and pj+1 to phi
		   inclusive.  Push the larger one, and loop back to do the
		   smaller one directly.
		*/
		if (pj - plo >= phi - pj) {
			PUSH(plo, pj-1);
			plo = pj+1;
		}
		else {
			PUSH(pj+1, phi);
			phi = pj-1;
		}
	}

#undef PUSH
#undef SWAP
}

/* Sort p and remove duplicates, as fast as we can. */
static size_t
sort_int4_nodups(int *p, size_t n)
{
	size_t nunique;
	size_t *work;

	assert(sizeof(int) == sizeof(element_type));
	assert(p);

	/* Use quicksort if the array is small, OR if malloc can't find
	   enough temp memory for radixsort.
	*/
	work = NULL;
	if (n > QUICKSORT_BEATS_RADIXSORT)
		work = (element_type *)malloc(n * sizeof(element_type));

	if (work) {
		element_type *out = radixsort_int4(p, work, n);
		nunique = uniq(p, out, n);
		free(work);
	}
	else {
		quicksort(p, n);
		nunique = uniq(p, p, n);
	}

	return nunique;
}


=== Zope/lib/python/BTrees/BTreeModuleTemplate.c 1.21 => 1.22 ===
 
 #define BUCKET(O) ((Bucket*)(O))
+#define SIZED(O) ((Bucket*)(O))
 
 static void PyVar_AssignB(Bucket **v, Bucket *e) { Py_XDECREF(*v); *v=e;}
 #define ASSIGNB(V,E) PyVar_AssignB(&(V),(E))
@@ -261,6 +262,14 @@
    "weightedIntersection(o1, o2 [, w1, w2]) -- "
    "compute the intersection of o1 and o2\n"
    "\nw1 and w2 are weights."
+  },
+#endif
+#ifdef MULTI_INT_UNION
+  {"multiunion", (PyCFunction) multiunion_m, METH_VARARGS,
+   "multiunion(seq) -- compute union of a sequence of integer sets.\n"
+   "\n"
+   "Each element of seq must be an integer set, or convertible to one\n"
+   "via the set iteration protocol.  The union returned is an IISet."
   },
 #endif
   {NULL,		NULL}		/* sentinel */


=== Zope/lib/python/BTrees/SetOpTemplate.c 1.12 => 1.13 ===
 
 #endif
+
+#ifdef MULTI_INT_UNION
+#include "sorters.c"
+
+/* Input is a sequence of integer sets (or convertible to sets by the
+   set iteration protocol).  Output is the union of the sets.  The point
+   is to run much faster than doing pairs of unions.
+*/
+static PyObject *
+multiunion_m(PyObject *ignored, PyObject *args)
+{
+    PyObject *seq;          /* input sequence */
+    int n;                  /* length of input sequence */
+    PyObject *set = NULL;   /* an element of the input sequence */
+    Bucket *result;         /* result set */
+    int i;
+
+    UNLESS(PyArg_ParseTuple(args, "O", &seq))
+        return NULL;
+
+    n = PyObject_Length(seq);
+    if (n < 0)
+        return NULL;
+
+    /* Construct an empty result set. */
+    result = BUCKET(PyObject_CallObject(OBJECT(&SetType), NULL));
+    if (result == NULL)
+        return NULL;
+
+    /* For each set in the input sequence, append its elements to the result
+       set.  At this point, we ignore the possibility of duplicates. */
+    for (i = 0; i < n; ++i) {
+        SetIteration setiter = {0, 0, 0};
+        int merge;  /* dummy needed for initSetIteration */
+
+        set = PySequence_GetItem(seq, i);
+        if (set == NULL)
+            goto Error;
+
+        /* XXX TODO: If set is a bucket, do a straight resize+memcpy instead.
+        */
+        if (initSetIteration(&setiter, set, 1, &merge) < 0)
+            goto Error;
+        if (setiter.next(&setiter) < 0)
+            goto Error;
+        while (setiter.position >= 0) {
+            if (result->len >= result->size && Bucket_grow(result, 1) < 0)
+                goto Error;
+            COPY_KEY(result->keys[result->len], setiter.key);
+            ++result->len;
+            /* We know the key is an int, so no need to incref it. */
+            if (setiter.next(&setiter) < 0)
+                goto Error;
+        }
+        Py_DECREF(set);
+        set = NULL;
+    }
+
+    /* Combine, sort, remove duplicates, and reset the result's len.
+       If the set shrinks (which happens if and only if there are
+       duplicates), no point to realloc'ing the set smaller, as we
+       expect the result set to be short-lived.
+    */
+    if (result->len > 0) {
+        size_t newlen;          /* number of elements in final result set */
+        newlen = sort_int4_nodups(result->keys, (size_t)result->len);
+        result->len = (int)newlen;
+    }
+    return (PyObject *)result;
+
+Error:
+    Py_DECREF(result);
+    Py_XDECREF(set);
+    return NULL;
+}
+
+#endif


=== Zope/lib/python/BTrees/_IIBTree.c 1.5 => 1.6 ===
 #define DEFAULT_MAX_BUCKET_SIZE 120
 #define DEFAULT_MAX_BTREE_SIZE 500
-                
+#define MULTI_INT_UNION 1
+
 #include "intkeymacros.h"
 #include "intvaluemacros.h"
 #include "cPersistence.h"