[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src - ISO_8859_1_Splitter.c:1.5.10.2

Andreas Jung andreas@digicool.com
Wed, 9 Jan 2002 09:16:25 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src
In directory cvs.zope.org:/tmp/cvs-serv25029/src

Modified Files:
      Tag: ajung-textindexng-branch
	ISO_8859_1_Splitter.c 
Log Message:
added 'maxlen','indexnumbers','singlechar' parameters to constructor


=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c 1.5.10.1 => 1.5.10.2 ===
     char *here, *end;
     int index;
+    int allow_single_chars;
+    int index_numbers;
+    int max_len;
 }
 
 Splitter;
@@ -181,7 +184,7 @@
 
     len = PyString_Size(word);
 
-    if(len < 2)	/* Single-letter words are stop words! */
+    if(len < 2 && ! self->allow_single_chars)	/* Single-letter words are stop words! */
     {
         Py_INCREF(Py_None);
         return Py_None;
@@ -193,7 +196,7 @@
     for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
 
         ;
-    if (len < 0) {
+    if (len < 0 && ! self->index_numbers) {
         Py_INCREF(Py_None);
         return Py_None;
     }
@@ -223,12 +226,11 @@
     return value;		/* Which must be None! */
 }
 
-#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
 
 static PyObject *
 next_word(Splitter *self, char **startpos, char **endpos)
 {
-    char wbuf[MAX_WORD];
+    char wbuf[256];
     char *end, *here, *b;
     int i = 0, c;
     PyObject *pyword, *res;
@@ -258,13 +260,13 @@
             if(startpos && i==0)
                 *startpos=here;
 
-            if(i++ < MAX_WORD)
+            if(i++ < self->max_len)
                 *b++ = c;
 
         } else if (i != 0) { /* We've found the end of a word */
 
-            if(i >= MAX_WORD)
-                i=MAX_WORD; /* "stem" the long word */
+            if(i >= self->max_len)
+                i=self->max_len; /* "stem" the long word */
 
             UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
                 self->here=here;
@@ -308,8 +310,8 @@
 
     /* We've reached the end of the string */
 
-    if(i >= MAX_WORD)
-        i=MAX_WORD; /* "stem" the long word */
+    if(i >= self->max_len)
+        i=self->max_len; /* "stem" the long word */
 
     if (i == 0) {
         /* No words */
@@ -488,7 +490,7 @@
                                        SplitterType__doc__ /* Documentation string */
                                    };
 
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
 
 static PyObject *
 get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
@@ -496,8 +498,29 @@
     Splitter *self;
     PyObject *doc, *synstop = NULL;
     char * encoding="latin1";
+    int single_char = 0;
+    int index_numbers = 0;
+    int max_len=64;
+
+    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
+
+
+    if (index_numbers<0 || index_numbers>1) {
+        PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+        return NULL;
+    }
+
+    if (single_char<0 || single_char>1) {
+        PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+        return NULL;
+    }
+
+    if (max_len<1 || max_len>128) {
+        PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+        return NULL;
+    }
+
 
-    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding)) return NULL;
 
     UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
 
@@ -513,6 +536,9 @@
     UNLESS(self->here=PyString_AsString(self->text)) goto err;
 
     self->end = self->here + PyString_Size(self->text);
+    self->allow_single_chars    = single_char;
+    self->index_numbers         = index_numbers;
+    self->max_len               = max_len;
 
     self->index = -1;
 
@@ -527,7 +553,7 @@
 static struct PyMethodDef Splitter_module_methods[] =
     {
         { "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
-            "ISO_8859_1_Splitter(doc[,synstop]) -- Return a word splitter"
+          "ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
         },
 
         { NULL, NULL }