[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.2.10.5

Andreas Jung andreas@zope.com
Wed, 10 Oct 2001 15:38:27 -0400


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv12808/src

Modified Files:
      Tag: ajung-unicode
	ZopeSplitter.c 
Log Message:
This version is known to fail. Checkin for debugging purposes.


=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.2.10.4 => 1.2.10.5 ===
 #define UNLESS_ASSIGN(V,E) ASSIGN(V,E) UNLESS(V)
 
+#define PO(x) { PyObject_Print(x,stdout,0); fflush(stdout);  }
+
 typedef struct
 {
         PyObject_HEAD
-        PyObject *text ;
         PyObject *list;
         PyObject *synstop;
 }
 Splitter;
 
+static PyObject * checkSynword(Splitter *self,PyObject *word)
+{
+
+        PyObject *value;
+
+#if DEBUG
+        PO(word);
+        PO(self->synstop);
+#endif
+
+        if (PyList_Check(self->list)) {
+
+                value = PyObject_GetItem(self->synstop,word);
+                if (value) {
+                        Py_INCREF(value);
+
+                        return value;
+                } else return word;
+        } else return word;
+}
+
 static void
 Splitter_dealloc(Splitter *self)
 {
         Py_XDECREF(self->list);
-        Py_XDECREF(self->text);
+        Py_XDECREF(self->synstop);
         PyMem_DEL(self);
 }
 
@@ -50,8 +72,8 @@
 {
         PyObject *item=NULL;
         if (i >= PyList_Size(self->list)) {
-            PyErr_SetString(PyExc_IndexError,"Splitter index out of range");
-            return NULL;
+                PyErr_SetString(PyExc_IndexError,"Splitter index out of range");
+                return NULL;
         }
 
         ASSIGN(item,PyList_GetItem(self->list , i));
@@ -166,19 +188,32 @@
 
 void splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
 {
-        PyObject *word;
+
+        PyObject *word,*synword;
         Py_UNICODE *s = doc->str;
         int len = doc->length;
         int inside_word=0;
         int i=0;
         int start=0;
 
+#ifdef DEBUG
+        puts("before List_New");
+        fflush(stdout);
+#endif
+      
         self->list = PyList_New(0);
 
+#ifdef DEBUG
+        puts("after List_New");
+        fflush(stdout);
+#endif
+
+
         do {
                 register Py_UNICODE ch;
 
                 ch = *s;
+                *s = Py_UNICODE_TOLOWER(ch);
 
 #ifdef DEBUG
                 printf("%d %c %d\n",i,ch,ch);
@@ -191,17 +226,21 @@
                         }
                 } else {
 
-                        if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) 
-                        {
+                        if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) {
                                 inside_word = 0;
 
                                 word = PySequence_GetSlice((PyObject *)doc,start,i);
-                                Py_INCREF(word);
                                 // Stem word
                                 if (PyUnicode_GET_SIZE(word)>MAX_WORD)
-                                    word = PySequence_GetSlice(word,0,MAX_WORD);
+                                        word = PySequence_GetSlice(word,0,MAX_WORD);
+
+                                synword = checkSynword(self,word);
+                                if (synword != Py_None) {
+                                        PyList_Append(self->list,synword);
+                                } else Py_DECREF(synword);
+
+                                Py_DECREF(word);
 
-                                PyList_Append(self->list,word);
                                 start =  0;
 #ifdef DEBUG
                                 PyObject_Print(word,stdout,0);
@@ -211,30 +250,36 @@
                 }
 
                 s++;
+
         } while(++i < len);
 
         if (inside_word) {
-            word = PySequence_GetSlice((PyObject *)doc,start,i);
+                word = PySequence_GetSlice((PyObject *)doc,start,i);
 
-            // Stem word
-            if (PyUnicode_GET_SIZE(word)>MAX_WORD)
-                word = PySequence_GetSlice(word,0,MAX_WORD);
+                // Stem word
+                if (PyUnicode_GET_SIZE(word)>MAX_WORD)
+                        word = PySequence_GetSlice(word,0,MAX_WORD);
+
+                synword = checkSynword(self,word);
+                if (synword != Py_None) {
+                        PyList_Append(self->list,synword);
+                } else Py_DECREF(synword);
 
-            Py_INCREF(word);
-            PyList_Append(self->list,word);
+                Py_DECREF(word);
         }
 
 #ifdef DEBUG
         PyObject_Print(self->list,stdout,0);
         fflush(stdout);
 #endif
+
 }
 
 static PyObject *
 get_Splitter(PyObject *modinfo, PyObject *args)
 {
         Splitter *self;
-        PyObject *doc, *synstop=NULL;
+        PyObject *doc, *unicodedoc,*synstop=NULL;
 
         UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
         UNLESS(PyArg_ParseTuple(args,"O|O",&doc,&synstop)) return NULL;
@@ -246,22 +291,42 @@
 #endif
 
         if (PyString_Check(doc)) {
-                doc = PyUnicode_FromObject(doc);
 
-        } else if( PyUnicode_Check(doc)) {}
-        else {
+#ifdef DEBUG
+                puts("got a string object");
+                fflush(stdout);
+#endif
+                unicodedoc = PyUnicode_FromObject(doc);
+
+        } else if( PyUnicode_Check(doc)) {
+#ifdef DEBUG
+                puts("got a unicode object");
+                fflush(stdout);
+
+#endif
+                unicodedoc = doc;
+        } else {
                 PyErr_SetString(PyExc_TypeError, "first argument is neither string nor unicode.");
                 return NULL;
         }
 
-
-        UNLESS(self->text = doc) goto err;
-        splitUnicodeString(self,(PyUnicodeObject *)doc);
-
         if (synstop) {
                 self->synstop = synstop;
                 Py_INCREF(synstop);
+        } else {
+                self->synstop=NULL;
         }
+
+#ifdef DEBUG
+        puts("before splitUnicodeString");
+        PyObject_Print(unicodedoc,stdout,0);
+#endif
+
+        splitUnicodeString(self,(PyUnicodeObject *)unicodedoc);
+
+#ifdef DEBUG
+        puts("after splitUnicodeString");
+#endif
 
         return (PyObject*)self;