[Zope-CVS] CVS: Products/ZCTextIndex - HTMLSplitter.py:1.9

Guido van Rossum guido@python.org
Wed, 22 May 2002 16:06:55 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv9545

Modified Files:
	HTMLSplitter.py 
Log Message:
Get rid of the unused HTMLSplitter class (it's too simple).
Add glob support to the HTMLWordSplitter class.


=== Products/ZCTextIndex/HTMLSplitter.py 1.8 => 1.9 ===
 import re
 
-class HTMLSplitter:
-
-    __implements__ = ISplitter
-
-    def process(self, text):
-        return re.sub('<[^>]*>', ' ', text).split()
-
 class HTMLWordSplitter:
 
     __implements__ = ISplitter
 
-    def process(self, text):
+    def process(self, text, wordpat=r"\w+"):
         splat = []
         for t in text:
-            splat += self._split(t)
+            splat += self._split(t, wordpat)
         return splat
 
-    def _split(self, text):
+    def processGlob(self, text):
+        return self.process(text, r"\w+[\w*?]*") # see Lexicon.globToWordIds()
+
+    def _split(self, text, wordpat):
         text = text.lower()
-        remove = ["<[^>]*>",
-                  "&[A-Za-z]+;",
-                  "\W+"]
+        remove = [r"<[^<>]*>",
+                  r"&[A-Za-z]+;"]
         for pat in remove:
             text = re.sub(pat, " ", text)
-        rx = re.compile("[A-Za-z]")
-        return [word for word in text.split()
-                if len(word) > 1 and rx.search(word)]
+        return re.findall(wordpat, text)
                 
 element_factory.registerFactory('Word Splitter', 
                                 'HTML aware splitter',