[Checkins] SVN: zope.index/trunk/src/zope/index/text/htmlsplitter.py Clean up regex usage in HTMLSplitter.

Tres Seaver tseaver at palladion.com
Wed Jun 10 19:46:21 EDT 2009


Log message for revision 100820:
  Clean up regex usage in HTMLSplitter.

Changed:
  U   zope.index/trunk/src/zope/index/text/htmlsplitter.py

-=-
Modified: zope.index/trunk/src/zope/index/text/htmlsplitter.py
===================================================================
--- zope.index/trunk/src/zope/index/text/htmlsplitter.py	2009-06-10 23:37:59 UTC (rev 100819)
+++ zope.index/trunk/src/zope/index/text/htmlsplitter.py	2009-06-10 23:46:20 UTC (rev 100820)
@@ -21,24 +21,27 @@
 
 from zope.index.text.interfaces import ISplitter
 
+MARKUP = re.compile(r"(<[^<>]*>|&[A-Za-z]+;)")
+WORDS = re.compile(r"(?L)\w+")
+GLOBS = re.compile(r"(?L)\w+[\w*?]*")
+
 class HTMLWordSplitter(object):
 
     implements(ISplitter)
 
-    def process(self, text, wordpat=r"(?L)\w+"):
-        splat = []
-        for t in text:
-            splat += self._split(t, wordpat)
-        return splat
+    def process(self, text):
+        return self._apply(text, WORDS)
 
     def processGlob(self, text):
         # see Lexicon.globToWordIds()
-        return self.process(text, r"(?L)\w+[\w*?]*")
+        return self._apply(text, GLOBS)
 
-    def _split(self, text, wordpat):
-        text = text.lower()
-        remove = [r"<[^<>]*>",
-                  r"&[A-Za-z]+;"]
-        for pat in remove:
-            text = re.sub(pat, " ", text)
-        return re.findall(wordpat, text)
+    def _apply(self, text, pattern):
+        result = []
+        for chunk in text:
+            result.extend(self._split(chunk, pattern))
+        return result
+
+    def _split(self, text, pattern):
+        text = MARKUP.sub('', text.lower())
+        return pattern.findall(text)



More information about the Checkins mailing list