[Checkins] SVN: zope.index/trunk/src/zope/index/text/htmlsplitter.py Clean up regex usage in HTMLSplitter.
Tres Seaver
tseaver at palladion.com
Wed Jun 10 19:46:21 EDT 2009
Log message for revision 100820:
Clean up regex usage in HTMLSplitter.
Changed:
U zope.index/trunk/src/zope/index/text/htmlsplitter.py
-=-
Modified: zope.index/trunk/src/zope/index/text/htmlsplitter.py
===================================================================
--- zope.index/trunk/src/zope/index/text/htmlsplitter.py 2009-06-10 23:37:59 UTC (rev 100819)
+++ zope.index/trunk/src/zope/index/text/htmlsplitter.py 2009-06-10 23:46:20 UTC (rev 100820)
@@ -21,24 +21,27 @@
from zope.index.text.interfaces import ISplitter
+MARKUP = re.compile(r"(<[^<>]*>|&[A-Za-z]+;)")
+WORDS = re.compile(r"(?L)\w+")
+GLOBS = re.compile(r"(?L)\w+[\w*?]*")
+
class HTMLWordSplitter(object):
implements(ISplitter)
- def process(self, text, wordpat=r"(?L)\w+"):
- splat = []
- for t in text:
- splat += self._split(t, wordpat)
- return splat
+ def process(self, text):
+ return self._apply(text, WORDS)
def processGlob(self, text):
# see Lexicon.globToWordIds()
- return self.process(text, r"(?L)\w+[\w*?]*")
+ return self._apply(text, GLOBS)
- def _split(self, text, wordpat):
- text = text.lower()
- remove = [r"<[^<>]*>",
- r"&[A-Za-z]+;"]
- for pat in remove:
- text = re.sub(pat, " ", text)
- return re.findall(wordpat, text)
+ def _apply(self, text, pattern):
+ result = []
+ for chunk in text:
+ result.extend(self._split(chunk, pattern))
+ return result
+
+ def _split(self, text, pattern):
+ text = MARKUP.sub('', text.lower())
+ return pattern.findall(text)
More information about the Checkins
mailing list