[CMF-checkins] CVS: CMF/CMFDefault - utils.py:

Tres Seaver tseaver@zope.com
Fri, 19 Jul 2002 19:04:27 -0400

Update of /cvs-repository/CMF/CMFDefault
In directory cvs.zope.org:/tmp/cvs-serv24640/CMFDefault

Modified Files:
      Tag: CMF-1_3-branch
Log Message:

  - CMFCore.utils:

    o Remove unused import.

  - CMFDefault.utils:

    o Remove string module usage.

    o Normalize whitespace, docstrings.

    o Add new "stripping" parser (not wired into Document et al. yet).

=== CMF/CMFDefault/utils.py 1.12 => ===
     Utility functions.
-from string import split, join, lstrip, lower, strip, capitalize
 from sgmllib import SGMLParser
 import re
 import os
 from Globals import package_home
 _dtmldir = os.path.join( package_home( globals() ), 'dtml' )
 def formatRFC822Headers( headers ):
-    """
-        Convert the key-value pairs in 'headers' to valid RFC822-style
+    """ Convert the key-value pairs in 'headers' to valid RFC822-style
         headers, including adding leading whitespace to elements which
         contain newlines in order to preserve continuation-line semantics.
@@ -20,14 +21,14 @@
     for key, value in headers:
         vallines = linesplit.split( value )
-        munged.append( '%s: %s' % ( key, join( vallines, '\r\n  ' ) ) )
+        munged.append( '%s: %s' % ( key, '\r\n  '.join( vallines ) ) )
-    return join( munged, '\r\n' )
+    return '\r\n'.join( munged )
-def parseHeadersBody( body, headers=None, rc=re.compile(r'\n|\r\n')):
-    """
-        Parse any leading 'RFC-822'-ish headers from an uploaded
+def parseHeadersBody( body, headers=None, rc=re.compile( r'\n|\r\n' ) ):
+    """ Parse any leading 'RFC-822'-ish headers from an uploaded
         document, returning a dictionary containing the headers
         and the stripped body.
@@ -70,122 +71,314 @@
         headers = headers.copy()
     hdrlist = []
     for line in lines:
-        if not strip(line):
+        if not line.strip():
-        tokens = split( line, ': ' )
+        tokens = line.split( ': ' )
         if len( tokens ) > 1:
-            hdrlist.append( ( tokens[0], join( tokens[1:], ': ' ) ) )
+            hdrlist.append( ( tokens[0], ': '.join( tokens[1:] ) ) )
         elif i == 0:
             return headers, body     # no headers, just return those passed in.
         else:    # continuation
             last, hdrlist = hdrlist[ -1 ], hdrlist[ :-1 ]
             hdrlist.append( ( last[ 0 ]
-                            , join( ( last[1], lstrip( line ) ), '\n' )
+                            , '\n'.join( ( last[1], line.lstrip() ) )
                             ) )
         i = i + 1
     for hdr in hdrlist:
         headers[ hdr[0] ] = hdr[ 1 ]
-    return headers, join( lines[ i+1: ], '\n' )
+    return headers, '\n'.join( lines[ i+1: ] )
 def semi_split(s):
-    return map(strip, split(s, ';'))
+    """ Split 's' on semicolons.
+    """
+    return map(lambda x: x.strip(), s.split( ';' ) )
 def comma_split(s):
-    return map(strip, split(s, ','))
-def seq_strip (seq, stripper=strip):
-    if type(seq) == type([]):
-        return map ( stripper, seq)
-    if type(seq) == type(()):
-        #seq1 = list(seq)
-        return tuple (map(stripper, seq))
-    raise ValueError, "%s of unsupported sequencetype %s" % (seq, type(seq))
-def tuplize( valueName, value, splitter=split ):
-    if type(value) == type(()): return seq_strip( value )
-    if type(value) == type([]): return seq_strip( tuple( value ))
-    if type(value) == type(''): return seq_strip( tuple( splitter( value ) ))
+    """ Split 's' on commas.
+    """
+    return map(lambda x: x.strip(), s.split( ',') )
+def seq_strip (seq, stripper=lambda x: x.strip() ):
+    """ Strip a sequence of strings.
+    """
+    if type( seq ) == type( [] ):
+        return map( stripper, seq )
+    if type( seq ) == type( () ):
+        return tuple( map( stripper, seq ) )
+    raise ValueError, "%s of unsupported sequencetype %s" % ( seq, type( seq ) )
+def tuplize( valueName, value, splitter=lambda x: x.strip() ):
+    if type( value ) == type( () ):
+        return seq_strip( value )
+    if type( value ) == type( [] ):
+        return seq_strip( tuple( value ) )
+    if type( value ) == type( '' ):
+        return seq_strip( tuple( splitter( value ) ) )
     raise ValueError, "%s of unsupported type" % valueName
-class SimpleHTMLParser(SGMLParser):
+class SimpleHTMLParser( SGMLParser ):
     #from htmlentitydefs import entitydefs
-    def __init__(self, verbose=0):
-        SGMLParser.__init__(self, verbose)
+    def __init__( self, verbose=0 ):
+        SGMLParser.__init__( self, verbose )
         self.savedata = None
         self.title = ''
         self.metatags = {}
         self.body = ''
-    def handle_data(self, data):
+    def handle_data( self, data ):
         if self.savedata is not None:
             self.savedata = self.savedata + data
-    def handle_charref(self, ref):
-        self.handle_data("&#%s;" % ref)
+    def handle_charref( self, ref ):
+        self.handle_data( "&#%s;" % ref )
+    def handle_entityref( self, ref ):
+        self.handle_data( "&%s;" % ref )
-    def handle_entityref(self, ref):
-        self.handle_data("&%s;" % ref)
+    def save_bgn( self ):
-    def save_bgn(self):
         self.savedata = ''
-    def save_end(self):
+    def save_end( self ):
         data = self.savedata
         self.savedata = None
         return data
-    def start_title(self, attrs):
+    def start_title( self, attrs ):
-    def end_title(self):
+    def end_title( self ):
         self.title = self.save_end()
-    def do_meta(self, attrs):
+    def do_meta( self, attrs ):
         name = ''
         content = ''
         for attrname, value in attrs:
-            value = strip(value)
-            if attrname == "name": name = capitalize(value)
-            if attrname == "content": content = value
+            value = value.strip()
+            if attrname == "name":
+                name = value.capitalize()
+            if attrname == "content":
+                content = value
         if name:
-            self.metatags[name] = content
+            self.metatags[ name ] = content
-    def unknown_startag(self, tag, attrs):
+    def unknown_startag( self, tag, attrs ):
-    def unknown_endtag(self, tag):
+    def unknown_endtag( self, tag ):
-_bodyre = re.compile(r'^\s*<html.*<body.*?>', re.DOTALL|re.I)
-_endbodyre = re.compile(r'</body', re.DOTALL|re.I)
+#   HTML cleaning code
+# These are the HTML tags that we will leave intact
+VALID_TAGS = { 'a'          : 1
+             , 'b'          : 1
+             , 'base'       : 1
+             , 'blockquote' : 1
+             , 'body'       : 1
+             , 'br'         : 1
+             , 'caption'    : 1
+             , 'cite'       : 1
+             , 'code'       : 1
+             , 'div'        : 1
+             , 'dl'         : 1
+             , 'dt'         : 1
+             , 'dd'         : 1
+             , 'em'         : 1
+             , 'h1'         : 1
+             , 'h2'         : 1
+             , 'h3'         : 1
+             , 'h4'         : 1
+             , 'h5'         : 1
+             , 'h6'         : 1
+             , 'head'       : 1
+             , 'hr'         : 1
+             , 'html'       : 1
+             , 'i'          : 1
+             , 'img'        : 1
+             , 'kbd'        : 1
+             , 'li'         : 1
+           # , 'link'       : 1 type="script" hoses us
+             , 'meta'       : 1
+             , 'ol'         : 1
+             , 'p'          : 1
+             , 'pre'        : 1
+             , 'span'       : 1
+             , 'strong'     : 1
+             , 'table'      : 1
+             , 'tbody'      : 1
+             , 'td'         : 1
+             , 'th'         : 1
+             , 'title'      : 1
+             , 'tr'         : 1
+             , 'tt'         : 1
+             , 'ul'         : 1
+             }
+NASTY_TAGS = { 'script'     : 1
+             , 'object'     : 1
+             , 'embed'      : 1
+             , 'applet'     : 1
+             }
+class IllegalHTML( ValueError ):
+    pass
+class StrippingParser( SGMLParser ):
+    """ Pass only allowed tags;  raise exception for known-bad.
+    """
+    from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
+    def __init__( self ):
+        SGMLParser.__init__( self )
+        self.result = ""
+    def handle_data( self, data ):
+        if data:
+            self.result = self.result + data
+    def handle_charref( self, name ):
+        self.result = "%s&#%s;" % ( self.result, name )
+    def handle_entityref(self, name):
+        if self.entitydefs.has_key(name):
+            x = ';'
+        else:
+            # this breaks unstandard entities that end with ';'
+            x = ''
+        self.result = "%s&%s%s" % (self.result, name, x)
+    def unknown_starttag(self, tag, attrs):
+        """ Delete all tags except for legal ones.
+        """
+        if VALID_TAGS.get( tag ):
+            self.result = self.result + '<' + tag
+            for k, v in attrs:
+                if k.lower().startswith( 'on' ):
+                    raise IllegalHTML, 'Javascipt event "%s" not allowed.' % k
+                if v.lower().startswith( 'javascript:' ):
+                    raise IllegalHTML, 'Javascipt URI "%s" not allowed.' % v
+                self.result = '%s %s="%s"' % (self.result, k, v)
+            endTag = '</%s>' % tag
+            self.result = self.result + '>'
+        elif NASTY_TAGS.get( tag ):
+            raise IllegalHTML, 'Dynamic tag "%s" not allowed.' % tag
+        else:
+            pass    # omit tag
+    def unknown_endtag(self, tag):
-def bodyfinder(text):
-    bod = _bodyre.search(text)
-    if not bod: return text
-    end = _endbodyre.search(text)
-    if not end: return text
-    else: return text[bod.end():end.start()]
-htfinder = re.compile(r'<html', re.DOTALL|re.I)
-def html_headcheck(html):
-    """ Returns 'true' if document looks HTML-ish enough """
+        if VALID_TAGS.get( tag ):
+            self.result = "%s</%s>" % (self.result, tag)
+            remTag = '</%s>' % tag
+def scrubHTML( html ):
+    """ Strip illegal HTML tags from string text.
+    """
+    parser = StrippingParser()
+    parser.feed( html )
+    parser.close()
+    return parser.result
+def isHTMLSafe( html ):
+    """ Would current HTML be permitted to be saved?
+    """
+    try:
+        scrubHTML( html )
+    except IllegalHTML:
+        return 0
+    else:
+        return 1
+_bodyre = re.compile( r'^\s*<html.*<body.*?>', re.DOTALL | re.I )
+_endbodyre = re.compile( r'</body', re.DOTALL | re.I )
+def bodyfinder( text ):
+    bod = _bodyre.search( text )
+    if not bod:
+        return text
+    end = _endbodyre.search( text )
+    if not end:
+        return text
+    else:
+        return text[bod.end():end.start()]
+htfinder = re.compile( r'<html', re.DOTALL | re.I )
+def html_headcheck( html ):
+    """ Return 'true' if document looks HTML-ish enough.
+    """
     if not htfinder.search(html):
         return 0
     lines = re.split(r'[\n\r]+?', html)
     for line in lines:
-        line = strip(line)
+        line = line.strip()
         if not line:
-        elif lower(line[:5]) == '<html':
+        elif line.lower().startswith( '<html' ):
             return 1
         elif line[0] != '<':
             return 0