[ZPT] CVS: Packages/TAL - HTMLParser.py:1.12

fred@digicool.com fred@digicool.com
Fri, 6 Apr 2001 16:09:42 -0400 (EDT)


Update of /cvs-repository/Packages/TAL
In directory korak:/tmp/cvs-serv19345

Modified Files:
	HTMLParser.py 
Log Message:
locatestarttagend:  Completely re-write the expression so to be much
    more strict about matchnig only what's legal.  The expression ends
    up being a bit more complex, and needs additional checks to be
    done on what follows.

HTMLParser.check_for_whole_start_tag():  Helper method that uses
    locatestarttagend, performs the required additional checks, and
    determines whether we've actually found the end of the start tag,
    are at a buffer boundary, or have encountered an syntactical
    error.


HTMLParser.parse_starttag():  Use check_for_whole_start_tag() to see
    if we really have the start tag.

HTMLParseError.__init__():  Simplify assertion.


This should close ZPT(18).



--- Updated File HTMLParser.py in package Packages/TAL --
--- HTMLParser.py	2001/03/26 16:48:32	1.11
+++ HTMLParser.py	2001/04/06 20:09:40	1.12
@@ -32,7 +32,20 @@
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
 
-locatestarttagend = re.compile("('[^']*'|\"[^\"]*\"|[^'\">]+)*/?>")
+locatestarttagend = re.compile(r"""
+  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
+  (?:\s+                             # whitespace before attribute name
+    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
+      (?:\s*=\s*                     # value indicator
+        (?:'[^']*'                   # LITA-enclosed value
+          |\"[^\"]*\"                # LIT-enclosed value
+          |[^'\">\s]+                # bare value
+         )
+       )?
+     )
+   )*
+  \s*                                # trailing whitespace
+""", re.VERBOSE)
 endstarttag = re.compile(r"\s*/?>")
 endendtag = re.compile('>')
 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
@@ -45,7 +58,7 @@
     """Exception raised for all parse errors."""
 
     def __init__(self, msg, position=(None, None)):
-        assert msg != ""
+        assert msg
         self.msg = msg
         self.lineno = position[0]
         self.offset = position[1]
@@ -255,11 +268,10 @@
     # Internal -- handle starttag, return end or -1 if not terminated
     def parse_starttag(self, i):
         self.__starttag_text = None
+        endpos = self.check_for_whole_start_tag(i)
+        if endpos < 0:
+            return endpos
         rawdata = self.rawdata
-        m = locatestarttagend.match(rawdata, i) # > outside quotes
-        if not m:
-            return -1
-        endpos = m.end()
         self.__starttag_text = rawdata[i:endpos]
 
         # Now parse the data between i+1 and j into a tag and attrs
@@ -301,6 +313,29 @@
         else:
             self.handle_starttag(tag, attrs)
         return endpos
+
+    # Internal -- check to see if we have a complete starttag; return end
+    # or -1 if incomplete.
+    def check_for_whole_start_tag(self, i):
+        rawdata = self.rawdata
+        m = locatestarttagend.match(rawdata, i)
+        if m:
+            j = m.end()
+            next = rawdata[j:j+1]
+            if next == ">":
+                return j + 1
+            if rawdata[j:j+2] == "/>":
+                return j + 2
+            if next == "":
+                # end of input
+                return -1
+            if next in ("abcdefghijklmnopqrstuvwxyz="
+                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
+                # end of input in or before attribute value
+                return -1
+            self.updatepos(i, j)
+            raise HTMLParseError("malformed start tag", self.getpos())
+        raise AssertionError("we should not gt here!")
 
     # Internal -- parse endtag, return end or -1 if incomplete
     def parse_endtag(self, i):