Move several comments to become docstings.

Added reasonable parsing of the DOCTYPE declaration, fixed edge cases regarding bare ampersands in content.

Move several comments to become docstings.
Added reasonable parsing of the DOCTYPE declaration, fixed edge cases regarding bare ampersands in content.
77622d13 · Fred Drake · aaec4852 · 77622d13
Commit 77622d13 authored Sep 04, 2001 by Fred Drake
Hide whitespace changes
Inline Side-by-side

Showing with 297 additions and 45 deletions

lib/python/TAL/HTMLParser.py lib/python/TAL/HTMLParser.py +297 -45

No files found.
--- a/lib/python/TAL/HTMLParser.py
+++ b/lib/python/TAL/HTMLParser.py
-"""A parser for HTML."""
+"""A parser for HTML and XHTML."""

 # This file is derived from sgmllib.py, which is part of Python.

@@ -15,10 +15,10 @@ import string

 interesting_normal = re.compile('[&<]')
 interesting_cdata = re.compile(r'<(/|\Z)')
-incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
+incomplete = re.compile('&[a-zA-Z#]')

 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
-charref = re.compile('&#([0-9]+)[^0-9]')
+charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

 starttagopen = re.compile('<[a-zA-Z]')
 piopen = re.compile(r'<\?')
@@ -73,32 +73,35 @@ class HTMLParseError(Exception):
        return result


-# HTML parser class -- find tags and call handler functions.
-# Usage:
-#
-#     p = HTMLParser(); p.feed(data); ...; p.close()
-
-# Start tags are handled by calling self.handle_starttag() or
-# self.handle_startendtag(); end tags by self.handle_endtag().  The
-# data between tags is passed from the parser to the derived class by
-# calling self.handle_data() with the data as argument (the data may
-# be split up in arbitrary chunks).  Entity references are passed by
-# calling self.handle_entityref() with the entity reference as the
-# argument.  Numeric character references are passed to
-# self.handle_charref() with the string containing the reference as
-# the argument.
-
 class HTMLParser:
+    """Find tags and other markup and call handler functions.
+
+    Usage:
+        p = HTMLParser()
+        p.feed(data)
+        ...
+        p.close()
+
+    Start tags are handled by calling self.handle_starttag() or
+    self.handle_startendtag(); end tags by self.handle_endtag().  The
+    data between tags is passed from the parser to the derived class
+    by calling self.handle_data() with the data as argument (the data
+    may be split up in arbitrary chunks).  Entity references are
+    passed by calling self.handle_entityref() with the entity
+    reference as the argument.  Numeric character references are
+    passed to self.handle_charref() with the string containing the
+    reference as the argument.
+    """

    CDATA_CONTENT_ELEMENTS = ("script", "style")


-    # Interface -- initialize and reset this instance
    def __init__(self):
+        """Initialize and reset this instance."""
        self.reset()

-    # Interface -- reset this instance.  Loses all unprocessed data
    def reset(self):
+        """Reset this instance.  Loses all unprocessed data."""
        self.rawdata = ''
        self.stack = []
        self.lasttag = '???'
@@ -106,16 +109,17 @@ class HTMLParser:
        self.offset = 0
        self.interesting = interesting_normal

-    # Interface -- feed some data to the parser.  Call this as
-    # often as you want, with as little or as much text as you
-    # want (may include '\n').  (This just saves the text, all the
-    # processing is done by goahead().)
    def feed(self, data):
+        """Feed data to the parser.
+
+        Call this as often as you want, with as little or as much text
+        as you want (may include '\n').
+        """
        self.rawdata = self.rawdata + data
        self.goahead(0)

-    # Interface -- handle the remaining data
    def close(self):
+        """Handle any buffered data."""
        self.goahead(1)

    # Internal -- update line number and offset.  This should be
@@ -135,14 +139,14 @@ class HTMLParser:
            self.offset = self.offset + j-i
        return j

-    # Interface -- return current line number and offset.
    def getpos(self):
+        """Return current line number and offset."""
        return self.lineno, self.offset

    __starttag_text = None

-    # Interface -- return full source of start tag: "<...>"
    def get_starttag_text(self):
+        """Return full source of start tag: '<...>'."""
        return self.__starttag_text

    def set_cdata_mode(self):
@@ -180,45 +184,56 @@ class HTMLParser:
                    k = self.parse_pi(i)
                elif declopen.match(rawdata, i): # <!
                    k = self.parse_declaration(i)
+                elif (i + 1) < n:
+                    self.handle_data("<")
+                    k = i + 1
                else:
-                    if i < n-1:
-                        raise HTMLParseError(
-                            "invalid '<' construct: %s" % `rawdata[i:i+2]`,
-                            self.getpos())
-                    k = -1
+                    break
                if k < 0:
                    if end:
                        raise HTMLParseError("EOF in middle of construct",
                                             self.getpos())
                    break
                i = self.updatepos(i, k)
-            elif rawdata[i] == '&':
+            elif rawdata[i:i+2] == "&#":
                match = charref.match(rawdata, i)
                if match:
-                    name = match.group(1)
+                    name = match.group()[2:-1]
                    self.handle_charref(name)
                    k = match.end()
                    if rawdata[k-1] != ';':
-                        k = k-1
+                        k = k - 1
                    i = self.updatepos(i, k)
                    continue
+                else:
+                    break
+            elif rawdata[i] == '&':
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    k = match.end()
                    if rawdata[k-1] != ';':
-                        k = k-1
+                        k = k - 1
                    i = self.updatepos(i, k)
                    continue
-                if incomplete.match(rawdata, i):
-                    if end:
+                match = incomplete.match(rawdata, i)
+                if match:
+                    # match.group() will contain at least 2 chars
+                    rest = rawdata[i:]
+                    if end and match.group() == rest:
                        raise HTMLParseError(
                            "EOF in middle of entity or char ref",
                            self.getpos())
-                    return -1 # incomplete
-                raise HTMLParseError("'&' not part of entity or char ref",
-                                     self.getpos())
+                    # incomplete
+                    break
+                elif (i + 1) < n:
+                    # not the end of the buffer, and can't be confused
+                    # with some other construct
+                    self.handle_data("&")
+                    i = self.updatepos(i, i + 1)
+                else:
+                    break
            else:
                assert 0, "interesting.search() lied"
        # end while
@@ -228,14 +243,15 @@ class HTMLParser:
        self.rawdata = rawdata[i:]

    # Internal -- parse comment, return end or -1 if not terminated
-    def parse_comment(self, i):
+    def parse_comment(self, i, report=1):
        rawdata = self.rawdata
        assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
        match = commentclose.search(rawdata, i+4)
        if not match:
            return -1
-        j = match.start()
-        self.handle_comment(rawdata[i+4: j])
+        if report:
+            j = match.start()
+            self.handle_comment(rawdata[i+4: j])
        j = match.end()
        return j

@@ -253,11 +269,17 @@ class HTMLParser:
            return -1
        # in practice, this should look like: ((name|stringlit) S*)+ '>'
        n = len(rawdata)
+        decltype = None
+        extrachars = ""
        while j < n:
            c = rawdata[j]
            if c == ">":
                # end of declaration syntax
-                self.handle_decl(rawdata[i+2:j])
+                data = rawdata[i+2:j]
+                if decltype == "doctype":
+                    self.handle_decl(data)
+                else:
+                    self.unknown_decl(data)
                return j + 1
            if c in "\"'":
                m = declstringlit.match(rawdata, j)
@@ -269,12 +291,242 @@ class HTMLParser:
                if not m:
                    return -1 # incomplete
                j = m.end()
+                if decltype is None:
+                    decltype = m.group(0).rstrip().lower()
+                    if decltype != "doctype":
+                        extrachars = "="
+            elif c == "[" and decltype == "doctype":
+                j = self.parse_doctype_subset(j + 1, i)
+                if j < 0:
+                    return j
+            elif c in extrachars:
+                j = j + 1
+                while j < n and rawdata[j] in string.whitespace:
+                    j = j + 1
+                if j == n:
+                    # end of buffer while in declaration
+                    return -1
            else:
                raise HTMLParseError(
                    "unexpected char in declaration: %s" % `rawdata[j]`,
                    self.getpos())
+            decltype = decltype or ''
        return -1 # incomplete

+    # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
+    # returning the index just past any whitespace following the trailing ']'.
+    def parse_doctype_subset(self, i, declstartpos):
+        rawdata = self.rawdata
+        n = len(rawdata)
+        j = i
+        while j < n:
+            c = rawdata[j]
+            if c == "<":
+                s = rawdata[j:j+2]
+                if s == "<":
+                    # end of buffer; incomplete
+                    return -1
+                if s != "<!":
+                    self.updatepos(declstartpos, j + 1)
+                    raise HTMLParseError("unexpect char in internal subset",
+                                         self.getpos())
+                if (j + 2) == n:
+                    # end of buffer; incomplete
+                    return -1
+                if (j + 4) > n:
+                    # end of buffer; incomplete
+                    return -1
+                if rawdata[j:j+4] == "<!--":
+                    j = self.parse_comment(j, report=0)
+                    if j < 0:
+                        return j
+                    continue
+                name, j = self.scan_name(j + 2, declstartpos)
+                if j == -1:
+                    return -1
+                if name not in ("attlist", "element", "entity", "notation"):
+                    self.updatepos(declstartpos, j + 2)
+                    raise HTMLParseError(
+                        "unknown declaration %s in internal subset" % `name`,
+                        self.getpos())
+                # handle the individual names
+                meth = getattr(self, "parse_doctype_" + name)
+                j = meth(j, declstartpos)
+                if j < 0:
+                    return j
+            elif c == "%":
+                # parameter entity reference
+                if (j + 1) == n:
+                    # end of buffer; incomplete
+                    return -1
+                m = declname.match(rawdata, j + 1)
+                s = m.group()
+                if s == rawdata[j+1:]:
+                    return -1
+                j = j + 1 + len(s.rstrip())
+                if rawdata[j] == ";":
+                    j = j + 1
+            elif c == "]":
+                j = j + 1
+                while j < n and rawdata[j] in string.whitespace:
+                    j = j + 1
+                if j < n:
+                    if rawdata[j] == ">":
+                        return j
+                    self.updatepos(declstartpos, j)
+                    raise HTMLParseError(
+                        "unexpected char after internal subset",
+                        self.getpos())
+                else:
+                    return -1
+            elif c in string.whitespace:
+                j = j + 1
+            else:
+                self.updatepos(declstartpos, j)
+                raise HTMLParseError("unexpected char in internal subset",
+                                     self.getpos())
+        # end of buffer reached
+        return -1
+
+    def parse_doctype_element(self, i, declstartpos):
+        rawdata = self.rawdata
+        n = len(rawdata)
+        name, j = self.scan_name(i, declstartpos)
+        if j == -1:
+            return -1
+        # style content model; just skip until '>'
+        if '>' in rawdata[j:]:
+            return string.find(rawdata, ">", j) + 1
+        return -1
+
+    def parse_doctype_attlist(self, i, declstartpos):
+        rawdata = self.rawdata
+        name, j = self.scan_name(i, declstartpos)
+        c = rawdata[j:j+1]
+        if c == "":
+            return -1
+        if c == ">":
+            return j + 1
+        while 1:
+            # scan a series of attribute descriptions; simplified:
+            #   name type [value] [#constraint]
+            name, j = self.scan_name(j, declstartpos)
+            if j < 0:
+                return j
+            c = rawdata[j:j+1]
+            if c == "":
+                return -1
+            if c == "(":
+                # an enumerated type; look for ')'
+                if ")" in rawdata[j:]:
+                    j = string.find(rawdata, ")", j) + 1
+                else:
+                    return -1
+                while rawdata[j:j+1] in string.whitespace:
+                    j = j + 1
+                if not rawdata[j:]:
+                    # end of buffer, incomplete
+                    return -1
+            else:
+                name, j = self.scan_name(j, declstartpos)
+            c = rawdata[j:j+1]
+            if not c:
+                return -1
+            if c in "'\"":
+                m = declstringlit.match(rawdata, j)
+                if m:
+                    j = m.end()
+                else:
+                    return -1
+                c = rawdata[j:j+1]
+                if not c:
+                    return -1
+            if c == "#":
+                if rawdata[j:] == "#":
+                    # end of buffer
+                    return -1
+                name, j = self.scan_name(j + 1, declstartpos)
+                if j < 0:
+                    return j
+                c = rawdata[j:j+1]
+                if not c:
+                    return -1
+            if c == '>':
+                # all done
+                return j + 1
+                
+    def parse_doctype_notation(self, i, declstartpos):
+        name, j = self.scan_name(i, declstartpos)
+        if j < 0:
+            return j
+        rawdata = self.rawdata
+        while 1:
+            c = rawdata[j:j+1]
+            if not c:
+                # end of buffer; incomplete
+                return -1
+            if c == '>':
+                return j + 1
+            if c in "'\"":
+                m = declstringlit.match(rawdata, j)
+                if not m:
+                    return -1
+                j = m.end()
+            else:
+                name, j = self.scan_name(j, declstartpos)
+                if j < 0:
+                    return j
+
+    def parse_doctype_entity(self, i, declstartpos):
+        rawdata = self.rawdata
+        if rawdata[i:i+1] == "%":
+            j = i + 1
+            while 1:
+                c = rawdata[j:j+1]
+                if not c:
+                    return -1
+                if c in string.whitespace:
+                    j = j + 1
+                else:
+                    break
+        else:
+            j = i
+        name, j = self.scan_name(j, declstartpos)
+        if j < 0:
+            return j
+        while 1:
+            c = self.rawdata[j:j+1]
+            if not c:
+                return -1
+            if c in "'\"":
+                m = declstringlit.match(rawdata, j)
+                if m:
+                    j = m.end()
+                else:
+                    return -1    # incomplete
+            elif c == ">":
+                return j + 1
+            else:
+                name, j = self.scan_name(j, declstartpos)
+                if j < 0:
+                    return j
+
+    def scan_name(self, i, declstartpos):
+        rawdata = self.rawdata
+        n = len(rawdata)
+        if i == n:
+            return None, -1
+        m = declname.match(rawdata, i)
+        if m:
+            s = m.group()
+            name = s.strip()
+            if (i + len(s)) == n:
+                return None, -1  # end of buffer
+            return name.lower(), m.end()
+        else:
+            self.updatepos(declstartpos, i)
+            raise HTMLParseError("expected name token", self.getpos())
+
    # Internal -- parse processing instr, return end or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata