Commit aaec4852 authored by Fred Drake's avatar Fred Drake

Add more tests to cover edge cases that are legal in either HTML (bare

pointy brackets & ampersands) or XHTML (hexadecimal character references),
but not both.  Also added a test for DOCTYPE declaration parsing.
parent cddf8940
...@@ -62,6 +62,9 @@ class EventCollector(HTMLParser.HTMLParser): ...@@ -62,6 +62,9 @@ class EventCollector(HTMLParser.HTMLParser):
def handle_pi(self, data): def handle_pi(self, data):
self.append(("pi", data)) self.append(("pi", data))
def unknown_decl(self, decl):
self.append(("unknown decl", decl))
class EventCollectorExtra(EventCollector): class EventCollectorExtra(EventCollector):
...@@ -117,6 +120,7 @@ class HTMLParserTestCase(TestCaseBase): ...@@ -117,6 +120,7 @@ class HTMLParserTestCase(TestCaseBase):
comment1b--> comment1b-->
<Img sRc='Bar' isMAP>sample <Img sRc='Bar' isMAP>sample
text text
&#x201C;
<!--comment2a-- --comment2b--> <!--comment2a-- --comment2b-->
</Html> </Html>
""", [ """, [
...@@ -131,13 +135,36 @@ text ...@@ -131,13 +135,36 @@ text
("data", "\n"), ("data", "\n"),
("starttag", "img", [("src", "Bar"), ("ismap", None)]), ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
("data", "sample\ntext\n"), ("data", "sample\ntext\n"),
("charref", "x201C"),
("data", "\n"),
("comment", "comment2a-- --comment2b"), ("comment", "comment2a-- --comment2b"),
("data", "\n"), ("data", "\n"),
("endtag", "html"), ("endtag", "html"),
("data", "\n"), ("data", "\n"),
]) ])
def check_doctype_decl(self):
inside = """\
DOCTYPE html [
<!ELEMENT html - O EMPTY>
<!ATTLIST html
version CDATA #IMPLIED
profile CDATA 'DublinCore'>
<!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
<!ENTITY myEntity 'internal parsed entity'>
<!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
<!ENTITY % paramEntity 'name|name|name'>
%paramEntity;
<!-- comment -->
]"""
self._run_check("<!%s>" % inside, [
("decl", inside),
])
def check_bad_nesting(self): def check_bad_nesting(self):
# Strangely, this *is* supposed to test that overlapping
# elements are allowed. HTMLParser is more geared toward
# lexing the input that parsing the structure.
self._run_check("<a><b></a></b>", [ self._run_check("<a><b></a></b>", [
("starttag", "a", []), ("starttag", "a", []),
("starttag", "b", []), ("starttag", "b", []),
...@@ -145,6 +172,16 @@ text ...@@ -145,6 +172,16 @@ text
("endtag", "b"), ("endtag", "b"),
]) ])
def check_bare_ampersands(self):
self._run_check("this text & contains & ampersands &", [
("data", "this text & contains & ampersands &"),
])
def check_bare_pointy_brackets(self):
self._run_check("this < text > contains < bare>pointy< brackets", [
("data", "this < text > contains < bare>pointy< brackets"),
])
def check_attr_syntax(self): def check_attr_syntax(self):
output = [ output = [
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
...@@ -174,6 +211,14 @@ text ...@@ -174,6 +211,14 @@ text
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
]) ])
def check_illegal_declarations(self):
s = 'abc<!spacer type="block" height="25">def'
self._run_check(s, [
("data", "abc"),
("unknown decl", 'spacer type="block" height="25"'),
("data", "def"),
])
def check_starttag_end_boundary(self): def check_starttag_end_boundary(self):
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])]) self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
...@@ -196,17 +241,12 @@ text ...@@ -196,17 +241,12 @@ text
self._run_check(["<a b='>'", ">"], output) self._run_check(["<a b='>'", ">"], output)
def check_starttag_junk_chars(self): def check_starttag_junk_chars(self):
self._parse_error("<")
self._parse_error("<>")
self._parse_error("</>") self._parse_error("</>")
self._parse_error("</$>") self._parse_error("</$>")
self._parse_error("</") self._parse_error("</")
self._parse_error("</a") self._parse_error("</a")
self._parse_error("</a")
self._parse_error("<a<a>") self._parse_error("<a<a>")
self._parse_error("</a<a>") self._parse_error("</a<a>")
self._parse_error("<$")
self._parse_error("<$>")
self._parse_error("<!") self._parse_error("<!")
self._parse_error("<a $>") self._parse_error("<a $>")
self._parse_error("<a") self._parse_error("<a")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment