Backport the type-sniffing code from Zope 3 to support XML page templates

from the filesystem.

Backport the type-sniffing code from Zope 3 to support XML page templates
from the filesystem.
fe148408 · Fred Drake · 9a595cb2 · fe148408 · fe148408
Commit fe148408 authored Mar 21, 2003 by Fred Drake
2 changed files
--- a/lib/python/Products/PageTemplates/PageTemplateFile.py
+++ b/lib/python/Products/PageTemplates/PageTemplateFile.py
@@ -15,7 +15,7 @@
 Zope object encapsulating a Page Template from the filesystem.
 """

-__version__='$Revision: 1.23 $'[11:-2]
+__version__='$Revision: 1.24 $'[11:-2]

 import os, AccessControl, Acquisition, sys
 from Globals import package_home, DevelopmentMode
@@ -117,7 +117,12 @@ class PageTemplateFile(Script, PageTemplate, Traversable):
            mtime = 0
        if self._v_program is not None and mtime == self._v_last_read:
            return
-        self.pt_edit(open(self.filename), None)
+        f = open(self.filename, "rb")
+        try:
+            text = f.read()
+        finally:
+            f.close()
+        self.pt_edit(text, sniff_type(text))
        self._cook()
        if self._v_errors:
            LOG('PageTemplateFile', ERROR, 'Error in template',
@@ -154,3 +159,19 @@ class PageTemplateFile(Script, PageTemplate, Traversable):
        from ZODB.POSException import StorageError
        raise StorageError, ("Instance of AntiPersistent class %s "
                             "cannot be stored." % self.__class__.__name__)
+
+
+XML_PREFIXES = [
+    "<?xml",                      # ascii, utf-8
+    "\xef\xbb\xbf<?xml",          # utf-8 w/ byte order mark
+    "\0<\0?\0x\0m\0l",            # utf-16 big endian
+    "<\0?\0x\0m\0l\0",            # utf-16 little endian
+    "\xfe\xff\0<\0?\0x\0m\0l",    # utf-16 big endian w/ byte order mark
+    "\xff\xfe<\0?\0x\0m\0l\0",    # utf-16 little endian w/ byte order mark
+    ]
+
+def sniff_type(text):
+    for prefix in XML_PREFIXES:
+        if text.startswith(prefix):
+            return "text/xml"
+    return None
--- a/lib/python/Products/PageTemplates/tests/test_ptfile.py
+++ b/lib/python/Products/PageTemplates/tests/test_ptfile.py
+"""Tests of PageTemplateFile."""
+
+import os
+import tempfile
+import unittest
+
+from Products.PageTemplates.PageTemplateFile import PageTemplateFile
+
+
+class TypeSniffingTestCase(unittest.TestCase):
+
+    TEMPFILENAME = tempfile.mktemp()
+
+    def tearDown(self):
+        if os.path.exists(self.TEMPFILENAME):
+            os.unlink(self.TEMPFILENAME)
+
+    def check_content_type(self, text, expected_type):
+        f = open(self.TEMPFILENAME, "wb")
+        f.write(text)
+        f.close()
+        pt = PageTemplateFile(self.TEMPFILENAME)
+        pt.read()
+        self.assertEqual(pt.content_type, expected_type)
+
+    def test_sniffer_xml_ascii(self):
+        self.check_content_type(
+            "<?xml version='1.0' encoding='ascii'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "<?xml\tversion='1.0' encoding='ascii'?><doc/>",
+            "text/xml")
+
+    def test_sniffer_xml_utf8(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "<?xml version='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "<?xml\tversion='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xef\xbb\xbf<?xml version='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "\xef\xbb\xbf<?xml\tversion='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+
+    def test_sniffer_xml_utf16_be(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "\0<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        self.check_content_type(
+            "\0<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xfe\xff"
+            "\0<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        self.check_content_type(
+            "\xfe\xff"
+            "\0<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+
+    def test_sniffer_xml_utf16_le(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\n",
+            "text/xml")
+        self.check_content_type(
+            "<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xff\xfe"
+            "<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+        self.check_content_type(
+            "\xff\xfe"
+            "<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+
+    HTML_PUBLIC_ID = "-//W3C//DTD HTML 4.01 Transitional//EN"
+    HTML_SYSTEM_ID = "http://www.w3.org/TR/html4/loose.dtd"
+
+    def test_sniffer_html_ascii(self):
+        self.check_content_type(
+            "<!DOCTYPE html [ SYSTEM '%s' ]><html></html>"
+            % self.HTML_SYSTEM_ID,
+            "text/html")
+        self.check_content_type(
+            "<html><head><title>sample document</title></head></html>",
+            "text/html")
+
+    # XXX This reflects a case that simply isn't handled by the
+    # sniffer; there are many, but it gets it right more often than
+    # before.
+    def donttest_sniffer_xml_simple(self):
+        self.check_content_type("<doc><element/></doc>",
+                                "text/xml")
+
+
+def test_suite():
+    return unittest.makeSuite(TypeSniffingTestCase)
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="test_suite")