Commit e7fee233 authored by Tristan Cavelier's avatar Tristan Cavelier Committed by Sven Franck

erp5_web: add tools to export web page as single file (mhtml or embedded html)

parent 7fd7a9eb
##############################################################################
#
# Copyright (c) 2016 Nexedi SA and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly advised to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from HTMLParser import HTMLParser
class HtmlParseHelper(HTMLParser):
"""
Listens to all the HTMLParser methods and push results in a list of tuple.
Tuple contains every method arguments, for instance the `handle_starttag`
method pushes `('starttag', tag, attrs)` to the tuple list.
See https://docs.python.org/2/library/htmlparser.html
"""
def __init__(self, *args, **kw):
HTMLParser.__init__(self, *args, **kw)
self.result = []
def handle_starttag(self, tag, attrs):
self.result.append(("starttag", tag, attrs))
def handle_startendtag(self, tag, attrs):
self.result.append(("startendtag", tag, attrs))
def handle_endtag(self, tag):
self.result.append(("endtag", tag))
def handle_data(self, data):
self.result.append(("data", data))
def handle_entityref(self, name):
self.result.append(("entityref", name))
def handle_charref(self, name):
self.result.append(("charref", name))
def handle_comment(self, data):
self.result.append(("comment", data))
def handle_decl(self, decl):
self.result.append(("decl", decl))
def handle_pi(self, data):
self.result.append(("pi", data))
def unknown_decl(self, data):
self.result.append(("unknown_decl", data))
def parseHtml(text):
"""
Parses a string and returns html parts as tuple list.
Example:
input: 'Click <a href="destination">here</a> to see the documentation.'
return: [
('data', 'Click '),
('starttag', 'a', ('href', 'destination')),
('data', 'here'),
('endtag', 'a'),
('data', ' to see the documentation'),
]
"""
hr = HtmlParseHelper()
hr.feed(text)
hr.close()
return hr.result
import re
def partition(text, separatorRegexp):
"""
partition("abcba", re.compile("(b)")) -> [
("a",),
("b", "b"),
("c",),
("b", "b"),
("a",),
]
"""
result = []
lastIndex = 0
for match in separatorRegexp.finditer(text):
result.append((text[lastIndex:match.start()],))
result.append((match.group(0),) + match.groups())
lastIndex = match.end()
result.append((text[lastIndex:],))
return result
css_comment_filter_re = re.compile(r"/\*((?:[^\*]|\*[^/])*)\*/")
#css_url_re = re.compile(r"""(:[ \t]*url\()((")([^"]*)"|(')([^']*)'|([^\)]*))\)""")
css_url_re = re.compile(r"""(:[ \t]*url\()(\s*(")([^"]*)"\s*|\s*(')([^']*)'\s*|([^\)]*))\)""")
def parseCssForUrl(text):
"""
return tuple list like: [
("data", ""),
("comment", "/* set body background image */", " set body background image "),
("data", "\nbody {\n background-image: url("),
("url", " 'http://ima.ge/bg.png' ", "http://ima.ge/bg.png", "'"),
("data", ");\n}\n"),
]
"""
result = []
parts = partition(text, css_comment_filter_re) # filter comments
i = 0
for part in parts:
i += 1
if i % 2 == 0: # comment
result.append(("comment", part[0], part[1]))
else: # non comment
parts = partition(part[0], css_url_re)
data = ""
j = 0
for part in parts:
j += 1
if j % 2 == 1: # css data
data += part[0]
else: # url
result.append(("data", data + part[1]))
result.append(("url", part[2], (part[4] or part[6] or part[7] or "").strip(), part[3] or part[5] or ""))
data = ")"
result.append(("data", data))
return result
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="Extension Component" module="erp5.portal_type"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>default_reference</string> </key>
<value> <string>WebUtility</string> </value>
</item>
<item>
<key> <string>description</string> </key>
<value>
<none/>
</value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>extension.erp5.WebUtility</string> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Extension Component</string> </value>
</item>
<item>
<key> <string>sid</string> </key>
<value>
<none/>
</value>
</item>
<item>
<key> <string>text_content_error_message</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>text_content_warning_message</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>version</string> </key>
<value> <string>erp5</string> </value>
</item>
<item>
<key> <string>workflow_history</string> </key>
<value>
<persistent> <string encoding="base64">AAAAAAAAAAI=</string> </persistent>
</value>
</item>
</dictionary>
</pickle>
</record>
<record id="2" aka="AAAAAAAAAAI=">
<pickle>
<global name="PersistentMapping" module="Persistence.mapping"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>data</string> </key>
<value>
<dictionary>
<item>
<key> <string>component_validation_workflow</string> </key>
<value>
<persistent> <string encoding="base64">AAAAAAAAAAM=</string> </persistent>
</value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</pickle>
</record>
<record id="3" aka="AAAAAAAAAAM=">
<pickle>
<global name="WorkflowHistoryList" module="Products.ERP5Type.patches.WorkflowTool"/>
</pickle>
<pickle>
<tuple>
<none/>
<list>
<dictionary>
<item>
<key> <string>action</string> </key>
<value> <string>validate</string> </value>
</item>
<item>
<key> <string>validation_state</string> </key>
<value> <string>validated</string> </value>
</item>
</dictionary>
</list>
</tuple>
</pickle>
</record>
</ZopeData>
"""
Usage:
formatAttachmentListToMIMEMultipartString(
subtype="related",
header_dict={
"From": "<Saved by ERP5>",
"Subject": "Document Title",
},
param_list=[("type", "text/html")],
attachment_list=[
{
"mime_type": "text/html",
"charset": "utf-8",
"encode": "quoted-printable",
"header_dict": {"Content-Location": "https://www.erp5.com/My.Web.Page"}, # only add headers
"data": "<!DOCTYPE ...>.....................</...>",
},
{
"mime_type": "image/png",
"add_header_list": [("Content-Location", "https://www.erp5.com/My.Image")],
"data": "\x00............\x01",
}
]
);
Only attachtment_list property is mandatory.
Note: text/* content will not be automatically encoded to quoted-printable
because this encoding can lose some characters like "\r" and possibly others.
Default text/* is encoded in 7or8bit.
To send specific encoded data, please make your attachment dict look like:
{
"mime_type": "text/html",
"encode": "noop",
"add_header_list": [("Content-Transfer-Encoding", "my-encoding")],
"data": encodestring(html_data),
}
"""
from email.encoders import encode_noop, encode_7or8bit, \
encode_base64 as original_encode_base64
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
from email.mime.audio import MIMEAudio
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
import quopri
def formatMultipartMessageToRFC2822String(msg):
"""
The `msg.as_string()` method does not exactly follow the RFC2822. The EOL are
not CRLF ("\r\n") by default, so we have to replace the actual newlines
(LF "\n") by CRLF if necessary.
Note: The first space in each line of a multiline header will be replaced by a
tabulation to make some mhtml viewers able to parse it, even if a simple space
follows the RFC2822.
"""
as_string = msg.as_string() # it also forces the boundary generation
if as_string.split("\n", 1)[0].endswith("\r"):
return as_string
boundary = msg.get_boundary()
parts = as_string.split("\n--" + boundary)
parts[0] = "\r\n".join(parts[0].split("\n")).replace("\r\n ", "\r\n\t")
i = 0
for part in parts[1:]:
i += 1
partsplit = part.split("\n\n", 1)
partsplit[0] = "\r\n".join(partsplit[0].split("\n")).replace("\r\n ", "\r\n\t")
parts[i] = "\r\n\r\n".join(partsplit)
return ("\r\n--" + boundary).join(parts)
def encode_quopri(msg):
"""Same as encoders.encode_quopri except that spaces are kept
when possible and end of lines are converted to CRLF ("\r\n")
when necessary.
"""
orig = msg.get_payload()
encdata = quopri.encodestring(orig).replace("=\n", "=\r\n")
msg.set_payload(encdata)
msg.add_header("Content-Transfer-Encoding", "quoted-printable")
def encode_base64(msg):
"""Extend encoders.encode_base64 to return CRLF at end of lines"""
original_encode_base64(msg)
msg.set_payload(msg.get_payload().replace("\n", "\r\n"))
outer = MIMEMultipart(subtype)
for key, value in param_list:
outer.set_param(key, value)
if boundary is not None:
outer.set_boundary(boundary)
if replace_header_list is not None:
for key, value in replace_header_list:
outer.replace_header(key, value)
if header_dict is not None: # adds headers, does not replace or set
for key, value in header_dict.items():
outer.add_header(key, value)
if add_header_list is not None:
for key, value in add_header_list:
outer.add_header(key, value)
for attachment in attachment_list:
mime_type = attachment.get("mime_type", "application/octet-stream")
data = attachment.get("data", "")
encoding = attachment.get("encode")
if encoding not in ("base64", "quoted-printable", "7or8bit", "noop", None):
raise ValueError("unknown attachment encoding %r" % encoding)
main_type, sub_type = mime_type.split("/")
if encoding is None:
if main_type == "image":
if sub_type == "svg+xml":
part = MIMEImage(data, sub_type, encode_quopri) # should we trust the mime_type ?
else:
part = MIMEImage(data, sub_type, encode_base64)
elif main_type == "text":
part = MIMEText(data, sub_type, attachment.get("charset", "us-ascii"))
elif main_type == "audio":
part = MIMEAudio(data, sub_type, encode_base64)
elif main_type == "application":
part = MIMEApplication(data, sub_type, encode_noop)
if sub_type == "javascript":
encode_quopri(part)
else:
encode_base64(part)
else:
part = MIMEBase(main_type, sub_type)
part.set_payload(data)
encode_base64(part)
else:
part = MIMEBase(main_type, sub_type)
part.set_payload(data)
if encoding == "base64":
encode_base64(part)
elif encoding == "quoted-printable":
encode_quopri(part)
elif encoding == "7or8bit":
encode_7or8bit(part)
else: # elif encoding == "noop":
encode_noop(part)
for key, value in attachment.get("replace_header_list", []):
part.replace_header(key, value)
for key, value in attachment.get("header_dict", {}).items(): # adds headers, does not replace or set
part.add_header(key, value)
for key, value in attachment.get("add_header_list", []):
part.add_header(key, value)
if attachment.get("filename", None) is not None:
part.add_header("Content-Disposition", "attachment", attachment["filename"])
outer.attach(part)
#return outer.as_string()
return formatMultipartMessageToRFC2822String(outer)
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>Script_magic</string> </key>
<value> <int>3</int> </value>
</item>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
</klass>
<tuple/>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>attachment_list, subtype="mixed", header_dict=None, param_list=(), replace_header_list=None, add_header_list=None, boundary=None</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_formatAttachmentListToMIMEMultipartString</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ExternalMethod" module="Products.ExternalMethod.ExternalMethod"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_function</string> </key>
<value> <string>parseCssForUrl</string> </value>
</item>
<item>
<key> <string>_module</string> </key>
<value> <string>WebUtility</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_parseCssForUrl</string> </value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string></string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ExternalMethod" module="Products.ExternalMethod.ExternalMethod"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_function</string> </key>
<value> <string>parseHtml</string> </value>
</item>
<item>
<key> <string>_module</string> </key>
<value> <string>WebUtility</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_parseHtml</string> </value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string></string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
# TODO: domain names should be exported to a web site property.
# domain_dict = {}
# for web_site in portal_catalog(portal_type="Web Site", validation_state="published"):
# domain = web_site.getDomainName("")
# if domain != "":
# domain_dict[domain] = web_site
# return domain_dict
return {}
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>Script_magic</string> </key>
<value> <int>3</int> </value>
</item>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
</klass>
<tuple/>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string></string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>ERP5Site_getWebSiteDomainDict</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>Script_magic</string> </key>
<value> <int>3</int> </value>
</item>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
</klass>
<tuple/>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>REQUEST=None, allow_script=False, format="embedded_html"</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>WebPage_exportAsSingleFile</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment