Commit 961a3bf5 authored by Jérome Perrin's avatar Jérome Perrin

ERP5Form: escape for XML control characters in render_ODF

XML 1.0 does not accepts some control characters, there should be
escaped in fields.

https://www.w3.org/TR/2008/REC-xml-20081126/#charsets
parent 4943e77a
......@@ -342,6 +342,12 @@ class TestStringField(ERP5TypeTestCase):
self.assertEqual('Hello World! <&> &lt;&mp;&gt;', self.field.render_odt(as_string=False).text)
self.assertEqual('Hello World!', self.field.render_odt(value='Hello World!', as_string=False).text)
def test_render_odt_escape_control_characters(self):
self.field.values['default'] = 'Hello \x10\x13 World!'
self.assertEqual(
'Hello \ufffd\ufffd World!',
self.field.render_odt(as_string=False).text)
def test_render_odg(self):
self.field.values['default'] = 'Hello World! <&> &lt;&mp;&gt;'
test_value = self.field.render_odg(as_string=False)\
......@@ -351,12 +357,27 @@ class TestStringField(ERP5TypeTestCase):
.xpath('%s/text()' % ODG_XML_WRAPPING_XPATH, namespaces=NSMAP)[0]
self.assertEqual('Hello World!', test_value)
def test_render_odg_escape_control_characters(self):
self.field.values['default'] = 'Hello \x10\x13 World!'
self.assertEqual(
'Hello \ufffd\ufffd World!',
self.field.render_odg(as_string=False).xpath(
'%s/text()' % ODG_XML_WRAPPING_XPATH,
namespaces=NSMAP)[0])
def test_render_odt_variable(self):
self.field.values['default'] = 'Hello World! <&> &lt;&mp;&gt;'
node = self.field.render_odt_variable(as_string=False)
self.assertEqual(node.get('{%s}value-type' % NSMAP['office']), 'string')
self.assertEqual(node.text, 'Hello World! <&> &lt;&mp;&gt;')
def test_render_odt_variable_escape_control_characters(self):
self.field.values['default'] = 'Hello \x10\x13 World!'
self.assertEqual(
'Hello \ufffd\ufffd World!',
self.field.render_odt_variable(as_string=False).text)
class TestDateTimeField(ERP5TypeTestCase):
"""Tests DateTime field
"""
......
......@@ -11,6 +11,7 @@ from lxml import etree
from lxml.etree import Element, SubElement
from lxml.builder import ElementMaker
import re
import sys
DRAW_URI = 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0'
TEXT_URI = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
......@@ -48,6 +49,39 @@ class OOoEscaper:
line_break = SubElement(self.parent_node, '{%s}%s' % (TEXT_URI, 'tab'))
line_break.tail = match_object.group(2)
# OD* Styles vs XML control characters TODO:
# - check `pdf` here. Don't we need something more generic like render_text ?
# - ods_style also uses _pdf where odt_style uses render_odt
def _convert_to_xml_compatible_string(value):
"""Convert value to an XML 1.0 compatible string.
This helper makes sure the value is compatible with this requirement of lxml:
All strings must be XML compatible: Unicode or ASCII, no NULL bytes
"""
if not value:
return ''
if isinstance(value, str):
value = value.decode('utf-8')
# remove control characters as described in the example from
# https://bugs.python.org/issue5166#msg95689
# http://www.w3.org/TR/REC-xml/#NT-Char
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
# [#x10000- #x10FFFF]
# (any Unicode character, excluding the surrogate blocks, FFFE, and FFFF)
_char_tail = ''
if sys.maxunicode > 0x10000:
_char_tail = u'%s-%s' % (unichr(0x10000),
unichr(min(sys.maxunicode, 0x10FFFF)))
# TODO: compile this at import time
_nontext_sub = re.compile(
ur'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD%s]' % _char_tail,
re.U).sub
return _nontext_sub(u'\uFFFD', value)
class Widget:
"""A field widget that knows how to display itself as HTML.
"""
......@@ -216,11 +250,7 @@ class Widget:
"""
if attr_dict is None:
attr_dict = {}
if isinstance(value, str):
#required by lxml
value = value.decode('utf-8')
if value is None:
value = ''
value = _convert_to_xml_compatible_string(value)
text_node = Element('{%s}%s' % (TEXT_URI, local_name), nsmap=NSMAP)
if escape:
RE_OOO_ESCAPE.sub(OOoEscaper(text_node), value)
......@@ -243,11 +273,7 @@ class Widget:
if attr_dict is None:
attr_dict = {}
attr_dict['{%s}value-type' % OFFICE_URI] = 'string'
if isinstance(value, str):
#required by lxml
value = value.decode('utf-8')
if value is None:
value = ''
value = _convert_to_xml_compatible_string(value)
text_node = Element('{%s}%s' % (TEXT_URI, local_name), nsmap=NSMAP)
text_node.text = value
text_node.attrib.update(attr_dict)
......@@ -286,11 +312,7 @@ class Widget:
"""
if attr_dict is None:
attr_dict = {}
if isinstance(value, str):
#required by lxml
value = value.decode('utf-8')
if value is None:
value = ''
value = _convert_to_xml_compatible_string(value)
draw_frame_tag_name = '{%s}%s' % (DRAW_URI, 'frame')
draw_frame_node = Element(draw_frame_tag_name, nsmap=NSMAP)
draw_frame_attribute_list = attr_dict.get(draw_frame_tag_name)
......@@ -554,11 +576,7 @@ class CheckBoxWidget(Widget):
"""
if attr_dict is None:
attr_dict = {}
if isinstance(value, int):
value = str(value)
if isinstance(value, str):
#required by lxml
value = value.decode('utf-8')
value = _convert_to_xml_compatible_string(value)
text_node = Element('{%s}%s' % (TEXT_URI, local_name), nsmap=NSMAP)
text_node.text = value
text_node.attrib.update(attr_dict)
......@@ -658,9 +676,7 @@ class TextAreaWidget(Widget):
render_prefix, attr_dict, local_name):
if attr_dict is None:
attr_dict = {}
if isinstance(value, str):
#required by lxml
value = value.decode('utf-8')
value = _convert_to_xml_compatible_string(value)
text_node = Element('{%s}%s' % (TEXT_URI, local_name), nsmap=NSMAP)
RE_OOO_ESCAPE.sub(OOoEscaper(text_node), value)
......@@ -927,9 +943,7 @@ class SingleItemsWidget(ItemsWidget):
if attr_dict is None:
attr_dict = {}
if isinstance(value, str):
#required by lxml
value = value.decode('utf-8')
value = _convert_to_xml_compatible_string(value)
text_node = Element('{%s}%s' % (TEXT_URI, local_name), nsmap=NSMAP)
RE_OOO_ESCAPE.sub(OOoEscaper(text_node), value)
......@@ -1080,6 +1094,7 @@ class MultiItemsWidget(ItemsWidget):
if value is None:
return None
value_list = self.render_items_odf(field, value, REQUEST)
# XXX is this handling unicode properly ???
value = ', '.join(value_list).decode('utf-8')
return Widget.render_odg(self, field, value, as_string, ooo_builder,
REQUEST, render_prefix, attr_dict, local_name)
......@@ -1112,9 +1127,7 @@ class MultiItemsWidget(ItemsWidget):
if attr_dict is None:
attr_dict = {}
if isinstance(value, str):
#required by lxml
value = value.decode('utf-8')
value = _convert_to_xml_compatible_string(value)
text_node = Element('{%s}%s' % (TEXT_URI, local_name), nsmap=NSMAP)
RE_OOO_ESCAPE.sub(OOoEscaper(text_node), value)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment