Commit 0c0500de authored by Jérome Perrin's avatar Jérome Perrin

core: text document / mail message str vs bytes WIP

parent ae6a06ad
......@@ -115,7 +115,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
text = Template(text).substitute(unicode_mapping)
# If the original was a str, convert it back to str.
if is_str:
if six.PY2 and is_str:
text = text.encode('utf-8')
return text
......@@ -188,12 +188,15 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
self.setConversion(result, original_mime_type, **kw)
else:
mime_type, result = self.getConversion(**kw)
if substitute and format in VALID_TEXT_FORMAT_LIST:
# only textual content can be sustituted
if substitution_method_parameter_dict is None:
substitution_method_parameter_dict = {}
result = self._substituteTextContent(result, safe_substitute=safe_substitute,
**substitution_method_parameter_dict)
if format in VALID_TEXT_FORMAT_LIST:
if six.PY3 and isinstance(result, bytes):
result = result.decode()
if substitute:
# only textual content can be sustituted
if substitution_method_parameter_dict is None:
substitution_method_parameter_dict = {}
result = self._substituteTextContent(result, safe_substitute=safe_substitute,
**substitution_method_parameter_dict)
return original_mime_type, result
else:
# text_content is not set, return empty string instead of None
......@@ -375,21 +378,27 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return message
security.declareProtected(Permissions.AccessContentsInformation, 'getTextContent')
def getTextContent(self, default=_MARKER):
def getTextContent(self, default=_MARKER, encoding=None):
"""Overriden method to check
permission to access content in raw format
permission to access content in raw format and manage encoding.
"""
# XXX Zope4py3: should this return str ??
# We probably have "legacy" documents where `text_content` is a python2
# str encoded as something else than utf-8.
# Maybe we should introduce a new text_content_encoding property and
# expose API to getRawTextContent (as bytes) and getTextContent would return
# the decoded string.
self._checkConversionFormatPermission(None)
if default is _MARKER:
return self._baseGetTextContent()
else:
return self._baseGetTextContent(default)
text_content = self._baseGetTextContent()
text_content = self._baseGetTextContent(default)
if isinstance(text_content, bytes):
# XXX Zope4py3: should this return str ??
# We probably have "legacy" documents where `text_content` is a python2
# str encoded as something else than utf-8.
# Maybe we should introduce a new text_content_encoding property and
# expose API to getRawTextContent (as bytes) and getTextContent would return
# the decoded string.
# XXX what about _convertToBaseFormat/guessCharsetAndConvert ???
try:
text_content = text_content.decode('utf-8')
except UnicodeDecodeError:
text_content = text_content.decode('latin1')
return text_content
# Backward compatibility for replacement of text_format by content_type
security.declareProtected(Permissions.AccessContentsInformation, 'getTextFormat')
......@@ -424,9 +433,11 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
"""
if not self.hasData():
if default is _MARKER:
return self.getTextContent()
data = self._baseGetTextContent()
else:
return self.getTextContent(default)
data = self._baseGetTextContent(default)
if not isinstance(data, bytes):
return data.encode('utf-8')
else:
if default is _MARKER:
return File.getData(self)
......
......@@ -149,6 +149,7 @@ class DownloadableMixin:
RESPONSE.setHeader('Content-Length', len(data))
if output_format in VALID_TEXT_FORMAT_LIST:
RESPONSE.setHeader('Content-Type', '%s; charset=utf-8' % mime)
data = data.encode('utf-8')
else:
RESPONSE.setHeader('Content-Type', mime)
if inline is _MARKER:
......
......@@ -29,7 +29,7 @@
from AccessControl import ClassSecurityInfo
from Products.ERP5Type.Globals import InitializeClass
from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import guessEncodingFromText
from Products.ERP5Type.Utils import guessEncodingFromText # TODO: guessEncodingFromBytes
from zLOG import LOG, INFO
from email.header import decode_header, HeaderParseError
......@@ -42,7 +42,7 @@ filename_regexp = 'name="([^"]*)"'
def testCharsetAndConvert(text_content, content_type, encoding):
try:
if encoding is not None:
text_content = text_content.decode(encoding).encode('utf-8')
text_content = text_content.decode(encoding)
else:
if six.PY2:
text_content = text_content.decode().encode('utf-8')
......@@ -50,8 +50,9 @@ def testCharsetAndConvert(text_content, content_type, encoding):
encoding = guessEncodingFromText(text_content, content_type)
if encoding is not None:
try:
text_content = text_content.decode(encoding).encode('utf-8')
text_content = text_content.decode(encoding)
except (UnicodeDecodeError, LookupError):
# TODO: errors= repr ?
text_content = repr(text_content)[1:-1]
else:
text_content = repr(text_content)[1:-1]
......@@ -113,9 +114,6 @@ class MailMessageMixin:
"""
Returns the content information from the header information.
This is used by the metadata discovery system.
Header information is converted in UTF-8 since this is the standard
way of representing strings in ERP5.
"""
result = {}
for (name, value) in self._getMessage().items():
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment