Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
erp5
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
7
Merge Requests
7
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Jobs
Commits
Open sidebar
Jérome Perrin
erp5
Commits
cdcb50f4
Commit
cdcb50f4
authored
Feb 07, 2024
by
Jérome Perrin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
TextContent base_data bytes
parent
de89b2e4
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
37 additions
and
52 deletions
+37
-52
product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.Document.py
...tTemplateItem/portal_components/document.erp5.Document.py
+1
-1
product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
...plateItem/portal_components/document.erp5.TextDocument.py
+36
-51
No files found.
product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.Document.py
View file @
cdcb50f4
...
...
@@ -410,7 +410,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
body_parser
=
re
.
compile
(
r'<body[^>]*>(.*?)</body>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
title_parser
=
re
.
compile
(
r'<title[^>]*>(.*?)</title>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
base_parser
=
re
.
compile
(
r'<base[^>]*href=[\'"](.*?)[\'"][^>]*>'
,
re
.
IGNORECASE
+
re
.
DOTALL
)
charset_parser
=
re
.
compile
(
r'(?P<keyword>charset="?)(?P<charset>[a-z0-9\
-]+)
', re.IGNORECASE)
charset_parser
=
re
.
compile
(
b
r'(?P<keyword>charset="?)(?P<charset>[a-z0-9\
-]+)
', re.IGNORECASE)
# Declarative security
security = ClassSecurityInfo()
...
...
product/ERP5/bootstrap/erp5_core/DocumentTemplateItem/portal_components/document.erp5.TextDocument.py
View file @
cdcb50f4
...
...
@@ -42,7 +42,7 @@ from string import Template
from
erp5.component.mixin.CachedConvertableMixin
import
CachedConvertableMixin
from
erp5.component.mixin.BaseConvertableFileMixin
import
BaseConvertableFileMixin
from
Products.ERP5Type.mixin.text_content_history
import
TextContentHistoryMixin
from
Products.ERP5Type.Utils
import
guessEncodingFromText
from
Products.ERP5Type.Utils
import
guessEncodingFromText
,
bytes2str
from
lxml
import
html
as
etree_html
from
lxml
import
etree
...
...
@@ -163,10 +163,9 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
if
mime_type
==
'text/html'
:
mime_type
=
'text/x-html-safe'
if
src_mimetype
!=
"image/svg+xml"
:
if
six
.
PY2
:
data
=
text_content
else
:
data
=
text_content
.
encode
()
data
=
text_content
if
not
isinstance
(
data
,
bytes
):
data
=
data
.
encode
(
'utf-8'
)
result
=
portal_transforms
.
convertToData
(
mime_type
,
data
,
object
=
self
,
context
=
self
,
filename
=
filename
,
...
...
@@ -186,6 +185,8 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
file
=
BytesIO
(),
filename
=
self
.
getId
(),
temp_object
=
1
)
if
not
isinstance
(
result
,
bytes
):
result
=
result
.
encode
(
'utf-8'
)
temp_image
.
_setData
(
result
)
_
,
result
=
temp_image
.
convert
(
**
kw
)
...
...
@@ -227,7 +228,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
def
setBaseData
(
self
,
value
):
"""Store base_data into text_content
"""
self
.
_setTextContent
(
value
)
self
.
_setTextContent
(
bytes2str
(
value
)
)
security
.
declareProtected
(
Permissions
.
ModifyPortalContent
,
'_setBaseData'
)
_setBaseData
=
setBaseData
...
...
@@ -253,9 +254,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
"""
self
.
_checkConversionFormatPermission
(
None
)
if
default
is
_MARKER
:
return
self
.
getTextContent
()
text_content
=
self
.
getTextContent
()
else
:
return
self
.
getTextContent
(
default
=
default
)
text_content
=
self
.
getTextContent
(
default
=
default
)
if
six
.
PY3
and
text_content
and
text_content
is
not
default
:
text_content
=
text_content
.
encode
(
'utf-8'
)
return
text_content
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'hasBaseData'
)
def
hasBaseData
(
self
):
...
...
@@ -290,9 +294,12 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
def
_convertToBaseFormat
(
self
):
"""Conversion to base format for TextDocument consist
to convert file content into utf-8
to convert file content into utf-8.
If the data embeds charset information, this information is updated
to the new (utf-8) charset. This supports XML and HTML.
"""
def
guessCharsetAndConvert
(
document
,
text_content
,
content_type
):
# type: (TextDocument, bytes, str) -> Tuple[bytes, str]
"""
return encoded content_type and message if encoding
is not utf-8
...
...
@@ -322,36 +329,32 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return
text_content
,
message
content_type
=
self
.
getContentType
()
or
DEFAULT_CONTENT_TYPE
text_content
=
self
.
getData
()
# TODO: don't we need to convert to bytes here ? what if it is PData ?
data
=
bytes
(
self
.
getData
())
if
content_type
.
endswith
(
'xml'
):
try
:
tree
=
etree
.
fromstring
(
text_content
)
text_content
=
etree
.
tostring
(
tree
,
encoding
=
'utf-8'
,
xml_declaration
=
True
)
tree
=
etree
.
fromstring
(
data
)
base_data
=
etree
.
tostring
(
tree
,
encoding
=
'utf-8'
,
xml_declaration
=
True
)
message
=
'Conversion to base format succeeds'
except
etree
.
XMLSyntaxError
:
# pylint: disable=catching-non-exception
message
=
'Conversion to base format without codec fails'
elif
content_type
==
'text/html'
:
re_match
=
self
.
charset_parser
.
search
(
# we don't really care about decoding errors for searching this
# regexp
text_content
.
decode
(
'ascii'
,
'replace'
)
if
six
.
PY3
else
text_content
)
re_match
=
self
.
charset_parser
.
search
(
data
)
message
=
'Conversion to base format succeeds'
if
re_match
is
not
None
:
charset
=
re_match
.
group
(
'charset'
)
base_data
=
data
charset
=
re_match
.
group
(
'charset'
).
decode
(
'ascii'
)
try
:
# Use encoding in html document
text_content
=
text_content
.
decode
(
charset
)
if
six
.
PY2
:
text_content
=
text_content
.
encode
(
'utf-8'
)
data
=
data
.
decode
(
charset
).
encode
(
'utf-8'
)
except
(
UnicodeDecodeError
,
LookupError
):
# Encoding read from document is wrong
text_content
,
message
=
guessCharsetAndConvert
(
self
,
text_content
,
content_type
)
base_data
,
message
=
guessCharsetAndConvert
(
self
,
data
,
content_type
)
else
:
message
=
'Conversion to base format with charset %r succeeds'
\
%
charset
if
charset
.
lower
()
!=
'utf-8'
:
charset
=
'utf-8'
# Override charset if convertion succeeds
charset
=
'utf-8'
# Override charset if convertion succeeds
# change charset value in html_document as well
def
subCharset
(
matchobj
):
keyword
=
matchobj
.
group
(
'keyword'
)
...
...
@@ -361,51 +364,33 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, TextContent
return
matchobj
.
group
(
0
)
elif
keyword
:
# if keyword is present, replace charset just after
return
keyword
+
'utf-8'
text_content
=
self
.
charset_parser
.
sub
(
subCharset
,
text_content
)
return
keyword
+
b
'utf-8'
base_data
=
self
.
charset_parser
.
sub
(
subCharset
,
data
)
else
:
text_content
,
message
=
guessCharsetAndConvert
(
self
,
text_content
,
content_type
)
base_data
,
message
=
guessCharsetAndConvert
(
self
,
data
,
content_type
)
else
:
# generaly text/plain
try
:
# if succeeds, not need to change encoding
# it's already utf-8
text_content
.
decode
(
'utf-8'
)
data
.
decode
(
'utf-8'
)
except
(
UnicodeDecodeError
,
LookupError
):
text_content
,
message
=
guessCharsetAndConvert
(
self
,
text_content
,
content_type
)
base_data
,
message
=
guessCharsetAndConvert
(
self
,
data
,
content_type
)
else
:
message
=
'Conversion to base format succeeds'
# TODO(zope4py3): rethink this, shouldn't we store bytes in base data ?
self
.
_setBaseData
(
text_content
)
self
.
_setBaseData
(
base_data
)
self
.
_setBaseContentType
(
content_type
)
return
message
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'getTextContent'
)
def
getTextContent
(
self
,
default
=
_MARKER
,
encoding
=
None
):
"""Overriden method to check
permission to access content in raw format and manage encoding.
def
getTextContent
(
self
,
default
=
_MARKER
):
"""Overridden method to check permission to access content in raw format
"""
self
.
_checkConversionFormatPermission
(
None
)
if
default
is
_MARKER
:
text_content
=
self
.
_baseGetTextContent
()
return
self
.
_baseGetTextContent
()
else
:
text_content
=
self
.
_baseGetTextContent
(
default
)
if
isinstance
(
text_content
,
bytes
):
# TODO(Zope4py3): should this return str ??
# We probably have "legacy" documents where `text_content` is a python2
# str encoded as something else than utf-8.
# Maybe we should introduce a new text_content_encoding property and
# expose API to getRawTextContent (as bytes) and getTextContent would return
# the decoded string.
# XXX what about _convertToBaseFormat/guessCharsetAndConvert ???
LOG
(
'TextDocument'
,
WARNING
,
"getTextContent with bytes %s"
%
text_content
)
try
:
text_content
=
text_content
.
decode
(
'utf-8'
)
except
UnicodeDecodeError
:
text_content
=
text_content
.
decode
(
'latin1'
)
return
text_content
return
self
.
_baseGetTextContent
(
default
)
# Backward compatibility for replacement of text_format by content_type
security
.
declareProtected
(
Permissions
.
AccessContentsInformation
,
'getTextFormat'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment