Commit e4b0603f authored by Nicolas Delaby's avatar Nicolas Delaby

Do not trust specified encoding

This patch will always perform conversion against given encoding, in order to check if this codec is valid or not.
parent f6caaf1b
...@@ -333,17 +333,17 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, ...@@ -333,17 +333,17 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
message = 'Conversion to base format succeeds' message = 'Conversion to base format succeeds'
if re_match is not None: if re_match is not None:
charset = re_match.group('charset') charset = re_match.group('charset')
if charset.lower() != 'utf-8': try:
try: # Use encoding in html document
# Use encoding in html document text_content = text_content.decode(charset).encode('utf-8')
text_content = text_content.decode(charset).encode('utf-8') except (UnicodeDecodeError, LookupError):
except (UnicodeDecodeError, LookupError): # Encoding read from document is wrong
# Encoding read from document is wrong text_content, message = guessCharsetAndConvert(self,
text_content, message = guessCharsetAndConvert(self, text_content, content_type)
text_content, content_type) else:
else: message = 'Conversion to base format with charset %r succeeds'\
message = 'Conversion to base format with charset %r succeeds'\ % charset
% charset if charset.lower() != 'utf-8':
charset = 'utf-8' # Override charset if convertion succeeds charset = 'utf-8' # Override charset if convertion succeeds
# change charset value in html_document as well # change charset value in html_document as well
def subCharset(matchobj): def subCharset(matchobj):
......
...@@ -1704,6 +1704,11 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph ...@@ -1704,6 +1704,11 @@ document.write('<sc'+'ript type="text/javascript" src="http://somosite.bg/utb.ph
self.assertTrue('AZERTYY' not in safe_html) self.assertTrue('AZERTYY' not in safe_html)
self.assertTrue('#FFAA44' in safe_html) self.assertTrue('#FFAA44' in safe_html)
filename = 'broken_html.html'
file_object = makeFileUpload(filename)
web_page.edit(file=file_object)
converted = web_page.convert('html')[1]
def test_safeHTML_impossible_conversion(self): def test_safeHTML_impossible_conversion(self):
"""Some html are not parsable. """Some html are not parsable.
""" """
......
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:p="urn:schemas-microsoft-com:office:powerpoint" xmlns:a="urn:schemas-microsoft-com:office:access" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882" xmlns:rs="urn:schemas-microsoft-com:rowset" xmlns:z="#RowsetSchema" xmlns:b="urn:schemas-microsoft-com:office:publisher" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet" xmlns:odc="urn:schemas-microsoft-com:office:odc" xmlns:oa="urn:schemas-microsoft-com:office:activation" xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:q="http://schemas.xmlsoap.org/soap/envelope/" xmlns:rtc="http://microsoft.com/officenet/conferencing" xmlns:D="DAV:" xmlns:Repl="http://schemas.microsoft.com/repl/" xmlns:mt="http://schemas.microsoft.com/sharepoint/soap/meetings/" xmlns:x2="http://schemas.microsoft.com/office/excel/2003/xml" xmlns:ppda="http://www.passport.com/NameSpace.xsd" xmlns:ois="http://schemas.microsoft.com/sharepoint/soap/ois/" xmlns:dir="http://schemas.microsoft.com/sharepoint/soap/directory/" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:dsp="http://schemas.microsoft.com/sharepoint/dsp" xmlns:udc="http://schemas.microsoft.com/data/udc" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:sub="http://schemas.microsoft.com/sharepoint/soap/2002/1/alerts/" xmlns:ec="http://www.w3.org/2001/04/xmlenc#" xmlns:sp="http://schemas.microsoft.com/sharepoint/" xmlns:sps="http://schemas.microsoft.com/sharepoint/soap/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:udcs="http://schemas.microsoft.com/data/udc/soap" xmlns:udcxf="http://schemas.microsoft.com/data/udc/xmlfile" xmlns:udcp2p="http://schemas.microsoft.com/data/udc/parttopart" xmlns:wf="http://schemas.microsoft.com/sharepoint/soap/workflow/" xmlns:dsss="http://schemas.microsoft.com/office/2006/digsig-setup" xmlns:dssi="http://schemas.microsoft.com/office/2006/digsig" xmlns:mdssi="http://schemas.openxmlformats.org/package/2006/digital-signature" xmlns:mver="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns:mrels="http://schemas.openxmlformats.org/package/2006/relationships" xmlns:spwp="http://microsoft.com/sharepoint/webpartpages" xmlns:ex12t="http://schemas.microsoft.com/exchange/services/2006/types" xmlns:ex12m="http://schemas.microsoft.com/exchange/services/2006/messages" xmlns:pptsl="http://schemas.microsoft.com/sharepoint/soap/SlideLibrary/" xmlns:spsl="http://microsoft.com/webservices/SharePointPortalServer/PublishedLinksService" xmlns:Z="urn:schemas-microsoft-com:" xmlns:st="&#1;" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 12 (filtered medium)">
<!--[if !mso]><style>v\:* {behavior:url(#default#VML);}
o\:* {behavior:url(#default#VML);}
w\:* {behavior:url(#default#VML);}
.shape {behavior:url(#default#VML);}
</style><![endif]-->
<title>One-Time</title>
<style><!--
/* Font Definitions */
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:Tahoma;
panose-1:2 11 6 4 3 5 4 4 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0cm;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
p
{mso-style-priority:99;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
p.MsoAcetate, li.MsoAcetate, div.MsoAcetate
{mso-style-priority:99;
mso-style-link:"Balloon Text Char";
margin:0cm;
margin-bottom:.0001pt;
font-size:8.0pt;
font-family:"Tahoma","sans-serif";}
p.style1, li.style1, div.style1
{mso-style-name:style1;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
p.style2, li.style2, div.style2
{mso-style-name:style2;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
p.style3, li.style3, div.style3
{mso-style-name:style3;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
p.style4, li.style4, div.style4
{mso-style-name:style4;
mso-margin-top-alt:auto;
margin-right:0cm;
mso-margin-bottom-alt:auto;
margin-left:0cm;
font-size:18.0pt;
font-family:"Times New Roman","serif";}
span.EmailStyle22
{mso-style-type:personal-reply;
font-family:"Calibri","sans-serif";
color:#1F497D;}
span.BalloonTextChar
{mso-style-name:"Balloon Text Char";
mso-style-priority:99;
mso-style-link:"Balloon Text";
font-family:"Tahoma","sans-serif";}
.MsoChpDefault
{mso-style-type:export-only;
font-size:10.0pt;}
@page WordSection1
{size:612.0pt 792.0pt;
margin:72.0pt 72.0pt 72.0pt 72.0pt;}
div.WordSection1
{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="2050" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="EN-IE" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">n
<sup>th</sup> .<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">p;
<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">nt.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">on.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Regards,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<div>
<p class="MsoNormal" style="margin-bottom:10.0pt;line-height:115%"><span style="font-size:10.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D"><br>
</span><span style="font-size:8.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D">Ator<br>
<br>
</span><span style="font-size:10.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D"><br>
</span><span style="font-size:8.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:red">_______________________________________________</span><span style="font-size:8.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D"><br>
Cse<br>
oad<br>
<br>
e<br>
dqwodj;j;jk;lj
<img width="288" height="41" id="Picture_x0020_1" src="cid:image001.jpg@01CC7129.3570BB40"><br>
<br>
<o:p></o:p></span></p>
</div>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D"><o:p>&nbsp;</o:p></span></p>
<div>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p class="MsoNormal"><b><span lang="EN-US" style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span lang="EN-US" style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> ni[]
<br>
<b>Sent:</b> Th:00<br>
<b>To:</b> ne<br>
<b>Subject:</b> O<o:p></o:p></span></p>
</div>
</div>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<div>
<p class="MsoNormal">One-Time<o:p></o:p></p>
</div>
<p>Thank you<o:p></o:p></p>
<div id="NewUser">
<table class="MsoNormalTable" border="0" cellspacing="0" cellpadding="0" width="600" style="width:450.0pt">
<tbody>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>r:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">04<o:p></o:p></p>
</td>
</tr>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>r:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">1<o:p></o:p></p>
</td>
</tr>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">7<o:p></o:p></p>
</td>
</tr>
<tr>
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm">
<p class="MsoNormal"><b>PaTts:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm">
<p class="MsoNormal">C0<o:p></o:p></p>
</td>
</tr>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>td:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">€3.<o:p></o:p></p>
</td>
</tr>
<tr style="height:13.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal"><b>Pt:<o:p></o:p></b></p>
</td>
<td style="padding:0cm 0cm 0cm 0cm;height:13.5pt">
<p class="MsoNormal">081<o:p></o:p></p>
</td>
</tr>
<tr style="height:10.5pt">
<td width="212" style="width:159.0pt;padding:0cm 0cm 0cm 0cm;height:10.5pt"></td>
<td style="padding:0cm 0cm 0cm 0cm;height:10.5pt"></td>
</tr>
</tbody>
</table>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<div>
<p class="MsoNormal"><b>Th0.<o:p></o:p></b></p>
</div>
</div>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<div>
<p class="MsoNormal">Sho
<o:p></o:p></p>
</div>
<div>
<p class="MsoNormal"><a href="here">here</a><o:p></o:p></p>
</div>
<p class="MsoNormal" style="margin-bottom:12.0pt"><o:p>&nbsp;</o:p></p>
</div>
<FONT size=2 face=Arial>
6,000
w 10,9. Wrs cl
yofawne rs l stda ru
</FONT><A href="htsoe"><FONT size=2
face=Arial>wsr.</FONT></A>
<P><FONT size=2 face=Arial></FONT>&nbsp;</P>
<P><FONT size=2 face=Arial>Bu a <STRONG><A
href="htt/w.fces"><FONT
color=#000080>eoo</FONT></A></STRONG></FONT> </P>
<HR>
<P>Thsssent</P>
<HR>
<FONT face=Arial color=#000080 size=2>WsrMtr
24<BR></FONT>
</body>
</html>
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment