Commit 3bdab08f authored by Jérome Perrin's avatar Jérome Perrin

converter: sniff csv field delimiter

using python csv module
parent de321992
Pipeline #5526 passed with stage
......@@ -28,6 +28,8 @@
##############################################################################
import sys
import csv
import codecs
import helper_util
from os.path import dirname, splitext
from tempfile import mktemp
......@@ -149,9 +151,18 @@ class UnoConverter(object):
_, extension = splitext(source_url)
if extension == '.csv':
# https://wiki.openoffice.org/wiki/Documentation/DevGuide/Spreadsheets/Filter_Options
# Try to sniff the csv delimiter
with codecs.open(source_url, 'rb', 'utf-8', errors="ignore") as csvfile:
try:
dialect = csv.Sniffer().sniff(csvfile.read(1024))
delimiter = ord(dialect.delimiter)
except csv.Error:
delimiter = ord(',')
return (
self._createProperty("FilterName", "Text - txt - csv (StarCalc)"),
self._createProperty("FilterOptions", "44,34,UTF-8"), )
self._createProperty("FilterOptions", "{delimiter},34,UTF-8".format(**locals())), )
return ()
......
a b
1,3 c
\ No newline at end of file
......@@ -578,6 +578,7 @@ class TestChapterItemList(TestCase):
class TestCSVEncoding(TestCase):
"""Cloudoo tries to be "a bit" clever with CSV:
* the supported encoding is UTF-8, but also accepts latin9, for compatibility.
* the fields delimiter is guessed by python csv module.
"""
def test_decode_ascii(self):
data = encodestring(open("./data/csv_ascii.csv").read())
......@@ -608,3 +609,36 @@ class TestCSVEncoding(TestCase):
self.assertEqual(
[u"Jérome", u"1€"],
[x.text for x in tree.getroot().find('.//tr[1]').iterdescendants() if x.text])
def test_separator_semicolon(self):
data = encodestring(open("./data/csv_semicolon.csv").read())
converted = decodestring(self.proxy.convertFile(data, "csv", "html"))
parser = etree.HTMLParser()
tree = etree.parse(StringIO(converted), parser)
self.assertEqual(
['a a', '1'],
[x.text for x in tree.getroot().find('.//tr[1]').iterdescendants() if x.text])
self.assertEqual(
['b b', '2;x'],
[x.text for x in tree.getroot().find('.//tr[2]').iterdescendants() if x.text])
def test_separator_tab(self):
data = encodestring(open("./data/tsv.tsv").read())
converted = decodestring(self.proxy.convertFile(data, "csv", "html"))
parser = etree.HTMLParser()
tree = etree.parse(StringIO(converted), parser)
self.assertEqual(
['a', 'b'],
[x.text for x in tree.getroot().find('.//tr[1]').iterdescendants() if x.text])
self.assertEqual(
['1,3', 'c'],
[x.text for x in tree.getroot().find('.//tr[2]').iterdescendants() if x.text])
def test_empty_csv(self):
data = encodestring("")
converted = decodestring(self.proxy.convertFile(data, "csv", "html"))
parser = etree.HTMLParser()
tree = etree.parse(StringIO(converted), parser)
self.assertEqual(
[],
[x.text for x in tree.getroot().findall('.//td')])
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment