Commit 3bdab08f authored by Jérome Perrin's avatar Jérome Perrin

converter: sniff csv field delimiter

using python csv module
parent de321992
Pipeline #5526 passed with stage
...@@ -28,6 +28,8 @@ ...@@ -28,6 +28,8 @@
############################################################################## ##############################################################################
import sys import sys
import csv
import codecs
import helper_util import helper_util
from os.path import dirname, splitext from os.path import dirname, splitext
from tempfile import mktemp from tempfile import mktemp
...@@ -149,9 +151,18 @@ class UnoConverter(object): ...@@ -149,9 +151,18 @@ class UnoConverter(object):
_, extension = splitext(source_url) _, extension = splitext(source_url)
if extension == '.csv': if extension == '.csv':
# https://wiki.openoffice.org/wiki/Documentation/DevGuide/Spreadsheets/Filter_Options # https://wiki.openoffice.org/wiki/Documentation/DevGuide/Spreadsheets/Filter_Options
# Try to sniff the csv delimiter
with codecs.open(source_url, 'rb', 'utf-8', errors="ignore") as csvfile:
try:
dialect = csv.Sniffer().sniff(csvfile.read(1024))
delimiter = ord(dialect.delimiter)
except csv.Error:
delimiter = ord(',')
return ( return (
self._createProperty("FilterName", "Text - txt - csv (StarCalc)"), self._createProperty("FilterName", "Text - txt - csv (StarCalc)"),
self._createProperty("FilterOptions", "44,34,UTF-8"), ) self._createProperty("FilterOptions", "{delimiter},34,UTF-8".format(**locals())), )
return () return ()
......
a b
1,3 c
\ No newline at end of file
...@@ -578,6 +578,7 @@ class TestChapterItemList(TestCase): ...@@ -578,6 +578,7 @@ class TestChapterItemList(TestCase):
class TestCSVEncoding(TestCase): class TestCSVEncoding(TestCase):
"""Cloudoo tries to be "a bit" clever with CSV: """Cloudoo tries to be "a bit" clever with CSV:
* the supported encoding is UTF-8, but also accepts latin9, for compatibility. * the supported encoding is UTF-8, but also accepts latin9, for compatibility.
* the fields delimiter is guessed by python csv module.
""" """
def test_decode_ascii(self): def test_decode_ascii(self):
data = encodestring(open("./data/csv_ascii.csv").read()) data = encodestring(open("./data/csv_ascii.csv").read())
...@@ -608,3 +609,36 @@ class TestCSVEncoding(TestCase): ...@@ -608,3 +609,36 @@ class TestCSVEncoding(TestCase):
self.assertEqual( self.assertEqual(
[u"Jérome", u"1€"], [u"Jérome", u"1€"],
[x.text for x in tree.getroot().find('.//tr[1]').iterdescendants() if x.text]) [x.text for x in tree.getroot().find('.//tr[1]').iterdescendants() if x.text])
def test_separator_semicolon(self):
data = encodestring(open("./data/csv_semicolon.csv").read())
converted = decodestring(self.proxy.convertFile(data, "csv", "html"))
parser = etree.HTMLParser()
tree = etree.parse(StringIO(converted), parser)
self.assertEqual(
['a a', '1'],
[x.text for x in tree.getroot().find('.//tr[1]').iterdescendants() if x.text])
self.assertEqual(
['b b', '2;x'],
[x.text for x in tree.getroot().find('.//tr[2]').iterdescendants() if x.text])
def test_separator_tab(self):
data = encodestring(open("./data/tsv.tsv").read())
converted = decodestring(self.proxy.convertFile(data, "csv", "html"))
parser = etree.HTMLParser()
tree = etree.parse(StringIO(converted), parser)
self.assertEqual(
['a', 'b'],
[x.text for x in tree.getroot().find('.//tr[1]').iterdescendants() if x.text])
self.assertEqual(
['1,3', 'c'],
[x.text for x in tree.getroot().find('.//tr[2]').iterdescendants() if x.text])
def test_empty_csv(self):
data = encodestring("")
converted = decodestring(self.proxy.convertFile(data, "csv", "html"))
parser = etree.HTMLParser()
tree = etree.parse(StringIO(converted), parser)
self.assertEqual(
[],
[x.text for x in tree.getroot().findall('.//td')])
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment