Commit a0e8378a authored by Jérome Perrin's avatar Jérome Perrin

try pdftk to extract metadata, pdfinfo only returns standard PDF metadata.


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@31563 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 26bb8448
...@@ -249,17 +249,40 @@ class PDFDocument(Image, CachedConvertableMixin): ...@@ -249,17 +249,40 @@ class PDFDocument(Image, CachedConvertableMixin):
tmp = tempfile.NamedTemporaryFile() tmp = tempfile.NamedTemporaryFile()
tmp.write(str(self.data)) tmp.write(str(self.data))
tmp.seek(0) tmp.seek(0)
cmd = 'pdfinfo -meta -box %s' % tmp.name try:
r = os.popen(cmd) # First, we use pdfinfo to get standard metadata
h = r.read() cmd = 'pdfinfo -meta -box %s' % tmp.name
tmp.close() r = os.popen(cmd)
r.close() h = r.read()
result = {} r.close()
for line in h.splitlines(): result = {}
item_list = line.split(':') for line in h.splitlines():
key = item_list[0].strip() item_list = line.split(':')
value = ':'.join(item_list[1:]).strip() key = item_list[0].strip()
result[key] = value value = ':'.join(item_list[1:]).strip()
result[key] = value
# Then we use pdftk to get extra metadata
cmd = 'pdftk %s dump_data output' % tmp.name
r = os.popen(cmd)
h = r.read()
r.close()
line_list = (line for line in h.splitlines())
while True:
try:
line = line_list.next()
except StopIteration:
break
if line.startswith('InfoKey'):
key = line[len('InfoKey: '):]
line = line_list.next()
assert line.startswith('InfoValue: '),\
"Wrong format returned by pdftk dump_data"
value = line[len('InfoValue: '):]
result.setdefault(key, value)
finally:
tmp.close()
self._content_information = result self._content_information = result
return result.copy() return result.copy()
......
...@@ -792,6 +792,15 @@ class TestDocument(ERP5TypeTestCase, ZopeTestCase.Functional): ...@@ -792,6 +792,15 @@ class TestDocument(ERP5TypeTestCase, ZopeTestCase.Functional):
self.assertEquals('title', content_information['Title']) self.assertEquals('title', content_information['Title'])
self.assertEquals('application/pdf', document.getContentType()) self.assertEquals('application/pdf', document.getContentType())
def test_PDF_content_information_extra_metadata(self):
# Extra metadata, such as those stored by pdftk update_info are also
# available in document.getContentInformation()
upload_file = makeFileUpload('metadata.pdf')
document = self.portal.portal_contributions.newContent(file=upload_file)
self.assertEquals('PDF', document.getPortalType())
content_information = document.getContentInformation()
self.assertEquals('the value', content_information['NonStandardMetadata'])
def test_PDF_content_content_type(self): def test_PDF_content_content_type(self):
upload_file = makeFileUpload('REF-en-001.pdf') upload_file = makeFileUpload('REF-en-001.pdf')
document = self.portal.document_module.newContent(portal_type='PDF') document = self.portal.document_module.newContent(portal_type='PDF')
......
File added
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment