diff --git a/product/ERP5/Document/PDFDocument.py b/product/ERP5/Document/PDFDocument.py index eb59d343d59138b4530e85f1f47eed9937f8ba2a..b1f3aaee9131fcb974f7803b01e32178fffa267f 100644 --- a/product/ERP5/Document/PDFDocument.py +++ b/product/ERP5/Document/PDFDocument.py @@ -249,17 +249,40 @@ class PDFDocument(Image, CachedConvertableMixin): tmp = tempfile.NamedTemporaryFile() tmp.write(str(self.data)) tmp.seek(0) - cmd = 'pdfinfo -meta -box %s' % tmp.name - r = os.popen(cmd) - h = r.read() - tmp.close() - r.close() - result = {} - for line in h.splitlines(): - item_list = line.split(':') - key = item_list[0].strip() - value = ':'.join(item_list[1:]).strip() - result[key] = value + try: + # First, we use pdfinfo to get standard metadata + cmd = 'pdfinfo -meta -box %s' % tmp.name + r = os.popen(cmd) + h = r.read() + r.close() + result = {} + for line in h.splitlines(): + item_list = line.split(':') + key = item_list[0].strip() + value = ':'.join(item_list[1:]).strip() + result[key] = value + + # Then we use pdftk to get extra metadata + cmd = 'pdftk %s dump_data output' % tmp.name + r = os.popen(cmd) + h = r.read() + r.close() + line_list = (line for line in h.splitlines()) + while True: + try: + line = line_list.next() + except StopIteration: + break + if line.startswith('InfoKey'): + key = line[len('InfoKey: '):] + line = line_list.next() + assert line.startswith('InfoValue: '),\ + "Wrong format returned by pdftk dump_data" + value = line[len('InfoValue: '):] + result.setdefault(key, value) + finally: + tmp.close() + self._content_information = result return result.copy() diff --git a/product/ERP5OOo/tests/testDms.py b/product/ERP5OOo/tests/testDms.py index 5227d39b657507f480434d9ade9efab83920e9af..847e79b8d50c8914968870dd77701c5dd05e964b 100644 --- a/product/ERP5OOo/tests/testDms.py +++ b/product/ERP5OOo/tests/testDms.py @@ -792,6 +792,15 @@ class TestDocument(ERP5TypeTestCase, ZopeTestCase.Functional): self.assertEquals('title', content_information['Title']) self.assertEquals('application/pdf', document.getContentType()) + def test_PDF_content_information_extra_metadata(self): + # Extra metadata, such as those stored by pdftk update_info are also + # available in document.getContentInformation() + upload_file = makeFileUpload('metadata.pdf') + document = self.portal.portal_contributions.newContent(file=upload_file) + self.assertEquals('PDF', document.getPortalType()) + content_information = document.getContentInformation() + self.assertEquals('the value', content_information['NonStandardMetadata']) + def test_PDF_content_content_type(self): upload_file = makeFileUpload('REF-en-001.pdf') document = self.portal.document_module.newContent(portal_type='PDF') diff --git a/product/ERP5OOo/tests/test_document/metadata.pdf b/product/ERP5OOo/tests/test_document/metadata.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1f0d182071b45b29fe1c0c30b2a4090bc1f71525 Binary files /dev/null and b/product/ERP5OOo/tests/test_document/metadata.pdf differ