handler.pdf: use pyPdf in setMetada

0ff799eb · Boris Kocherov · Romain Courteaud · 080f25b8 · 0ff799eb · 0ff799eb
Commit 0ff799eb authored Feb 07, 2018 by Boris Kocherov Committed by Romain Courteaud Feb 26, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 33 deletions

cloudooo/handler/pdf/handler.py cloudooo/handler/pdf/handler.py +30 -32

setup.py setup.py +2 -1

No files found.
--- a/cloudooo/handler/pdf/handler.py
+++ b/cloudooo/handler/pdf/handler.py
@@ -25,6 +25,7 @@
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 #
 ##############################################################################
+import io
 from zope.interface import implements
 from cloudooo.interfaces.handler import IHandler
@@ -33,6 +34,8 @@ from cloudooo.util import logger, parseContentType
 from subprocess import Popen, PIPE
 from tempfile import mktemp
+from pyPdf import PdfFileWriter, PdfFileReader
+from pyPdf.generic import NameObject, createStringObject
 class Handler(object):
  """PDF Handler is used to handler inputed pdf document."""
@@ -47,6 +50,7 @@ class Handler(object):
  def convert(self, destination_format=None, **kw):
    """ Convert a pdf document """
+    # TODO: use pyPdf
    logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format))
    output_url = mktemp(suffix=".%s" % destination_format,
                        dir=self.document.directory_name)
@@ -66,6 +70,7 @@ class Handler(object):
    """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
+    # TODO: use pyPdf and not use lower()
    command = ["pdfinfo", self.document.getUrl()]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
@@ -75,13 +80,10 @@ class Handler(object):
    info_list = filter(None, stdout.split("\n"))
    metadata = {}
    for info in iter(info_list):
-      if info.count(":") == 1:
+      info = info.split(":")
-        info_name, info_value = info.split(":")
+      info_name = info[0].lower()
-      else:
+      info_value = ":".join(info[1:]).strip()
-        info_name, info_value = info.split("  ")
+      metadata[info_name] = info_value
-        info_name = info_name.replace(":", "")
-      info_value = info_value.strip()
-      metadata[info_name.lower()] = info_value
    self.document.trash()
    return metadata
@@ -90,31 +92,27 @@ class Handler(object):
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
-    text_template = "InfoKey: %s\nInfoValue: %s\n"
+    # TODO: date as "D:20090401124817-04'00'" ASN.1 for ModDate and CreationDate
-    text_list = [text_template % (key.capitalize(), value) \
+    input_pdf = PdfFileReader(open(self.document.getUrl(), "rb"))
-                                 for key, value in metadata.iteritems()]
+    output_pdf = PdfFileWriter()
-    metadata_file = File(self.document.directory_name,
-                         "".join(text_list),
+    modification_date = metadata.pop("ModificationDate", None)
-                         "txt")
+    if modification_date:
-    output_url = mktemp(suffix=".pdf",
+      metadata['ModDate'] = modification_date
-                        dir=self.document.directory_name)
+    if type(metadata.get('Keywords', None)) is list:
-    command = ["pdftk",
+      metadata['Keywords'] = metadata['Keywords'].join(' ')
-               self.document.getUrl(),
+    args = {}
-               "update_info",
+    for key, value in list(metadata.items()):
-               metadata_file.getUrl(),
+      args[NameObject('/' + key.capitalize())] = createStringObject(value)
-               "output",
-               output_url
+    output_pdf._info.getObject().update(args)
-               ]
-    stdout, stderr = Popen(command,
+    for page_num in range(input_pdf.getNumPages()):
-                           stdout=PIPE,
+      output_pdf.addPage(input_pdf.getPage(page_num))
-                           stderr=PIPE,
-                           close_fds=True,
+    output_stream = io.BytesIO()
-                           env=self.environment).communicate()
+    output_pdf.write(output_stream)
-    self.document.reload(output_url)
+    return output_stream.getvalue()
-    try:
-      return self.document.getContent()
-    finally:
-      self.document.trash()
  @staticmethod
  def getAllowedConversionFormatList(source_mimetype):

--- a/setup.py
+++ b/setup.py
 from setuptools import setup, find_packages
 import sys
-version = '1.2.5-dev'
+version = '1.2.6-dev'
 def read(name):
    return open(name).read()
@@ -13,6 +13,7 @@ install_requires = [
          'zope.interface',
          'PasteDeploy',
          'PasteScript',
+          'pyPdf',
          'WSGIUtils',
          'psutil>=3.0.0',
          'lxml',