PDFDocument.py 9.75 KB
Newer Older
1
# -*- coding: utf-8 -*-
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

29
import tempfile, os
30

31
from AccessControl import ClassSecurityInfo
32 33
from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\
    _ViewEmulator
34

35
from Products.ERP5Type import Permissions, PropertySheet
36
from Products.ERP5.Document.Image import Image
37 38 39
from Products.ERP5.Document.Document import ConversionError,\
                                            VALID_TEXT_FORMAT_LIST
from subprocess import Popen, PIPE
40

41
class PDFDocument(Image):
42
  """
43 44 45
  PDFDocument is a subclass of Image which is able to
  extract text content from a PDF file either as text
  or as HTML.
46 47
  """
  # CMF Type Definition
48
  meta_type = 'ERP5 PDF Document'
49 50 51 52 53 54 55 56
  portal_type = 'PDF'

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
57
                    , PropertySheet.XMLObject
58 59 60 61 62 63
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
                    , PropertySheet.Reference
                    , PropertySheet.Document
                    , PropertySheet.Data
64 65 66
                    , PropertySheet.ExternalDocument
                    , PropertySheet.Url
                    , PropertySheet.Periodicity
67 68
                    )

69
  # Conversion API
70
  security.declareProtected(Permissions.AccessContentsInformation, 'convert')
71 72 73 74
  def convert(self, format, **kw):
    """
    Implementation of conversion for PDF files
    """
75
    if format == 'html':
76 77 78
      try:
        return self.getConversion(format=format)
      except KeyError:
79
        mime = 'text/html'
80
        data = self._convertToHTML()
81 82
        self.setConversion(data, mime=mime, format=format)
        return (mime, data)
83
    elif format in ('txt', 'text'):
84 85 86
      try:
        return self.getConversion(format='txt')
      except KeyError:
87
        mime = 'text/plain'
88
        data = self._convertToText()
89 90
        self.setConversion(data, mime=mime, format='txt')
        return (mime, data)
91 92 93 94 95
    else:
      return Image.convert(self, format, **kw)

  security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
  def populateContent(self):
96
    """
97 98 99
      Convert each page to an Image and populate the
      PDF directory with converted images. May be useful
      to provide online PDF reader
100
    """
101
    raise NotImplementedError
102 103

  security.declarePrivate('_convertToText')
104
  def _convertToText(self):
105
    """
106
      Convert the PDF text content to text with pdftotext
107
    """
108 109
    if not self.data:
      return ''
110
    tmp = tempfile.NamedTemporaryFile()
111
    tmp.write(str(self.getData()))
112
    tmp.seek(0)
113 114 115 116
    command_result = Popen(['pdftotext', '-layout', '-enc', 'UTF-8',
                                                    '-nopgbrk', tmp.name, '-'],
                                                  stdout=PIPE).communicate()[0]
    h = command_result
117
    tmp.close()
118
    if h:
119 120 121 122 123 124 125 126 127 128
      return h
    else:
      # Try to use OCR
      # As high dpi images are required, it may take some times to convert the
      # pdf. 
      # It may be required to use activities to fill the cache and at the end, 
      # to calculate the final result
      text = ''
      content_information = self.getContentInformation()
      page_count = int(content_information.get('Pages', 0))
129 130 131 132 133 134 135 136 137 138 139 140 141
      try:
        # if the dimension is too big, rasterized image can be too
        # big. so we limit the maximum of rasterized image to 4096
        # pixles.
        # XXX since the dimention can be different on each page, it is
        # better to call 'pdfinfo -f page_num -l page_num' to get the
        # size of each page.
        max_size = 4096
        size = content_information.get('Page size',
                                       '%s x %s pts' % (max_size, max_size))
        width = int(size.split(' ')[0])
        height = int(size.split(' ')[2])
        resolution = 72.0 * max_size / max(width, height)
Jérome Perrin's avatar
Jérome Perrin committed
142
      except (ValueError, ZeroDivisionError):
143
        resolution = None
144 145
      for page_number in range(page_count):
        src_mimetype, png_data = self.convert(
146
            'png', quality=100, resolution=resolution,
147 148 149 150 151 152 153 154 155 156
            frame=page_number, display='identical')
        if not src_mimetype.endswith('png'):
          continue
        content = '%s' % png_data
        mime_type = getToolByName(self, 'mimetypes_registry').\
                                    lookupExtension('name.%s' % 'txt')
        if content is not None:
          portal_transforms = getToolByName(self, 'portal_transforms')
          result = portal_transforms.convertToData(mime_type, content,
                                                   context=self,
157
                                                   filename=self.getTitleOrId(),
158 159
                                                   mimetype=src_mimetype)
          if result is None:
160 161
            raise ConversionError('PDFDocument conversion error. '
                                  'portal_transforms failed to convert to %s: %r' % (mime_type, self))
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
          text += result
      return text

  security.declareProtected('View', 'getSizeFromImageDisplay')
  def getSizeFromImageDisplay(self, image_display):
    """
    Return the size for this image display, or None if this image display name
    is not known. If the preference is not set, (0, 0) is returned.
    """
    # identical parameter can be considered as a hack, in order not to
    # resize the image to prevent text distorsion when using OCR.
    # A cleaner API is required.
    if image_display == 'identical':
      return (self.getWidth(), self.getHeight())
    else:
      return Image.getSizeFromImageDisplay(self, image_display)
178 179 180 181 182

  security.declarePrivate('_convertToHTML')
  def _convertToHTML(self):
    """
    Convert the PDF text content to HTML with pdftohtml
183 184 185

    NOTE: XXX check that command exists and was executed
    successfully
186
    """
187 188
    if not self.data:
      return ''
189
    tmp = tempfile.NamedTemporaryFile()
190
    tmp.write(str(self.data))
191
    tmp.seek(0)
192 193 194 195 196
    command_result = Popen(['pdftohtml', '-enc', 'UTF-8', '-stdout',
                            '-noframes', '-i', tmp.name], stdout=PIPE)\
                                                              .communicate()[0]

    h = command_result
197
    tmp.close()
198 199 200 201 202
    # Quick hack to remove bg color - XXX
    h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ')
    # Make links relative
    h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1],
                                                          'href="asEntireHTML')
203 204 205 206 207 208 209
    return h

  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
  def getContentInformation(self):
    """
    Returns the information about the PDF document with
    pdfinfo.
210 211 212

    NOTE: XXX check that command exists and was executed
    successfully
213
    """
214 215 216 217
    try:
      return self._content_information.copy()
    except AttributeError:
      pass
218
    tmp = tempfile.NamedTemporaryFile()
219
    tmp.write(str(self.data))
220
    tmp.seek(0)
221 222
    try:
      # First, we use pdfinfo to get standard metadata
223 224 225
      command_result = Popen(['pdfinfo', '-meta', '-box', tmp.name],
                                                  stdout=PIPE).communicate()[0]
      h = command_result
226 227 228 229 230 231 232 233
      result = {}
      for line in h.splitlines():
        item_list = line.split(':')
        key = item_list[0].strip()
        value = ':'.join(item_list[1:]).strip()
        result[key] = value

      # Then we use pdftk to get extra metadata
234 235
      try:
        command_result = Popen(['pdftk', tmp.name, 'dump_data', 'output'],
236
                                                  stdout=PIPE).communicate()[0]
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
      except OSError:
        # pdftk not found
        pass
      else:
        h = command_result
        line_list = (line for line in h.splitlines())
        while True:
          try:
            line = line_list.next()
          except StopIteration:
            break
          if line.startswith('InfoKey'):
            key = line[len('InfoKey: '):]
            line = line_list.next()
            assert line.startswith('InfoValue: '),\
                "Wrong format returned by pdftk dump_data"
            value = line[len('InfoValue: '):]
            result.setdefault(key, value)
255 256 257
    finally:
      tmp.close()

258 259 260 261 262 263
    self._content_information = result
    return result.copy()

  def _setFile(self, data, precondition=None):
    try:
      del self._content_information
Yusei Tahara's avatar
Yusei Tahara committed
264
    except (AttributeError, KeyError):
265
      pass
266
    Image._setFile(self, data, precondition=precondition)