PDFDocument.py 10.4 KB
Newer Older
1
# -*- coding: utf-8 -*-
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

29
import tempfile, os
30

31
from AccessControl import ClassSecurityInfo
Nicolas Delaby's avatar
Nicolas Delaby committed
32
from Products.CMFCore.utils import _setCacheHeaders, _ViewEmulator
33

34
from Products.ERP5Type import Permissions, PropertySheet
35
from Products.ERP5.Document.Image import Image
36 37 38
from Products.ERP5.Document.Document import ConversionError,\
                                            VALID_TEXT_FORMAT_LIST
from subprocess import Popen, PIPE
Nicolas Dumazet's avatar
Nicolas Dumazet committed
39
import errno
40

41
class PDFDocument(Image):
42
  """
43 44 45
  PDFDocument is a subclass of Image which is able to
  extract text content from a PDF file either as text
  or as HTML.
46 47
  """
  # CMF Type Definition
48
  meta_type = 'ERP5 PDF Document'
49 50 51 52 53 54 55 56
  portal_type = 'PDF'

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
57
                    , PropertySheet.XMLObject
58 59 60 61 62 63
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
                    , PropertySheet.Reference
                    , PropertySheet.Document
                    , PropertySheet.Data
64 65 66
                    , PropertySheet.ExternalDocument
                    , PropertySheet.Url
                    , PropertySheet.Periodicity
67 68
                    )

69
  # Conversion API
70
  def _convert(self, format, **kw):
71 72 73
    """
    Implementation of conversion for PDF files
    """
74
    if format == 'html':
75 76 77
      try:
        return self.getConversion(format=format)
      except KeyError:
78
        mime = 'text/html'
79
        data = self._convertToHTML()
80 81
        self.setConversion(data, mime=mime, format=format)
        return (mime, data)
82
    elif format in ('txt', 'text'):
83 84 85
      try:
        return self.getConversion(format='txt')
      except KeyError:
86
        mime = 'text/plain'
87
        data = self._convertToText()
88 89
        self.setConversion(data, mime=mime, format='txt')
        return (mime, data)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
90 91 92 93 94 95 96 97
    elif format in ('djvu', 'DJVU'):
      try:
        return self.getConversion(format='djvu')
      except KeyError:
        mime = 'image/vnd.djvu'
        data = self._convertToDJVU()
        self.setConversion(data, mime=mime, format='djvu')
        return (mime, data)
98 99
    elif format is None:
      return self.getContentType(), self.getData()
100
    else:
101 102 103 104 105
      if kw.get('frame', None) is None:
        # when converting to image from PDF we care for first page only
        # this will make sure that only first page is used and not whole content of
        # PDF file read & converted which is a performance issue
        kw['frame'] = 0
106
      return Image._convert(self, format, **kw)
107 108 109

  security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
  def populateContent(self):
110
    """
111 112 113
      Convert each page to an Image and populate the
      PDF directory with converted images. May be useful
      to provide online PDF reader
114
    """
115
    raise NotImplementedError
116 117

  security.declarePrivate('_convertToText')
118
  def _convertToText(self):
119
    """
120
      Convert the PDF text content to text with pdftotext
121
    """
122
    if not self.hasData():
123
      return ''
Nicolas Delaby's avatar
Nicolas Delaby committed
124 125 126 127 128 129 130 131
    mime_type = 'text/plain'
    portal_transforms = self.getPortalObject().portal_transforms
    filename = self.getStandardFilename(format='txt')
    result = portal_transforms.convertToData(mime_type, str(self.getData()),
                                             context=self, filename=filename,
                                             mimetype=self.getContentType())
    if result:
      return result
132 133 134 135 136 137 138 139 140 141 142
    else:
      # Try to use OCR
      # As high dpi images are required, it may take some times to convert the
      # pdf. 
      # It may be required to use activities to fill the cache and at the end, 
      # to calculate the final result
      text = ''
      content_information = self.getContentInformation()
      page_count = int(content_information.get('Pages', 0))
      for page_number in range(page_count):
        src_mimetype, png_data = self.convert(
143
            'png', quality=100, resolution=300,
144 145 146
            frame=page_number, display='identical')
        if not src_mimetype.endswith('png'):
          continue
Nicolas Delaby's avatar
Nicolas Delaby committed
147
        content = str(png_data)
148
        if content is not None:
Nicolas Delaby's avatar
Nicolas Delaby committed
149
          filename = self.getStandardFilename(format='png')
150 151
          result = portal_transforms.convertToData(mime_type, content,
                                                   context=self,
Nicolas Delaby's avatar
Nicolas Delaby committed
152
                                                   filename=filename,
153 154
                                                   mimetype=src_mimetype)
          if result is None:
155 156
            raise ConversionError('PDFDocument conversion error. '
                                  'portal_transforms failed to convert to %s: %r' % (mime_type, self))
157 158 159
          text += result
      return text

160
  security.declareProtected(Permissions.View, 'getSizeFromImageDisplay')
161 162 163 164 165 166 167 168 169 170 171 172
  def getSizeFromImageDisplay(self, image_display):
    """
    Return the size for this image display, or None if this image display name
    is not known. If the preference is not set, (0, 0) is returned.
    """
    # identical parameter can be considered as a hack, in order not to
    # resize the image to prevent text distorsion when using OCR.
    # A cleaner API is required.
    if image_display == 'identical':
      return (self.getWidth(), self.getHeight())
    else:
      return Image.getSizeFromImageDisplay(self, image_display)
173 174 175 176 177

  security.declarePrivate('_convertToHTML')
  def _convertToHTML(self):
    """
    Convert the PDF text content to HTML with pdftohtml
178 179 180

    NOTE: XXX check that command exists and was executed
    successfully
181
    """
182
    if not self.hasData():
183
      return ''
184
    tmp = tempfile.NamedTemporaryFile()
185
    tmp.write(self.getData())
186
    tmp.seek(0)
187

Nicolas Dumazet's avatar
Nicolas Dumazet committed
188 189 190 191 192 193 194 195 196 197 198 199 200
    command_result = None
    try:
      command = ['pdftohtml', '-enc', 'UTF-8', '-stdout',
                 '-noframes', '-i', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdftohtml was not found')
        raise

    finally:
      tmp.close()
201
    # Quick hack to remove bg color - XXX
Nicolas Dumazet's avatar
Nicolas Dumazet committed
202
    h = command_result.replace('<BODY bgcolor="#A0A0A0"', '<BODY ')
203 204 205
    # Make links relative
    h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1],
                                                          'href="asEntireHTML')
206 207
    return h

Jean-Paul Smets's avatar
Jean-Paul Smets committed
208 209 210 211 212 213 214 215 216 217 218 219 220
  security.declarePrivate('_convertToDJVU')
  def _convertToDJVU(self):
    """
    Convert the PDF text content to DJVU with pdf2djvu
    """
    if not self.hasData():
      return ''
    tmp = tempfile.NamedTemporaryFile()
    tmp.write(self.getData())
    tmp.seek(0)

    command_result = None
    try:
221
      command = ['/usr/bin/pdf2djvu', tmp.name]
Jean-Paul Smets's avatar
Jean-Paul Smets committed
222 223 224 225 226 227 228 229 230 231 232
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdf2djvu was not found')
        raise

    finally:
      tmp.close()
    return command_result

233 234 235 236 237
  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
  def getContentInformation(self):
    """
    Returns the information about the PDF document with
    pdfinfo.
238 239 240

    NOTE: XXX check that command exists and was executed
    successfully
241
    """
242 243 244 245
    try:
      return self._content_information.copy()
    except AttributeError:
      pass
246
    tmp = tempfile.NamedTemporaryFile()
247
    tmp.write(self.getData())
248
    tmp.seek(0)
Nicolas Dumazet's avatar
Nicolas Dumazet committed
249
    command_result = None
250
    try:
Nicolas Dumazet's avatar
Nicolas Dumazet committed
251

252
      # First, we use pdfinfo to get standard metadata
Nicolas Dumazet's avatar
Nicolas Dumazet committed
253 254 255 256 257 258 259 260
      command = ['pdfinfo', '-meta', '-box', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdfinfo was not found')
        raise

261
      result = {}
Nicolas Dumazet's avatar
Nicolas Dumazet committed
262
      for line in command_result.splitlines():
263 264 265 266 267 268
        item_list = line.split(':')
        key = item_list[0].strip()
        value = ':'.join(item_list[1:]).strip()
        result[key] = value

      # Then we use pdftk to get extra metadata
269
      try:
Nicolas Dumazet's avatar
Nicolas Dumazet committed
270 271 272 273 274 275
        command = ['pdftk', tmp.name, 'dump_data', 'output']
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        # if pdftk not found, pass
        if e.errno != errno.ENOENT:
          raise
276
      else:
Nicolas Dumazet's avatar
Nicolas Dumazet committed
277
        line_list = (line for line in command_result.splitlines())
278 279 280 281 282 283 284 285 286 287 288 289
        while True:
          try:
            line = line_list.next()
          except StopIteration:
            break
          if line.startswith('InfoKey'):
            key = line[len('InfoKey: '):]
            line = line_list.next()
            assert line.startswith('InfoValue: '),\
                "Wrong format returned by pdftk dump_data"
            value = line[len('InfoValue: '):]
            result.setdefault(key, value)
290 291 292
    finally:
      tmp.close()

293 294 295 296 297 298
    self._content_information = result
    return result.copy()

  def _setFile(self, data, precondition=None):
    try:
      del self._content_information
Yusei Tahara's avatar
Yusei Tahara committed
299
    except (AttributeError, KeyError):
300
      pass
301
    Image._setFile(self, data, precondition=precondition)