PDFDocument.py 6.85 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

28 29
import tempfile, os, cStringIO

30
from AccessControl import ClassSecurityInfo
31 32
from Products.CMFCore.utils import getToolByName

33 34 35 36
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.Document import ConversionCacheMixin
37
from Products.ERP5.Document.File import _unpackData
38

39
from zLOG import LOG
40

41
class PDFDocument(Image, ConversionCacheMixin):
42
  """
43 44 45
  PDFDocument is a subclass of Image which is able to
  extract text content from a PDF file either as text
  or as HTML.
46 47
  """
  # CMF Type Definition
48
  meta_type = 'ERP5 PDF Document'
49 50 51 52 53 54 55 56 57 58
  portal_type = 'PDF'
  isPortalContent = 1
  isRADContent = 1

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
59
                    , PropertySheet.XMLObject
60 61 62 63 64 65
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
                    , PropertySheet.Reference
                    , PropertySheet.Document
                    , PropertySheet.Data
66 67 68
                    , PropertySheet.ExternalDocument
                    , PropertySheet.Url
                    , PropertySheet.Periodicity
69 70
                    )

71 72 73 74
  searchable_property_list = ('asText', 'title', 'description', 'id', 'reference',
                              'version', 'short_title',
                              'subject', 'source_reference', 'source_project_title',)

75
  security.declareProtected(Permissions.View, 'index_html')
76 77
  def index_html(self, REQUEST, RESPONSE, display=None, format='', quality=75, 
                                          resolution=None, frame=0):
78
    """
79 80 81
      Returns data in the appropriate format (graphical)
      it is always a zip because multi-page pdfs are converted into a zip
      file of many images
82
    """
83 84
    if format is None:
      RESPONSE.setHeader('Content-Type', 'application/pdf')
85
      return _unpackData(self.data)
86 87 88 89 90 91 92
    if format in ('html', 'txt', 'text'):
      mime, data = self.convert(format)
      RESPONSE.setHeader('Content-Length', len(data))
      RESPONSE.setHeader('Content-Type', '%s;charset=UTF-8' % mime)
      RESPONSE.setHeader('Accept-Ranges', 'bytes')
      return data
    return Image.index_html(self, REQUEST, RESPONSE, display=display,
93 94
                            format=format, quality=quality,
                            resolution=resolution, frame=frame)
95 96 97 98 99 100 101

  # Conversion API
  security.declareProtected(Permissions.ModifyPortalContent, 'convert')
  def convert(self, format, **kw):
    """
    Implementation of conversion for PDF files
    """
102
    if format == 'html':
103 104 105 106 107 108 109 110
      if not self.hasConversion(format=format):
        data = self._convertToHTML()
        self.setConversion(data, mime='text/html', format=format)
      return self.getConversion(format=format)
    elif format in ('txt', 'text'):
      if not self.hasConversion(format='txt'):
        data = self._convertToText()
        self.setConversion(data, mime='text/plain', format='txt')
111
      return self.getConversion(format='txt')
112 113 114 115 116
    else:
      return Image.convert(self, format, **kw)

  security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
  def populateContent(self):
117
    """
118 119 120
      Convert each page to an Image and populate the
      PDF directory with converted images. May be useful
      to provide online PDF reader
121
    """
122
    raise NotImplementedError
123 124

  security.declarePrivate('_convertToText')
125
  def _convertToText(self):
126
    """
127
      Convert the PDF text content to text with pdftotext
128
    """
129
    tmp = tempfile.NamedTemporaryFile()
130
    tmp.write(_unpackData(self.data))
131 132 133 134 135 136 137 138 139 140 141 142
    tmp.seek(0)
    cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
    r = os.popen(cmd)
    h = r.read()
    tmp.close()
    r.close()
    return h

  security.declarePrivate('_convertToHTML')
  def _convertToHTML(self):
    """
    Convert the PDF text content to HTML with pdftohtml
143 144 145

    NOTE: XXX check that command exists and was executed
    successfully
146 147
    """
    tmp = tempfile.NamedTemporaryFile()
148
    tmp.write(_unpackData(self.data))
149 150 151 152 153 154 155
    tmp.seek(0)
    cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
    r = os.popen(cmd)
    h = r.read()
    tmp.close()
    r.close()
    h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ') # Quick hack to remove bg color - XXX
156
    h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1], 'href="asEntireHTML') # Make links relative
157 158 159 160 161 162 163
    return h

  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
  def getContentInformation(self):
    """
    Returns the information about the PDF document with
    pdfinfo.
164 165 166

    NOTE: XXX check that command exists and was executed
    successfully
167
    """
168 169 170 171
    try:
      return self._content_information.copy()
    except AttributeError:
      pass
172
    tmp = tempfile.NamedTemporaryFile()
173
    tmp.write(_unpackData(self.data))
174 175 176 177 178 179 180 181 182 183 184 185
    tmp.seek(0)
    cmd = 'pdfinfo -meta -box %s' % tmp.name
    r = os.popen(cmd)
    h = r.read()
    tmp.close()
    r.close()
    result = {}
    for line in h.splitlines():
      item_list = line.split(':')
      key = item_list[0].strip()
      value = ':'.join(item_list[1:]).strip()
      result[key] = value
186 187 188 189 190 191 192 193 194
    self._content_information = result
    return result.copy()

  def _setFile(self, data, precondition=None):
    try:
      del self._content_information
    except AttributeError:
      pass
    Image._setFile(self, data, precondition)