OOoDocument.py 14.2 KB
Newer Older
Bartek Górny's avatar
Bartek Górny committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

28
import xmlrpclib
Jean-Paul Smets's avatar
Jean-Paul Smets committed
29
from xmlrpclib import Fault
30 31 32 33
import base64
import re
import zipfile
import cStringIO
Jean-Paul Smets's avatar
Jean-Paul Smets committed
34
import socket
35 36
from DateTime import DateTime

Bartek Górny's avatar
Bartek Górny committed
37 38 39
from AccessControl import ClassSecurityInfo
from OFS.Image import Pdata
from Products.CMFCore.utils import getToolByName
40
from Products.CMFCore.utils import _setCacheHeaders
Bartek Górny's avatar
Bartek Górny committed
41 42 43 44
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Message import Message
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5Type.XMLObject import XMLObject
45
from Products.ERP5.Document.File import File
46
from Products.ERP5.Document.Document import ConversionCacheMixin, ConversionError
47
from Products.CMFCore.utils import getToolByName
48
from Products.DCWorkflow.DCWorkflow import ValidationFailed
Bartek Górny's avatar
Bartek Górny committed
49

50 51
from zLOG import LOG

Bartek Górny's avatar
Bartek Górny committed
52 53 54
enc=base64.encodestring
dec=base64.decodestring

55 56 57
_MARKER = []


58
class OOoDocument(File, ConversionCacheMixin):
Bartek Górny's avatar
Bartek Górny committed
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
  """
    A file document able to convert OOo compatible files to
    any OOo supported format, to capture metadata and to
    update metadata in OOo documents.

    This class can be used:

    - to create an OOo document database with powerful indexing (r/o)
      and metadata handling (r/w) features (ex. change title in ERP5 ->
      title is changed in OOo document)

    - to massively convert MS Office documents to OOo format

    - to easily keep snapshots (in PDF and/or OOo format) of OOo documents
      generated from OOo templates

    This class may be used in the future:

    - to create editable OOo templates (ex. by adding tags in WYSIWYG mode
      and using tags to make document dynamic - ask kevin for more info)

    - to automatically sign / encrypt OOo documents based on user

    - to automatically sign / encrypt PDF generated from OOo documents based on user

    This class should not be used:

    - to store files in formats not supported by OOo

    - to stored pure images (use Image for that)

    - as a general file conversion system (use portal_transforms for that)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
91 92 93

    TODO:
    - better permissions
Bartek Górny's avatar
Bartek Górny committed
94 95 96 97 98 99 100
  """
  # CMF Type Definition
  meta_type = 'ERP5 OOo Document'
  portal_type = 'OOo Document'
  isPortalContent = 1
  isRADContent = 1

101 102 103
  searchable_property_list = ('asTextContent', 'title', 'description', 'id', 'reference',
                              'version', 'short_title',
                              'subject', 'source_reference', 'source_project_title',)
Bartek Górny's avatar
Bartek Górny committed
104 105 106 107 108 109 110

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
111 112
                    , PropertySheet.XMLObject
                    , PropertySheet.Reference
Bartek Górny's avatar
Bartek Górny committed
113 114 115
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
116
                    , PropertySheet.Document
117 118 119 120
                    , PropertySheet.Snapshot
                    , PropertySheet.ExternalDocument
                    , PropertySheet.Url
                    , PropertySheet.Periodicity
Bartek Górny's avatar
Bartek Górny committed
121 122
                    )

123
  # regular expressions for stripping xml from ODF documents
124 125
  rx_strip = re.compile('<[^>]*?>', re.DOTALL|re.MULTILINE)
  rx_compr = re.compile('\s+')
126

127 128 129 130 131
  def _setFile(self, data, precondition=None):
    File._setFile(self, data, precondition=precondition)
    if self.hasBaseData():
      # This is a hack - XXX - new accessor needed to delete properties
      delattr(self, 'base_data')
132

133 134
  security.declareProtected(Permissions.View, 'index_html')
  def index_html(self, REQUEST, RESPONSE, format=None, **kw):
135
    """
136 137 138
      Default renderer with conversion support. Format is
      a string. The list of available formats can be obtained
      by calling getTargetFormatItemList.
139
    """
140 141 142
    # Accelerate rendering in Web mode
    _setCacheHeaders(self, {'format' : format})
    # Return the original file by default
143
    if format is None:
144 145 146 147 148
      return File.index_html(self, REQUEST, RESPONSE)
    # Make sure file is converted to base format
    if not self.hasBaseData():
      self.convertToBaseFormat()
    # Else try to convert the document and return it
149 150 151
    mime, result = self.convert(format=format)
    if not mime:
      mime = getToolByName(self, 'mimetypes_registry').lookupExtension('name.%s' % format)
152
    RESPONSE.setHeader('Content-Length', len(result))
153 154 155 156
    RESPONSE.setHeader('Content-Type', mime)
    RESPONSE.setHeader('Accept-Ranges', 'bytes')
    return result

157
  # Format conversion implementation
158
  def _getServerCoordinate(self):
Bartek Górny's avatar
Bartek Górny committed
159
    """
160 161
      Returns the oood conversion server coordinates
      as defined in preferences.
Bartek Górny's avatar
Bartek Górny committed
162
    """
163 164 165 166 167 168 169
    preference_tool = getToolByName(self, 'portal_preferences')
    address = preference_tool.getPreferredOoodocServerAddress()
    port = preference_tool.getPreferredOoodocServerPortNumber()
    if not address or not port:
      raise ConversionError('Can not proceed with conversion: '
                            'conversion server host and port is not defined in preferences')
    return address, port
Bartek Górny's avatar
Bartek Górny committed
170 171 172

  def _mkProxy(self):
    """
173
      Create an XML-RPC proxy to access the conversion server.
Bartek Górny's avatar
Bartek Górny committed
174
    """
175 176 177
    server_proxy = xmlrpclib.ServerProxy('http://%s:%d' % self._getServerCoordinate(),
                                         allow_none=True)
    return server_proxy
Bartek Górny's avatar
Bartek Górny committed
178 179 180 181 182 183 184

  security.declareProtected(Permissions.AccessContentsInformation,'getTargetFormatList')
  def getTargetFormatItemList(self):
    """
      Returns a list of acceptable formats for conversion
      in the form of tuples (for listfield in ERP5Form)

185 186
      NOTE: it is the responsability of the conversion server
      to provide an extensive list of conversion formats.
Bartek Górny's avatar
Bartek Górny committed
187
    """
188
    def cached_getTargetFormatItemList(content_type):
189
      server_proxy = self._mkProxy()
Bartek Górny's avatar
Bartek Górny committed
190
      allowed = server_proxy.getAllowedTargetItemList(content_type) # oood API needs naming convention update
191
      return [(y, x) for x, y in allowed] # tuple order is reversed to be compatible with ERP5 Form
Bartek Górny's avatar
Bartek Górny committed
192

193
    # Cache valid format list
Bartek Górny's avatar
Bartek Górny committed
194
    cached_getTargetFormatItemList = CachingMethod(cached_getTargetFormatItemList,
Aurel's avatar
Aurel committed
195
                                        id = "OOoDocument_getTargetFormatItemList",
196
                                                   cache_factory='erp5_ui_medium')
Bartek Górny's avatar
Bartek Górny committed
197

198 199 200 201
    return cached_getTargetFormatItemList(self.getBaseContentType())

  security.declareProtected(Permissions.AccessContentsInformation, 'getTargetFormatTitleList')
  def getTargetFormatTitleList(self):
Bartek Górny's avatar
Bartek Górny committed
202 203 204 205 206
    """
      Returns a list of acceptable formats for conversion
    """
    return map(lambda x: x[0], self.getTargetFormatItemList())

207 208
  security.declareProtected(Permissions.AccessContentsInformation, 'getTargetFormatList')
  def getTargetFormatList(self):
Bartek Górny's avatar
Bartek Górny committed
209
    """
210
      Returns a list of acceptable formats for conversion
Bartek Górny's avatar
Bartek Górny committed
211
    """
212
    return map(lambda x: x[1], self.getTargetFormatItemList())
Bartek Górny's avatar
Bartek Górny committed
213

214 215
  security.declareProtected(Permissions.ModifyPortalContent,'isTargetFormatAllowed')
  def isTargetFormatAllowed(self, format):
216
    """
217 218 219 220 221 222 223 224 225 226 227 228 229
      Checks if the current document can be converted
      into the specified target format.
    """
    return format in self.getTargetFormatList()

  security.declarePrivate('_convert')
  def _convert(self, format):
    """
      Communicates with server to convert a file 
    """
    if format == 'text-content':
      # Extract text from the ODF file
      cs = cStringIO.StringIO()
230
      cs.write(self._unpackData(self.getBaseData()))
231 232 233 234 235 236
      z = zipfile.ZipFile(cs)
      s = z.read('content.xml')
      s = self.rx_strip.sub(" ", s) # strip xml
      s = self.rx_compr.sub(" ", s) # compress multiple spaces
      cs.close()
      z.close()
237
      return 'text/plain', s
238 239 240 241 242
    server_proxy = self._mkProxy()
    kw = server_proxy.run_generate(self.getId(),
                                   enc(self._unpackData(self.getBaseData())),
                                   None, format)
    return kw['mime'], Pdata(dec(kw['data']))
243

244
  # Conversion API
245
  security.declareProtected(Permissions.View, 'convert')
246
  def convert(self, format, **kw):
Bartek Górny's avatar
Bartek Górny committed
247
    """
248
      Implementation of thGet file in a given format.
249 250
      Runs makeFile to make sure we have the requested version cached,
      then returns from cache.
Bartek Górny's avatar
Bartek Górny committed
251
    """
252 253
    # Make sure we can support html and pdf by default
    is_html = 0
254 255 256
    if format == 'base-data':
      if not self.hasBaseData(): self.convertToBaseFormat()
      return self.getBaseContentType(), self.getBaseData()
257 258 259 260 261 262 263
    if format == 'pdf':
      format_list = [x for x in self.getTargetFormatList() if x.endswith('pdf')]
      format = format_list[0]
    elif format == 'html':
      format_list = [x for x in self.getTargetFormatList() if x.startswith('html')]
      format = format_list[0]
      is_html = 1
264 265 266 267 268 269 270 271 272 273
    elif format in ('txt', 'text', 'text-content'):
      format_list = self.getTargetFormatList()
      if format in format_list:
        format = format_list[format_list.index(format)]
      if 'txt' in format_list:
        format = format_list[format_list.index('txt')]
      elif 'text' in format_list:
        format = format_list[format_list.index('text')]
      else:
        return 'text/plain', self.asTextContent()
274 275 276 277 278
    # Raise an error if the format is not supported
    if not self.isTargetFormatAllowed(format):
      raise ConversionError, 'Target format %s is not supported' % format
    # Check if we have already a base conversion
    if not self.hasBaseData():
279
      self.convertToBaseFormat()
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
    # Return converted file
    if not self.hasConversion(format=format):
      # Do real conversion
      mime, data = self._convert(format)
      if is_html:
        # Extra processing required since
        # we receive a zip file
        cs = cStringIO.StringIO()
        cs.write(self._unpackData(data))
        z = zipfile.ZipFile(cs)
        for f in z.infolist():
          fn = f.filename
          if fn.endswith('html'):
            data = z.read(fn)
            break
        mime = 'text/html'
        self.populateContent(zip_file=z)
        z.close()
        cs.close()
      self.setConversion(data, mime, format=format)
    return self.getConversion(format=format)
  
  security.declareProtected(Permissions.View, 'asTextContent')
  def asTextContent(self):
    """
      Extract plain text from ooo docs by stripping the XML file.
      This is the simplest way, the most universal and it is compatible
      will all formats.
    """
309
    return self._convert(format='text-content')
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342

  security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
  def populateContent(self, zip_file=None):
    """
    Extract content from the ODF zip file and populate the document.
    Optional parameter zip_file prevents from converting content twice.
    """
    if zip_file is None:
      format_list = [x for x in self.getTargetFormatList() if x.startswith('html')]
      format = format_list[0]
      mime, data = self._convert(format)
      archive_file = cStringIO.StringIO()
      archive_file.write(self._unpackData(data))
      zip_file = zipfile.ZipFile(archive_file)
      must_close = 1
    else:
      must_close = 0
    for f in zip_file.infolist():
      file_name = f.filename
      if not file_name.endswith('html'):
        document = self.get(file_name, None)
        if document is not None:
          self.manage_delObjects([file_name])
        self.portal_contributions.newContent(id=file_name, container=self,
                                             file_name=file_name,
                                             data=zip_file.read(file_name))
    if must_close:
      zip_file.close()
      archive_file.close()

  # Base format implementation
  security.declarePrivate('_convertToBaseFormat')
  def _convertToBaseFormat(self):
Bartek Górny's avatar
Bartek Górny committed
343
    """
344 345 346
      Converts the original document into ODF
      by invoking the conversion server. Store the result
      on the object. Update metadata information.
Bartek Górny's avatar
Bartek Górny committed
347
    """
348 349 350 351 352 353 354 355 356
    # LOG('in _convertToBaseFormat', 0, self.getRelativeUrl())
    server_proxy = self._mkProxy()
    kw = server_proxy.run_convert(self.getSourceReference() or self.getId(),
                                  enc(self._unpackData(self.getData())))
    self._setBaseData(dec(kw['data']))
    metadata = kw['meta']
    self._base_metadata = metadata
    if metadata.get('MIMEType', None):
      self._setBaseContentType(metadata['MIMEType'])
Bartek Górny's avatar
Bartek Górny committed
357

358 359
  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
  def getContentInformation(self):
Bartek Górny's avatar
Bartek Górny committed
360
    """
361 362
      Returns the metadata extracted by the conversion
      server.
Bartek Górny's avatar
Bartek Górny committed
363
    """
364 365
    # LOG('in getContentInformation', 0, self.getRelativeUrl())
    return self._base_metadata
Bartek Górny's avatar
Bartek Górny committed
366

367 368
  security.declareProtected(Permissions.ModifyPortalContent, 'updateBaseMetadata')
  def updateBaseMetadata(self, **kw):
Bartek Górny's avatar
Bartek Górny committed
369
    """
370 371 372
      Updates metadata information in the converted OOo document
      based on the values provided by the user. This is implemented
      through the invocation of the conversion server.
Bartek Górny's avatar
Bartek Górny committed
373
    """
374 375 376 377
    server_proxy = self._mkProxy()
    kw = server_proxy.run_setmetadata(self.getId(),
                                      enc(self._unpackData(self.getBaseData())),
                                      kw)
Bartek Górny's avatar
Bartek Górny committed
378
    self._setBaseData(dec(kw['data']))