ContributionTool.py 27.3 KB
Newer Older
1
# -*- coding: utf-8 -*-
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
##############################################################################
#
# Copyright (c) 2007 Nexedi SARL and Contributors. All Rights Reserved.
#                    Jean-Paul Smets <jp@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

30
import cStringIO
31
import re
32
import socket
Jean-Paul Smets's avatar
Jean-Paul Smets committed
33
import urllib2, urllib
34 35 36
import urlparse
from cgi import parse_header
import os
37

Bartek Górny's avatar
Bartek Górny committed
38
from AccessControl import ClassSecurityInfo, getSecurityManager
39
from Products.ERP5Type.Globals import InitializeClass, DTMLFile
40
from Products.CMFCore.utils import _checkPermission
41 42 43
from Products.ERP5Type.Tool.BaseTool import BaseTool
from Products.ERP5Type import Permissions
from Products.ERP5 import _dtmldir
Nicolas Delaby's avatar
Nicolas Delaby committed
44
from Products.ERP5.Document.Url import no_crawl_protocol_list
Ivan Tyagov's avatar
Ivan Tyagov committed
45
from AccessControl import Unauthorized
Jean-Paul Smets's avatar
Jean-Paul Smets committed
46

47
from DateTime import DateTime
Nicolas Delaby's avatar
Nicolas Delaby committed
48
import warnings
49

50 51 52 53 54 55
# Install openers
import ContributionOpener
opener = urllib2.build_opener(ContributionOpener.DirectoryFileHandler)
urllib2.install_opener(opener)

# Global parameters
56
TEMP_NEW_OBJECT_KEY = '_v_new_object'
57
MAX_REPEAT = 10
58 59

_marker = []  # Create a new marker object.
60 61 62 63

class ContributionTool(BaseTool):
  """
    ContributionTool provides an abstraction layer to unify the contribution
64
    of documents into an ERP5 Site.
65

66 67
    ContributionTool needs to be configured in portal_types (allowed contents) so
    that it can store Text, Spreadsheet, PDF, etc. 
68

69 70 71
    The main method of ContributionTool is newContent. This method can
    be provided various parameters from which the portal type and document
    metadata can be derived. 
72 73

    Configuration Scripts:
Jean-Paul Smets's avatar
Jean-Paul Smets committed
74

Nicolas Delaby's avatar
Nicolas Delaby committed
75
      - ContributionTool_getPropertyDictFromFilename: receives file name and a 
76 77
        dict derived from filename by regular expression, and does any necesary
        operations (e.g. mapping document type id onto a real portal_type).
Jean-Paul Smets's avatar
Jean-Paul Smets committed
78 79 80 81 82 83

    Problems which are not solved

      - handling of relative links in HTML contents (or others...)
        some text rewriting is necessary.

84 85 86 87 88 89
  """
  title = 'Contribution Tool'
  id = 'portal_contributions'
  meta_type = 'ERP5 Contribution Tool'
  portal_type = 'Contribution Tool'

Nicolas Delaby's avatar
Nicolas Delaby committed
90
  
Jean-Paul Smets's avatar
Jean-Paul Smets committed
91

92 93 94 95 96 97 98
  # Declarative Security
  security = ClassSecurityInfo()

  security.declareProtected(Permissions.ManagePortal, 'manage_overview' )
  manage_overview = DTMLFile( 'explainContributionTool', _dtmldir )

  security.declareProtected(Permissions.AddPortalContent, 'newContent')
Nicolas Delaby's avatar
Nicolas Delaby committed
99
  def newContent(self, **kw):
100 101 102 103 104 105
    """
      The newContent method is overriden to implement smart content
      creation by detecting the portal type based on whatever information
      was provided and finding out the most appropriate module to store
      the content.

Nicolas Delaby's avatar
Nicolas Delaby committed
106
      explicit named parameters was:
107
        id - id of document
Nicolas Delaby's avatar
Nicolas Delaby committed
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
        portal_type - explicit portal_type parameter, must be honoured
        url - Identifier of external resource. Content will be downloaded
              from it
        container - if specified, it is possible to define
                    where to contribute the content. Else, ContributionTool
                    tries to guess.
        container_path - if specified, defines the container path
                         and has precedence over container
        discover_metadata - Enable metadata extraction and discovery
                            (default True)
        temp_object - build tempObject or not (default False)
        user_login - is the name under which the content will be created
                     XXX - this is a security hole which needs to be fixed by
                     making sure only Manager can use this parameter
        data - Binary representation of content
        filename - explicit filename of content
124
    """
Nicolas Delaby's avatar
Nicolas Delaby committed
125 126 127 128 129 130 131 132 133 134
    # Useful for metadata discovery, keep it as it as been provided
    input_parameter_dict = kw.copy()
    # But file and data are exceptions.
    # They are potentialy too big to be keept into memory.
    # We want to keep only one reference of thoses values
    # on futur created document only !
    if 'file' in input_parameter_dict:
      del input_parameter_dict['file']
    if 'data' in input_parameter_dict:
      del input_parameter_dict['data']
135 136 137 138 139
    if 'container' in input_parameter_dict:
      # Container is a persistent object
      # keep only its path in container_path key
      container = input_parameter_dict.pop('container')
      input_parameter_dict['container_path'] = container.getPath()
Nicolas Delaby's avatar
Nicolas Delaby committed
140 141 142 143 144 145
    # pop: remove keys which are not document properties
    url = kw.pop('url', None)
    container = kw.pop('container', None)
    container_path = kw.pop('container_path', None)
    discover_metadata = kw.pop('discover_metadata', True)
    user_login = kw.pop('user_login', None)
146
    document_id = kw.pop('id', None)
Nicolas Delaby's avatar
Nicolas Delaby committed
147 148 149 150 151 152 153 154
    # check file_name argument for backward compatibility.
    if 'file_name' in kw:
      if 'filename' not in kw:
        kw['filename'] = kw['file_name']
      del(kw['file_name'])
    filename = kw.get('filename', None)
    portal_type = kw.get('portal_type')
    temp_object = kw.get('temp_object', False)
155

156
    document = None
Nicolas Delaby's avatar
Nicolas Delaby committed
157
    portal = self.getPortalObject()
158 159 160 161
    if container is None and container_path:
      # Get persistent object from its path.
      # Container may disappear, be smoother by passing default value
      container = portal.restrictedTraverse(container_path, None)
Nicolas Delaby's avatar
Nicolas Delaby committed
162
    # Try to find the filename
163
    content_type = None
164
    if not url:
165
      # check if file was provided
Nicolas Delaby's avatar
Nicolas Delaby committed
166 167 168 169
      file_object = kw.get('file')
      if file_object is not None:
        if not filename:
          filename = file_object.filename
170 171 172 173
      else:
        # some channels supply data and file-name separately
        # this is the case for example for email ingestion
        # in this case, we build a file wrapper for it
Nicolas Delaby's avatar
Nicolas Delaby committed
174 175 176 177 178 179 180 181 182
        data = kw.get('data')
        if data is not None and filename:
          file_object = cStringIO.StringIO()
          file_object.write(data)
          file_object.seek(0)
          kw['file'] = file_object
          del kw['data']
        else:
          raise TypeError, 'data and filename must be provided'
Jean-Paul Smets's avatar
Jean-Paul Smets committed
183
    else:
Nicolas Delaby's avatar
Nicolas Delaby committed
184
      file_object, filename, content_type = self._openURL(url)
185 186
      if content_type:
        kw['content_type'] = content_type
Nicolas Delaby's avatar
Nicolas Delaby committed
187
      kw['file'] = file_object
188

189 190 191 192 193
    if not content_type:
      # fallback to a default content_type according provided
      # filename
      content_type = self.guessMimeTypeFromFilename(filename)

194
    # If the portal_type was provided, we can go faster
195 196 197
    if portal_type and container is None:
      # We know the portal_type, let us find the default module
      # and use it as container
198
      try:
Nicolas Delaby's avatar
Nicolas Delaby committed
199
        container = portal.getDefaultModule(portal_type)
200 201
      except ValueError:
        container = None
202

203
    # From here, there is no hope unless a file was provided
Nicolas Delaby's avatar
Nicolas Delaby committed
204 205
    if file_object is None:
      raise ValueError, "No data provided"
206

Nicolas Delaby's avatar
Nicolas Delaby committed
207 208 209

    if portal_type is None:
      # Guess it with help of portal_contribution_registry
210
      registry = portal.portal_contribution_registry
Nicolas Delaby's avatar
Nicolas Delaby committed
211 212
      portal_type = registry.findPortalTypeName(filename=filename,
                                                content_type=content_type)
213 214
    #
    # Check if same file is already exists. if it exists, then update it.
215
    #
Nicolas Delaby's avatar
Nicolas Delaby committed
216 217 218 219 220
    property_dict = self.getMatchedFilenamePatternDict(filename)
    reference = property_dict.get('reference', None)
    version  = property_dict.get('version', None)
    language  = property_dict.get('language', None)
    if portal_type and reference and version and language:
221
      portal_catalog = portal.portal_catalog
Nicolas Delaby's avatar
Nicolas Delaby committed
222 223 224 225
      document = portal_catalog.getResultValue(portal_type=portal_type,
                                                reference=reference,
                                                version=version,
                                                language=language)
226

Nicolas Delaby's avatar
Nicolas Delaby committed
227 228 229 230 231 232
      if document is not None:
        # document is already uploaded. So overrides file.
        if not _checkPermission(Permissions.ModifyPortalContent, document):
          raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
        document.edit(file=kw['file'])
        return document
233 234 235
    # Temp objects use the standard newContent from Folder
    if temp_object:
      # For temp_object creation, use the standard method
Nicolas Delaby's avatar
Nicolas Delaby committed
236 237
      kw['portal_type'] = portal_type
      return BaseTool.newContent(self, **kw)
238

239
    # Then put the file inside ourselves for a short while
240 241
    if container_path is not None:
      container = self.getPortalObject().restrictedTraverse(container_path)
242
    document = self._setObject(document_id, None, portal_type=portal_type,
Nicolas Delaby's avatar
Nicolas Delaby committed
243
                               user_login=user_login, container=container,
244
                               discover_metadata=discover_metadata,
Nicolas Delaby's avatar
Nicolas Delaby committed
245 246
                               filename=filename,
                               input_parameter_dict=input_parameter_dict
247
                               )
248
    object_id = document.getId()
249
    document = self._getOb(object_id) # Call _getOb to purge cache
250

Nicolas Delaby's avatar
Nicolas Delaby committed
251
    kw['filename'] = filename # Override filename property
252
    # Then edit the document contents (so that upload can happen)
253
    document._edit(**kw)
254 255
    if url:
      document.fromURL(url)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
256

257
    # Allow reindexing, reindex it and return the document
Romain Courteaud's avatar
Romain Courteaud committed
258 259 260 261 262
    try:
      delattr(document, 'isIndexable')
    except AttributeError:
      # Document does not have such attribute
      pass
263
    document.reindexObject()
264 265
    return document

266
  security.declareProtected( Permissions.AddPortalContent, 'newXML' )
267 268 269 270 271 272 273
  def newXML(self, xml):
    """
      Create a new content based on XML data. This is intended for contributing
      to ERP5 from another application.
    """
    pass

Nicolas Delaby's avatar
Nicolas Delaby committed
274 275 276
  security.declareProtected(Permissions.ModifyPortalContent,
                            'getMatchedFilenamePatternDict')
  def getMatchedFilenamePatternDict(self, filename):
277
    """
278
      Get matched group dict of file name parsing regular expression.
279
    """
280
    property_dict = {}
281

Nicolas Delaby's avatar
Nicolas Delaby committed
282
    if filename is None:
283 284
      return property_dict

Nicolas Delaby's avatar
Nicolas Delaby committed
285 286
    regex_text = self.portal_preferences.\
                                getPreferredDocumentFilenameRegularExpression()
287
    if regex_text in ('', None):
288 289
      return property_dict

290 291 292
    if regex_text:
      pattern = re.compile(regex_text)
      if pattern is not None:
293
        try:
Nicolas Delaby's avatar
Nicolas Delaby committed
294
          property_dict = pattern.match(filename).groupdict()
295 296
        except AttributeError: # no match
          pass
297 298
    return property_dict

Nicolas Delaby's avatar
Nicolas Delaby committed
299 300 301 302 303 304 305 306 307 308 309 310 311 312
  # backward compatibility
  security.declareProtected(Permissions.ModifyPortalContent,
                            'getMatchedFileNamePatternDict')
  def getMatchedFileNamePatternDict(self, filename):
    """
    (deprecated) use getMatchedFilenamePatternDict() instead.
    """
    warnings.warn('getMatchedFileNamePatternDict() is deprecated. '
                  'use getMatchedFilenamePatternDict() instead.')
    return self.getMatchedFilenamePatternDict(filename)

  security.declareProtected(Permissions.ModifyPortalContent,
                            'getPropertyDictFromFilename')
  def getPropertyDictFromFilename(self, filename):
313 314 315 316
    """
      Gets properties from filename. File name is parsed with a regular expression
      set in preferences. The regexp should contain named groups.
    """
Nicolas Delaby's avatar
Nicolas Delaby committed
317
    if filename is None:
318
      return {}
Nicolas Delaby's avatar
Nicolas Delaby committed
319 320 321 322
    property_dict = self.getMatchedFilenamePatternDict(filename)
    method = self._getTypeBasedMethod('getPropertyDictFromFilename',
             fallback_script_id='ContributionTool_getPropertyDictFromFilename')
    property_dict = method(filename, property_dict)
323 324
    return property_dict

Nicolas Delaby's avatar
Nicolas Delaby committed
325 326 327 328 329 330 331 332 333 334 335
  # backward compatibility
  security.declareProtected(Permissions.ModifyPortalContent,
                            'getPropertyDictFromFileName')
  def getPropertyDictFromFileName(self, filename):
    """
    (deprecated) use getPropertyDictFromFilename() instead.
    """
    warnings.warn('getPropertyDictFromFileName() is deprecated. '
                  'use getPropertyDictFromFilename() instead.')
    return self.getPropertyDictFromFilename(filename)

336
  # WebDAV virtual folder support
Nicolas Delaby's avatar
Nicolas Delaby committed
337 338 339
  def _setObject(self, id, ob, portal_type=None, user_login=None,
                 container=None, discover_metadata=True, filename=None,
                 input_parameter_dict=None):
340
    """
341
      portal_contribution_registry will find appropriate portal type
Nicolas Delaby's avatar
Nicolas Delaby committed
342
      name by filename and content itself.
343 344 345 346 347

      The ContributionTool instance must be configured in such
      way that _verifyObjectPaste will return TRUE.

    """
348 349 350 351 352
    # _setObject is called by constructInstance at a time
    # when the object has no portal_type defined yet. It
    # will be removed later on. We can safely store the
    # document inside us at this stage. Else we
    # must find out where to store it.
353
    if ob is not None:
354 355 356 357 358
      # Called from webdav API
      # Object is already created by PUT_factory
      # fill the volatile cache _v_document_cache
      # then return the document
      document = ob
359
    else:
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
      if not portal_type:
        document = BaseTool.newContent(self, id=id,
                                      portal_type=portal_type,
                                      is_indexable=0)
      elif ob is None:
        # We give the system a last chance to analyse the
        # portal_type based on the document content
        # (ex. a Memo is a kind of Text which can be identified
        # by the fact it includes some specific content)

        # Now we know the portal_type, let us find the module
        # to which we should move the document to
        if container is None:
          module = self.getDefaultModule(portal_type)
        else:
          module = container
        # There is no preexisting document - we can therefore
        # set the new object
        new_content_kw = {'portal_type': portal_type,
                          'is_indexable': False}
        if id is not None:
          new_content_kw['id'] = id
        document = module.newContent(**new_content_kw)
        # We can now discover metadata
        if discover_metadata:
          # Metadata disovery is done as an activity by default
          # If we need to discoverMetadata synchronously, it must
          # be for user interface and should thus be handled by
          # ZODB scripts
          document.activate(after_path_and_method_id=(document.getPath(),
            ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
          .discoverMetadata(filename=filename,
                            user_login=user_login,
                            input_parameter_dict=input_parameter_dict)
    # Keep the document close to us - this is only useful for
    # file upload from webdav
    volatile_cache = getattr(self, '_v_document_cache', None)
    if volatile_cache is None:
      self._v_document_cache = {}
      volatile_cache = self._v_document_cache
    volatile_cache[document.getId()] = document.getRelativeUrl()
401 402
    # Return document to newContent method
    return document
403

404 405 406 407 408
  def _getOb(self, id, default=_marker):
    """
    Check for volatile temp object info first
    and try to find it
    """
409 410
    # Use the document cache if possible and return result immediately
    # this is only useful for webdav
Nicolas Delaby's avatar
Nicolas Delaby committed
411 412 413
    volatile_cache = getattr(self, '_v_document_cache', None)
    if volatile_cache is not None:
      document_url = volatile_cache.get(id)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
414
      if document_url is not None:
Nicolas Delaby's avatar
Nicolas Delaby committed
415
        del volatile_cache[id]
Jean-Paul Smets's avatar
Jean-Paul Smets committed
416 417
        return self.getPortalObject().unrestrictedTraverse(document_url)

418 419 420 421 422 423 424 425 426 427 428 429
    # Try first to return the real object inside
    # This is much safer than trying to access objects displayed by listDAVObjects
    # because the behaviour of catalog is unpredicatble if a string is passed
    # for a UID. For example 
    #   select path from catalog where uid = "001193.html";
    # will return the same as
    #   select path from catalog where uid = 1193;
    # This was the source of an error in which the contribution tool
    # was creating a web page and was returning a Base Category
    # when
    #   o = folder._getOb(id)
    # was called in DocumentConstructor
430 431 432 433 434 435 436
    if default is _marker:
      result = BaseTool._getOb(self, id)
    else:
      result = BaseTool._getOb(self, id, default=default)
    if result is not None:
      # if result is None, ignore it at this stage
      # we can be more lucky with portal_catalog
437 438 439
      return result

    # Return an object listed by listDAVObjects
440 441 442
    # ids are concatenation of uid + '-' + standard file name of documents
    # get the uid
    uid = str(id).split('-', 1)[0]
443 444
    object = self.getPortalObject().portal_catalog.unrestrictedGetResultValue(uid=uid)
    if object is not None:
445
      return object.getObject() # Make sure this does not break security. XXX
446 447
    if default is not _marker:
      return default
448 449 450
    # Raise an AttributeError the same way as in OFS.ObjectManager._getOb
    raise AttributeError, id

451

Bartek Górny's avatar
Bartek Górny committed
452
  def listDAVObjects(self):
453 454 455
    """
      Get all contents contributed by the current user. This is
      delegated to a script in order to help customisation.
456
    XXX Killer feature, it is not scalable
457 458 459 460 461 462 463 464 465 466 467 468 469
    """
    method = getattr(self, 'ContributionTool_getMyContentList', None)
    if method is not None:
      object_list = method()
    else:
      sm = getSecurityManager()
      user = sm.getUser()
      object_list = self.portal_catalog(portal_type=self.getPortalMyDocumentTypeList(),
                                        owner=str(user))

    def wrapper(o_list):
      for o in o_list:
        o = o.getObject()
Nicolas Delaby's avatar
Nicolas Delaby committed
470
        id = '%s-%s' % (o.getUid(), o.getStandardFilename(),)
471
        yield o.asContext(id=id)
472 473

    return wrapper(object_list)
Bartek Górny's avatar
Bartek Górny committed
474

Jean-Paul Smets's avatar
Jean-Paul Smets committed
475
  security.declareProtected(Permissions.AddPortalContent, 'crawlContent')
476
  def crawlContent(self, content, container=None):
Jean-Paul Smets's avatar
Jean-Paul Smets committed
477 478 479 480 481 482
    """
      Analyses content and download linked pages

      XXX: missing is the conversion of content local href to something
      valid.
    """
Nicolas Delaby's avatar
Nicolas Delaby committed
483 484
    portal = self.getPortalObject()
    url_registry_tool = portal.portal_url_registry
Jean-Paul Smets's avatar
Jean-Paul Smets committed
485
    depth = content.getCrawlingDepth()
486 487 488 489 490 491 492 493
    if depth < 0:
      # Do nothing if crawling depth is reached
      # (this is not a duplicate code but a way to prevent
      # calling isIndexContent unnecessarily)
      return
    if not content.isIndexContent(): # Decrement depth only if it is a content document
      depth = depth - 1
    if depth < 0:
Jean-Paul Smets's avatar
Jean-Paul Smets committed
494 495
      # Do nothing if crawling depth is reached
      return
Nicolas Delaby's avatar
Nicolas Delaby committed
496
    url_list = content.getContentNormalisedURLList()
Jean-Paul Smets's avatar
Jean-Paul Smets committed
497
    for url in set(url_list):
498
      # LOG('trying to crawl', 0, url)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
499
      # Some url protocols should not be crawled
Nicolas Delaby's avatar
Nicolas Delaby committed
500
      if urlparse.urlsplit(url)[0] in no_crawl_protocol_list:
Jean-Paul Smets's avatar
Jean-Paul Smets committed
501
        continue
502 503 504 505
      if container is None:
        #if content.getParentValue()
        # in place of not ?
        container = content.getParentValue()
Nicolas Delaby's avatar
Nicolas Delaby committed
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
      try:
        url_registry_tool.getReferenceFromURL(url, context=container)
      except KeyError:
        pass
      else:
        # url already crawled
        continue
      # XXX - This call is not working due to missing group_method_id
      # therefore, multiple call happen in parallel and eventually fail
      # (the same URL is created multiple times)
      # LOG('activate newContentFromURL', 0, url)
      self.activate(activity="SQLQueue").newContentFromURL(
                                  container_path=container.getRelativeUrl(),
                                  url=url, crawling_depth=depth)
      # Url is not known yet but register right now to avoid
      # creation of duplicated crawled content
      # An activity will later setup the good reference for it.
      url_registry_tool.registerURL(url, None, context=container)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
524 525

  security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL')
526
  def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0):
Jean-Paul Smets's avatar
Jean-Paul Smets committed
527 528 529
    """
      Updates an existing content.
    """
530 531 532 533 534 535 536 537 538
    # First, test if the document is updatable according to
    # its workflow states (if it has a workflow associated with)
    if content.isUpdatable():
      # Step 0: update crawling_depth if required
      if crawling_depth > content.getCrawlingDepth():
        content._setCrawlingDepth(crawling_depth)
      # Step 1: download new content
      try:
        url = content.asURL()
Nicolas Delaby's avatar
Nicolas Delaby committed
539
        file_object, filename, content_type = self._openURL(url)
540 541 542
      except urllib2.HTTPError, error:
        if repeat == 0:
          # XXX - Call the extendBadURLList method,--NOT Implemented--
543
          # IDEA : ajouter l'url en question dans une list "bad_url_list" puis lors du crawling au lieu que de boucler sur 
544
          #        la liste des url extraites de la page web on fait un test supplementaire qui verifie que l'url n'est pas 
545
          #        dans la liste bad_url_lis
Jérome Perrin's avatar
Jérome Perrin committed
546
          raise
547 548 549 550 551
        content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1)
        return
      except urllib2.URLError, error:
        if repeat == 0:
          # XXX - Call the extendBadURLList method,--NOT Implemented--
Jérome Perrin's avatar
Jérome Perrin committed
552
          raise
553 554 555
        content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1)
        return

Nicolas Delaby's avatar
Nicolas Delaby committed
556 557
      content._edit(file=file_object, content_type=content_type)
                              # Please make sure that if content is the same
558 559 560
                              # we do not update it
                              # This feature must be implemented by Base or File
                              # not here (look at _edit in Base)
Nicolas Delaby's avatar
Nicolas Delaby committed
561 562 563 564 565 566 567
      # Step 2: convert to base format
      if content.isSupportBaseDataConversion():
        content.activate().Document_tryToConvertToBaseFormat()
      # Step 3: run discoverMetadata
      content.activate(after_path_and_method_id=(content.getPath(),
            ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
          .discoverMetadata(filename=filename)
Jean-Paul Smets's avatar
Jean-Paul Smets committed
568
      # Step 4: activate populate (unless interaction workflow does it)
569
      content.activate().populateContent()
Jean-Paul Smets's avatar
Jean-Paul Smets committed
570
      # Step 5: activate crawlContent
571 572 573
      depth = content.getCrawlingDepth()
      if depth > 0:
        content.activate().crawlContent()
Jean-Paul Smets's avatar
Jean-Paul Smets committed
574 575

  security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL')
Nicolas Delaby's avatar
Nicolas Delaby committed
576 577
  def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT,
                        repeat_interval=1, batch_mode=True, url=None, **kw):
Jean-Paul Smets's avatar
Jean-Paul Smets committed
578 579 580 581 582 583 584 585 586
    """
      A wrapper method for newContent which provides extra safety
      in case or errors (ie. download, access, conflict, etc.).
      The method is able to handle a certain number of exceptions
      and can postpone itself through an activity based on
      the type of exception (ex. for a 404, postpone 1 day), using
      the at_date parameter and some standard values.

      NOTE: implementation needs to be done.
Nicolas Delaby's avatar
Nicolas Delaby committed
587
      id parameter is ignored
Jean-Paul Smets's avatar
Jean-Paul Smets committed
588
    """
Ivan Tyagov's avatar
Ivan Tyagov committed
589
    document = None
Nicolas Delaby's avatar
Nicolas Delaby committed
590 591
    if not url:
      raise TypeError, 'url parameter is mandatory'
592
    try:
Nicolas Delaby's avatar
Nicolas Delaby committed
593
      document = self.newContent(container_path=container_path, url=url, **kw)
594 595 596 597 598 599
      if document.isIndexContent() and document.getCrawlingDepth() >= 0:
        # If this is an index document, keep on crawling even if crawling_depth is 0
        document.activate().crawlContent()
      elif document.getCrawlingDepth() > 0:
        # If this is an index document, stop crawling if crawling_depth is 0
        document.activate().crawlContent()
600
    except urllib2.HTTPError, error:
601
      if repeat == 0 and batch_mode:
602 603 604
        # here we must call the extendBadURLList method,--NOT Implemented--
        # which had to add this url to bad URL list, so next time we avoid
        # crawling bad URL
Jérome Perrin's avatar
Jérome Perrin committed
605
        raise
606 607 608
      if repeat > 0:
        # Catch any HTTP error
        self.activate(at_date=DateTime() + repeat_interval).newContentFromURL(
Nicolas Delaby's avatar
Nicolas Delaby committed
609
                          container_path=container_path, url=url,
610 611
                          repeat=repeat - 1,
                          repeat_interval=repeat_interval, **kw)
612
    except urllib2.URLError, error:
613
      if repeat == 0 and batch_mode:
614
        # XXX - Call the extendBadURLList method, --NOT Implemented--
Jérome Perrin's avatar
Jérome Perrin committed
615
        raise
616 617 618
      #if getattr(error.reason,'args',None):
        #if error.reason.args[0] == socket.EAI_AGAIN:
          ## Temporary failure in name resolution - try again in 1 day
619 620 621
      if repeat > 0:
        self.activate(at_date=DateTime() + repeat_interval,
                      activity="SQLQueue").newContentFromURL(
Nicolas Delaby's avatar
Nicolas Delaby committed
622
                        container_path=container_path, url=url,
623 624
                        repeat=repeat - 1,
                        repeat_interval=repeat_interval, **kw)
625
    return document
Jean-Paul Smets's avatar
Jean-Paul Smets committed
626

Nicolas Delaby's avatar
Nicolas Delaby committed
627 628 629
  security.declareProtected(Permissions.AccessContentsInformation,
                            'guessMimeTypeFromFilename')
  def guessMimeTypeFromFilename(self, filename):
630
    """
Nicolas Delaby's avatar
Nicolas Delaby committed
631
      get mime type from file name
632
    """
Nicolas Delaby's avatar
Nicolas Delaby committed
633 634 635 636
    if not filename:
      return
    portal = self.getPortalObject()
    content_type = portal.mimetypes_registry.lookupExtension(filename)
637 638
    if content_type:
      return str(content_type)
Nicolas Delaby's avatar
Nicolas Delaby committed
639 640 641 642 643 644 645 646 647 648 649 650 651
    return content_type

  def _openURL(self, url):
    """Download content from url,
    read filename and content_type
    return file_object, filename, content_type tuple
    """
    # Quote path part of url
    url_tuple = urlparse.urlsplit(url)
    quoted_path = urllib.quote(url_tuple[2])
    url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path,
                               url_tuple[3], url_tuple[4]))
    # build a new file from the url
652 653
    url_file = urllib2.urlopen(urllib2.Request(url,
                                               headers={'Accept':'*/*'}))
Nicolas Delaby's avatar
Nicolas Delaby committed
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
    data = url_file.read() # time out must be set or ... too long XXX
    file_object = cStringIO.StringIO()
    file_object.write(data)
    file_object.seek(0)
    # if a content-disposition header is present,
    # try first to read the suggested filename from it.
    header_info = url_file.info()
    content_disposition = header_info.getheader('content-disposition', '')
    filename = parse_header(content_disposition)[1].get('filename')
    if not filename:
      # Now read the filename from url.
      # In case of http redirection, the real url must be read
      # from file object returned by urllib2.urlopen.
      # It can happens when the header 'Location' is present in request.
      # See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
      url = url_file.geturl()
      # Create a file name based on the URL and quote it
      filename = urlparse.urlsplit(url)[-3]
      filename = os.path.basename(filename)
      filename = urllib.quote(filename, safe='')
      filename = filename.replace('%', '')
    content_type = header_info.gettype()
    return file_object, filename, content_type
677

Ivan Tyagov's avatar
Ivan Tyagov committed
678
InitializeClass(ContributionTool)