Refactoring of DMS.

- file_name become filename - filename values are not stored in source_reference Contribution Tool will not honour id arguments. Contribution Tool can create any kind of document. Portal Contribution Registry can read extention, content_type and read content_type from data to guess what will be the best Portal Type to use. All discoverable methods (IDiscoverable) can change the portal_type of document. (migratePortalType) User can change portal_type of document through UI with simple Action. Crawling will not hardcode ids of document depending of their URLs thanks to Portal Url Registry git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@40971 20353a03-c40f-0410-a6d1-a30d3c3de9de

Refactoring of DMS.
- file_name become filename - filename values are not stored in source_reference Contribution Tool will not honour id arguments. Contribution Tool can create any kind of document. Portal Contribution Registry can read extention, content_type and read content_type from data to guess what will be the best Portal Type to use. All discoverable methods (IDiscoverable) can change the portal_type of document. (migratePortalType) User can change portal_type of document through UI with simple Action. Crawling will not hardcode ids of document depending of their URLs thanks to Portal Url Registry git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@40971 20353a03-c40f-0410-a6d1-a30d3c3de9de
de2545fc · Nicolas Delaby · 4627391c · de2545fc · de2545fc · de2545fc
Commit de2545fc authored Dec 01, 2010 by Nicolas Delaby
39 changed files
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -40,18 +40,15 @@ from Products.ERP5Type import Permissions, PropertySheet, interfaces
 from Products.ERP5Type.XMLObject import XMLObject
 from Products.ERP5Type.DateUtils import convertDateToHour,\
                                number_of_hours_in_day, number_of_hours_in_year
-from Products.ERP5Type.Utils import convertToUpperCase, fill_args_from_request
+from Products.ERP5Type.Utils import convertToUpperCase, fill_args_from_request,\
+                                    deprecated
 from Products.ERP5Type.TransactionalVariable import getTransactionalVariable
 from Products.ERP5Type.Cache import getReadOnlyTransactionCache
-from Products.ERP5.Document.Url import UrlMixIn
 from Products.ERP5.Tool.ContributionTool import MAX_REPEAT
-from Products.ERP5Type.UnrestrictedMethod import unrestricted_apply
 from Products.ZSQLCatalog.SQLCatalog import SQLQuery
 from AccessControl import Unauthorized
 import zope.interface
 from Products.PythonScripts.Utility import allow_class
-import tempfile
-from subprocess import Popen, PIPE

 # Mixin Import
 from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
@@ -60,9 +57,10 @@ from Products.ERP5.mixin.downloadable import DownloadableMixin
 from Products.ERP5.mixin.document import DocumentMixin
 from Products.ERP5.mixin.extensible_traversable import DocumentExtensibleTraversableMixin
 from Products.ERP5.mixin.crawlable import CrawlableMixin
+from Products.ERP5.mixin.discoverable import DiscoverableMixin
+from Products.ERP5.mixin.url import UrlMixin

 _MARKER = []
-VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')

 # these property ids are unchangable
 FIXED_PROPERTY_IDS = ('id', 'uid', 'rid', 'sid')
@@ -88,8 +86,9 @@ class DocumentProxyError(Exception):pass
 class NotConvertedError(Exception):pass
 allow_class(NotConvertedError)

-class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedConvertableMixin,
-               CrawlableMixin, TextConvertableMixin, DownloadableMixin, DocumentMixin):
+class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
+               CachedConvertableMixin, CrawlableMixin, TextConvertableMixin,
+               DownloadableMixin, DocumentMixin, DiscoverableMixin):
  """Document is an abstract class with all methods related to document
  management in ERP5. This includes searchable text, explicit relations,
  implicit relations, metadata, versions, languages, etc.
@@ -144,7 +143,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo

  input      -   data supplied with http request or set on the object during (2) (e.g.
                 discovered from email text)
-  file_name  -   data which might be encoded in file name
+  filename   -   data which might be encoded in filename
  user_login -   information about user who is contributing the file
  content    -   data which might be derived from document content

@@ -156,7 +155,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
  Methods for discovering metadata are:

    getPropertyDictFromInput
-    getPropertyDictFromFileName
+    getPropertyDictFromFilename
    getPropertyDictFromUserLogin
    getPropertyDictFromContent

@@ -266,10 +265,15 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
                            interfaces.IVersionable,
                            interfaces.IDownloadable,
                            interfaces.ICrawlable,
-                            interfaces.IDocument
+                            interfaces.IDocument,
+                            interfaces.IDiscoverable,
+                            interfaces.IUrl,
                           )

  # Regular expressions
+  # XXX those regex are weak, fast but not reliable.
+  # this is a valid url than regex are not able to parse
+  # http://www.example.com//I don't care i put what/ i want/
  href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
  body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
  title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
@@ -639,141 +643,14 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
    if not reference:
      return
    catalog = self.getPortalObject().portal_catalog
-    res = catalog(reference=self.getReference(), sort_on=(('creation_date','ascending'),))
+    result_list = catalog.unrestrictedSearchResults(
+                                      reference=self.getReference(),
+                                      sort_on=(('creation_date', 
+                                                'ascending'),))
    # XXX this should be security-unaware - delegate to script with proxy roles
-    return res[0].getLanguage() # XXX what happens if it is empty?
-
-  ### Property getters
-  # Property Getters are document dependent so that we can
-  # handle the weird cases in which needed properties change with the type of document
-  # and the usual cases in which accessing content changes with the meta type
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromUserLogin')
-  def getPropertyDictFromUserLogin(self, user_login=None):
-    """
-      Based on the user_login, find out as many properties as needed.
-      returns properties which should be set on the document
-    """
-    if user_login is None:
-      user_login = str(getSecurityManager().getUser())
-    method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
-        fallback_script_id='Document_getPropertyDictFromUserLogin')
-    return method(user_login)
-
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromContent')
-  def getPropertyDictFromContent(self):
-    """
-      Based on the document content, find out as many properties as needed.
-      returns properties which should be set on the document
-    """
-    # accesss data through convert
-    mime, content = self.convert(None)
-    if not content:
-       # if document is empty, we will not find anything in its content
-      return {}
-    method = self._getTypeBasedMethod('getPropertyDictFromContent',
-        fallback_script_id='Document_getPropertyDictFromContent')
-    return method()
-
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
-  def getPropertyDictFromFileName(self, file_name):
-    """
-      Based on the file name, find out as many properties as needed.
-      returns properties which should be set on the document
-    """
-    return self.portal_contributions.getPropertyDictFromFileName(file_name)
-
-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromInput')
-  def getPropertyDictFromInput(self):
-    """
-      Get properties which were supplied explicitly to the ingestion method
-      (discovered or supplied before the document was created).
-
-      The implementation consists in saving document properties
-      into _backup_input by supposing that original input parameters were
-      set on the document by ContributionTool.newContent as soon
-      as the document was created.
-    """
-    kw = getattr(self, '_backup_input', {})
-    if kw:
-      return kw
-    for id in self.propertyIds():
-      # We should not consider file data
-      if id not in ('data', 'categories_list', 'uid', 'id',
-                    'text_content', 'base_data',) \
-            and self.hasProperty(id):
-        kw[id] = self.getProperty(id)
-    self._backup_input = kw # We could use volatile and pass kw in activate
-                            # if we are garanteed that _backup_input does not
-                            # disappear within a given transaction
-    return kw
-
-  ### Metadata disovery and ingestion methods
-  security.declareProtected(Permissions.ModifyPortalContent, 'discoverMetadata')
-  def discoverMetadata(self, file_name=None, user_login=None):
-    """
-      This is the main metadata discovery function - controls the process
-      of discovering data from various sources. The discovery itself is
-      delegated to scripts or uses preference-configurable regexps. The
-      method returns either self or the document which has been
-      merged in the discovery process.
-
-      file_name - this parameter is a file name of the form "AA-BBB-CCC-223-en"
-
-      user_login - this is a login string of a person; can be None if the user is
-                   currently logged in, then we'll get him from session
-    """
-    # Preference is made of a sequence of 'user_login', 'content', 'file_name', 'input'
-    method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList',
-        fallback_script_id = 'Document_getPreferredDocumentMetadataDiscoveryOrderList')
-    order_list = list(method())
-    order_list.reverse()
-    # build a dictionary according to the order
-    kw = {}
-    for order_id in order_list:
-      result = None
-      if order_id not in VALID_ORDER_KEY_LIST:
-        # Prevent security attack or bad preferences
-        raise AttributeError, "%s is not in valid order key list" % order_id
-      method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
-      method = getattr(self, method_id)
-      if order_id == 'file_name':
-        if file_name is not None:
-          result = method(file_name)
-      elif order_id == 'user_login':
-        if user_login is not None:
-          result = method(user_login)
-      else:
-        result = method()
-      if result is not None:
-        for key, value in result.iteritems():
-          if value not in (None, ''):
-            kw[key]=value
-
-    if file_name is not None:
-      # filename is often undefined....
-      kw['source_reference'] = file_name
-    # Prepare the content edit parameters - portal_type should not be changed
-    kw.pop('portal_type', None)
-    # Try not to invoke an automatic transition here
-    self._edit(**kw)
-    # Finish ingestion by calling method
-    self.finishIngestion() # XXX - is this really the right place ?
-    self.reindexObject() # XXX - is this really the right place ?
-    # Revision merge is tightly coupled
-    # to metadata discovery - refer to the documentation of mergeRevision method
-    merged_doc = self.mergeRevision() # XXX - is this really the right place ?
-    merged_doc.reindexObject() # XXX - is this really the right place ?
-    return merged_doc # XXX - is this really the right place ?
-
-  security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
-  def finishIngestion(self):
-    """
-      Finish the ingestion process by calling the appropriate script. This
-      script can for example allocate a reference number automatically if
-      no reference was defined.
-    """
-    method = self._getTypeBasedMethod('finishIngestion', fallback_script_id='Document_finishIngestion')
-    return method()
+    if result_list:
+      return result_list[0].getLanguage()
+    return

  security.declareProtected(Permissions.View, 'asSubjectText')
  def asSubjectText(self, **kw):
@@ -827,32 +704,13 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
    return self._stripHTML(self._asHTML(**kw))

  security.declarePrivate('_guessEncoding')
+  @deprecated
  def _guessEncoding(self, string, mime='text/html'):
    """
-      Try to guess the encoding for this string.
-      Returns None if no encoding can be guessed.
+      Deprecated method
    """
-    try:
-      import chardet
-    except ImportError:
-      chardet = None
-    if chardet is not None and (mime == 'text/html'\
-                                               or os.sys.platform != 'linux2'):
-      # chardet works fine on html document and its platform independent
-      return chardet.detect(string).get('encoding', None)
-    else:
-      # file command provide better result
-      # for text/plain documents
-      # store the content into tempfile
-      file_descriptor, path = tempfile.mkstemp()
-      file_object = os.fdopen(file_descriptor, 'w')
-      file_object.write(string)
-      file_object.close()
-      # run file command against tempfile to and read encoded
-      command_result = Popen(['file', '-b', '--mime-encoding', path],
-                                                  stdout=PIPE).communicate()[0]
-      # return detected encoding
-      return command_result.strip()
+    contribution_tool = self.getPortalObject().portal_contributions
+    return contribution_tool.guessEncodingFromText(string, content_type=mime)

  def _stripHTML(self, html, charset=None):
    """
@@ -866,22 +724,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
      stripped_html = html
    return stripped_html

-  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
-  def getContentInformation(self):
-    """
-    Returns the content information from the HTML conversion.
-    The default implementation tries to build a dictionnary
-    from the HTML conversion of the document and extract
-    the document title.
-    """
-    result = {}
-    html = self.asEntireHTML()
-    if not html: return result
-    title_list = re.findall(self.title_parser, str(html))
-    if title_list:
-      result['title'] = title_list[0]
-    return result
-
  security.declareProtected(Permissions.AccessContentsInformation,
                            'getMetadataMappingDict')
  def getMetadataMappingDict(self):
@@ -918,21 +760,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
      method = None
    if method is not None: method()

-  # Crawling API
-  security.declareProtected(Permissions.AccessContentsInformation, 'getContentURLList')
-  def getContentURLList(self):
-    """
-      Returns a list of URLs referenced by the content of this document.
-      Default implementation consists in analysing the document
-      converted to HTML. Subclasses may overload this method
-      if necessary. However, it is better to extend the conversion
-      methods in order to produce valid HTML, which is useful to
-      many people, rather than overload this method which is only
-      useful for crawling.
-    """
-    html_content = self.asStrippedHTML()
-    return re.findall(self.href_parser, str(html_content))
-
  security.declareProtected(Permissions.ModifyPortalContent, 'updateContentFromURL')
  def updateContentFromURL(self, repeat=MAX_REPEAT, crawling_depth=0):
    """
@@ -963,18 +790,3 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
    if hasattr(aq_base(container), 'isIndexContent'):
      return container.isIndexContent(self)
    return False
-
-  security.declareProtected(Permissions.AccessContentsInformation, 'getContentBaseURL')
-  def getContentBaseURL(self):
-    """
-      Returns the content base URL based on the actual content or
-      on its URL.
-    """
-    base_url = self.asURL()
-    base_url_list = base_url.split('/')
-    if len(base_url_list):
-      if base_url_list[-1] and base_url_list[-1].find('.') > 0:
-        # Cut the trailing part in http://www.some.site/at/trailing.html
-        # but not in http://www.some.site/at
-        base_url = '/'.join(base_url_list[:-1])
-    return base_url
--- a/product/ERP5/Document/PDFDocument.py
+++ b/product/ERP5/Document/PDFDocument.py
@@ -114,22 +114,14 @@ class PDFDocument(Image):
    """
    if not self.hasData():
      return ''
-    tmp = tempfile.NamedTemporaryFile()
-    tmp.write(self.getData())
-    tmp.seek(0)
-    try:
-      command = ['pdftotext', '-layout', '-enc', 'UTF-8',
-                 '-nopgbrk', tmp.name, '-']
-      try:
-        command_result = Popen(command, stdout=PIPE).communicate()[0]
-      except OSError, e:
-        if e.errno == errno.ENOENT:
-          raise ConversionError('pdftotext was not found')
-        raise
-    finally:
-      tmp.close()
-    if command_result:
-      return command_result
+    mime_type = 'text/plain'
+    portal_transforms = self.getPortalObject().portal_transforms
+    filename = self.getStandardFilename(format='txt')
+    result = portal_transforms.convertToData(mime_type, str(self.getData()),
+                                             context=self, filename=filename,
+                                             mimetype=self.getContentType())
+    if result:
+      return result
    else:
      # Try to use OCR
      # As high dpi images are required, it may take some times to convert the
@@ -145,13 +137,12 @@ class PDFDocument(Image):
            frame=page_number, display='identical')
        if not src_mimetype.endswith('png'):
          continue
-        content = '%s' % png_data
-        mime_type = 'text/plain'
+        content = str(png_data)
        if content is not None:
-          portal_transforms = getToolByName(self, 'portal_transforms')
+          filename = self.getStandardFilename(format='png')
          result = portal_transforms.convertToData(mime_type, content,
                                                   context=self,
-                                                   filename=self.getTitleOrId(),
+                                                   filename=filename,
                                                   mimetype=src_mimetype)
          if result is None:
            raise ConversionError('PDFDocument conversion error. '

--- a/product/ERP5/Document/TextDocument.py
+++ b/product/ERP5/Document/TextDocument.py
@@ -45,6 +45,9 @@ try:
  from string import Template
 except ImportError:
  from Products.ERP5Type.patches.string import Template
+from Products.ERP5Type.Utils import guessEncodingFromText
+
+from lxml import html as etree_html

 class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
                                                            TextContent, File):
@@ -147,7 +150,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
        kw['format'] = format
        if not self.hasConversion(**kw):
          portal_transforms = getToolByName(portal, 'portal_transforms')
-          filename = self.getSourceReference(self.getTitleOrId())
+          filename = self.getStandardFilename(format=format)
          if mime_type == 'text/html':
            mime_type = 'text/x-html-safe'
          result = portal_transforms.convertToData(mime_type, text_content,
@@ -183,9 +186,13 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
      """
      if self.hasTextContent():
        html = self._asHTML()
-        base_list = re.findall(self.base_parser, str(html))
-        if base_list:
-          return base_list[0]
+        # a document can be entirely stripped by safe_html
+        # so its html conversion can be empty
+        if html.strip():
+          html_tree = etree_html.fromstring(html)
+          base_list = [href for href in html_tree.xpath('//base/@href') if href]
+          if base_list:
+            return str(base_list[0])
      return Document.getContentBaseURL(self)

    security.declareProtected(Permissions.ModifyPortalContent, 'setBaseData')
@@ -270,14 +277,14 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
        return encoded content_type and message if encoding
        is not utf-8
        """
-        codec = document._guessEncoding(text_content, content_type)
+        codec = guessEncodingFromText(text_content, content_type)
        if codec is not None:
          try:
            text_content = text_content.decode(codec).encode('utf-8')
          except (UnicodeDecodeError, LookupError):
            message = 'Conversion to base format with codec %r fails' % codec
            # try again with another guesser based on file command
-            codec = document._guessEncoding(text_content, 'text/plain')
+            codec = guessEncodingFromText(text_content, 'text/plain')
            if codec is not None:
              try:
                text_content = text_content.decode(codec).encode('utf-8')

--- a/product/ERP5/Tool/ContributionRegistryTool.py
+++ b/product/ERP5/Tool/ContributionRegistryTool.py
@@ -29,7 +29,7 @@
 from AccessControl import ClassSecurityInfo
 from Products.ERP5Type.Globals import InitializeClass
 from Products.ERP5Type.Tool.BaseTool import BaseTool
-
+from Products.ERP5Type import Permissions

 class ContributionRegistryTool(BaseTool):

@@ -41,14 +41,18 @@ class ContributionRegistryTool(BaseTool):

  security = ClassSecurityInfo()

-  security.declarePrivate('findPortalTypeName')
-  def findPortalTypeName(self, file_name='', mime_type=None, data=None):
-    from Products.ERP5Type.Document import newTempIngestionFile
-    ingestion_file = newTempIngestionFile(self, 'id')
-    ingestion_file._edit(file_name=file_name, mime_type=mime_type, data=data)
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'findPortalTypeName')
+  def findPortalTypeName(self, context=None, **kw):
+    # if a context is passed, ignore other arguments
+    if context is None:
+      # Build a temp object edited with provided parameters
+      from Products.ERP5Type.Document import newTempFile
+      context = newTempFile(self, 'id')
+      context.edit(**kw)

    for predicate in self.objectValues(sort_on='int_index'):
-      result = predicate.test(ingestion_file)
+      result = predicate.test(context)
      if result:
        return result


--- a/product/ERP5/Tool/ContributionTool.py
+++ b/product/ERP5/Tool/ContributionTool.py
@@ -29,12 +29,7 @@

 import cStringIO
 import re
-import string
 import socket
-try:
-  from hashlib import md5 as md5_new
-except ImportError:
-  from md5 import new as md5_new
 import urllib2, urllib
 import urlparse
 from cgi import parse_header
@@ -46,13 +41,11 @@ from Products.CMFCore.utils import getToolByName, _checkPermission
 from Products.ERP5Type.Tool.BaseTool import BaseTool
 from Products.ERP5Type import Permissions
 from Products.ERP5 import _dtmldir
-from Products.ERP5.Document.Url import no_crawl_protocol_list, no_host_protocol_list
+from Products.ERP5.Document.Url import no_crawl_protocol_list
 from AccessControl import Unauthorized

-from zLOG import LOG
 from DateTime import DateTime
-from Acquisition import aq_base
-from zExceptions import BadRequest
+import warnings

 # Install openers
 import ContributionOpener
@@ -83,7 +76,7 @@ class ContributionTool(BaseTool):

    Configuration Scripts:

-      - ContributionTool_getPropertyDictFromFileName: receives file name and a 
+      - ContributionTool_getPropertyDictFromFilename: receives file name and a 
        dict derived from filename by regular expression, and does any necesary
        operations (e.g. mapping document type id onto a real portal_type).

@@ -98,8 +91,7 @@ class ContributionTool(BaseTool):
  meta_type = 'ERP5 Contribution Tool'
  portal_type = 'Contribution Tool'

-  # Regular expressions
-  simple_normaliser = re.compile('#.*')
+  

  # Declarative Security
  security = ClassSecurityInfo()
@@ -108,153 +100,141 @@ class ContributionTool(BaseTool):
  manage_overview = DTMLFile( 'explainContributionTool', _dtmldir )

  security.declareProtected(Permissions.AddPortalContent, 'newContent')
-  def newContent(self, id=None, portal_type=None, url=None, container=None,
-                       container_path=None,
-                       discover_metadata=1, temp_object=0,
-                       user_login=None, data=None, file_name=None, **kw):
+  def newContent(self, **kw):
    """
      The newContent method is overriden to implement smart content
      creation by detecting the portal type based on whatever information
      was provided and finding out the most appropriate module to store
      the content.

-      user_login is the name under which the content will be created
-      XXX - this is a security hole which needs to be fixed by
-      making sure only Manager can use this parameter
-
-      container -- if specified, it is possible to define
-      where to contribute the content. Else, ContributionTool
-      tries to guess.
-
-      container_path -- if specified, defines the container path
-      and has precedence over container
-
-      url -- if specified, content is download from the URL.
-
-      NOTE:
-        We always generate ID. So, we must prevent using the one
-        which we were provided.
+      explicit named parameters was:
+        id - ignored argument
+        portal_type - explicit portal_type parameter, must be honoured
+        url - Identifier of external resource. Content will be downloaded
+              from it
+        container - if specified, it is possible to define
+                    where to contribute the content. Else, ContributionTool
+                    tries to guess.
+        container_path - if specified, defines the container path
+                         and has precedence over container
+        discover_metadata - Enable metadata extraction and discovery
+                            (default True)
+        temp_object - build tempObject or not (default False)
+        user_login - is the name under which the content will be created
+                     XXX - this is a security hole which needs to be fixed by
+                     making sure only Manager can use this parameter
+        data - Binary representation of content
+        filename - explicit filename of content
    """
-    if file_name is not None:
-      kw['file_name'] = file_name
-    if data is not None:
-      # This is only used to make sure
-      # we can pass file as parameter to ZPublisher
-      # whenever we ingest email
-      kw['data'] = data
+    kw.pop('id', None) # Never use hardcoded ids anymore longer
+
+    # Useful for metadata discovery, keep it as it as been provided
+    input_parameter_dict = kw.copy()
+    # But file and data are exceptions.
+    # They are potentialy too big to be keept into memory.
+    # We want to keep only one reference of thoses values
+    # on futur created document only !
+    if 'file' in input_parameter_dict:
+      del input_parameter_dict['file']
+    if 'data' in input_parameter_dict:
+      del input_parameter_dict['data']
+    # pop: remove keys which are not document properties
+    url = kw.pop('url', None)
+    container = kw.pop('container', None)
+    container_path = kw.pop('container_path', None)
+    discover_metadata = kw.pop('discover_metadata', True)
+    user_login = kw.pop('user_login', None)
+    # check file_name argument for backward compatibility.
+    if 'file_name' in kw:
+      if 'filename' not in kw:
+        kw['filename'] = kw['file_name']
+      del(kw['file_name'])
+    filename = kw.get('filename', None)
+    portal_type = kw.get('portal_type')
+    temp_object = kw.get('temp_object', False)

    document = None
-
-    # Try to find the file_name
+    portal = self.getPortalObject()
+    # Try to find the filename
    content_type = None
    if not url:
      # check if file was provided
-      file = kw.get('file', None)
-      if file is not None and file_name is None:
-        file_name = file.filename
+      file_object = kw.get('file')
+      if file_object is not None:
+        if not filename:
+          filename = file_object.filename
      else:
        # some channels supply data and file-name separately
        # this is the case for example for email ingestion
        # in this case, we build a file wrapper for it
-        data = kw.get('data', None)
-        if data is not None:
-          file_name = kw.get('file_name', None)
-          if file_name is not None:
-            file = cStringIO.StringIO()
-            file.write(data)
-            file.seek(0)
-            kw['file'] = file
-            del kw['data']
-            del kw['file_name']
+        data = kw.get('data')
+        if data is not None and filename:
+          file_object = cStringIO.StringIO()
+          file_object.write(data)
+          file_object.seek(0)
+          kw['file'] = file_object
+          del kw['data']
+        else:
+          raise TypeError, 'data and filename must be provided'
    else:
-      # build a new file from the url
-      url_file = urllib2.urlopen(url)
-      data = url_file.read() # time out must be set or ... too long XXX
-      file = cStringIO.StringIO()
-      file.write(data)
-      file.seek(0)
-      # if a content-disposition header is present,
-      # try first to read the suggested filename from it.
-      header_info = url_file.info()
-      content_disposition = header_info.getheader('content-disposition', '')
-      file_name = parse_header(content_disposition)[1].get('filename')
-      if not file_name:
-        # Now read the filename from url.
-        # In case of http redirection, the real url must be read
-        # from file object returned by urllib2.urlopen.
-        # It can happens when the header 'Location' is present in request.
-        # See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
-        url = url_file.geturl()
-        # Create a file name based on the URL and quote it
-        file_name = urlparse.urlsplit(url)[-3]
-        file_name = os.path.basename(file_name)
-        file_name = urllib.quote(file_name, safe='')
-        file_name = file_name.replace('%', '')
-      # For URLs, we want an id by default equal to the encoded URL
-      if id is None:
-        id = self.encodeURL(url)
-      content_type = header_info.gettype()
+      file_object, filename, content_type = self._openURL(url)
      if content_type:
        kw['content_type'] = content_type
-      kw['file'] = file
+      kw['file'] = file_object

    # If the portal_type was provided, we can go faster
    if portal_type and container is None:
      # We know the portal_type, let us find the default module
      # and use it as container
      try:
-        container = self.getDefaultModule(portal_type)
+        container = portal.getDefaultModule(portal_type)
      except ValueError:
        container = None

-    if portal_type and container is not None:
-      # We could simplify things here and return a document immediately
-      # NOTE: we use the module ID generator rather than the provided ID
-      #document = module.newContent(portal_type=portal_type, **kw)
-      #if discover_metadata:
-      #  document.activate().discoverMetadata(file_name=file_name, user_login=user_login)
-      #return document
-      pass # XXX - This needs to be implemented once the rest is stable
-
    # From here, there is no hope unless a file was provided
-    if file is None:
-      raise ValueError, "could not determine portal type"
+    if file_object is None:
+      raise ValueError, "No data provided"

+
+    if portal_type is None:
+      # Guess it with help of portal_contribution_registry
+      registry = getToolByName(portal, 'portal_contribution_registry')
+      portal_type = registry.findPortalTypeName(filename=filename,
+                                                content_type=content_type)
    #
    # Check if same file is already exists. if it exists, then update it.
    #
-    if portal_type is None:
-      portal_type = self._guessPortalType(file_name, content_type, data)
-      property_dict = self.getMatchedFileNamePatternDict(file_name)
-      reference = property_dict.get('reference', None)
-      version  = property_dict.get('version', None)
-      language  = property_dict.get('language', None)
-      if portal_type and reference and version and language:
-        portal_catalog = getToolByName(self, 'portal_catalog')
-        document = portal_catalog.getResultValue(portal_type=portal_type,
-                                                  reference=reference,
-                                                  version=version,
-                                                  language=language)
-        if document is not None:
-          # document is already uploaded. So overrides file.
-          if not _checkPermission(Permissions.ModifyPortalContent, document):
-            raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
-          document.edit(file=kw['file'])
-          return document
+    property_dict = self.getMatchedFilenamePatternDict(filename)
+    reference = property_dict.get('reference', None)
+    version  = property_dict.get('version', None)
+    language  = property_dict.get('language', None)
+    if portal_type and reference and version and language:
+      portal_catalog = getToolByName(portal, 'portal_catalog')
+      document = portal_catalog.getResultValue(portal_type=portal_type,
+                                                reference=reference,
+                                                version=version,
+                                                language=language)

+      if document is not None:
+        # document is already uploaded. So overrides file.
+        if not _checkPermission(Permissions.ModifyPortalContent, document):
+          raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
+        document.edit(file=kw['file'])
+        return document
    # Temp objects use the standard newContent from Folder
    if temp_object:
      # For temp_object creation, use the standard method
-      return BaseTool.newContent(self, id=id, portal_type=portal_type,
-                                 temp_object=temp_object, **kw)
+      kw['portal_type'] = portal_type
+      return BaseTool.newContent(self, **kw)

    # Then put the file inside ourselves for a short while
    if container_path is not None:
      container = self.getPortalObject().restrictedTraverse(container_path)
-    document = self._setObject(file_name, None, portal_type=portal_type,
-                               user_login=user_login, id=id,
-                               container=container,
+    document = self._setObject(filename, None, portal_type=portal_type,
+                               user_login=user_login, container=container,
                               discover_metadata=discover_metadata,
+                               filename=filename,
+                               input_parameter_dict=input_parameter_dict
                               )
    object_id = document.getId()
    document = self._getOb(object_id) # Call _getOb to purge cache
@@ -264,18 +244,12 @@ class ContributionTool(BaseTool):
      if modified_kw is not None:
        kw.update(modified_kw)

+    kw['filename'] = filename # Override filename property
    # Then edit the document contents (so that upload can happen)
    document._edit(**kw)
-    # if no content_type has been set, guess it
-    if 'content_type' not in kw and getattr(document, 'guessMimeType', None) is not None:
-      # For File force to setup the mime_type
-      document.guessMimeType(fname=file_name)
    if url:
      document.fromURL(url)

-    # Notify workflows
-    #document.notifyWorkflowCreated()
-
    # Allow reindexing, reindex it and return the document
    try:
      delattr(document, 'isIndexable')
@@ -293,17 +267,19 @@ class ContributionTool(BaseTool):
    """
    pass

-  security.declareProtected(Permissions.ModifyPortalContent,'getMatchedFileNamePatternDict')
-  def getMatchedFileNamePatternDict(self, file_name):
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'getMatchedFilenamePatternDict')
+  def getMatchedFilenamePatternDict(self, filename):
    """
      Get matched group dict of file name parsing regular expression.
    """
    property_dict = {}

-    if file_name is None:
+    if filename is None:
      return property_dict

-    regex_text = self.portal_preferences.getPreferredDocumentFileNameRegularExpression()
+    regex_text = self.portal_preferences.\
+                                getPreferredDocumentFilenameRegularExpression()
    if regex_text in ('', None):
      return property_dict

@@ -311,42 +287,55 @@ class ContributionTool(BaseTool):
      pattern = re.compile(regex_text)
      if pattern is not None:
        try:
-          property_dict = pattern.match(file_name).groupdict()
+          property_dict = pattern.match(filename).groupdict()
        except AttributeError: # no match
          pass
    return property_dict

-  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
-  def getPropertyDictFromFileName(self, file_name):
+  # backward compatibility
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'getMatchedFileNamePatternDict')
+  def getMatchedFileNamePatternDict(self, filename):
+    """
+    (deprecated) use getMatchedFilenamePatternDict() instead.
+    """
+    warnings.warn('getMatchedFileNamePatternDict() is deprecated. '
+                  'use getMatchedFilenamePatternDict() instead.')
+    return self.getMatchedFilenamePatternDict(filename)
+
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'getPropertyDictFromFilename')
+  def getPropertyDictFromFilename(self, filename):
    """
      Gets properties from filename. File name is parsed with a regular expression
      set in preferences. The regexp should contain named groups.
    """
-    if file_name is None:
+    if filename is None:
      return {}
-    property_dict = self.getMatchedFileNamePatternDict(file_name)
-    method = self._getTypeBasedMethod('getPropertyDictFromFileName',
-        fallback_script_id = 'ContributionTool_getPropertyDictFromFileName')
-    property_dict = method(file_name, property_dict)
-    if property_dict.get('portal_type', None) is not None:
-      # we have to return portal_type as a tuple
-      # because we should allow for having multiple candidate types
-      property_dict['portal_type'] = (property_dict['portal_type'],)
-    else:
-      # we have to find candidates by file extenstion
-      basename, extension = os.path.splitext(file_name)
-      if extension:
-        extension = extension.lstrip('.') # remove first dot
-        property_dict['portal_type'] =\
-               self.ContributionTool_getCandidateTypeListByExtension(extension)
+    property_dict = self.getMatchedFilenamePatternDict(filename)
+    method = self._getTypeBasedMethod('getPropertyDictFromFilename',
+             fallback_script_id='ContributionTool_getPropertyDictFromFilename')
+    property_dict = method(filename, property_dict)
    return property_dict

+  # backward compatibility
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'getPropertyDictFromFileName')
+  def getPropertyDictFromFileName(self, filename):
+    """
+    (deprecated) use getPropertyDictFromFilename() instead.
+    """
+    warnings.warn('getPropertyDictFromFileName() is deprecated. '
+                  'use getPropertyDictFromFilename() instead.')
+    return self.getPropertyDictFromFilename(filename)
+
  # WebDAV virtual folder support
-  def _setObject(self, name, ob, portal_type=None, user_login=None,
-                 container=None, id=None, discover_metadata=1):
+  def _setObject(self, id, ob, portal_type=None, user_login=None,
+                 container=None, discover_metadata=True, filename=None,
+                 input_parameter_dict=None):
    """
      portal_contribution_registry will find appropriate portal type
-      name by file_name and content itself.
+      name by filename and content itself.

      The ContributionTool instance must be configured in such
      way that _verifyObjectPaste will return TRUE.
@@ -362,9 +351,8 @@ class ContributionTool(BaseTool):
      # redefine parameters
      portal_type = ob.getPortalType()
      container = ob.getParentValue()
-      id = ob.getId()
    if not portal_type:
-      document = BaseTool.newContent(self, id=name,
+      document = BaseTool.newContent(self, id=id,
                                     portal_type=portal_type,
                                     is_indexable=0)
    else:
@@ -379,33 +367,27 @@ class ContributionTool(BaseTool):
        module = self.getDefaultModule(portal_type)
      else:
        module = container
-      if id is None:
-        new_id = module.generateNewId()
-      else:
-        new_id = id
-      existing_document = module._getOb(new_id, None)
-      if existing_document is None:
-        # There is no preexisting document - we can therefore
-        # set the new object
-        document = module.newContent(id=new_id,
-                                     portal_type=portal_type,
-                                     is_indexable=0)
-        # We can now discover metadata
-        if discover_metadata:
-          # Metadata disovery is done as an activity by default
-          # If we need to discoverMetadata synchronously, it must
-          # be for user interface and should thus be handled by
-          # ZODB scripts
-          document.activate(after_path_and_method_id=(document.getPath(),
-            ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
-          .discoverMetadata(file_name=name, user_login=user_login)
-      else:
-        document = existing_document
+      # There is no preexisting document - we can therefore
+      # set the new object
+      document = module.newContent(portal_type=portal_type, is_indexable=0)
+      # We can now discover metadata
+      if discover_metadata:
+        # Metadata disovery is done as an activity by default
+        # If we need to discoverMetadata synchronously, it must
+        # be for user interface and should thus be handled by
+        # ZODB scripts
+        document.activate(after_path_and_method_id=(document.getPath(),
+          ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
+        .discoverMetadata(filename=filename,
+                          user_login=user_login,
+                          input_parameter_dict=input_parameter_dict)
      # Keep the document close to us - this is only useful for
      # file upload from webdav
-      if not hasattr(self, '_v_document_cache'):
+      volatile_cache = getattr(self, '_v_document_cache', None)
+      if volatile_cache is None:
        self._v_document_cache = {}
-      self._v_document_cache[document.getId()] = document.getRelativeUrl()
+        volatile_cache = self._v_document_cache
+      volatile_cache[document.getId()] = document.getRelativeUrl()

    # Return document to newContent method
    return document
@@ -417,10 +399,11 @@ class ContributionTool(BaseTool):
    """
    # Use the document cache if possible and return result immediately
    # this is only useful for webdav
-    if hasattr(self, '_v_document_cache'):
-      document_url = self._v_document_cache.get(id, None)
+    volatile_cache = getattr(self, '_v_document_cache', None)
+    if volatile_cache is not None:
+      document_url = volatile_cache.get(id)
      if document_url is not None:
-        del self._v_document_cache[id]
+        del volatile_cache[id]
        return self.getPortalObject().unrestrictedTraverse(document_url)

    # Try first to return the real object inside
@@ -475,66 +458,11 @@ class ContributionTool(BaseTool):
    def wrapper(o_list):
      for o in o_list:
        o = o.getObject()
-        id = '%s-%s' % (o.getUid(), o.getStandardFileName(),)
+        id = '%s-%s' % (o.getUid(), o.getStandardFilename(),)
        yield o.asContext(id=id)

    return wrapper(object_list)

-  # Crawling methods
-  security.declareProtected(Permissions.View, 'normaliseURL')
-  def normaliseURL(self, url, base_url=None):
-    """
-      Returns a normalised version of the url so
-      that we do not download twice the same content.
-      URL normalisation is an important part in crawlers.
-      The current implementation is obviously simplistic.
-      Refer to http://en.wikipedia.org/wiki/Web_crawler
-      and study Harvestman for more ideas.
-    """
-    url = self.simple_normaliser.sub('', url)
-    url_split = url.split(':')
-    url_protocol = url_split[0]
-    if url_protocol in no_host_protocol_list:
-      return url
-    if base_url and len(url_split) == 1:
-      # Make relative URL absolute
-      url = '%s/%s' % (base_url, url)
-    return url
-
-  security.declareProtected(Permissions.View, 'encodeURL')
-  def encodeURL(self, url):
-    """
-    Returns the URL as an ID. ID should be chosen in such
-    way that it is optimal with HBTreeFolder (ie. so that
-    distribution of access time on a cluster is possible)
-
-    NOTE: alternate approach is based on a url table
-    and catalog lookup. It is faster ? Not sure. Since
-    we must anyway insert objects in btrees and this
-    is simimar in cost to accessing them.
-    """
-    # Produce an MD5 from the URL
-    hex_md5 = md5_new(url).hexdigest()
-    # Take the first part in the URL which is not empty
-    # LOG("encodeURL", 0, url)
-    url_segment = url.split(':')[1]
-    url_segment_list = url_segment.split('/')
-    url_domain = None
-    for url_part in url_segment_list:
-      if url_part:
-        url_domain = url_part
-        break
-    # Return encoded url
-    if url_domain:
-      url_domain = urllib.quote(url_domain, safe='')
-      url_domain = url_domain.replace('%', '')
-      return "%s-%s" % (url_domain, hex_md5)
-    return hex_md5
-    url = urllib.quote(url, safe='')
-    url = url.replace('_', '__')
-    url = url.replace('%', '_')
-    return url
-
  security.declareProtected(Permissions.AddPortalContent, 'crawlContent')
  def crawlContent(self, content, container=None):
    """
@@ -543,6 +471,8 @@ class ContributionTool(BaseTool):
      XXX: missing is the conversion of content local href to something
      valid.
    """
+    portal = self.getPortalObject()
+    url_registry_tool = portal.portal_url_registry
    depth = content.getCrawlingDepth()
    if depth < 0:
      # Do nothing if crawling depth is reached
@@ -554,32 +484,34 @@ class ContributionTool(BaseTool):
    if depth < 0:
      # Do nothing if crawling depth is reached
      return
-    base_url = content.getContentBaseURL()
-    url_list = map(lambda url: self.normaliseURL(url, base_url), set(content.getContentURLList()))
+    url_list = content.getContentNormalisedURLList()
    for url in set(url_list):
      # LOG('trying to crawl', 0, url)
      # Some url protocols should not be crawled
-      if url.split(':')[0] in no_crawl_protocol_list:
+      if urlparse.urlsplit(url)[0] in no_crawl_protocol_list:
        continue
      if container is None:
        #if content.getParentValue()
        # in place of not ?
        container = content.getParentValue()
-      # Calculate the id under which content will be stored
-      id = self.encodeURL(url)
-      # Try to access the document if it already exists
-      document = container.get(id, None)
-      if document is None:
-        # XXX - This call is not working due to missing group_method_id
-        # therefore, multiple call happen in parallel and eventually fail
-        # (the same URL is created multiple times)
-        # LOG('activate newContentFromURL', 0, url)
-        self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(),
-                                                      id=id, url=url, crawling_depth=depth)
-      elif depth and document.getCrawlingDepth() < depth:
-        # Update the crawling depth if necessary
-        document._setCrawlingDepth(depth)
-        document.activate().crawlContent()
+      try:
+        url_registry_tool.getReferenceFromURL(url, context=container)
+      except KeyError:
+        pass
+      else:
+        # url already crawled
+        continue
+      # XXX - This call is not working due to missing group_method_id
+      # therefore, multiple call happen in parallel and eventually fail
+      # (the same URL is created multiple times)
+      # LOG('activate newContentFromURL', 0, url)
+      self.activate(activity="SQLQueue").newContentFromURL(
+                                  container_path=container.getRelativeUrl(),
+                                  url=url, crawling_depth=depth)
+      # Url is not known yet but register right now to avoid
+      # creation of duplicated crawled content
+      # An activity will later setup the good reference for it.
+      url_registry_tool.registerURL(url, None, context=container)

  security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL')
  def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0):
@@ -595,10 +527,7 @@ class ContributionTool(BaseTool):
      # Step 1: download new content
      try:
        url = content.asURL()
-        data = urllib2.urlopen(url).read()
-        file = cStringIO.StringIO()
-        file.write(data)
-        file.seek(0)
+        file_object, filename, content_type = self._openURL(url)
      except urllib2.HTTPError, error:
        if repeat == 0:
          # XXX - Call the extendBadURLList method,--NOT Implemented--
@@ -615,28 +544,28 @@ class ContributionTool(BaseTool):
        content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1)
        return

-      # Step 2: compare and update if necessary (md5)
-      # md5 stuff to compare contents
-      new_content_md5 = md5_new(data).hexdigest()
-      content_md5 = content.getContentMd5()
-      if content_md5 == new_content_md5:
-        return
-      content._edit(file=file)# Please make sure that if content is the same
+      content._edit(file=file_object, content_type=content_type)
+                              # Please make sure that if content is the same
                              # we do not update it
                              # This feature must be implemented by Base or File
                              # not here (look at _edit in Base)
-      # Step 3: convert to base format
-      content.convertToBaseFormat()
+      # Step 2: convert to base format
+      if content.isSupportBaseDataConversion():
+        content.activate().Document_tryToConvertToBaseFormat()
+      # Step 3: run discoverMetadata
+      content.activate(after_path_and_method_id=(content.getPath(),
+            ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
+          .discoverMetadata(filename=filename)
      # Step 4: activate populate (unless interaction workflow does it)
      content.activate().populateContent()
      # Step 5: activate crawlContent
      depth = content.getCrawlingDepth()
      if depth > 0:
        content.activate().crawlContent()
-      content.setContentMd5(new_content_md5)

  security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL')
-  def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT, repeat_interval=1, batch_mode=True, **kw):
+  def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT,
+                        repeat_interval=1, batch_mode=True, url=None, **kw):
    """
      A wrapper method for newContent which provides extra safety
      in case or errors (ie. download, access, conflict, etc.).
@@ -646,17 +575,13 @@ class ContributionTool(BaseTool):
      the at_date parameter and some standard values.

      NOTE: implementation needs to be done.
+      id parameter is ignored
    """
    document = None
-    # First of all, make sure do not try to create an existing document
-    if container_path is not None and id is not None:
-      container = self.restrictedTraverse(container_path)
-      document = container.get(id, None)
-      if document is not None:
-        # Document aleardy exists: no need to keep on crawling
-        return document
+    if not url:
+      raise TypeError, 'url parameter is mandatory'
    try:
-      document = self.newContent(container_path=container_path, id=id, **kw)
+      document = self.newContent(container_path=container_path, url=url, **kw)
      if document.isIndexContent() and document.getCrawlingDepth() >= 0:
        # If this is an index document, keep on crawling even if crawling_depth is 0
        document.activate().crawlContent()
@@ -672,7 +597,7 @@ class ContributionTool(BaseTool):
      if repeat > 0:
        # Catch any HTTP error
        self.activate(at_date=DateTime() + repeat_interval).newContentFromURL(
-                          container_path=container_path, id=id,
+                          container_path=container_path, url=url,
                          repeat=repeat - 1,
                          repeat_interval=repeat_interval, **kw)
    except urllib2.URLError, error:
@@ -685,28 +610,57 @@ class ContributionTool(BaseTool):
      if repeat > 0:
        self.activate(at_date=DateTime() + repeat_interval,
                      activity="SQLQueue").newContentFromURL(
-                        container_path=container_path, id=id,
+                        container_path=container_path, url=url,
                        repeat=repeat - 1,
                        repeat_interval=repeat_interval, **kw)
    return document

-  def _guessPortalType(self, name, typ, body):
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'guessMimeTypeFromFilename')
+  def guessMimeTypeFromFilename(self, filename):
    """
-       Call Portal Contribution Registry
-       to know which portal_type should be used
+      get mime type from file name
    """
-    findPortalTypeName = None
-    registry = getToolByName(self, 'portal_contribution_registry', None)
-    if registry is not None:
-      findPortalTypeName = registry.findPortalTypeName
-    else:
-      # Keep backward compatibility
-      registry = getToolByName(self, 'content_type_registry', None)
-      if registry is None:
-        return None
-      findPortalTypeName = registry.findTypeName
-
-    portal_type = findPortalTypeName(name, typ, body)
-    return portal_type
+    if not filename:
+      return
+    portal = self.getPortalObject()
+    content_type = portal.mimetypes_registry.lookupExtension(filename)
+    return content_type
+
+  def _openURL(self, url):
+    """Download content from url,
+    read filename and content_type
+    return file_object, filename, content_type tuple
+    """
+    # Quote path part of url
+    url_tuple = urlparse.urlsplit(url)
+    quoted_path = urllib.quote(url_tuple[2])
+    url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path,
+                               url_tuple[3], url_tuple[4]))
+    # build a new file from the url
+    url_file = urllib2.urlopen(url)
+    data = url_file.read() # time out must be set or ... too long XXX
+    file_object = cStringIO.StringIO()
+    file_object.write(data)
+    file_object.seek(0)
+    # if a content-disposition header is present,
+    # try first to read the suggested filename from it.
+    header_info = url_file.info()
+    content_disposition = header_info.getheader('content-disposition', '')
+    filename = parse_header(content_disposition)[1].get('filename')
+    if not filename:
+      # Now read the filename from url.
+      # In case of http redirection, the real url must be read
+      # from file object returned by urllib2.urlopen.
+      # It can happens when the header 'Location' is present in request.
+      # See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
+      url = url_file.geturl()
+      # Create a file name based on the URL and quote it
+      filename = urlparse.urlsplit(url)[-3]
+      filename = os.path.basename(filename)
+      filename = urllib.quote(filename, safe='')
+      filename = filename.replace('%', '')
+    content_type = header_info.gettype()
+    return file_object, filename, content_type

 InitializeClass(ContributionTool)
--- a/product/ERP5/__init__.py
+++ b/product/ERP5/__init__.py
@@ -50,7 +50,7 @@ from Tool import CategoryTool, SimulationTool, RuleTool, IdTool, TemplateTool,\
                 TrashTool, ContributionTool, NotificationTool, PasswordTool,\
                 GadgetTool, ContributionRegistryTool, IntrospectionTool,\
                 AcknowledgementTool, SolverTool, SolverProcessTool,\
-                 ConversionTool, RoundingTool
+                 ConversionTool, RoundingTool, UrlRegistryTool
 import ERP5Site
 from Document import PythonScript
 object_classes = ( ERP5Site.ERP5Site,
@@ -78,6 +78,7 @@ portal_tools = ( CategoryTool.CategoryTool,
                 SolverProcessTool.SolverProcessTool,
                 ConversionTool.ConversionTool,
                 RoundingTool.RoundingTool,
+                 UrlRegistryTool.UrlRegistryTool,
                )
 content_classes = ()
 content_constructors = ()

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/drawing_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/drawing_extension.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                    <value>
                      <list>
                        <string>sxd</string>
@@ -32,7 +32,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
              </tuple>
            </value>
        </item>
@@ -46,7 +46,7 @@
        </item>
        <item>
            <key> <string>int_index</string> </key>
-            <value> <int>60</int> </value>
+            <value> <int>10</int> </value>
        </item>
        <item>
            <key> <string>portal_type</string> </key>
@@ -60,7 +60,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>Drawing</string> </value>
+            <value> <string>Drawing by extension</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/image_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/image_extension.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                    <value>
                      <list>
                        <string>gif</string>
@@ -35,7 +35,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
              </tuple>
            </value>
        </item>
@@ -49,7 +49,7 @@
        </item>
        <item>
            <key> <string>int_index</string> </key>
-            <value> <int>20</int> </value>
+            <value> <int>10</int> </value>
        </item>
        <item>
            <key> <string>portal_type</string> </key>
@@ -63,7 +63,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>Image</string> </value>
+            <value> <string>Image by extension</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_extension.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                    <value>
                      <list>
                        <string>pdf</string>
@@ -31,7 +31,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
              </tuple>
            </value>
        </item>
@@ -45,7 +45,7 @@
        </item>
        <item>
            <key> <string>int_index</string> </key>
-            <value> <int>30</int> </value>
+            <value> <int>10</int> </value>
        </item>
        <item>
            <key> <string>portal_type</string> </key>
@@ -59,7 +59,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>PDF</string> </value>
+            <value> <string>PDF by extension</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_mimetype.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/pdf_mimetype.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>mime_type</string> </key>
+                    <key> <string>content_type</string> </key>
                    <value>
                      <list>
                        <string>application/pdf</string>
@@ -31,7 +31,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>mime_type</string>
+                <string>content_type</string>
              </tuple>
            </value>
        </item>
@@ -45,7 +45,7 @@
        </item>
        <item>
            <key> <string>int_index</string> </key>
-            <value> <int>30</int> </value>
+            <value> <int>20</int> </value>
        </item>
        <item>
            <key> <string>portal_type</string> </key>
@@ -59,7 +59,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>PDF</string> </value>
+            <value> <string>PDF by mimetype</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/presentation_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/presentation_extension.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                    <value>
                      <list>
                        <string>ppt</string>
@@ -34,7 +34,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
              </tuple>
            </value>
        </item>
@@ -48,7 +48,7 @@
        </item>
        <item>
            <key> <string>int_index</string> </key>
-            <value> <int>50</int> </value>
+            <value> <int>10</int> </value>
        </item>
        <item>
            <key> <string>portal_type</string> </key>
@@ -62,7 +62,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>Presentation</string> </value>
+            <value> <string>Presentation by extension</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_by_content.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_by_content.xml
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>_identity_criterion</string> </key>
+            <value>
+              <dictionary>
+                <item>
+                    <key> <string>content_type_from_content</string> </key>
+                    <value>
+                      <list>
+                        <string>application/vnd.ms-excel</string>
+                        <string>application/vnd.ms-office</string>
+                        <string>application/msexcel</string>
+                        <string>application/vnd.oasis.opendocument.spreadsheet</string>
+                        <string>application/vnd.oasis.opendocument.spreadsheet-template</string>
+                      </list>
+                    </value>
+                </item>
+              </dictionary>
+            </value>
+        </item>
+        <item>
+            <key> <string>_range_criterion</string> </key>
+            <value>
+              <dictionary/>
+            </value>
+        </item>
+        <item>
+            <key> <string>criterion_property</string> </key>
+            <value>
+              <tuple>
+                <string>content_type_from_content</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>destination_portal_type</string> </key>
+            <value> <string>Spreadsheet</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>spreadsheet_by_content</string> </value>
+        </item>
+        <item>
+            <key> <string>int_index</string> </key>
+            <value> <int>70</int> </value>
+        </item>
+        <item>
+            <key> <string>portal_type</string> </key>
+            <value> <string>Contribution Predicate</string> </value>
+        </item>
+        <item>
+            <key> <string>test_method_id</string> </key>
+            <value>
+              <tuple/>
+            </value>
+        </item>
+        <item>
+            <key> <string>title</string> </key>
+            <value> <string>Spreadsheet by content</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/spreadsheet_extension.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                    <value>
                      <list>
                        <string>xls</string>
@@ -35,7 +35,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
              </tuple>
            </value>
        </item>
@@ -49,7 +49,7 @@
        </item>
        <item>
            <key> <string>int_index</string> </key>
-            <value> <int>40</int> </value>
+            <value> <int>10</int> </value>
        </item>
        <item>
            <key> <string>portal_type</string> </key>
@@ -63,7 +63,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>Spreadsheet</string> </value>
+            <value> <string>Spreadsheet by extension</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_conent_type.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_conent_type.xml
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>_identity_criterion</string> </key>
+            <value>
+              <dictionary>
+                <item>
+                    <key> <string>content_type</string> </key>
+                    <value>
+                      <list>
+                        <string>text/plain</string>
+                      </list>
+                    </value>
+                </item>
+              </dictionary>
+            </value>
+        </item>
+        <item>
+            <key> <string>_range_criterion</string> </key>
+            <value>
+              <dictionary/>
+            </value>
+        </item>
+        <item>
+            <key> <string>criterion_property</string> </key>
+            <value>
+              <tuple>
+                <string>content_type</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>destination_portal_type</string> </key>
+            <value> <string>Text</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>text_by_conent_type</string> </value>
+        </item>
+        <item>
+            <key> <string>int_index</string> </key>
+            <value> <int>20</int> </value>
+        </item>
+        <item>
+            <key> <string>portal_type</string> </key>
+            <value> <string>Contribution Predicate</string> </value>
+        </item>
+        <item>
+            <key> <string>test_method_id</string> </key>
+            <value>
+              <tuple/>
+            </value>
+        </item>
+        <item>
+            <key> <string>title</string> </key>
+            <value> <string>Text by content type</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_content.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_by_content.xml
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>_identity_criterion</string> </key>
+            <value>
+              <dictionary>
+                <item>
+                    <key> <string>content_type_from_content</string> </key>
+                    <value>
+                      <list>
+                        <string>text/plain</string>
+                      </list>
+                    </value>
+                </item>
+              </dictionary>
+            </value>
+        </item>
+        <item>
+            <key> <string>_range_criterion</string> </key>
+            <value>
+              <dictionary/>
+            </value>
+        </item>
+        <item>
+            <key> <string>criterion_property</string> </key>
+            <value>
+              <tuple>
+                <string>content_type_from_content</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>destination_portal_type</string> </key>
+            <value> <string>Text</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>text_by_content</string> </value>
+        </item>
+        <item>
+            <key> <string>int_index</string> </key>
+            <value> <int>70</int> </value>
+        </item>
+        <item>
+            <key> <string>portal_type</string> </key>
+            <value> <string>Contribution Predicate</string> </value>
+        </item>
+        <item>
+            <key> <string>test_method_id</string> </key>
+            <value>
+              <tuple/>
+            </value>
+        </item>
+        <item>
+            <key> <string>title</string> </key>
+            <value> <string>Text by mimetype from data</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/text_extension.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                    <value>
                      <list>
                        <string>txt</string>
@@ -36,7 +36,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
              </tuple>
            </value>
        </item>
@@ -64,7 +64,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>Text</string> </value>
+            <value> <string>Text by extension</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/web_page_by_content.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/web_page_by_content.xml
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>_identity_criterion</string> </key>
+            <value>
+              <dictionary>
+                <item>
+                    <key> <string>content_type_from_content</string> </key>
+                    <value>
+                      <list>
+                        <string>text/html</string>
+                      </list>
+                    </value>
+                </item>
+              </dictionary>
+            </value>
+        </item>
+        <item>
+            <key> <string>_range_criterion</string> </key>
+            <value>
+              <dictionary/>
+            </value>
+        </item>
+        <item>
+            <key> <string>criterion_property</string> </key>
+            <value>
+              <tuple>
+                <string>content_type_from_content</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>destination_portal_type</string> </key>
+            <value> <string>Web Page</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>web_page_by_content</string> </value>
+        </item>
+        <item>
+            <key> <string>int_index</string> </key>
+            <value> <int>70</int> </value>
+        </item>
+        <item>
+            <key> <string>portal_type</string> </key>
+            <value> <string>Contribution Predicate</string> </value>
+        </item>
+        <item>
+            <key> <string>test_method_id</string> </key>
+            <value>
+              <tuple/>
+            </value>
+        </item>
+        <item>
+            <key> <string>title</string> </key>
+            <value> <string>Web Page by mimetype from data</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_extension.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_extension.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>file_extension</string> </key>
+                    <key> <string>extension_from_filename</string> </key>
                    <value>
                      <list>
                        <string>html</string>
@@ -33,7 +33,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>file_extension</string>
+                <string>extension_from_filename</string>
              </tuple>
            </value>
        </item>
@@ -47,7 +47,7 @@
        </item>
        <item>
            <key> <string>int_index</string> </key>
-            <value> <int>90</int> </value>
+            <value> <int>10</int> </value>
        </item>
        <item>
            <key> <string>portal_type</string> </key>
@@ -61,7 +61,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>Web Page</string> </value>
+            <value> <string>Web Page by extension</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_mimetype.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_contribution_registry/webpage_mimetype.xml
@@ -11,7 +11,7 @@
            <value>
              <dictionary>
                <item>
-                    <key> <string>mime_type</string> </key>
+                    <key> <string>content_type</string> </key>
                    <value>
                      <list>
                        <string>text/html</string>
@@ -31,7 +31,7 @@
            <key> <string>criterion_property</string> </key>
            <value>
              <tuple>
-                <string>mime_type</string>
+                <string>content_type</string>
              </tuple>
            </value>
        </item>
@@ -45,13 +45,7 @@
        </item>
        <item>
            <key> <string>int_index</string> </key>
-            <value> <int>90</int> </value>
-        </item>
-        <item>
-            <key> <string>membership_criterion_base_category</string> </key>
-            <value>
-              <tuple/>
-            </value>
+            <value> <int>20</int> </value>
        </item>
        <item>
            <key> <string>portal_type</string> </key>
@@ -65,7 +59,7 @@
        </item>
        <item>
            <key> <string>title</string> </key>
-            <value> <string>Web Page</string> </value>
+            <value> <string>Web Page by mimetype</string> </value>
        </item>
      </dictionary>
    </pickle>

--- a/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_preferences/default_site_preference.xml
+++ b/product/ERP5/bootstrap/erp5_core/PathTemplateItem/portal_preferences/default_site_preference.xml
@@ -24,6 +24,22 @@
              </tuple>
            </value>
        </item>
+        <item>
+            <key> <string>_Add_portal_content_Permission</string> </key>
+            <value>
+              <tuple>
+                <string>Manager</string>
+              </tuple>
+            </value>
+        </item>
+        <item>
+            <key> <string>_Delete_objects_Permission</string> </key>
+            <value>
+              <tuple>
+                <string>Manager</string>
+              </tuple>
+            </value>
+        </item>
        <item>
            <key> <string>_Modify_portal_content_Permission</string> </key>
            <value>
@@ -252,6 +268,22 @@ It\'s the lowest priority one; ie. managers can create higher priority preferenc
            <key> <string>preferred_date_order</string> </key>
            <value> <string>ymd</string> </value>
        </item>
+        <item>
+            <key> <string>preferred_document_file_name_regular_expression</string> </key>
+            <value> <string encoding="cdata"><![CDATA[
+
+(?P<reference>[A-Z&é@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})
+
+]]></string> </value>
+        </item>
+        <item>
+            <key> <string>preferred_document_reference_regular_expression</string> </key>
+            <value> <string encoding="cdata"><![CDATA[
+
+(?P<reference>[A-Z&é@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?
+
+]]></string> </value>
+        </item>
        <item>
            <key> <string>preferred_event_assessment_form_id</string> </key>
            <value>

--- a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_download.xml
+++ b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_download.xml
@@ -58,8 +58,8 @@ from zExceptions import Unauthorized\n
 format = None\n
 # Always force download of document even if format is supported\n
 # by browser\n
-file_name = context.getStandardFileName(format)\n
-response.setHeader(\'Content-disposition\', \'attachment; filename="%s"\' % file_name)\n
+filename = context.getStandardFilename(format)\n
+response.setHeader(\'Content-disposition\', \'attachment; filename="%s"\' % filename)\n
 \n
 try:\n
  return context.index_html(request, response, format)\n
@@ -111,7 +111,7 @@ except Unauthorized:\n
                            <string>None</string>
                            <string>format</string>
                            <string>context</string>
-                            <string>file_name</string>
+                            <string>filename</string>
                            <string>msg</string>
                            <string>dict</string>
                          </tuple>

--- a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionPredicate_view/my_criterion_property_list.xml
+++ b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionPredicate_view/my_criterion_property_list.xml
@@ -222,12 +222,16 @@
                    <value>
                      <list>
                        <tuple>
-                          <string>file_extension</string>
-                          <string>file_extension</string>
+                          <string>extension_from_filename</string>
+                          <string>extension_from_filename</string>
                        </tuple>
                        <tuple>
-                          <string>mime_type</string>
-                          <string>mime_type</string>
+                          <string>content_type</string>
+                          <string>content_type</string>
+                        </tuple>
+                        <tuple>
+                          <string>content_type_from_content</string>
+                          <string>content_type_from_content</string>
                        </tuple>
                      </list>
                    </value>

--- a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionRegistryTool_viewContributionPredicateList/listbox.xml
+++ b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/ContributionRegistryTool_viewContributionPredicateList/listbox.xml
@@ -352,6 +352,10 @@
                    <key> <string>css_class</string> </key>
                    <value> <string></string> </value>
                </item>
+                <item>
+                    <key> <string>default_display_style</string> </key>
+                    <value> <string>table</string> </value>
+                </item>
                <item>
                    <key> <string>default_params</string> </key>
                    <value>
@@ -362,6 +366,12 @@
                    <key> <string>description</string> </key>
                    <value> <string></string> </value>
                </item>
+                <item>
+                    <key> <string>display_style_list</string> </key>
+                    <value>
+                      <list/>
+                    </value>
+                </item>
                <item>
                    <key> <string>domain_root_list</string> </key>
                    <value>
@@ -396,10 +406,18 @@
                      <list/>
                    </value>
                </item>
+                <item>
+                    <key> <string>global_search_column</string> </key>
+                    <value> <string></string> </value>
+                </item>
                <item>
                    <key> <string>hidden</string> </key>
                    <value> <int>0</int> </value>
                </item>
+                <item>
+                    <key> <string>hide_rows_on_no_search_criterion</string> </key>
+                    <value> <int>0</int> </value>
+                </item>
                <item>
                    <key> <string>lines</string> </key>
                    <value> <int>20</int> </value>
@@ -425,6 +443,10 @@
                      </list>
                    </value>
                </item>
+                <item>
+                    <key> <string>page_navigation_mode</string> </key>
+                    <value> <string>slider</string> </value>
+                </item>
                <item>
                    <key> <string>page_template</string> </key>
                    <value> <string></string> </value>
@@ -445,6 +467,10 @@
                    <key> <string>report_tree</string> </key>
                    <value> <int>0</int> </value>
                </item>
+                <item>
+                    <key> <string>row_css_method</string> </key>
+                    <value> <string></string> </value>
+                </item>
                <item>
                    <key> <string>search</string> </key>
                    <value> <int>0</int> </value>
@@ -490,10 +516,22 @@
                    <key> <string>stat_method</string> </key>
                    <value> <string></string> </value>
                </item>
+                <item>
+                    <key> <string>style_columns</string> </key>
+                    <value>
+                      <list/>
+                    </value>
+                </item>
                <item>
                    <key> <string>title</string> </key>
                    <value> <string>Contribution Predicates</string> </value>
                </item>
+                <item>
+                    <key> <string>untranslatable_columns</string> </key>
+                    <value>
+                      <list/>
+                    </value>
+                </item>
                <item>
                    <key> <string>url_columns</string> </key>
                    <value>

--- a/product/ERP5/bootstrap/erp5_core/bt/revision
+++ b/product/ERP5/bootstrap/erp5_core/bt/revision
-40819
\ No newline at end of file
+40820
\ No newline at end of file
--- a/product/ERP5/bootstrap/erp5_core/bt/template_path_list
+++ b/product/ERP5/bootstrap/erp5_core/bt/template_path_list
@@ -22,8 +22,12 @@ portal_contribution_registry/image_extension
 portal_contribution_registry/pdf_extension
 portal_contribution_registry/pdf_mimetype
 portal_contribution_registry/presentation_extension
+portal_contribution_registry/spreadsheet_by_content
 portal_contribution_registry/spreadsheet_extension
+portal_contribution_registry/text_by_conent_type
+portal_contribution_registry/text_by_content
 portal_contribution_registry/text_extension
+portal_contribution_registry/web_page_by_content
 portal_contribution_registry/webpage_extension
 portal_contribution_registry/webpage_mimetype
 portal_domains/base_day_domain

--- a/product/ERP5/interfaces/discoverable.py
+++ b/product/ERP5/interfaces/discoverable.py
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Jean-Paul Smets-Solanes <jp@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class IDiscoverable(Interface):
+  """
+  Discoverable interface specification
+
+  Documents which implement IMetadataDiscoverable provides
+  methods to discover and update metadata properties
+  from content, user input, file name, etc.
+  """
+
+  def getContentInformation():
+    """
+    Returns a dictionary of possible metadata which can be extracted from the
+    document content (ex. title from an HTML file, creation date from a PDF
+    document, etc.)
+    """
+
+  def getPropertyDictFromUserLogin(user_login=None):
+    """
+    Based on the user_login, find out all properties which
+    can be discovered to later update document metadata.
+
+    user_login -- optional user login ID
+    """
+
+  def getPropertyDictFromContent():
+    """
+    Based on the result of getContentInformation, find out all
+    properties which can be discovered to later update document metadata.
+    """
+
+  def getPropertyDictFromFilename(filename):
+    """
+    Based on the file name, find out all properties which
+    can be discovered to later update document metadata.
+
+    filename -- file name to use in discovery process
+    """
+
+  def getPropertyDictFromInput():
+    """
+    Based on the user input, find out all properties which
+    can be discovered to later update document metadata.
+    """
+
+  def discoverMetadata(filename=None, user_login=None):
+    """
+    Updates the document metadata by discovering metadata from
+    the user login, the document content, the file name and the
+    user input. The order of discovery should be set in system
+    preferences.
+
+    filename - optional file name (ex. AA-BBB-CCC-223-en.doc)
+
+    user_login -- optional user login ID
+
+    XXX - it is unclear if this method should also trigger finishIngestion
+          and whether this should be documented here or not
+    """
+
+  def finishIngestion():
+    """
+    Finish the ingestion process (ex. allocate a reference number automatically if
+    no reference was defined.)
+
+    XXX - it is unclear if this method should be part of the interface
+    """
+
+  def getExtensionFromFilename():
+    """Return calculated value of extension read from filename
+    """
+
+  def getContentTypeFromContent():
+    """Return calculated value of content type read from content
+    """
--- a/product/ERP5/interfaces/document.py
+++ b/product/ERP5/interfaces/document.py
@@ -87,7 +87,7 @@ class IDocument(Interface):

  input      -   data supplied with http request or set on the object during (2) (e.g.
                 discovered from email text)
-  file_name  -   data which might be encoded in file name
+  filename  -   data which might be encoded in filename
  user_login -   information about user who is contributing the file
  content    -   data which might be derived from document content


--- a/product/ERP5/interfaces/downloadable.py
+++ b/product/ERP5/interfaces/downloadable.py
@@ -52,11 +52,11 @@ class IDownloadable(Interface):
    kw -- optional conversion parameters
    """

-  def getStandardFileName(format=None):
+  def getStandardFilename(format=None):
    """
    Returns a standard file name for the document to download.
    This method is the reverse of
-    IMetadataDiscoverable.getPropertyDictFromFileName.
+    IDiscoverable.getPropertyDictFromFilename.

    format -- extension of returned file name
    """
--- a/product/ERP5/interfaces/url.py
+++ b/product/ERP5/interfaces/url.py
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Nicolas Delaby <nicolas@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class IUrl(Interface):
+  """
+  """
+
+  def asURL():
+    """
+    Returns a text representation of the Url if defined
+    or None else.
+    """
+
+
+  def fromURL(url):
+    """
+    Analyses a URL and splits it into two parts. URLs
+    normally follow RFC 1738. However, we accept URLs
+    without the protocol a.k.a. scheme part (http, mailto, etc.). In this
+    case only the url_string a.k.a. scheme-specific-part is taken
+    into account. asURL will then generate the full URL.
+    """
+
+  def getURLServer():
+    """
+    Returns the server part of a URL
+    """
+
+  def getURLPort():
+    """
+    Returns the port part of a URL
+    """
+
+  def getURLPath():
+    """
+    Returns the path part of a URL
+    """
+
+  def asNormalisedURL(base_url=None):
+    """
+    Returns a normalised version of the url so
+    that we do not download twice the same content.
+    This normalisation must refer to the same resource !
+    Refer to http://en.wikipedia.org/wiki/URL_normalization .
+
+    base_url - Specify a default URL and a default target
+               for all links on a page.
+               if url is a relative link, we try to compute an absolute url
+               with help of base_url
+    """
--- a/product/ERP5/interfaces/url_registry_tool.py
+++ b/product/ERP5/interfaces/url_registry_tool.py
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Nicolas Delaby <nicolas@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from zope.interface import Interface
+
+class IUrlRegistryTool(Interface):
+  """Tool to register URLs
+  This tool aim to maintain consistency in URL management
+  of crawlable sources in order to maintain consistency
+  between an external resource identifier and generated
+  document inside ERP5.
+
+  Multiple URL can be associated to the same reference
+
+  A System Preference can used to configure the global namespace.
+  This enable isolation of url mappings for different Groups.
+
+  This is a configurable tool to support different scope for mappings.
+  So it is possible to restrict the crawling of an URL
+  only once in the context of portal;
+  Or restrict the crawling of an url for the scope of an external_source
+  or a module only (Crawling multiple times the same URL for a portal)
+  """
+
+  def clearUrlRegistryTool(context=None):
+    """Unregister all urls in all namespaces.
+    Only available for Manager
+
+    context - a context to access container of mappings.
+    """
+
+  def registerURL(url, reference, context=None):
+    """Register the mapping url:reference
+    this method is aimed to be called from interaction_workflow
+    which trig on _setReference in order to keep the association
+    between url:reference up to date.
+
+    url - external resource identifier
+    reference - reference of downloaded resource (ERP5 Object instance)
+    context - a context to access container of mappings.
+              If not passed, mappings are stored on tool itself
+    """
+
+  def getReferenceList(context=None):
+    """return all references registered by portal_url_registry
+    according given context
+
+    context - a context to access container of mappings.
+    """
+
+  def getReferenceFromURL(url, context=None):
+    """return reference of document according provided url
+
+    url - external resource identifier
+    context - a context to access container of mappings.
+              If not passed, mapping are stored on tool itself
+    """
+
+  def getURLListFromReference(reference, context=None):
+    """return list of urls associated to given reference
+    and context.
+
+    reference - reference of downloaded resource (ERP5 Object instance)
+    context - a context to access container of mappings.
+    """
+
+  def updateUrlRegistryTool():
+    """Rebuild all url mappings for active preference
+    """
--- a/product/ERP5/mixin/cached_convertable.py
+++ b/product/ERP5/mixin/cached_convertable.py
@@ -139,10 +139,21 @@ class CachedConvertableMixin:
      cached_value = data
      conversion_md5 = md5_new(str(data.data)).hexdigest()
      size = len(data.data)
-    else:
+    elif isinstance(data, (str, unicode,)):
      cached_value = data
      conversion_md5 = md5_new(cached_value).hexdigest()
      size = len(cached_value)
+    elif isinstance(data, dict):
+      # Dict instance are used to store computed metadata
+      # from actual content.
+      # So this value is intimely related to cache of conversion.
+      # As it should be cleared each time the document is edited.
+      # Also may be a proper API should be used
+      cached_value = data
+      conversion_md5 = None
+      size = len(cached_value)
+    else:
+      raise NotImplementedError, 'Not able to store type:%r' % type(data)
    if date is None:
      date = DateTime()
    stored_data_dict = {'content_md5': self.getContentMd5(),

--- a/product/ERP5/mixin/crawlable.py
+++ b/product/ERP5/mixin/crawlable.py
 # -*- coding: utf-8 -*-
 ##############################################################################
 #
-# Copyright (c) 2009 Nexedi SA and Contributors. All Rights Reserved.
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
 #                    Ivan Tyagov <ivan@nexedi.com>
 #
 # WARNING: This program as such is intended to be used by professional
@@ -27,8 +27,13 @@
 #
 ##############################################################################

-from AccessControl import ClassSecurityInfo, getSecurityManager
+from AccessControl import ClassSecurityInfo
 from Products.ERP5Type import Permissions
+from Products.ERP5Type.Utils import normaliseUrl
+from Products.ERP5Type.DateUtils import convertDateToHour,\
+     number_of_hours_in_day, number_of_hours_in_year
+from urlparse import urlsplit, urlunsplit
+from lxml import html as etree_html

 class CrawlableMixin:
  """
@@ -80,3 +85,81 @@ class CrawlableMixin:
    method = self._getTypeBasedMethod('isUpdatable',
        fallback_script_id = 'Document_isUpdatable')
    return method()
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentURLList')
+  def getContentURLList(self):
+    """
+    Returns a list of URLs referenced by the content of this document.
+    Default implementation consists in analysing the document
+    converted to HTML. Subclasses may overload this method
+    if necessary. However, it is better to extend the conversion
+    methods in order to produce valid HTML, which is useful to
+    many people, rather than overload this method which is only
+    useful for crawling.
+    """
+    html_content = self.asEntireHTML()
+    html_tree = etree_html.fromstring(html_content)
+    base_href = self.getContentBaseURL()
+    if base_href:
+      html_tree.make_links_absolute(base_href)
+    href_list = []
+    for elemnt, attribute_name, link, position in html_tree.iterlinks():
+      # For now take into acount only a and img tags
+      if attribute_name not in ('href',):
+        continue
+      if isinstance(link, unicode):
+        link = link.encode('utf-8')
+      href_list.append(link)
+    return href_list
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentBaseURL')
+  def getContentBaseURL(self):
+    """
+    Returns the content base URL based on the actual content or
+    on its URL.
+    """
+    raw_url = self.asURL() or ''
+    splitted_url = urlsplit(raw_url)
+    path_part = splitted_url[2]
+    path_part = '/'.join(path_part.split('/')[:-1])
+    base_url = urlunsplit((splitted_url[0], splitted_url[1], path_part, None,
+                           None))
+    if isinstance(base_url, unicode):
+      base_url = base_url.encode('utf-8')
+    return base_url
+
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentNormalisedURLList')
+  def getContentNormalisedURLList(self):
+    """
+    Call url normalizer for each url returned by getContentURLList
+    Return only url associated to the same Domain
+    """
+    reference_domain = urlsplit(normaliseUrl(self.asURL() or ''))[1]
+    # in www.example.com or www.3.example.com
+    # keep only the example.com part
+    reference_domain = ''.join(reference_domain.split('.')[-2:])
+    if isinstance(reference_domain, unicode):
+      reference_domain = reference_domain.encode('utf-8')
+    url_list = []
+    base_url = self.getContentBaseURL()
+    for url in self.getContentURLList():
+      try:
+        url = normaliseUrl(url, base_url=base_url)
+      except UnicodeDecodeError:
+        # Ignore wrong encoding errors
+        # Web is not a kind world
+        continue
+      if not url:
+        continue
+      url_domain = urlsplit(url)[1]
+      if isinstance(url_domain, unicode):
+        url_domain = url_domain.encode('utf-8')
+      if url_domain and ''.join(url_domain.split('.')[-2:]) != reference_domain:
+        continue
+      # if domain is empty (relative link) or domain is same, then OK
+      url_list.append(url)
+    return url_list
--- a/product/ERP5/mixin/discoverable.py
+++ b/product/ERP5/mixin/discoverable.py
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Ivan Tyagov <ivan@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo, getSecurityManager
+from Products.ERP5Type import Permissions
+from Products.ERP5Type.Utils import convertToUpperCase
+from Products.CMFCore.utils import getToolByName
+from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
+import os
+import re
+
+try:
+  import magic
+except ImportError:
+  magic = None
+
+VALID_ORDER_KEY_LIST = ('user_login', 'content', 'filename', 'input')
+
+CONTENT_INFORMATION_FORMAT = '_idiscoverable_content_information'
+
+class DiscoverableMixin(CachedConvertableMixin):
+  """
+  Implements IDiscoverable
+  This class provide methods useful for Metadata extraction.
+  It inherit from CachedConvertableMixin to access
+  Cache storage API.
+  As computed data needs to be stored in same backend.
+  """
+  security = ClassSecurityInfo()
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromUserLogin')
+  def getPropertyDictFromUserLogin(self, user_login=None):
+    """
+    Based on the user_login, find out as many properties as needed.
+    returns properties which should be set on the document
+    """
+    if user_login is None:
+      user_login = str(getSecurityManager().getUser())
+    method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
+        fallback_script_id='Document_getPropertyDictFromUserLogin')
+    return method(user_login)
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromContent')
+  def getPropertyDictFromContent(self):
+    """
+    Based on the document content, find out as many properties as needed.
+    returns properties which should be set on the document
+    """
+    # accesss data through convert
+    mime, content = self.convert(None)
+    if not content:
+       # if document is empty, we will not find anything in its content
+      return {}
+    method = self._getTypeBasedMethod('getPropertyDictFromContent',
+        fallback_script_id='Document_getPropertyDictFromContent')
+    return method()
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromFilename')
+  def getPropertyDictFromFilename(self, filename):
+    """
+    Based on the file name, find out as many properties as needed.
+    returns properties which should be set on the document
+    """
+    return self.portal_contributions.getPropertyDictFromFilename(filename)
+
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromFileName')
+  getPropertyDictFromFileName = getPropertyDictFromFilename
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getPropertyDictFromInput')
+  def getPropertyDictFromInput(self, input_parameter_dict):
+    """
+    Fetch argument_dict, then filter pass this dictionary
+    to getPropertyDictFromInput.
+    """
+    method = self._getTypeBasedMethod('getPropertyDictFromInput')
+    return method(input_parameter_dict)
+
+  ### Metadata disovery and ingestion methods
+  security.declareProtected(Permissions.ModifyPortalContent,
+                            'discoverMetadata')
+  def discoverMetadata(self, filename=None, user_login=None,
+                       input_parameter_dict=None):
+    """
+    This is the main metadata discovery function - controls the process
+    of discovering data from various sources. The discovery itself is
+    delegated to scripts or uses preference-configurable regexps. The
+    method returns either self or the document which has been
+    merged in the discovery process.
+
+    filename - this parameter is a file name of the form "AA-BBB-CCC-223-en"
+
+    user_login - this is a login string of a person; can be None if the user is
+                 currently logged in, then we'll get him from session
+    input_parameter_dict - arguments provided to Create this content by user.
+    """
+    # Preference is made of a sequence of 'user_login', 'content', 'filename', 'input'
+    method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList')
+    order_list = list(method())
+    order_list.reverse()
+    # build a dictionary according to the order
+    kw = {}
+    for order_id in order_list:
+      result = None
+      if order_id not in VALID_ORDER_KEY_LIST:
+        # Prevent security attack or bad preferences
+        raise AttributeError, "%s is not in valid order key list" % order_id
+      method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
+      method = getattr(self, method_id)
+      if order_id == 'filename':
+        if filename is not None:
+          result = method(filename)
+      elif order_id == 'user_login':
+        if user_login is not None:
+          result = method(user_login)
+      elif order_id == 'input':
+        if input_parameter_dict is not None:
+          result = method(input_parameter_dict)
+      else:
+        result = method()
+      if result is not None:
+        for key, value in result.iteritems():
+          if value not in (None, ''):
+            kw[key]=value
+    # Prepare the content edit parameters
+    portal_type = kw.pop('portal_type', None)
+    if portal_type and portal_type != self.getPortalType():
+      # Reingestion is required to update portal_type
+      return self.migratePortalType(portal_type)
+    # Try not to invoke an automatic transition here
+    self._edit(**kw)
+    if not portal_type:
+      # If no portal_type was dicovered, pass self
+      # through to portal_contribution_registry
+      # to guess destination portal_type against all properties.
+      # If returned portal_type is different, then reingest.
+      registry = getToolByName(self.getPortalObject(),
+                              'portal_contribution_registry')
+      portal_type = registry.findPortalTypeName(context=self)
+      if portal_type != self.getPortalType():
+        return self.migratePortalType(portal_type)
+    # Finish ingestion by calling method
+    self.finishIngestion() # XXX - is this really the right place ?
+    self.reindexObject() # XXX - is this really the right place ?
+    # Revision merge is tightly coupled
+    # to metadata discovery - refer to the documentation of mergeRevision method
+    merged_doc = self.mergeRevision() # XXX - is this really the right place ?
+    merged_doc.reindexObject() # XXX - is this really the right place ?
+    return merged_doc # XXX - is this really the right place ?
+
+  security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
+  def finishIngestion(self):
+    """
+    Finish the ingestion process by calling the appropriate script. This
+    script can for example allocate a reference number automatically if
+    no reference was defined.
+    """
+    method = self._getTypeBasedMethod('finishIngestion',
+                                 fallback_script_id='Document_finishIngestion')
+    return method()
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentTypeFromContent')
+  def getContentTypeFromContent(self):
+    """
+    Return content_type read from metadata extraction of content.
+    This method is called by portal_contribution_registry
+    """
+    mime, content = self.convert(None)
+    if not content:
+      return
+    if magic is not None:
+      # This will be delegated soon to external web service
+      # like cloudooo
+      # ERP5 will no longer handle data itself.
+      mimedetector = magic.Magic(mime=True)
+      return mimedetector.from_buffer(content)
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getExtensionFromFilename')
+  def getExtensionFromFilename(self, filename=None):
+    """
+    Return extension read from filename in lower case.
+    """
+    if not filename:
+      filename = self.getStandardFilename()
+    basename, extension = os.path.splitext(filename)
+    if extension:
+      extension = extension[1:].lower() # remove first dot
+    return extension
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getContentInformation')
+  def getContentInformation(self):
+    """
+    Call private implementation, then store the result in conversion
+    cache storage.
+    """
+    format = CONTENT_INFORMATION_FORMAT
+    # How to knows if a instance implement an interface
+    try:
+      mime, cached_value = self.getConversion(format=format)
+      return cached_value
+    except KeyError:
+      value = self._getContentInformation()
+      self.setConversion(value, format=format)
+      return value
+
+  def _getContentInformation(self):
+    """
+    Returns the content information from the HTML conversion.
+    The default implementation tries to build a dictionary
+    from the HTML conversion of the document and extract
+    the document title.
+    """
+    result = {}
+    html = self.asEntireHTML()
+    if not html:
+      return result
+    title_list = re.findall(self.title_parser, str(html))
+    if title_list:
+      result['title'] = title_list[0]
+    return result
--- a/product/ERP5/mixin/downloadable.py
+++ b/product/ERP5/mixin/downloadable.py
@@ -31,6 +31,7 @@ from Products.ERP5Type import Permissions
 from Products.ERP5Type.Utils import fill_args_from_request
 from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\
    _ViewEmulator
+import warnings

 _MARKER = []

@@ -108,15 +109,31 @@ class DownloadableMixin:
    return str(data)

  security.declareProtected(Permissions.AccessContentsInformation,
-                            'getStandardFileName')
-  def getStandardFileName(self, format=None):
+                            'getStandardFilename')
+  def getStandardFilename(self, format=None):
    """Returns the document coordinates as a standard file name. This
    method is the reverse of getPropertyDictFromFileName.
    """
-    method = self._getTypeBasedMethod('getStandardFileName',
+    method = self._getTypeBasedMethod('getStandardFilename',
+                             fallback_script_id='Document_getStandardFilename')
+    if method is None:
+      # backward compatibility
+      method = self._getTypeBasedMethod('getStandardFileName',
                             fallback_script_id='Document_getStandardFileName')
    return method(format=format)

+  # backward compatibility
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getStandardFileName')
+  def getStandardFileName(self, format=None):
+    """(deprecated) use getStandardFilename() instead."""
+    warnings.warn('getStandardFileName() is deprecated. '
+                  'use getStandardFilename() instead.')
+    return self.getStandardFilename(format=format)
+    method = self._getTypeBasedMethod('getStandardFilename',
+                             fallback_script_id='Document_getStandardFilename')
+    return method(format=format)
+
  def manage_FTPget(self):
    """Return body for ftp. and WebDAV
    """

--- a/product/ERP5/tests/testBase.py
+++ b/product/ERP5/tests/testBase.py
@@ -43,6 +43,7 @@ from zExceptions import BadRequest
 from Products.ERP5Type.tests.backportUnittest import skip
 from Products.ERP5Type.Tool.ClassTool import _aq_reset
 from Products.ERP5Type.Workflow import addWorkflowByType
+from Products.CMFCore.WorkflowCore import WorkflowException

 def getDummyTypeBaseMethod(self):
  """ Use a type Base method
@@ -1248,6 +1249,43 @@ class TestBase(ERP5TypeTestCase, ZopeTestCase.Functional):
    self.assertFalse(person.isIndexable)
    self.assertEquals(0, len(self.portal.portal_catalog(uid=person.getUid())))

+  def test_metaWorkflowTransition(self):
+    """Test Meta Transtion, jump from state to another without explicitely
+    transtion defined.
+    """
+    module = self.portal.person_module
+    person = module.newContent(portal_type='Person')
+    self.assertEquals(person.getValidationState(), 'draft')
+    self.assertFalse(self.portal.portal_workflow.isTransitionPossible(person,
+                                                                 'invalidate'))
+    # test low-level implementation
+    self.portal.portal_workflow.validation_workflow._executeMetaTransition(
+                                                         person, 'invalidated')
+    self.assertEquals(person.getValidationState(), 'invalidated')
+    validation_history = person.workflow_history['validation_workflow']
+    self.assertEquals(len(validation_history), 2)
+    self.assertEquals(validation_history[-1]['comment'],
+                                      'Jump from \'draft\' to \'invalidated\'')
+    person = module.newContent(portal_type='Person')
+    self.assertEquals(person.getValidationState(), 'draft')
+
+    # test high-level implementation
+    self.portal.portal_workflow._jumpToStateFor(person, 'invalidated')
+    self.assertEquals(person.getValidationState(), 'invalidated')
+
+    person = module.newContent(portal_type='Person')
+    self.assertEquals(person.getValidationState(), 'draft')
+    self.portal.portal_workflow._jumpToStateFor(person, 'invalidated',
+                                               wf_id='validation_workflow')
+    self.assertEquals(person.getValidationState(), 'invalidated')
+    person = module.newContent(portal_type='Person')
+    self.assertEquals(person.getValidationState(), 'draft')
+    self.assertRaises(WorkflowException,
+                      self.portal.portal_workflow._jumpToStateFor,
+                      person, 'invalidated', wf_id='edit_workflow')
+    self.assertEquals(person.getValidationState(), 'draft')
+
+
 class TestERP5PropertyManager(unittest.TestCase):
  """Tests for ERP5PropertyManager.
  """

--- a/product/ERP5/tests/testCRM.py
+++ b/product/ERP5/tests/testCRM.py
@@ -36,7 +36,7 @@ from Products.CMFCore.WorkflowCore import WorkflowException
 from Products.ERP5Type.tests.utils import DummyMailHost, FileUpload
 from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
                                                       _getConversionServerDict
-from Products.ERP5OOo.tests.testIngestion import FILE_NAME_REGULAR_EXPRESSION
+from Products.ERP5OOo.tests.testIngestion import FILENAME_REGULAR_EXPRESSION
 from Products.ERP5OOo.tests.testIngestion import REFERENCE_REGULAR_EXPRESSION
 from Products.ERP5Type.tests.backportUnittest import expectedFailure

@@ -443,7 +443,7 @@ class TestCRMMailIngestion(BaseTestCRM):
      data=self._readTestData(filename)
    return self.portal.portal_contributions.newContent(
                    container_path='event_module',
-                    file_name='postfix_mail.eml',
+                    filename='postfix_mail.eml',
                    data=data)

  def test_findTypeByName_MailMessage(self):
@@ -451,7 +451,7 @@ class TestCRMMailIngestion(BaseTestCRM):
    self.assertEquals(
      'Mail Message',
      self.portal.portal_contribution_registry.findPortalTypeName(
-      file_name='postfix_mail.eml', mime_type='message/rfc822', data='Test'
+      filename='postfix_mail.eml', content_type='message/rfc822', data='Test'
      ))

  def test_Base_getEntityListFromFromHeader(self):
@@ -767,7 +767,7 @@ class TestCRMMailSend(BaseTestCRM):
    conversion_dict = _getConversionServerDict()
    default_pref.setPreferredOoodocServerAddress(conversion_dict['hostname'])
    default_pref.setPreferredOoodocServerPortNumber(conversion_dict['port'])
-    default_pref.setPreferredDocumentFileNameRegularExpression(FILE_NAME_REGULAR_EXPRESSION)
+    default_pref.setPreferredDocumentFileNameRegularExpression(FILENAME_REGULAR_EXPRESSION)
    default_pref.setPreferredDocumentReferenceRegularExpression(REFERENCE_REGULAR_EXPRESSION)
    if default_pref.getPreferenceState() == 'disabled':
      default_pref.enable()

--- a/product/ERP5/tests/testContributionRegistryTool.py
+++ b/product/ERP5/tests/testContributionRegistryTool.py
@@ -120,36 +120,36 @@ return predicate.getDestinationPortalType()
    tool = self.portal.portal_contribution_registry

    # Test extension matching
-    self.assertEqual(tool.findPortalTypeName(file_name='test.txt'), 'Text')
-    self.assertEqual(tool.findPortalTypeName(file_name='test.odt'), 'Text')
-    self.assertEqual(tool.findPortalTypeName(file_name='001.jpg'), 'Image')
-    self.assertEqual(tool.findPortalTypeName(file_name='002.PNG'), 'Image')
-    self.assertEqual(tool.findPortalTypeName(file_name='002.PNG'), 'Image')
-    self.assertEqual(tool.findPortalTypeName(file_name='index.html'), 'Web Page')
+    self.assertEqual(tool.findPortalTypeName(filename='test.txt'), 'Text')
+    self.assertEqual(tool.findPortalTypeName(filename='test.odt'), 'Text')
+    self.assertEqual(tool.findPortalTypeName(filename='001.jpg'), 'Image')
+    self.assertEqual(tool.findPortalTypeName(filename='002.png'), 'Image')
+    self.assertEqual(tool.findPortalTypeName(filename='002.PNG'), 'Image')
+    self.assertEqual(tool.findPortalTypeName(filename='index.html'), 'Web Page')
    # Unknown extension
-    self.assertEqual(tool.findPortalTypeName(file_name='index.xxx'), 'File')
+    self.assertEqual(tool.findPortalTypeName(filename='index.xxx'), 'File')

    # Test mimetype matching
-    self.assertEqual(tool.findPortalTypeName(mime_type='text/html'), 'Web Page')
+    self.assertEqual(tool.findPortalTypeName(content_type='text/html'), 'Web Page')

    # Unknown mimetype
-    self.assertEqual(tool.findPortalTypeName(mime_type='application/octet-stream'), 'File')
+    self.assertEqual(tool.findPortalTypeName(content_type='application/octet-stream'), 'File')

    # Test both of extension and mimetype
-    self.assertNotEqual(tool.findPortalTypeName(file_name='message.eml'),
+    self.assertNotEqual(tool.findPortalTypeName(filename='message.eml'),
                        'Mail Message')
-    self.assertNotEqual(tool.findPortalTypeName(mime_type='message/rfc822'),
+    self.assertNotEqual(tool.findPortalTypeName(content_type='message/rfc822'),
                        'Mail Message')
-    self.assertEqual(tool.findPortalTypeName(file_name='message.eml',
-                                             mime_type='message/rfc822'),
+    self.assertEqual(tool.findPortalTypeName(filename='message.eml',
+                                             content_type='message/rfc822'),
                     'Mail Message')

    # Test test script
    data = """\
 Subject: Fax
 """
-    self.assertEqual(tool.findPortalTypeName(file_name='message.eml',
-                                             mime_type='message/rfc822',
+    self.assertEqual(tool.findPortalTypeName(filename='message.eml',
+                                             content_type='message/rfc822',
                                             data=data),
                     'Fax Message')


--- a/product/ERP5/tests/testERP5WebWithDms.py
+++ b/product/ERP5/tests/testERP5WebWithDms.py
@@ -37,7 +37,8 @@ from AccessControl.SecurityManagement import newSecurityManager
 from Testing import ZopeTestCase
 from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
     _getConversionServerDict
-from Products.ERP5Type.tests.utils import FileUpload
+from Products.ERP5Type.tests.utils import FileUpload, createZODBPythonScript
+

 LANGUAGE_LIST = ('en', 'fr', 'de', 'bg',)

@@ -568,8 +569,21 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):

  def test_PreviewOOoDocumentWithEmbeddedImage(self):
    """Tests html preview of an OOo document with images as extensible content.
+    For this test, Presentation_checkConversionFormatPermission does not allow
+    access to original format for Unauthenticated users.
+    Chack that user can still access to other format.
    """
    portal = self.portal
+    script_id = 'Presentation_checkConversionFormatPermission'
+    python_code = """from AccessControl import getSecurityManager
+user = getSecurityManager().getUser()
+if (not user or not user.getId()) and not format:
+  return False
+return True
+"""
+    createZODBPythonScript(portal.portal_skins.custom, script_id,
+                           'format, **kw', python_code)
+    
    request = portal.REQUEST
    request['PARENTS'] = [self.app]
    self.getPortalObject().aq_parent.acl_users._doAddUser(
@@ -611,7 +625,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
    # then publish the document and access it anonymously by reference through
    # the web site
    document.publish()
-    
+
    transaction.commit()
    self.tic()

@@ -620,7 +634,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
    self.assertTrue(response.getHeader('content-type').startswith('text/html'))
    html = response.getBody()
    self.assertTrue('<img' in html, html)
-    
+
    # find the img src
    img_list = etree.HTML(html).findall('.//img')
    self.assertEquals(1, len(img_list))
@@ -633,6 +647,22 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
    png = response.getBody()
    self.assertTrue(png.startswith('\x89PNG'))

+    # Now purge cache and let Anonymous user converting the document.
+    self.login()
+    document.edit() # Reset cache key
+    transaction.commit()
+    self.tic()
+    response = self.publish('%s/%s/asEntireHTML' % (
+                            website.absolute_url_path(), document_reference))
+    self.assertTrue(response.getHeader('content-type').startswith('text/html'))
+    html = response.getBody()
+    self.assertTrue('<img' in html, html)
+    
+    # find the img src
+    img_list = etree.HTML(html).findall('.//img')
+    self.assertEquals(1, len(img_list))
+    src = img_list[0].get('src')
+
  def test_ImageConversionThroughWebSite(self):
    """Check that conversion parameters pass in url
    are hounoured to display an image in context of a website

--- a/product/ERP5/tests/testWebCrawler.py
+++ b/product/ERP5/tests/testWebCrawler.py
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Nicolas Delaby <nicolas@erp5.org>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsibility of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# guarantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+import unittest
+from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
+     _getConversionServerDict
+
+import transaction
+
+# test files' home
+FILENAME_REGULAR_EXPRESSION = "(?P<reference>[A-Z&é@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})"
+REFERENCE_REGULAR_EXPRESSION = "(?P<reference>[A-Z&é@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?"
+
+class TestWebCrawler(ERP5TypeTestCase):
+  """
+    Test Crawling mechanism
+  """
+
+  _path_to_delete_list = []
+  system_pref_id = 'my_preference'
+
+  def getTitle(self):
+    """
+      Return the title of the current test set.
+    """
+    return "ERP5 Live DMS - Web Crawling"
+
+  def getBusinessTemplateList(self):
+    """
+      Return the list of required business templates.
+    """
+    return ('erp5_base',
+            'erp5_ingestion',
+            'erp5_ingestion_mysql_innodb_catalog',
+            'erp5_web',
+            'erp5_dms')
+
+  def afterSetUp(self):
+    """
+      Initialize the ERP5 site.
+    """
+    self.login()
+    self.portal = self.getPortal()
+    self.setSystemPreference()
+    self.bootstrapWebSite()
+    transaction.commit()
+    self.tic()
+
+  def beforeTearDown(self):
+    portal = self.portal
+    module_id_list = [
+      'web_page_module',
+      'web_site_module',
+      'external_source_module',
+      'document_module',
+      ]
+    # delete created documents by test
+    for module_id in module_id_list:
+      module = portal[module_id]
+      module.manage_delObjects(list(module.objectIds()))
+    # Unindex deleted documents
+    transaction.commit()
+    self.tic()
+
+  def setSystemPreference(self):
+    portal_preferences = self.portal.portal_preferences
+    system_preference = portal_preferences._getOb(self.system_pref_id, None)
+    if system_preference is None:
+      system_preference = portal_preferences.newContent(id=self.system_pref_id,
+                                               portal_type='System Preference')
+    conversion_dict = _getConversionServerDict()
+    system_preference.\
+                   setPreferredOoodocServerAddress(conversion_dict['hostname'])
+    system_preference.\
+                    setPreferredOoodocServerPortNumber(conversion_dict['port'])
+    system_preference.setPreferredDocumentFilenameRegularExpression(
+                                                   FILENAME_REGULAR_EXPRESSION)
+    system_preference.setPreferredDocumentReferenceRegularExpression(
+                                                  REFERENCE_REGULAR_EXPRESSION)
+    if system_preference.getPreferenceState() != 'global':
+      system_preference.enable()
+
+
+  def bootstrapWebSite(self):
+    """Create 1 Website
+    live_test_web_site/section1/section1a
+                      /section2
+    create 2 web pages
+      W-REFERENCE.PAGE
+      W-REFERENCE.HOMEPAGE
+
+    the website use light version of erp5_web_layout
+    It keep just displaying sections and subsection
+    And default Web page
+    """
+    web_site_portal_type = 'Web Site'
+    web_section_portal_type = 'Web Section'
+    web_page_portal_type = 'Web Page'
+    web_site_module = self.portal.getDefaultModule(web_site_portal_type)
+    web_page_module = self.portal.getDefaultModule(web_page_portal_type)
+
+    text_content = """<p><a href="W-REFERENCE.PAGE">Page</a></p>"""
+    web_page_id = 'live_test_home'
+    home_page = web_page_module.newContent(portal_type=web_page_portal_type,
+                                          title='Home Page',
+                                          text_content=text_content,
+                                          reference='W-REFERENCE.HOMEPAGE',
+                                          version='001',
+                                          language='en',
+                                          id=web_page_id)
+    home_page.submit()
+    home_page.publish()
+
+    web_site_id = 'live_test_web_site'
+    web_site = web_site_module.newContent(portal_type=web_site_portal_type,
+                      id=web_site_id,
+                      title='Live Test Web Site',
+                      visible=True,
+                      default_page_displayed=True,
+                      site_map_section_parent=True,
+                      authorization_forced=True,
+                      aggregate_value=home_page,
+                      available_language_set=['en'],
+                      container_layout='erp5_web_layout_test',
+                      content_layout='erp5_web_content_layout_test')
+    web_site.publish()
+
+    text_content = """<p>
+    <a href="%s/W-REFERENCE.HOMEPAGE">absolute link to HOME PAGE</a>
+    </p>""" % web_site.absolute_url()
+    section1a_page = web_page_module.newContent(
+                                              portal_type=web_page_portal_type,
+                                              title='Home Page',
+                                              text_content=text_content,
+                                              reference='W-REFERENCE.PAGE',
+                                              version='001',
+                                              language='en')
+    section1a_page.submit()
+    section1a_page.publish()
+    web_section1 = web_site.newContent(portal_type=web_section_portal_type,
+                                      title='Section 1',
+                                      id='section1',
+                                      aggregate_value=section1a_page)
+    web_section2 = web_site.newContent(portal_type=web_section_portal_type,
+                                      title='Section 2',
+                                      id='section2',
+                                      aggregate_value=section1a_page)
+    web_section1a = web_section1.newContent(
+                                          portal_type=web_section_portal_type,
+                                          title='Section 1a',
+                                          id='section 1a', #add a space in id
+                                          aggregate_value=section1a_page)
+
+  def test_01_check_URLTransformations(self):
+    """Check crawlable functionalities regarding URL handling
+
+    getContentBaseURL
+    asNormalisedURL
+    getContentNormalisedURLList
+    """
+    web_page_portal_type = 'Web Page'
+    web_page_module = self.portal.getDefaultModule(web_page_portal_type)
+    web_page = web_page_module.newContent(portal_type=web_page_portal_type)
+    self.assertEquals(web_page.getContentBaseURL(), '')
+    web_page.fromURL('http://www.example.com')
+    self.assertEquals(web_page.getContentBaseURL(), 'http://www.example.com')
+    web_page.fromURL('http://www.example.com/section/sub_section')
+    self.assertEquals(web_page.getContentBaseURL(),
+                      'http://www.example.com/section')
+    text_content = """<html>
+    <head>
+      <base href="http://www.example.com"/>
+    </head>
+    <body>
+      <p><a href="http://www.notexample.com/">External link</a></p>
+      <p><a href="http://www.example.com//I don't care I put what/ I want/">
+          Funny link</a></p>
+      <p><a href="http://www.example.com/section">Internal link</a></p>
+      <p><a href="section2">Relative Internal link</a></p>
+      <p><a href="http://www.example.com/?title=%E9+crit">With Encoding issue
+      This link will be discarded</a></p>
+      <img src="my_image_link"/>
+      <script src="should_not_be_followed.js"/>
+      <p><a href="http://http://www.example.com/section">Not a link</a></p>
+    </body>
+    </html>"""
+    web_page.edit(text_content=text_content)
+    self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
+    self.assertEquals(web_page.getContentNormalisedURLList(),
+                    ["http://www.example.com/I don't care I put what/ I want/",
+                     'http://www.example.com/section',
+                     'http://www.example.com/section2',])
+    # relative links without base tag
+    text_content = """<html>
+    <head>
+    </head>
+    <body>
+      <p><a href="section2">Relative Internal link</a></p>
+    </body>
+    </html>"""
+    web_page.edit(text_content=text_content)
+    web_page.fromURL('http://www.example.com/#fffff')
+    self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
+    self.assertEquals(web_page.getContentNormalisedURLList(),
+                      ['http://www.example.com/section2',])
+    self.assertEquals(web_page.asNormalisedURL(),
+                      'http://www.example.com/#fffff')
+
+  def test_02_crawlWebSite(self):
+    """Call portal_contribution to crawl website hosted by itself.
+    """
+    web_site = self.portal.web_site_module.live_test_web_site
+    external_source_portal_type = 'URL Crawler'
+    web_crawler_module = self.portal.getDefaultModule(
+                                                   external_source_portal_type)
+    web_crawler = web_crawler_module.newContent(
+                                       portal_type=external_source_portal_type,
+                                       crawling_depth=5)
+    web_crawler.fromURL(web_site.absolute_url())
+    transaction.commit()
+    self.tic()
+    web_crawler.crawlContent()
+    transaction.commit()
+    self.tic()
+
+    # 6 = 1 website
+    #     + 3 Web Sections
+    #     + 1 absolute link to home_page
+    #     + 1 relative link from home_page to another web page
+    self.assertEquals(len(web_crawler), 6)
+    self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
+                      6)
+    date_before = web_crawler.getModificationDate()
+    web_crawler.crawlContent()
+    transaction.commit()
+    self.tic()
+    # Nothing happens, portal_url_registry keep crawling twice
+    # the same url
+    self.assertEquals(len(web_crawler), 6)
+    self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
+                      6)
+    # not modified
+    self.assertEquals(date_before, web_crawler.getModificationDate())
+
+    new_web_crawler = web_crawler_module.newContent(
+                                       portal_type=external_source_portal_type,
+                                       crawling_depth=5)
+    new_web_crawler.fromURL(web_site.absolute_url())
+    transaction.commit()
+    self.tic()
+    new_web_crawler.crawlContent()
+    transaction.commit()
+    self.tic()
+    # check that portal_url_registry
+    # block contribution of existing content
+    self.assertFalse(len(new_web_crawler))
+
+    # set another namespace on preference
+    preference = self.portal.portal_preferences[self.system_pref_id]
+    preference.setPreferredIngestionNamespace('NEW')
+    transaction.commit()
+    self.tic()
+    new_web_crawler.crawlContent()
+    transaction.commit()
+    self.tic()
+    self.assertEquals(len(web_crawler), 6)
+
+
+def test_suite():
+  suite = unittest.TestSuite()
+  suite.addTest(unittest.makeSuite(TestWebCrawler))
+  return suite