Commit de2545fc authored by Nicolas Delaby's avatar Nicolas Delaby

Refactoring of DMS.

- file_name become filename
- filename values are not stored in source_reference
Contribution Tool will not honour id arguments.
Contribution Tool can create any kind of document.
Portal Contribution Registry can read extention, content_type and read content_type from data
to guess what will be the best Portal Type to use.

All discoverable methods (IDiscoverable) can change the portal_type of document.
  (migratePortalType)
User can change portal_type of document through UI with simple Action.
Crawling will not hardcode ids of document depending of their URLs thanks to 
Portal Url Registry





git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@40971 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4627391c
...@@ -40,18 +40,15 @@ from Products.ERP5Type import Permissions, PropertySheet, interfaces ...@@ -40,18 +40,15 @@ from Products.ERP5Type import Permissions, PropertySheet, interfaces
from Products.ERP5Type.XMLObject import XMLObject from Products.ERP5Type.XMLObject import XMLObject
from Products.ERP5Type.DateUtils import convertDateToHour,\ from Products.ERP5Type.DateUtils import convertDateToHour,\
number_of_hours_in_day, number_of_hours_in_year number_of_hours_in_day, number_of_hours_in_year
from Products.ERP5Type.Utils import convertToUpperCase, fill_args_from_request from Products.ERP5Type.Utils import convertToUpperCase, fill_args_from_request,\
deprecated
from Products.ERP5Type.TransactionalVariable import getTransactionalVariable from Products.ERP5Type.TransactionalVariable import getTransactionalVariable
from Products.ERP5Type.Cache import getReadOnlyTransactionCache from Products.ERP5Type.Cache import getReadOnlyTransactionCache
from Products.ERP5.Document.Url import UrlMixIn
from Products.ERP5.Tool.ContributionTool import MAX_REPEAT from Products.ERP5.Tool.ContributionTool import MAX_REPEAT
from Products.ERP5Type.UnrestrictedMethod import unrestricted_apply
from Products.ZSQLCatalog.SQLCatalog import SQLQuery from Products.ZSQLCatalog.SQLCatalog import SQLQuery
from AccessControl import Unauthorized from AccessControl import Unauthorized
import zope.interface import zope.interface
from Products.PythonScripts.Utility import allow_class from Products.PythonScripts.Utility import allow_class
import tempfile
from subprocess import Popen, PIPE
# Mixin Import # Mixin Import
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
...@@ -60,9 +57,10 @@ from Products.ERP5.mixin.downloadable import DownloadableMixin ...@@ -60,9 +57,10 @@ from Products.ERP5.mixin.downloadable import DownloadableMixin
from Products.ERP5.mixin.document import DocumentMixin from Products.ERP5.mixin.document import DocumentMixin
from Products.ERP5.mixin.extensible_traversable import DocumentExtensibleTraversableMixin from Products.ERP5.mixin.extensible_traversable import DocumentExtensibleTraversableMixin
from Products.ERP5.mixin.crawlable import CrawlableMixin from Products.ERP5.mixin.crawlable import CrawlableMixin
from Products.ERP5.mixin.discoverable import DiscoverableMixin
from Products.ERP5.mixin.url import UrlMixin
_MARKER = [] _MARKER = []
VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')
# these property ids are unchangable # these property ids are unchangable
FIXED_PROPERTY_IDS = ('id', 'uid', 'rid', 'sid') FIXED_PROPERTY_IDS = ('id', 'uid', 'rid', 'sid')
...@@ -88,8 +86,9 @@ class DocumentProxyError(Exception):pass ...@@ -88,8 +86,9 @@ class DocumentProxyError(Exception):pass
class NotConvertedError(Exception):pass class NotConvertedError(Exception):pass
allow_class(NotConvertedError) allow_class(NotConvertedError)
class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedConvertableMixin, class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
CrawlableMixin, TextConvertableMixin, DownloadableMixin, DocumentMixin): CachedConvertableMixin, CrawlableMixin, TextConvertableMixin,
DownloadableMixin, DocumentMixin, DiscoverableMixin):
"""Document is an abstract class with all methods related to document """Document is an abstract class with all methods related to document
management in ERP5. This includes searchable text, explicit relations, management in ERP5. This includes searchable text, explicit relations,
implicit relations, metadata, versions, languages, etc. implicit relations, metadata, versions, languages, etc.
...@@ -144,7 +143,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo ...@@ -144,7 +143,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
input - data supplied with http request or set on the object during (2) (e.g. input - data supplied with http request or set on the object during (2) (e.g.
discovered from email text) discovered from email text)
file_name - data which might be encoded in file name filename - data which might be encoded in filename
user_login - information about user who is contributing the file user_login - information about user who is contributing the file
content - data which might be derived from document content content - data which might be derived from document content
...@@ -156,7 +155,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo ...@@ -156,7 +155,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
Methods for discovering metadata are: Methods for discovering metadata are:
getPropertyDictFromInput getPropertyDictFromInput
getPropertyDictFromFileName getPropertyDictFromFilename
getPropertyDictFromUserLogin getPropertyDictFromUserLogin
getPropertyDictFromContent getPropertyDictFromContent
...@@ -266,10 +265,15 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo ...@@ -266,10 +265,15 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
interfaces.IVersionable, interfaces.IVersionable,
interfaces.IDownloadable, interfaces.IDownloadable,
interfaces.ICrawlable, interfaces.ICrawlable,
interfaces.IDocument interfaces.IDocument,
interfaces.IDiscoverable,
interfaces.IUrl,
) )
# Regular expressions # Regular expressions
# XXX those regex are weak, fast but not reliable.
# this is a valid url than regex are not able to parse
# http://www.example.com//I don't care i put what/ i want/
href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE) href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL) body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL) title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
...@@ -639,141 +643,14 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo ...@@ -639,141 +643,14 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
if not reference: if not reference:
return return
catalog = self.getPortalObject().portal_catalog catalog = self.getPortalObject().portal_catalog
res = catalog(reference=self.getReference(), sort_on=(('creation_date','ascending'),)) result_list = catalog.unrestrictedSearchResults(
reference=self.getReference(),
sort_on=(('creation_date',
'ascending'),))
# XXX this should be security-unaware - delegate to script with proxy roles # XXX this should be security-unaware - delegate to script with proxy roles
return res[0].getLanguage() # XXX what happens if it is empty? if result_list:
return result_list[0].getLanguage()
### Property getters return
# Property Getters are document dependent so that we can
# handle the weird cases in which needed properties change with the type of document
# and the usual cases in which accessing content changes with the meta type
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromUserLogin')
def getPropertyDictFromUserLogin(self, user_login=None):
"""
Based on the user_login, find out as many properties as needed.
returns properties which should be set on the document
"""
if user_login is None:
user_login = str(getSecurityManager().getUser())
method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
fallback_script_id='Document_getPropertyDictFromUserLogin')
return method(user_login)
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromContent')
def getPropertyDictFromContent(self):
"""
Based on the document content, find out as many properties as needed.
returns properties which should be set on the document
"""
# accesss data through convert
mime, content = self.convert(None)
if not content:
# if document is empty, we will not find anything in its content
return {}
method = self._getTypeBasedMethod('getPropertyDictFromContent',
fallback_script_id='Document_getPropertyDictFromContent')
return method()
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
def getPropertyDictFromFileName(self, file_name):
"""
Based on the file name, find out as many properties as needed.
returns properties which should be set on the document
"""
return self.portal_contributions.getPropertyDictFromFileName(file_name)
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromInput')
def getPropertyDictFromInput(self):
"""
Get properties which were supplied explicitly to the ingestion method
(discovered or supplied before the document was created).
The implementation consists in saving document properties
into _backup_input by supposing that original input parameters were
set on the document by ContributionTool.newContent as soon
as the document was created.
"""
kw = getattr(self, '_backup_input', {})
if kw:
return kw
for id in self.propertyIds():
# We should not consider file data
if id not in ('data', 'categories_list', 'uid', 'id',
'text_content', 'base_data',) \
and self.hasProperty(id):
kw[id] = self.getProperty(id)
self._backup_input = kw # We could use volatile and pass kw in activate
# if we are garanteed that _backup_input does not
# disappear within a given transaction
return kw
### Metadata disovery and ingestion methods
security.declareProtected(Permissions.ModifyPortalContent, 'discoverMetadata')
def discoverMetadata(self, file_name=None, user_login=None):
"""
This is the main metadata discovery function - controls the process
of discovering data from various sources. The discovery itself is
delegated to scripts or uses preference-configurable regexps. The
method returns either self or the document which has been
merged in the discovery process.
file_name - this parameter is a file name of the form "AA-BBB-CCC-223-en"
user_login - this is a login string of a person; can be None if the user is
currently logged in, then we'll get him from session
"""
# Preference is made of a sequence of 'user_login', 'content', 'file_name', 'input'
method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList',
fallback_script_id = 'Document_getPreferredDocumentMetadataDiscoveryOrderList')
order_list = list(method())
order_list.reverse()
# build a dictionary according to the order
kw = {}
for order_id in order_list:
result = None
if order_id not in VALID_ORDER_KEY_LIST:
# Prevent security attack or bad preferences
raise AttributeError, "%s is not in valid order key list" % order_id
method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
method = getattr(self, method_id)
if order_id == 'file_name':
if file_name is not None:
result = method(file_name)
elif order_id == 'user_login':
if user_login is not None:
result = method(user_login)
else:
result = method()
if result is not None:
for key, value in result.iteritems():
if value not in (None, ''):
kw[key]=value
if file_name is not None:
# filename is often undefined....
kw['source_reference'] = file_name
# Prepare the content edit parameters - portal_type should not be changed
kw.pop('portal_type', None)
# Try not to invoke an automatic transition here
self._edit(**kw)
# Finish ingestion by calling method
self.finishIngestion() # XXX - is this really the right place ?
self.reindexObject() # XXX - is this really the right place ?
# Revision merge is tightly coupled
# to metadata discovery - refer to the documentation of mergeRevision method
merged_doc = self.mergeRevision() # XXX - is this really the right place ?
merged_doc.reindexObject() # XXX - is this really the right place ?
return merged_doc # XXX - is this really the right place ?
security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
def finishIngestion(self):
"""
Finish the ingestion process by calling the appropriate script. This
script can for example allocate a reference number automatically if
no reference was defined.
"""
method = self._getTypeBasedMethod('finishIngestion', fallback_script_id='Document_finishIngestion')
return method()
security.declareProtected(Permissions.View, 'asSubjectText') security.declareProtected(Permissions.View, 'asSubjectText')
def asSubjectText(self, **kw): def asSubjectText(self, **kw):
...@@ -827,32 +704,13 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo ...@@ -827,32 +704,13 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
return self._stripHTML(self._asHTML(**kw)) return self._stripHTML(self._asHTML(**kw))
security.declarePrivate('_guessEncoding') security.declarePrivate('_guessEncoding')
@deprecated
def _guessEncoding(self, string, mime='text/html'): def _guessEncoding(self, string, mime='text/html'):
""" """
Try to guess the encoding for this string. Deprecated method
Returns None if no encoding can be guessed.
""" """
try: contribution_tool = self.getPortalObject().portal_contributions
import chardet return contribution_tool.guessEncodingFromText(string, content_type=mime)
except ImportError:
chardet = None
if chardet is not None and (mime == 'text/html'\
or os.sys.platform != 'linux2'):
# chardet works fine on html document and its platform independent
return chardet.detect(string).get('encoding', None)
else:
# file command provide better result
# for text/plain documents
# store the content into tempfile
file_descriptor, path = tempfile.mkstemp()
file_object = os.fdopen(file_descriptor, 'w')
file_object.write(string)
file_object.close()
# run file command against tempfile to and read encoded
command_result = Popen(['file', '-b', '--mime-encoding', path],
stdout=PIPE).communicate()[0]
# return detected encoding
return command_result.strip()
def _stripHTML(self, html, charset=None): def _stripHTML(self, html, charset=None):
""" """
...@@ -866,22 +724,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo ...@@ -866,22 +724,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
stripped_html = html stripped_html = html
return stripped_html return stripped_html
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def getContentInformation(self):
"""
Returns the content information from the HTML conversion.
The default implementation tries to build a dictionnary
from the HTML conversion of the document and extract
the document title.
"""
result = {}
html = self.asEntireHTML()
if not html: return result
title_list = re.findall(self.title_parser, str(html))
if title_list:
result['title'] = title_list[0]
return result
security.declareProtected(Permissions.AccessContentsInformation, security.declareProtected(Permissions.AccessContentsInformation,
'getMetadataMappingDict') 'getMetadataMappingDict')
def getMetadataMappingDict(self): def getMetadataMappingDict(self):
...@@ -918,21 +760,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo ...@@ -918,21 +760,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
method = None method = None
if method is not None: method() if method is not None: method()
# Crawling API
security.declareProtected(Permissions.AccessContentsInformation, 'getContentURLList')
def getContentURLList(self):
"""
Returns a list of URLs referenced by the content of this document.
Default implementation consists in analysing the document
converted to HTML. Subclasses may overload this method
if necessary. However, it is better to extend the conversion
methods in order to produce valid HTML, which is useful to
many people, rather than overload this method which is only
useful for crawling.
"""
html_content = self.asStrippedHTML()
return re.findall(self.href_parser, str(html_content))
security.declareProtected(Permissions.ModifyPortalContent, 'updateContentFromURL') security.declareProtected(Permissions.ModifyPortalContent, 'updateContentFromURL')
def updateContentFromURL(self, repeat=MAX_REPEAT, crawling_depth=0): def updateContentFromURL(self, repeat=MAX_REPEAT, crawling_depth=0):
""" """
...@@ -963,18 +790,3 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo ...@@ -963,18 +790,3 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
if hasattr(aq_base(container), 'isIndexContent'): if hasattr(aq_base(container), 'isIndexContent'):
return container.isIndexContent(self) return container.isIndexContent(self)
return False return False
security.declareProtected(Permissions.AccessContentsInformation, 'getContentBaseURL')
def getContentBaseURL(self):
"""
Returns the content base URL based on the actual content or
on its URL.
"""
base_url = self.asURL()
base_url_list = base_url.split('/')
if len(base_url_list):
if base_url_list[-1] and base_url_list[-1].find('.') > 0:
# Cut the trailing part in http://www.some.site/at/trailing.html
# but not in http://www.some.site/at
base_url = '/'.join(base_url_list[:-1])
return base_url
...@@ -114,22 +114,14 @@ class PDFDocument(Image): ...@@ -114,22 +114,14 @@ class PDFDocument(Image):
""" """
if not self.hasData(): if not self.hasData():
return '' return ''
tmp = tempfile.NamedTemporaryFile() mime_type = 'text/plain'
tmp.write(self.getData()) portal_transforms = self.getPortalObject().portal_transforms
tmp.seek(0) filename = self.getStandardFilename(format='txt')
try: result = portal_transforms.convertToData(mime_type, str(self.getData()),
command = ['pdftotext', '-layout', '-enc', 'UTF-8', context=self, filename=filename,
'-nopgbrk', tmp.name, '-'] mimetype=self.getContentType())
try: if result:
command_result = Popen(command, stdout=PIPE).communicate()[0] return result
except OSError, e:
if e.errno == errno.ENOENT:
raise ConversionError('pdftotext was not found')
raise
finally:
tmp.close()
if command_result:
return command_result
else: else:
# Try to use OCR # Try to use OCR
# As high dpi images are required, it may take some times to convert the # As high dpi images are required, it may take some times to convert the
...@@ -145,13 +137,12 @@ class PDFDocument(Image): ...@@ -145,13 +137,12 @@ class PDFDocument(Image):
frame=page_number, display='identical') frame=page_number, display='identical')
if not src_mimetype.endswith('png'): if not src_mimetype.endswith('png'):
continue continue
content = '%s' % png_data content = str(png_data)
mime_type = 'text/plain'
if content is not None: if content is not None:
portal_transforms = getToolByName(self, 'portal_transforms') filename = self.getStandardFilename(format='png')
result = portal_transforms.convertToData(mime_type, content, result = portal_transforms.convertToData(mime_type, content,
context=self, context=self,
filename=self.getTitleOrId(), filename=filename,
mimetype=src_mimetype) mimetype=src_mimetype)
if result is None: if result is None:
raise ConversionError('PDFDocument conversion error. ' raise ConversionError('PDFDocument conversion error. '
......
...@@ -45,6 +45,9 @@ try: ...@@ -45,6 +45,9 @@ try:
from string import Template from string import Template
except ImportError: except ImportError:
from Products.ERP5Type.patches.string import Template from Products.ERP5Type.patches.string import Template
from Products.ERP5Type.Utils import guessEncodingFromText
from lxml import html as etree_html
class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
TextContent, File): TextContent, File):
...@@ -147,7 +150,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, ...@@ -147,7 +150,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
kw['format'] = format kw['format'] = format
if not self.hasConversion(**kw): if not self.hasConversion(**kw):
portal_transforms = getToolByName(portal, 'portal_transforms') portal_transforms = getToolByName(portal, 'portal_transforms')
filename = self.getSourceReference(self.getTitleOrId()) filename = self.getStandardFilename(format=format)
if mime_type == 'text/html': if mime_type == 'text/html':
mime_type = 'text/x-html-safe' mime_type = 'text/x-html-safe'
result = portal_transforms.convertToData(mime_type, text_content, result = portal_transforms.convertToData(mime_type, text_content,
...@@ -183,9 +186,13 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, ...@@ -183,9 +186,13 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
""" """
if self.hasTextContent(): if self.hasTextContent():
html = self._asHTML() html = self._asHTML()
base_list = re.findall(self.base_parser, str(html)) # a document can be entirely stripped by safe_html
if base_list: # so its html conversion can be empty
return base_list[0] if html.strip():
html_tree = etree_html.fromstring(html)
base_list = [href for href in html_tree.xpath('//base/@href') if href]
if base_list:
return str(base_list[0])
return Document.getContentBaseURL(self) return Document.getContentBaseURL(self)
security.declareProtected(Permissions.ModifyPortalContent, 'setBaseData') security.declareProtected(Permissions.ModifyPortalContent, 'setBaseData')
...@@ -270,14 +277,14 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin, ...@@ -270,14 +277,14 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
return encoded content_type and message if encoding return encoded content_type and message if encoding
is not utf-8 is not utf-8
""" """
codec = document._guessEncoding(text_content, content_type) codec = guessEncodingFromText(text_content, content_type)
if codec is not None: if codec is not None:
try: try:
text_content = text_content.decode(codec).encode('utf-8') text_content = text_content.decode(codec).encode('utf-8')
except (UnicodeDecodeError, LookupError): except (UnicodeDecodeError, LookupError):
message = 'Conversion to base format with codec %r fails' % codec message = 'Conversion to base format with codec %r fails' % codec
# try again with another guesser based on file command # try again with another guesser based on file command
codec = document._guessEncoding(text_content, 'text/plain') codec = guessEncodingFromText(text_content, 'text/plain')
if codec is not None: if codec is not None:
try: try:
text_content = text_content.decode(codec).encode('utf-8') text_content = text_content.decode(codec).encode('utf-8')
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
from AccessControl import ClassSecurityInfo from AccessControl import ClassSecurityInfo
from Products.ERP5Type.Globals import InitializeClass from Products.ERP5Type.Globals import InitializeClass
from Products.ERP5Type.Tool.BaseTool import BaseTool from Products.ERP5Type.Tool.BaseTool import BaseTool
from Products.ERP5Type import Permissions
class ContributionRegistryTool(BaseTool): class ContributionRegistryTool(BaseTool):
...@@ -41,14 +41,18 @@ class ContributionRegistryTool(BaseTool): ...@@ -41,14 +41,18 @@ class ContributionRegistryTool(BaseTool):
security = ClassSecurityInfo() security = ClassSecurityInfo()
security.declarePrivate('findPortalTypeName') security.declareProtected(Permissions.AccessContentsInformation,
def findPortalTypeName(self, file_name='', mime_type=None, data=None): 'findPortalTypeName')
from Products.ERP5Type.Document import newTempIngestionFile def findPortalTypeName(self, context=None, **kw):
ingestion_file = newTempIngestionFile(self, 'id') # if a context is passed, ignore other arguments
ingestion_file._edit(file_name=file_name, mime_type=mime_type, data=data) if context is None:
# Build a temp object edited with provided parameters
from Products.ERP5Type.Document import newTempFile
context = newTempFile(self, 'id')
context.edit(**kw)
for predicate in self.objectValues(sort_on='int_index'): for predicate in self.objectValues(sort_on='int_index'):
result = predicate.test(ingestion_file) result = predicate.test(context)
if result: if result:
return result return result
......
...@@ -29,12 +29,7 @@ ...@@ -29,12 +29,7 @@
import cStringIO import cStringIO
import re import re
import string
import socket import socket
try:
from hashlib import md5 as md5_new
except ImportError:
from md5 import new as md5_new
import urllib2, urllib import urllib2, urllib
import urlparse import urlparse
from cgi import parse_header from cgi import parse_header
...@@ -46,13 +41,11 @@ from Products.CMFCore.utils import getToolByName, _checkPermission ...@@ -46,13 +41,11 @@ from Products.CMFCore.utils import getToolByName, _checkPermission
from Products.ERP5Type.Tool.BaseTool import BaseTool from Products.ERP5Type.Tool.BaseTool import BaseTool
from Products.ERP5Type import Permissions from Products.ERP5Type import Permissions
from Products.ERP5 import _dtmldir from Products.ERP5 import _dtmldir
from Products.ERP5.Document.Url import no_crawl_protocol_list, no_host_protocol_list from Products.ERP5.Document.Url import no_crawl_protocol_list
from AccessControl import Unauthorized from AccessControl import Unauthorized
from zLOG import LOG
from DateTime import DateTime from DateTime import DateTime
from Acquisition import aq_base import warnings
from zExceptions import BadRequest
# Install openers # Install openers
import ContributionOpener import ContributionOpener
...@@ -83,7 +76,7 @@ class ContributionTool(BaseTool): ...@@ -83,7 +76,7 @@ class ContributionTool(BaseTool):
Configuration Scripts: Configuration Scripts:
- ContributionTool_getPropertyDictFromFileName: receives file name and a - ContributionTool_getPropertyDictFromFilename: receives file name and a
dict derived from filename by regular expression, and does any necesary dict derived from filename by regular expression, and does any necesary
operations (e.g. mapping document type id onto a real portal_type). operations (e.g. mapping document type id onto a real portal_type).
...@@ -98,8 +91,7 @@ class ContributionTool(BaseTool): ...@@ -98,8 +91,7 @@ class ContributionTool(BaseTool):
meta_type = 'ERP5 Contribution Tool' meta_type = 'ERP5 Contribution Tool'
portal_type = 'Contribution Tool' portal_type = 'Contribution Tool'
# Regular expressions
simple_normaliser = re.compile('#.*')
# Declarative Security # Declarative Security
security = ClassSecurityInfo() security = ClassSecurityInfo()
...@@ -108,153 +100,141 @@ class ContributionTool(BaseTool): ...@@ -108,153 +100,141 @@ class ContributionTool(BaseTool):
manage_overview = DTMLFile( 'explainContributionTool', _dtmldir ) manage_overview = DTMLFile( 'explainContributionTool', _dtmldir )
security.declareProtected(Permissions.AddPortalContent, 'newContent') security.declareProtected(Permissions.AddPortalContent, 'newContent')
def newContent(self, id=None, portal_type=None, url=None, container=None, def newContent(self, **kw):
container_path=None,
discover_metadata=1, temp_object=0,
user_login=None, data=None, file_name=None, **kw):
""" """
The newContent method is overriden to implement smart content The newContent method is overriden to implement smart content
creation by detecting the portal type based on whatever information creation by detecting the portal type based on whatever information
was provided and finding out the most appropriate module to store was provided and finding out the most appropriate module to store
the content. the content.
user_login is the name under which the content will be created explicit named parameters was:
XXX - this is a security hole which needs to be fixed by id - ignored argument
making sure only Manager can use this parameter portal_type - explicit portal_type parameter, must be honoured
url - Identifier of external resource. Content will be downloaded
container -- if specified, it is possible to define from it
where to contribute the content. Else, ContributionTool container - if specified, it is possible to define
tries to guess. where to contribute the content. Else, ContributionTool
tries to guess.
container_path -- if specified, defines the container path container_path - if specified, defines the container path
and has precedence over container and has precedence over container
discover_metadata - Enable metadata extraction and discovery
url -- if specified, content is download from the URL. (default True)
temp_object - build tempObject or not (default False)
NOTE: user_login - is the name under which the content will be created
We always generate ID. So, we must prevent using the one XXX - this is a security hole which needs to be fixed by
which we were provided. making sure only Manager can use this parameter
data - Binary representation of content
filename - explicit filename of content
""" """
if file_name is not None: kw.pop('id', None) # Never use hardcoded ids anymore longer
kw['file_name'] = file_name
if data is not None: # Useful for metadata discovery, keep it as it as been provided
# This is only used to make sure input_parameter_dict = kw.copy()
# we can pass file as parameter to ZPublisher # But file and data are exceptions.
# whenever we ingest email # They are potentialy too big to be keept into memory.
kw['data'] = data # We want to keep only one reference of thoses values
# on futur created document only !
if 'file' in input_parameter_dict:
del input_parameter_dict['file']
if 'data' in input_parameter_dict:
del input_parameter_dict['data']
# pop: remove keys which are not document properties
url = kw.pop('url', None)
container = kw.pop('container', None)
container_path = kw.pop('container_path', None)
discover_metadata = kw.pop('discover_metadata', True)
user_login = kw.pop('user_login', None)
# check file_name argument for backward compatibility.
if 'file_name' in kw:
if 'filename' not in kw:
kw['filename'] = kw['file_name']
del(kw['file_name'])
filename = kw.get('filename', None)
portal_type = kw.get('portal_type')
temp_object = kw.get('temp_object', False)
document = None document = None
portal = self.getPortalObject()
# Try to find the file_name # Try to find the filename
content_type = None content_type = None
if not url: if not url:
# check if file was provided # check if file was provided
file = kw.get('file', None) file_object = kw.get('file')
if file is not None and file_name is None: if file_object is not None:
file_name = file.filename if not filename:
filename = file_object.filename
else: else:
# some channels supply data and file-name separately # some channels supply data and file-name separately
# this is the case for example for email ingestion # this is the case for example for email ingestion
# in this case, we build a file wrapper for it # in this case, we build a file wrapper for it
data = kw.get('data', None) data = kw.get('data')
if data is not None: if data is not None and filename:
file_name = kw.get('file_name', None) file_object = cStringIO.StringIO()
if file_name is not None: file_object.write(data)
file = cStringIO.StringIO() file_object.seek(0)
file.write(data) kw['file'] = file_object
file.seek(0) del kw['data']
kw['file'] = file else:
del kw['data'] raise TypeError, 'data and filename must be provided'
del kw['file_name']
else: else:
# build a new file from the url file_object, filename, content_type = self._openURL(url)
url_file = urllib2.urlopen(url)
data = url_file.read() # time out must be set or ... too long XXX
file = cStringIO.StringIO()
file.write(data)
file.seek(0)
# if a content-disposition header is present,
# try first to read the suggested filename from it.
header_info = url_file.info()
content_disposition = header_info.getheader('content-disposition', '')
file_name = parse_header(content_disposition)[1].get('filename')
if not file_name:
# Now read the filename from url.
# In case of http redirection, the real url must be read
# from file object returned by urllib2.urlopen.
# It can happens when the header 'Location' is present in request.
# See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
url = url_file.geturl()
# Create a file name based on the URL and quote it
file_name = urlparse.urlsplit(url)[-3]
file_name = os.path.basename(file_name)
file_name = urllib.quote(file_name, safe='')
file_name = file_name.replace('%', '')
# For URLs, we want an id by default equal to the encoded URL
if id is None:
id = self.encodeURL(url)
content_type = header_info.gettype()
if content_type: if content_type:
kw['content_type'] = content_type kw['content_type'] = content_type
kw['file'] = file kw['file'] = file_object
# If the portal_type was provided, we can go faster # If the portal_type was provided, we can go faster
if portal_type and container is None: if portal_type and container is None:
# We know the portal_type, let us find the default module # We know the portal_type, let us find the default module
# and use it as container # and use it as container
try: try:
container = self.getDefaultModule(portal_type) container = portal.getDefaultModule(portal_type)
except ValueError: except ValueError:
container = None container = None
if portal_type and container is not None:
# We could simplify things here and return a document immediately
# NOTE: we use the module ID generator rather than the provided ID
#document = module.newContent(portal_type=portal_type, **kw)
#if discover_metadata:
# document.activate().discoverMetadata(file_name=file_name, user_login=user_login)
#return document
pass # XXX - This needs to be implemented once the rest is stable
# From here, there is no hope unless a file was provided # From here, there is no hope unless a file was provided
if file is None: if file_object is None:
raise ValueError, "could not determine portal type" raise ValueError, "No data provided"
if portal_type is None:
# Guess it with help of portal_contribution_registry
registry = getToolByName(portal, 'portal_contribution_registry')
portal_type = registry.findPortalTypeName(filename=filename,
content_type=content_type)
# #
# Check if same file is already exists. if it exists, then update it. # Check if same file is already exists. if it exists, then update it.
# #
if portal_type is None: property_dict = self.getMatchedFilenamePatternDict(filename)
portal_type = self._guessPortalType(file_name, content_type, data) reference = property_dict.get('reference', None)
property_dict = self.getMatchedFileNamePatternDict(file_name) version = property_dict.get('version', None)
reference = property_dict.get('reference', None) language = property_dict.get('language', None)
version = property_dict.get('version', None) if portal_type and reference and version and language:
language = property_dict.get('language', None) portal_catalog = getToolByName(portal, 'portal_catalog')
if portal_type and reference and version and language: document = portal_catalog.getResultValue(portal_type=portal_type,
portal_catalog = getToolByName(self, 'portal_catalog') reference=reference,
document = portal_catalog.getResultValue(portal_type=portal_type, version=version,
reference=reference, language=language)
version=version,
language=language)
if document is not None:
# document is already uploaded. So overrides file.
if not _checkPermission(Permissions.ModifyPortalContent, document):
raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
document.edit(file=kw['file'])
return document
if document is not None:
# document is already uploaded. So overrides file.
if not _checkPermission(Permissions.ModifyPortalContent, document):
raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
document.edit(file=kw['file'])
return document
# Temp objects use the standard newContent from Folder # Temp objects use the standard newContent from Folder
if temp_object: if temp_object:
# For temp_object creation, use the standard method # For temp_object creation, use the standard method
return BaseTool.newContent(self, id=id, portal_type=portal_type, kw['portal_type'] = portal_type
temp_object=temp_object, **kw) return BaseTool.newContent(self, **kw)
# Then put the file inside ourselves for a short while # Then put the file inside ourselves for a short while
if container_path is not None: if container_path is not None:
container = self.getPortalObject().restrictedTraverse(container_path) container = self.getPortalObject().restrictedTraverse(container_path)
document = self._setObject(file_name, None, portal_type=portal_type, document = self._setObject(filename, None, portal_type=portal_type,
user_login=user_login, id=id, user_login=user_login, container=container,
container=container,
discover_metadata=discover_metadata, discover_metadata=discover_metadata,
filename=filename,
input_parameter_dict=input_parameter_dict
) )
object_id = document.getId() object_id = document.getId()
document = self._getOb(object_id) # Call _getOb to purge cache document = self._getOb(object_id) # Call _getOb to purge cache
...@@ -264,18 +244,12 @@ class ContributionTool(BaseTool): ...@@ -264,18 +244,12 @@ class ContributionTool(BaseTool):
if modified_kw is not None: if modified_kw is not None:
kw.update(modified_kw) kw.update(modified_kw)
kw['filename'] = filename # Override filename property
# Then edit the document contents (so that upload can happen) # Then edit the document contents (so that upload can happen)
document._edit(**kw) document._edit(**kw)
# if no content_type has been set, guess it
if 'content_type' not in kw and getattr(document, 'guessMimeType', None) is not None:
# For File force to setup the mime_type
document.guessMimeType(fname=file_name)
if url: if url:
document.fromURL(url) document.fromURL(url)
# Notify workflows
#document.notifyWorkflowCreated()
# Allow reindexing, reindex it and return the document # Allow reindexing, reindex it and return the document
try: try:
delattr(document, 'isIndexable') delattr(document, 'isIndexable')
...@@ -293,17 +267,19 @@ class ContributionTool(BaseTool): ...@@ -293,17 +267,19 @@ class ContributionTool(BaseTool):
""" """
pass pass
security.declareProtected(Permissions.ModifyPortalContent,'getMatchedFileNamePatternDict') security.declareProtected(Permissions.ModifyPortalContent,
def getMatchedFileNamePatternDict(self, file_name): 'getMatchedFilenamePatternDict')
def getMatchedFilenamePatternDict(self, filename):
""" """
Get matched group dict of file name parsing regular expression. Get matched group dict of file name parsing regular expression.
""" """
property_dict = {} property_dict = {}
if file_name is None: if filename is None:
return property_dict return property_dict
regex_text = self.portal_preferences.getPreferredDocumentFileNameRegularExpression() regex_text = self.portal_preferences.\
getPreferredDocumentFilenameRegularExpression()
if regex_text in ('', None): if regex_text in ('', None):
return property_dict return property_dict
...@@ -311,42 +287,55 @@ class ContributionTool(BaseTool): ...@@ -311,42 +287,55 @@ class ContributionTool(BaseTool):
pattern = re.compile(regex_text) pattern = re.compile(regex_text)
if pattern is not None: if pattern is not None:
try: try:
property_dict = pattern.match(file_name).groupdict() property_dict = pattern.match(filename).groupdict()
except AttributeError: # no match except AttributeError: # no match
pass pass
return property_dict return property_dict
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName') # backward compatibility
def getPropertyDictFromFileName(self, file_name): security.declareProtected(Permissions.ModifyPortalContent,
'getMatchedFileNamePatternDict')
def getMatchedFileNamePatternDict(self, filename):
"""
(deprecated) use getMatchedFilenamePatternDict() instead.
"""
warnings.warn('getMatchedFileNamePatternDict() is deprecated. '
'use getMatchedFilenamePatternDict() instead.')
return self.getMatchedFilenamePatternDict(filename)
security.declareProtected(Permissions.ModifyPortalContent,
'getPropertyDictFromFilename')
def getPropertyDictFromFilename(self, filename):
""" """
Gets properties from filename. File name is parsed with a regular expression Gets properties from filename. File name is parsed with a regular expression
set in preferences. The regexp should contain named groups. set in preferences. The regexp should contain named groups.
""" """
if file_name is None: if filename is None:
return {} return {}
property_dict = self.getMatchedFileNamePatternDict(file_name) property_dict = self.getMatchedFilenamePatternDict(filename)
method = self._getTypeBasedMethod('getPropertyDictFromFileName', method = self._getTypeBasedMethod('getPropertyDictFromFilename',
fallback_script_id = 'ContributionTool_getPropertyDictFromFileName') fallback_script_id='ContributionTool_getPropertyDictFromFilename')
property_dict = method(file_name, property_dict) property_dict = method(filename, property_dict)
if property_dict.get('portal_type', None) is not None:
# we have to return portal_type as a tuple
# because we should allow for having multiple candidate types
property_dict['portal_type'] = (property_dict['portal_type'],)
else:
# we have to find candidates by file extenstion
basename, extension = os.path.splitext(file_name)
if extension:
extension = extension.lstrip('.') # remove first dot
property_dict['portal_type'] =\
self.ContributionTool_getCandidateTypeListByExtension(extension)
return property_dict return property_dict
# backward compatibility
security.declareProtected(Permissions.ModifyPortalContent,
'getPropertyDictFromFileName')
def getPropertyDictFromFileName(self, filename):
"""
(deprecated) use getPropertyDictFromFilename() instead.
"""
warnings.warn('getPropertyDictFromFileName() is deprecated. '
'use getPropertyDictFromFilename() instead.')
return self.getPropertyDictFromFilename(filename)
# WebDAV virtual folder support # WebDAV virtual folder support
def _setObject(self, name, ob, portal_type=None, user_login=None, def _setObject(self, id, ob, portal_type=None, user_login=None,
container=None, id=None, discover_metadata=1): container=None, discover_metadata=True, filename=None,
input_parameter_dict=None):
""" """
portal_contribution_registry will find appropriate portal type portal_contribution_registry will find appropriate portal type
name by file_name and content itself. name by filename and content itself.
The ContributionTool instance must be configured in such The ContributionTool instance must be configured in such
way that _verifyObjectPaste will return TRUE. way that _verifyObjectPaste will return TRUE.
...@@ -362,9 +351,8 @@ class ContributionTool(BaseTool): ...@@ -362,9 +351,8 @@ class ContributionTool(BaseTool):
# redefine parameters # redefine parameters
portal_type = ob.getPortalType() portal_type = ob.getPortalType()
container = ob.getParentValue() container = ob.getParentValue()
id = ob.getId()
if not portal_type: if not portal_type:
document = BaseTool.newContent(self, id=name, document = BaseTool.newContent(self, id=id,
portal_type=portal_type, portal_type=portal_type,
is_indexable=0) is_indexable=0)
else: else:
...@@ -379,33 +367,27 @@ class ContributionTool(BaseTool): ...@@ -379,33 +367,27 @@ class ContributionTool(BaseTool):
module = self.getDefaultModule(portal_type) module = self.getDefaultModule(portal_type)
else: else:
module = container module = container
if id is None: # There is no preexisting document - we can therefore
new_id = module.generateNewId() # set the new object
else: document = module.newContent(portal_type=portal_type, is_indexable=0)
new_id = id # We can now discover metadata
existing_document = module._getOb(new_id, None) if discover_metadata:
if existing_document is None: # Metadata disovery is done as an activity by default
# There is no preexisting document - we can therefore # If we need to discoverMetadata synchronously, it must
# set the new object # be for user interface and should thus be handled by
document = module.newContent(id=new_id, # ZODB scripts
portal_type=portal_type, document.activate(after_path_and_method_id=(document.getPath(),
is_indexable=0) ('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
# We can now discover metadata .discoverMetadata(filename=filename,
if discover_metadata: user_login=user_login,
# Metadata disovery is done as an activity by default input_parameter_dict=input_parameter_dict)
# If we need to discoverMetadata synchronously, it must
# be for user interface and should thus be handled by
# ZODB scripts
document.activate(after_path_and_method_id=(document.getPath(),
('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
.discoverMetadata(file_name=name, user_login=user_login)
else:
document = existing_document
# Keep the document close to us - this is only useful for # Keep the document close to us - this is only useful for
# file upload from webdav # file upload from webdav
if not hasattr(self, '_v_document_cache'): volatile_cache = getattr(self, '_v_document_cache', None)
if volatile_cache is None:
self._v_document_cache = {} self._v_document_cache = {}
self._v_document_cache[document.getId()] = document.getRelativeUrl() volatile_cache = self._v_document_cache
volatile_cache[document.getId()] = document.getRelativeUrl()
# Return document to newContent method # Return document to newContent method
return document return document
...@@ -417,10 +399,11 @@ class ContributionTool(BaseTool): ...@@ -417,10 +399,11 @@ class ContributionTool(BaseTool):
""" """
# Use the document cache if possible and return result immediately # Use the document cache if possible and return result immediately
# this is only useful for webdav # this is only useful for webdav
if hasattr(self, '_v_document_cache'): volatile_cache = getattr(self, '_v_document_cache', None)
document_url = self._v_document_cache.get(id, None) if volatile_cache is not None:
document_url = volatile_cache.get(id)
if document_url is not None: if document_url is not None:
del self._v_document_cache[id] del volatile_cache[id]
return self.getPortalObject().unrestrictedTraverse(document_url) return self.getPortalObject().unrestrictedTraverse(document_url)
# Try first to return the real object inside # Try first to return the real object inside
...@@ -475,66 +458,11 @@ class ContributionTool(BaseTool): ...@@ -475,66 +458,11 @@ class ContributionTool(BaseTool):
def wrapper(o_list): def wrapper(o_list):
for o in o_list: for o in o_list:
o = o.getObject() o = o.getObject()
id = '%s-%s' % (o.getUid(), o.getStandardFileName(),) id = '%s-%s' % (o.getUid(), o.getStandardFilename(),)
yield o.asContext(id=id) yield o.asContext(id=id)
return wrapper(object_list) return wrapper(object_list)
# Crawling methods
security.declareProtected(Permissions.View, 'normaliseURL')
def normaliseURL(self, url, base_url=None):
"""
Returns a normalised version of the url so
that we do not download twice the same content.
URL normalisation is an important part in crawlers.
The current implementation is obviously simplistic.
Refer to http://en.wikipedia.org/wiki/Web_crawler
and study Harvestman for more ideas.
"""
url = self.simple_normaliser.sub('', url)
url_split = url.split(':')
url_protocol = url_split[0]
if url_protocol in no_host_protocol_list:
return url
if base_url and len(url_split) == 1:
# Make relative URL absolute
url = '%s/%s' % (base_url, url)
return url
security.declareProtected(Permissions.View, 'encodeURL')
def encodeURL(self, url):
"""
Returns the URL as an ID. ID should be chosen in such
way that it is optimal with HBTreeFolder (ie. so that
distribution of access time on a cluster is possible)
NOTE: alternate approach is based on a url table
and catalog lookup. It is faster ? Not sure. Since
we must anyway insert objects in btrees and this
is simimar in cost to accessing them.
"""
# Produce an MD5 from the URL
hex_md5 = md5_new(url).hexdigest()
# Take the first part in the URL which is not empty
# LOG("encodeURL", 0, url)
url_segment = url.split(':')[1]
url_segment_list = url_segment.split('/')
url_domain = None
for url_part in url_segment_list:
if url_part:
url_domain = url_part
break
# Return encoded url
if url_domain:
url_domain = urllib.quote(url_domain, safe='')
url_domain = url_domain.replace('%', '')
return "%s-%s" % (url_domain, hex_md5)
return hex_md5
url = urllib.quote(url, safe='')
url = url.replace('_', '__')
url = url.replace('%', '_')
return url
security.declareProtected(Permissions.AddPortalContent, 'crawlContent') security.declareProtected(Permissions.AddPortalContent, 'crawlContent')
def crawlContent(self, content, container=None): def crawlContent(self, content, container=None):
""" """
...@@ -543,6 +471,8 @@ class ContributionTool(BaseTool): ...@@ -543,6 +471,8 @@ class ContributionTool(BaseTool):
XXX: missing is the conversion of content local href to something XXX: missing is the conversion of content local href to something
valid. valid.
""" """
portal = self.getPortalObject()
url_registry_tool = portal.portal_url_registry
depth = content.getCrawlingDepth() depth = content.getCrawlingDepth()
if depth < 0: if depth < 0:
# Do nothing if crawling depth is reached # Do nothing if crawling depth is reached
...@@ -554,32 +484,34 @@ class ContributionTool(BaseTool): ...@@ -554,32 +484,34 @@ class ContributionTool(BaseTool):
if depth < 0: if depth < 0:
# Do nothing if crawling depth is reached # Do nothing if crawling depth is reached
return return
base_url = content.getContentBaseURL() url_list = content.getContentNormalisedURLList()
url_list = map(lambda url: self.normaliseURL(url, base_url), set(content.getContentURLList()))
for url in set(url_list): for url in set(url_list):
# LOG('trying to crawl', 0, url) # LOG('trying to crawl', 0, url)
# Some url protocols should not be crawled # Some url protocols should not be crawled
if url.split(':')[0] in no_crawl_protocol_list: if urlparse.urlsplit(url)[0] in no_crawl_protocol_list:
continue continue
if container is None: if container is None:
#if content.getParentValue() #if content.getParentValue()
# in place of not ? # in place of not ?
container = content.getParentValue() container = content.getParentValue()
# Calculate the id under which content will be stored try:
id = self.encodeURL(url) url_registry_tool.getReferenceFromURL(url, context=container)
# Try to access the document if it already exists except KeyError:
document = container.get(id, None) pass
if document is None: else:
# XXX - This call is not working due to missing group_method_id # url already crawled
# therefore, multiple call happen in parallel and eventually fail continue
# (the same URL is created multiple times) # XXX - This call is not working due to missing group_method_id
# LOG('activate newContentFromURL', 0, url) # therefore, multiple call happen in parallel and eventually fail
self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(), # (the same URL is created multiple times)
id=id, url=url, crawling_depth=depth) # LOG('activate newContentFromURL', 0, url)
elif depth and document.getCrawlingDepth() < depth: self.activate(activity="SQLQueue").newContentFromURL(
# Update the crawling depth if necessary container_path=container.getRelativeUrl(),
document._setCrawlingDepth(depth) url=url, crawling_depth=depth)
document.activate().crawlContent() # Url is not known yet but register right now to avoid
# creation of duplicated crawled content
# An activity will later setup the good reference for it.
url_registry_tool.registerURL(url, None, context=container)
security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL') security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL')
def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0): def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0):
...@@ -595,10 +527,7 @@ class ContributionTool(BaseTool): ...@@ -595,10 +527,7 @@ class ContributionTool(BaseTool):
# Step 1: download new content # Step 1: download new content
try: try:
url = content.asURL() url = content.asURL()
data = urllib2.urlopen(url).read() file_object, filename, content_type = self._openURL(url)
file = cStringIO.StringIO()
file.write(data)
file.seek(0)
except urllib2.HTTPError, error: except urllib2.HTTPError, error:
if repeat == 0: if repeat == 0:
# XXX - Call the extendBadURLList method,--NOT Implemented-- # XXX - Call the extendBadURLList method,--NOT Implemented--
...@@ -615,28 +544,28 @@ class ContributionTool(BaseTool): ...@@ -615,28 +544,28 @@ class ContributionTool(BaseTool):
content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1) content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1)
return return
# Step 2: compare and update if necessary (md5) content._edit(file=file_object, content_type=content_type)
# md5 stuff to compare contents # Please make sure that if content is the same
new_content_md5 = md5_new(data).hexdigest()
content_md5 = content.getContentMd5()
if content_md5 == new_content_md5:
return
content._edit(file=file)# Please make sure that if content is the same
# we do not update it # we do not update it
# This feature must be implemented by Base or File # This feature must be implemented by Base or File
# not here (look at _edit in Base) # not here (look at _edit in Base)
# Step 3: convert to base format # Step 2: convert to base format
content.convertToBaseFormat() if content.isSupportBaseDataConversion():
content.activate().Document_tryToConvertToBaseFormat()
# Step 3: run discoverMetadata
content.activate(after_path_and_method_id=(content.getPath(),
('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
.discoverMetadata(filename=filename)
# Step 4: activate populate (unless interaction workflow does it) # Step 4: activate populate (unless interaction workflow does it)
content.activate().populateContent() content.activate().populateContent()
# Step 5: activate crawlContent # Step 5: activate crawlContent
depth = content.getCrawlingDepth() depth = content.getCrawlingDepth()
if depth > 0: if depth > 0:
content.activate().crawlContent() content.activate().crawlContent()
content.setContentMd5(new_content_md5)
security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL') security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL')
def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT, repeat_interval=1, batch_mode=True, **kw): def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT,
repeat_interval=1, batch_mode=True, url=None, **kw):
""" """
A wrapper method for newContent which provides extra safety A wrapper method for newContent which provides extra safety
in case or errors (ie. download, access, conflict, etc.). in case or errors (ie. download, access, conflict, etc.).
...@@ -646,17 +575,13 @@ class ContributionTool(BaseTool): ...@@ -646,17 +575,13 @@ class ContributionTool(BaseTool):
the at_date parameter and some standard values. the at_date parameter and some standard values.
NOTE: implementation needs to be done. NOTE: implementation needs to be done.
id parameter is ignored
""" """
document = None document = None
# First of all, make sure do not try to create an existing document if not url:
if container_path is not None and id is not None: raise TypeError, 'url parameter is mandatory'
container = self.restrictedTraverse(container_path)
document = container.get(id, None)
if document is not None:
# Document aleardy exists: no need to keep on crawling
return document
try: try:
document = self.newContent(container_path=container_path, id=id, **kw) document = self.newContent(container_path=container_path, url=url, **kw)
if document.isIndexContent() and document.getCrawlingDepth() >= 0: if document.isIndexContent() and document.getCrawlingDepth() >= 0:
# If this is an index document, keep on crawling even if crawling_depth is 0 # If this is an index document, keep on crawling even if crawling_depth is 0
document.activate().crawlContent() document.activate().crawlContent()
...@@ -672,7 +597,7 @@ class ContributionTool(BaseTool): ...@@ -672,7 +597,7 @@ class ContributionTool(BaseTool):
if repeat > 0: if repeat > 0:
# Catch any HTTP error # Catch any HTTP error
self.activate(at_date=DateTime() + repeat_interval).newContentFromURL( self.activate(at_date=DateTime() + repeat_interval).newContentFromURL(
container_path=container_path, id=id, container_path=container_path, url=url,
repeat=repeat - 1, repeat=repeat - 1,
repeat_interval=repeat_interval, **kw) repeat_interval=repeat_interval, **kw)
except urllib2.URLError, error: except urllib2.URLError, error:
...@@ -685,28 +610,57 @@ class ContributionTool(BaseTool): ...@@ -685,28 +610,57 @@ class ContributionTool(BaseTool):
if repeat > 0: if repeat > 0:
self.activate(at_date=DateTime() + repeat_interval, self.activate(at_date=DateTime() + repeat_interval,
activity="SQLQueue").newContentFromURL( activity="SQLQueue").newContentFromURL(
container_path=container_path, id=id, container_path=container_path, url=url,
repeat=repeat - 1, repeat=repeat - 1,
repeat_interval=repeat_interval, **kw) repeat_interval=repeat_interval, **kw)
return document return document
def _guessPortalType(self, name, typ, body): security.declareProtected(Permissions.AccessContentsInformation,
'guessMimeTypeFromFilename')
def guessMimeTypeFromFilename(self, filename):
""" """
Call Portal Contribution Registry get mime type from file name
to know which portal_type should be used
""" """
findPortalTypeName = None if not filename:
registry = getToolByName(self, 'portal_contribution_registry', None) return
if registry is not None: portal = self.getPortalObject()
findPortalTypeName = registry.findPortalTypeName content_type = portal.mimetypes_registry.lookupExtension(filename)
else: return content_type
# Keep backward compatibility
registry = getToolByName(self, 'content_type_registry', None) def _openURL(self, url):
if registry is None: """Download content from url,
return None read filename and content_type
findPortalTypeName = registry.findTypeName return file_object, filename, content_type tuple
"""
portal_type = findPortalTypeName(name, typ, body) # Quote path part of url
return portal_type url_tuple = urlparse.urlsplit(url)
quoted_path = urllib.quote(url_tuple[2])
url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path,
url_tuple[3], url_tuple[4]))
# build a new file from the url
url_file = urllib2.urlopen(url)
data = url_file.read() # time out must be set or ... too long XXX
file_object = cStringIO.StringIO()
file_object.write(data)
file_object.seek(0)
# if a content-disposition header is present,
# try first to read the suggested filename from it.
header_info = url_file.info()
content_disposition = header_info.getheader('content-disposition', '')
filename = parse_header(content_disposition)[1].get('filename')
if not filename:
# Now read the filename from url.
# In case of http redirection, the real url must be read
# from file object returned by urllib2.urlopen.
# It can happens when the header 'Location' is present in request.
# See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
url = url_file.geturl()
# Create a file name based on the URL and quote it
filename = urlparse.urlsplit(url)[-3]
filename = os.path.basename(filename)
filename = urllib.quote(filename, safe='')
filename = filename.replace('%', '')
content_type = header_info.gettype()
return file_object, filename, content_type
InitializeClass(ContributionTool) InitializeClass(ContributionTool)
...@@ -50,7 +50,7 @@ from Tool import CategoryTool, SimulationTool, RuleTool, IdTool, TemplateTool,\ ...@@ -50,7 +50,7 @@ from Tool import CategoryTool, SimulationTool, RuleTool, IdTool, TemplateTool,\
TrashTool, ContributionTool, NotificationTool, PasswordTool,\ TrashTool, ContributionTool, NotificationTool, PasswordTool,\
GadgetTool, ContributionRegistryTool, IntrospectionTool,\ GadgetTool, ContributionRegistryTool, IntrospectionTool,\
AcknowledgementTool, SolverTool, SolverProcessTool,\ AcknowledgementTool, SolverTool, SolverProcessTool,\
ConversionTool, RoundingTool ConversionTool, RoundingTool, UrlRegistryTool
import ERP5Site import ERP5Site
from Document import PythonScript from Document import PythonScript
object_classes = ( ERP5Site.ERP5Site, object_classes = ( ERP5Site.ERP5Site,
...@@ -78,6 +78,7 @@ portal_tools = ( CategoryTool.CategoryTool, ...@@ -78,6 +78,7 @@ portal_tools = ( CategoryTool.CategoryTool,
SolverProcessTool.SolverProcessTool, SolverProcessTool.SolverProcessTool,
ConversionTool.ConversionTool, ConversionTool.ConversionTool,
RoundingTool.RoundingTool, RoundingTool.RoundingTool,
UrlRegistryTool.UrlRegistryTool,
) )
content_classes = () content_classes = ()
content_constructors = () content_constructors = ()
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>file_extension</string> </key> <key> <string>extension_from_filename</string> </key>
<value> <value>
<list> <list>
<string>sxd</string> <string>sxd</string>
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>file_extension</string> <string>extension_from_filename</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -46,7 +46,7 @@ ...@@ -46,7 +46,7 @@
</item> </item>
<item> <item>
<key> <string>int_index</string> </key> <key> <string>int_index</string> </key>
<value> <int>60</int> </value> <value> <int>10</int> </value>
</item> </item>
<item> <item>
<key> <string>portal_type</string> </key> <key> <string>portal_type</string> </key>
...@@ -60,7 +60,7 @@ ...@@ -60,7 +60,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>Drawing</string> </value> <value> <string>Drawing by extension</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>file_extension</string> </key> <key> <string>extension_from_filename</string> </key>
<value> <value>
<list> <list>
<string>gif</string> <string>gif</string>
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>file_extension</string> <string>extension_from_filename</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -49,7 +49,7 @@ ...@@ -49,7 +49,7 @@
</item> </item>
<item> <item>
<key> <string>int_index</string> </key> <key> <string>int_index</string> </key>
<value> <int>20</int> </value> <value> <int>10</int> </value>
</item> </item>
<item> <item>
<key> <string>portal_type</string> </key> <key> <string>portal_type</string> </key>
...@@ -63,7 +63,7 @@ ...@@ -63,7 +63,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>Image</string> </value> <value> <string>Image by extension</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>file_extension</string> </key> <key> <string>extension_from_filename</string> </key>
<value> <value>
<list> <list>
<string>pdf</string> <string>pdf</string>
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>file_extension</string> <string>extension_from_filename</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
</item> </item>
<item> <item>
<key> <string>int_index</string> </key> <key> <string>int_index</string> </key>
<value> <int>30</int> </value> <value> <int>10</int> </value>
</item> </item>
<item> <item>
<key> <string>portal_type</string> </key> <key> <string>portal_type</string> </key>
...@@ -59,7 +59,7 @@ ...@@ -59,7 +59,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>PDF</string> </value> <value> <string>PDF by extension</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>mime_type</string> </key> <key> <string>content_type</string> </key>
<value> <value>
<list> <list>
<string>application/pdf</string> <string>application/pdf</string>
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>mime_type</string> <string>content_type</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
</item> </item>
<item> <item>
<key> <string>int_index</string> </key> <key> <string>int_index</string> </key>
<value> <int>30</int> </value> <value> <int>20</int> </value>
</item> </item>
<item> <item>
<key> <string>portal_type</string> </key> <key> <string>portal_type</string> </key>
...@@ -59,7 +59,7 @@ ...@@ -59,7 +59,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>PDF</string> </value> <value> <string>PDF by mimetype</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>file_extension</string> </key> <key> <string>extension_from_filename</string> </key>
<value> <value>
<list> <list>
<string>ppt</string> <string>ppt</string>
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>file_extension</string> <string>extension_from_filename</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -48,7 +48,7 @@ ...@@ -48,7 +48,7 @@
</item> </item>
<item> <item>
<key> <string>int_index</string> </key> <key> <string>int_index</string> </key>
<value> <int>50</int> </value> <value> <int>10</int> </value>
</item> </item>
<item> <item>
<key> <string>portal_type</string> </key> <key> <string>portal_type</string> </key>
...@@ -62,7 +62,7 @@ ...@@ -62,7 +62,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>Presentation</string> </value> <value> <string>Presentation by extension</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_identity_criterion</string> </key>
<value>
<dictionary>
<item>
<key> <string>content_type_from_content</string> </key>
<value>
<list>
<string>application/vnd.ms-excel</string>
<string>application/vnd.ms-office</string>
<string>application/msexcel</string>
<string>application/vnd.oasis.opendocument.spreadsheet</string>
<string>application/vnd.oasis.opendocument.spreadsheet-template</string>
</list>
</value>
</item>
</dictionary>
</value>
</item>
<item>
<key> <string>_range_criterion</string> </key>
<value>
<dictionary/>
</value>
</item>
<item>
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>content_type_from_content</string>
</tuple>
</value>
</item>
<item>
<key> <string>destination_portal_type</string> </key>
<value> <string>Spreadsheet</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>spreadsheet_by_content</string> </value>
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>70</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Contribution Predicate</string> </value>
</item>
<item>
<key> <string>test_method_id</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Spreadsheet by content</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>file_extension</string> </key> <key> <string>extension_from_filename</string> </key>
<value> <value>
<list> <list>
<string>xls</string> <string>xls</string>
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>file_extension</string> <string>extension_from_filename</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -49,7 +49,7 @@ ...@@ -49,7 +49,7 @@
</item> </item>
<item> <item>
<key> <string>int_index</string> </key> <key> <string>int_index</string> </key>
<value> <int>40</int> </value> <value> <int>10</int> </value>
</item> </item>
<item> <item>
<key> <string>portal_type</string> </key> <key> <string>portal_type</string> </key>
...@@ -63,7 +63,7 @@ ...@@ -63,7 +63,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>Spreadsheet</string> </value> <value> <string>Spreadsheet by extension</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_identity_criterion</string> </key>
<value>
<dictionary>
<item>
<key> <string>content_type</string> </key>
<value>
<list>
<string>text/plain</string>
</list>
</value>
</item>
</dictionary>
</value>
</item>
<item>
<key> <string>_range_criterion</string> </key>
<value>
<dictionary/>
</value>
</item>
<item>
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>content_type</string>
</tuple>
</value>
</item>
<item>
<key> <string>destination_portal_type</string> </key>
<value> <string>Text</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>text_by_conent_type</string> </value>
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>20</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Contribution Predicate</string> </value>
</item>
<item>
<key> <string>test_method_id</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Text by content type</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_identity_criterion</string> </key>
<value>
<dictionary>
<item>
<key> <string>content_type_from_content</string> </key>
<value>
<list>
<string>text/plain</string>
</list>
</value>
</item>
</dictionary>
</value>
</item>
<item>
<key> <string>_range_criterion</string> </key>
<value>
<dictionary/>
</value>
</item>
<item>
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>content_type_from_content</string>
</tuple>
</value>
</item>
<item>
<key> <string>destination_portal_type</string> </key>
<value> <string>Text</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>text_by_content</string> </value>
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>70</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Contribution Predicate</string> </value>
</item>
<item>
<key> <string>test_method_id</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Text by mimetype from data</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>file_extension</string> </key> <key> <string>extension_from_filename</string> </key>
<value> <value>
<list> <list>
<string>txt</string> <string>txt</string>
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>file_extension</string> <string>extension_from_filename</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -64,7 +64,7 @@ ...@@ -64,7 +64,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>Text</string> </value> <value> <string>Text by extension</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_identity_criterion</string> </key>
<value>
<dictionary>
<item>
<key> <string>content_type_from_content</string> </key>
<value>
<list>
<string>text/html</string>
</list>
</value>
</item>
</dictionary>
</value>
</item>
<item>
<key> <string>_range_criterion</string> </key>
<value>
<dictionary/>
</value>
</item>
<item>
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>content_type_from_content</string>
</tuple>
</value>
</item>
<item>
<key> <string>destination_portal_type</string> </key>
<value> <string>Web Page</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>web_page_by_content</string> </value>
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>70</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Contribution Predicate</string> </value>
</item>
<item>
<key> <string>test_method_id</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Web Page by mimetype from data</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>file_extension</string> </key> <key> <string>extension_from_filename</string> </key>
<value> <value>
<list> <list>
<string>html</string> <string>html</string>
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>file_extension</string> <string>extension_from_filename</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -47,7 +47,7 @@ ...@@ -47,7 +47,7 @@
</item> </item>
<item> <item>
<key> <string>int_index</string> </key> <key> <string>int_index</string> </key>
<value> <int>90</int> </value> <value> <int>10</int> </value>
</item> </item>
<item> <item>
<key> <string>portal_type</string> </key> <key> <string>portal_type</string> </key>
...@@ -61,7 +61,7 @@ ...@@ -61,7 +61,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>Web Page</string> </value> <value> <string>Web Page by extension</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
<value> <value>
<dictionary> <dictionary>
<item> <item>
<key> <string>mime_type</string> </key> <key> <string>content_type</string> </key>
<value> <value>
<list> <list>
<string>text/html</string> <string>text/html</string>
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
<key> <string>criterion_property</string> </key> <key> <string>criterion_property</string> </key>
<value> <value>
<tuple> <tuple>
<string>mime_type</string> <string>content_type</string>
</tuple> </tuple>
</value> </value>
</item> </item>
...@@ -45,13 +45,7 @@ ...@@ -45,13 +45,7 @@
</item> </item>
<item> <item>
<key> <string>int_index</string> </key> <key> <string>int_index</string> </key>
<value> <int>90</int> </value> <value> <int>20</int> </value>
</item>
<item>
<key> <string>membership_criterion_base_category</string> </key>
<value>
<tuple/>
</value>
</item> </item>
<item> <item>
<key> <string>portal_type</string> </key> <key> <string>portal_type</string> </key>
...@@ -65,7 +59,7 @@ ...@@ -65,7 +59,7 @@
</item> </item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>Web Page</string> </value> <value> <string>Web Page by mimetype</string> </value>
</item> </item>
</dictionary> </dictionary>
</pickle> </pickle>
......
...@@ -24,6 +24,22 @@ ...@@ -24,6 +24,22 @@
</tuple> </tuple>
</value> </value>
</item> </item>
<item>
<key> <string>_Add_portal_content_Permission</string> </key>
<value>
<tuple>
<string>Manager</string>
</tuple>
</value>
</item>
<item>
<key> <string>_Delete_objects_Permission</string> </key>
<value>
<tuple>
<string>Manager</string>
</tuple>
</value>
</item>
<item> <item>
<key> <string>_Modify_portal_content_Permission</string> </key> <key> <string>_Modify_portal_content_Permission</string> </key>
<value> <value>
...@@ -252,6 +268,22 @@ It\'s the lowest priority one; ie. managers can create higher priority preferenc ...@@ -252,6 +268,22 @@ It\'s the lowest priority one; ie. managers can create higher priority preferenc
<key> <string>preferred_date_order</string> </key> <key> <string>preferred_date_order</string> </key>
<value> <string>ymd</string> </value> <value> <string>ymd</string> </value>
</item> </item>
<item>
<key> <string>preferred_document_file_name_regular_expression</string> </key>
<value> <string encoding="cdata"><![CDATA[
(?P<reference>[A-Z&é@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})
]]></string> </value>
</item>
<item>
<key> <string>preferred_document_reference_regular_expression</string> </key>
<value> <string encoding="cdata"><![CDATA[
(?P<reference>[A-Z&é@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?
]]></string> </value>
</item>
<item> <item>
<key> <string>preferred_event_assessment_form_id</string> </key> <key> <string>preferred_event_assessment_form_id</string> </key>
<value> <value>
......
...@@ -58,8 +58,8 @@ from zExceptions import Unauthorized\n ...@@ -58,8 +58,8 @@ from zExceptions import Unauthorized\n
format = None\n format = None\n
# Always force download of document even if format is supported\n # Always force download of document even if format is supported\n
# by browser\n # by browser\n
file_name = context.getStandardFileName(format)\n filename = context.getStandardFilename(format)\n
response.setHeader(\'Content-disposition\', \'attachment; filename="%s"\' % file_name)\n response.setHeader(\'Content-disposition\', \'attachment; filename="%s"\' % filename)\n
\n \n
try:\n try:\n
return context.index_html(request, response, format)\n return context.index_html(request, response, format)\n
...@@ -111,7 +111,7 @@ except Unauthorized:\n ...@@ -111,7 +111,7 @@ except Unauthorized:\n
<string>None</string> <string>None</string>
<string>format</string> <string>format</string>
<string>context</string> <string>context</string>
<string>file_name</string> <string>filename</string>
<string>msg</string> <string>msg</string>
<string>dict</string> <string>dict</string>
</tuple> </tuple>
......
...@@ -222,12 +222,16 @@ ...@@ -222,12 +222,16 @@
<value> <value>
<list> <list>
<tuple> <tuple>
<string>file_extension</string> <string>extension_from_filename</string>
<string>file_extension</string> <string>extension_from_filename</string>
</tuple> </tuple>
<tuple> <tuple>
<string>mime_type</string> <string>content_type</string>
<string>mime_type</string> <string>content_type</string>
</tuple>
<tuple>
<string>content_type_from_content</string>
<string>content_type_from_content</string>
</tuple> </tuple>
</list> </list>
</value> </value>
......
...@@ -352,6 +352,10 @@ ...@@ -352,6 +352,10 @@
<key> <string>css_class</string> </key> <key> <string>css_class</string> </key>
<value> <string></string> </value> <value> <string></string> </value>
</item> </item>
<item>
<key> <string>default_display_style</string> </key>
<value> <string>table</string> </value>
</item>
<item> <item>
<key> <string>default_params</string> </key> <key> <string>default_params</string> </key>
<value> <value>
...@@ -362,6 +366,12 @@ ...@@ -362,6 +366,12 @@
<key> <string>description</string> </key> <key> <string>description</string> </key>
<value> <string></string> </value> <value> <string></string> </value>
</item> </item>
<item>
<key> <string>display_style_list</string> </key>
<value>
<list/>
</value>
</item>
<item> <item>
<key> <string>domain_root_list</string> </key> <key> <string>domain_root_list</string> </key>
<value> <value>
...@@ -396,10 +406,18 @@ ...@@ -396,10 +406,18 @@
<list/> <list/>
</value> </value>
</item> </item>
<item>
<key> <string>global_search_column</string> </key>
<value> <string></string> </value>
</item>
<item> <item>
<key> <string>hidden</string> </key> <key> <string>hidden</string> </key>
<value> <int>0</int> </value> <value> <int>0</int> </value>
</item> </item>
<item>
<key> <string>hide_rows_on_no_search_criterion</string> </key>
<value> <int>0</int> </value>
</item>
<item> <item>
<key> <string>lines</string> </key> <key> <string>lines</string> </key>
<value> <int>20</int> </value> <value> <int>20</int> </value>
...@@ -425,6 +443,10 @@ ...@@ -425,6 +443,10 @@
</list> </list>
</value> </value>
</item> </item>
<item>
<key> <string>page_navigation_mode</string> </key>
<value> <string>slider</string> </value>
</item>
<item> <item>
<key> <string>page_template</string> </key> <key> <string>page_template</string> </key>
<value> <string></string> </value> <value> <string></string> </value>
...@@ -445,6 +467,10 @@ ...@@ -445,6 +467,10 @@
<key> <string>report_tree</string> </key> <key> <string>report_tree</string> </key>
<value> <int>0</int> </value> <value> <int>0</int> </value>
</item> </item>
<item>
<key> <string>row_css_method</string> </key>
<value> <string></string> </value>
</item>
<item> <item>
<key> <string>search</string> </key> <key> <string>search</string> </key>
<value> <int>0</int> </value> <value> <int>0</int> </value>
...@@ -490,10 +516,22 @@ ...@@ -490,10 +516,22 @@
<key> <string>stat_method</string> </key> <key> <string>stat_method</string> </key>
<value> <string></string> </value> <value> <string></string> </value>
</item> </item>
<item>
<key> <string>style_columns</string> </key>
<value>
<list/>
</value>
</item>
<item> <item>
<key> <string>title</string> </key> <key> <string>title</string> </key>
<value> <string>Contribution Predicates</string> </value> <value> <string>Contribution Predicates</string> </value>
</item> </item>
<item>
<key> <string>untranslatable_columns</string> </key>
<value>
<list/>
</value>
</item>
<item> <item>
<key> <string>url_columns</string> </key> <key> <string>url_columns</string> </key>
<value> <value>
......
40819 40820
\ No newline at end of file \ No newline at end of file
...@@ -22,8 +22,12 @@ portal_contribution_registry/image_extension ...@@ -22,8 +22,12 @@ portal_contribution_registry/image_extension
portal_contribution_registry/pdf_extension portal_contribution_registry/pdf_extension
portal_contribution_registry/pdf_mimetype portal_contribution_registry/pdf_mimetype
portal_contribution_registry/presentation_extension portal_contribution_registry/presentation_extension
portal_contribution_registry/spreadsheet_by_content
portal_contribution_registry/spreadsheet_extension portal_contribution_registry/spreadsheet_extension
portal_contribution_registry/text_by_conent_type
portal_contribution_registry/text_by_content
portal_contribution_registry/text_extension portal_contribution_registry/text_extension
portal_contribution_registry/web_page_by_content
portal_contribution_registry/webpage_extension portal_contribution_registry/webpage_extension
portal_contribution_registry/webpage_mimetype portal_contribution_registry/webpage_mimetype
portal_domains/base_day_domain portal_domains/base_day_domain
......
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Jean-Paul Smets-Solanes <jp@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from zope.interface import Interface
class IDiscoverable(Interface):
"""
Discoverable interface specification
Documents which implement IMetadataDiscoverable provides
methods to discover and update metadata properties
from content, user input, file name, etc.
"""
def getContentInformation():
"""
Returns a dictionary of possible metadata which can be extracted from the
document content (ex. title from an HTML file, creation date from a PDF
document, etc.)
"""
def getPropertyDictFromUserLogin(user_login=None):
"""
Based on the user_login, find out all properties which
can be discovered to later update document metadata.
user_login -- optional user login ID
"""
def getPropertyDictFromContent():
"""
Based on the result of getContentInformation, find out all
properties which can be discovered to later update document metadata.
"""
def getPropertyDictFromFilename(filename):
"""
Based on the file name, find out all properties which
can be discovered to later update document metadata.
filename -- file name to use in discovery process
"""
def getPropertyDictFromInput():
"""
Based on the user input, find out all properties which
can be discovered to later update document metadata.
"""
def discoverMetadata(filename=None, user_login=None):
"""
Updates the document metadata by discovering metadata from
the user login, the document content, the file name and the
user input. The order of discovery should be set in system
preferences.
filename - optional file name (ex. AA-BBB-CCC-223-en.doc)
user_login -- optional user login ID
XXX - it is unclear if this method should also trigger finishIngestion
and whether this should be documented here or not
"""
def finishIngestion():
"""
Finish the ingestion process (ex. allocate a reference number automatically if
no reference was defined.)
XXX - it is unclear if this method should be part of the interface
"""
def getExtensionFromFilename():
"""Return calculated value of extension read from filename
"""
def getContentTypeFromContent():
"""Return calculated value of content type read from content
"""
...@@ -87,7 +87,7 @@ class IDocument(Interface): ...@@ -87,7 +87,7 @@ class IDocument(Interface):
input - data supplied with http request or set on the object during (2) (e.g. input - data supplied with http request or set on the object during (2) (e.g.
discovered from email text) discovered from email text)
file_name - data which might be encoded in file name filename - data which might be encoded in filename
user_login - information about user who is contributing the file user_login - information about user who is contributing the file
content - data which might be derived from document content content - data which might be derived from document content
......
...@@ -52,11 +52,11 @@ class IDownloadable(Interface): ...@@ -52,11 +52,11 @@ class IDownloadable(Interface):
kw -- optional conversion parameters kw -- optional conversion parameters
""" """
def getStandardFileName(format=None): def getStandardFilename(format=None):
""" """
Returns a standard file name for the document to download. Returns a standard file name for the document to download.
This method is the reverse of This method is the reverse of
IMetadataDiscoverable.getPropertyDictFromFileName. IDiscoverable.getPropertyDictFromFilename.
format -- extension of returned file name format -- extension of returned file name
""" """
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Nicolas Delaby <nicolas@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from zope.interface import Interface
class IUrl(Interface):
"""
"""
def asURL():
"""
Returns a text representation of the Url if defined
or None else.
"""
def fromURL(url):
"""
Analyses a URL and splits it into two parts. URLs
normally follow RFC 1738. However, we accept URLs
without the protocol a.k.a. scheme part (http, mailto, etc.). In this
case only the url_string a.k.a. scheme-specific-part is taken
into account. asURL will then generate the full URL.
"""
def getURLServer():
"""
Returns the server part of a URL
"""
def getURLPort():
"""
Returns the port part of a URL
"""
def getURLPath():
"""
Returns the path part of a URL
"""
def asNormalisedURL(base_url=None):
"""
Returns a normalised version of the url so
that we do not download twice the same content.
This normalisation must refer to the same resource !
Refer to http://en.wikipedia.org/wiki/URL_normalization .
base_url - Specify a default URL and a default target
for all links on a page.
if url is a relative link, we try to compute an absolute url
with help of base_url
"""
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Nicolas Delaby <nicolas@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from zope.interface import Interface
class IUrlRegistryTool(Interface):
"""Tool to register URLs
This tool aim to maintain consistency in URL management
of crawlable sources in order to maintain consistency
between an external resource identifier and generated
document inside ERP5.
Multiple URL can be associated to the same reference
A System Preference can used to configure the global namespace.
This enable isolation of url mappings for different Groups.
This is a configurable tool to support different scope for mappings.
So it is possible to restrict the crawling of an URL
only once in the context of portal;
Or restrict the crawling of an url for the scope of an external_source
or a module only (Crawling multiple times the same URL for a portal)
"""
def clearUrlRegistryTool(context=None):
"""Unregister all urls in all namespaces.
Only available for Manager
context - a context to access container of mappings.
"""
def registerURL(url, reference, context=None):
"""Register the mapping url:reference
this method is aimed to be called from interaction_workflow
which trig on _setReference in order to keep the association
between url:reference up to date.
url - external resource identifier
reference - reference of downloaded resource (ERP5 Object instance)
context - a context to access container of mappings.
If not passed, mappings are stored on tool itself
"""
def getReferenceList(context=None):
"""return all references registered by portal_url_registry
according given context
context - a context to access container of mappings.
"""
def getReferenceFromURL(url, context=None):
"""return reference of document according provided url
url - external resource identifier
context - a context to access container of mappings.
If not passed, mapping are stored on tool itself
"""
def getURLListFromReference(reference, context=None):
"""return list of urls associated to given reference
and context.
reference - reference of downloaded resource (ERP5 Object instance)
context - a context to access container of mappings.
"""
def updateUrlRegistryTool():
"""Rebuild all url mappings for active preference
"""
...@@ -139,10 +139,21 @@ class CachedConvertableMixin: ...@@ -139,10 +139,21 @@ class CachedConvertableMixin:
cached_value = data cached_value = data
conversion_md5 = md5_new(str(data.data)).hexdigest() conversion_md5 = md5_new(str(data.data)).hexdigest()
size = len(data.data) size = len(data.data)
else: elif isinstance(data, (str, unicode,)):
cached_value = data cached_value = data
conversion_md5 = md5_new(cached_value).hexdigest() conversion_md5 = md5_new(cached_value).hexdigest()
size = len(cached_value) size = len(cached_value)
elif isinstance(data, dict):
# Dict instance are used to store computed metadata
# from actual content.
# So this value is intimely related to cache of conversion.
# As it should be cleared each time the document is edited.
# Also may be a proper API should be used
cached_value = data
conversion_md5 = None
size = len(cached_value)
else:
raise NotImplementedError, 'Not able to store type:%r' % type(data)
if date is None: if date is None:
date = DateTime() date = DateTime()
stored_data_dict = {'content_md5': self.getContentMd5(), stored_data_dict = {'content_md5': self.getContentMd5(),
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
############################################################################## ##############################################################################
# #
# Copyright (c) 2009 Nexedi SA and Contributors. All Rights Reserved. # Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Ivan Tyagov <ivan@nexedi.com> # Ivan Tyagov <ivan@nexedi.com>
# #
# WARNING: This program as such is intended to be used by professional # WARNING: This program as such is intended to be used by professional
...@@ -27,8 +27,13 @@ ...@@ -27,8 +27,13 @@
# #
############################################################################## ##############################################################################
from AccessControl import ClassSecurityInfo, getSecurityManager from AccessControl import ClassSecurityInfo
from Products.ERP5Type import Permissions from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import normaliseUrl
from Products.ERP5Type.DateUtils import convertDateToHour,\
number_of_hours_in_day, number_of_hours_in_year
from urlparse import urlsplit, urlunsplit
from lxml import html as etree_html
class CrawlableMixin: class CrawlableMixin:
""" """
...@@ -80,3 +85,81 @@ class CrawlableMixin: ...@@ -80,3 +85,81 @@ class CrawlableMixin:
method = self._getTypeBasedMethod('isUpdatable', method = self._getTypeBasedMethod('isUpdatable',
fallback_script_id = 'Document_isUpdatable') fallback_script_id = 'Document_isUpdatable')
return method() return method()
security.declareProtected(Permissions.AccessContentsInformation,
'getContentURLList')
def getContentURLList(self):
"""
Returns a list of URLs referenced by the content of this document.
Default implementation consists in analysing the document
converted to HTML. Subclasses may overload this method
if necessary. However, it is better to extend the conversion
methods in order to produce valid HTML, which is useful to
many people, rather than overload this method which is only
useful for crawling.
"""
html_content = self.asEntireHTML()
html_tree = etree_html.fromstring(html_content)
base_href = self.getContentBaseURL()
if base_href:
html_tree.make_links_absolute(base_href)
href_list = []
for elemnt, attribute_name, link, position in html_tree.iterlinks():
# For now take into acount only a and img tags
if attribute_name not in ('href',):
continue
if isinstance(link, unicode):
link = link.encode('utf-8')
href_list.append(link)
return href_list
security.declareProtected(Permissions.AccessContentsInformation,
'getContentBaseURL')
def getContentBaseURL(self):
"""
Returns the content base URL based on the actual content or
on its URL.
"""
raw_url = self.asURL() or ''
splitted_url = urlsplit(raw_url)
path_part = splitted_url[2]
path_part = '/'.join(path_part.split('/')[:-1])
base_url = urlunsplit((splitted_url[0], splitted_url[1], path_part, None,
None))
if isinstance(base_url, unicode):
base_url = base_url.encode('utf-8')
return base_url
security.declareProtected(Permissions.AccessContentsInformation,
'getContentNormalisedURLList')
def getContentNormalisedURLList(self):
"""
Call url normalizer for each url returned by getContentURLList
Return only url associated to the same Domain
"""
reference_domain = urlsplit(normaliseUrl(self.asURL() or ''))[1]
# in www.example.com or www.3.example.com
# keep only the example.com part
reference_domain = ''.join(reference_domain.split('.')[-2:])
if isinstance(reference_domain, unicode):
reference_domain = reference_domain.encode('utf-8')
url_list = []
base_url = self.getContentBaseURL()
for url in self.getContentURLList():
try:
url = normaliseUrl(url, base_url=base_url)
except UnicodeDecodeError:
# Ignore wrong encoding errors
# Web is not a kind world
continue
if not url:
continue
url_domain = urlsplit(url)[1]
if isinstance(url_domain, unicode):
url_domain = url_domain.encode('utf-8')
if url_domain and ''.join(url_domain.split('.')[-2:]) != reference_domain:
continue
# if domain is empty (relative link) or domain is same, then OK
url_list.append(url)
return url_list
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Ivan Tyagov <ivan@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from AccessControl import ClassSecurityInfo, getSecurityManager
from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import convertToUpperCase
from Products.CMFCore.utils import getToolByName
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
import os
import re
try:
import magic
except ImportError:
magic = None
VALID_ORDER_KEY_LIST = ('user_login', 'content', 'filename', 'input')
CONTENT_INFORMATION_FORMAT = '_idiscoverable_content_information'
class DiscoverableMixin(CachedConvertableMixin):
"""
Implements IDiscoverable
This class provide methods useful for Metadata extraction.
It inherit from CachedConvertableMixin to access
Cache storage API.
As computed data needs to be stored in same backend.
"""
security = ClassSecurityInfo()
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromUserLogin')
def getPropertyDictFromUserLogin(self, user_login=None):
"""
Based on the user_login, find out as many properties as needed.
returns properties which should be set on the document
"""
if user_login is None:
user_login = str(getSecurityManager().getUser())
method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
fallback_script_id='Document_getPropertyDictFromUserLogin')
return method(user_login)
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromContent')
def getPropertyDictFromContent(self):
"""
Based on the document content, find out as many properties as needed.
returns properties which should be set on the document
"""
# accesss data through convert
mime, content = self.convert(None)
if not content:
# if document is empty, we will not find anything in its content
return {}
method = self._getTypeBasedMethod('getPropertyDictFromContent',
fallback_script_id='Document_getPropertyDictFromContent')
return method()
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromFilename')
def getPropertyDictFromFilename(self, filename):
"""
Based on the file name, find out as many properties as needed.
returns properties which should be set on the document
"""
return self.portal_contributions.getPropertyDictFromFilename(filename)
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromFileName')
getPropertyDictFromFileName = getPropertyDictFromFilename
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromInput')
def getPropertyDictFromInput(self, input_parameter_dict):
"""
Fetch argument_dict, then filter pass this dictionary
to getPropertyDictFromInput.
"""
method = self._getTypeBasedMethod('getPropertyDictFromInput')
return method(input_parameter_dict)
### Metadata disovery and ingestion methods
security.declareProtected(Permissions.ModifyPortalContent,
'discoverMetadata')
def discoverMetadata(self, filename=None, user_login=None,
input_parameter_dict=None):
"""
This is the main metadata discovery function - controls the process
of discovering data from various sources. The discovery itself is
delegated to scripts or uses preference-configurable regexps. The
method returns either self or the document which has been
merged in the discovery process.
filename - this parameter is a file name of the form "AA-BBB-CCC-223-en"
user_login - this is a login string of a person; can be None if the user is
currently logged in, then we'll get him from session
input_parameter_dict - arguments provided to Create this content by user.
"""
# Preference is made of a sequence of 'user_login', 'content', 'filename', 'input'
method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList')
order_list = list(method())
order_list.reverse()
# build a dictionary according to the order
kw = {}
for order_id in order_list:
result = None
if order_id not in VALID_ORDER_KEY_LIST:
# Prevent security attack or bad preferences
raise AttributeError, "%s is not in valid order key list" % order_id
method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
method = getattr(self, method_id)
if order_id == 'filename':
if filename is not None:
result = method(filename)
elif order_id == 'user_login':
if user_login is not None:
result = method(user_login)
elif order_id == 'input':
if input_parameter_dict is not None:
result = method(input_parameter_dict)
else:
result = method()
if result is not None:
for key, value in result.iteritems():
if value not in (None, ''):
kw[key]=value
# Prepare the content edit parameters
portal_type = kw.pop('portal_type', None)
if portal_type and portal_type != self.getPortalType():
# Reingestion is required to update portal_type
return self.migratePortalType(portal_type)
# Try not to invoke an automatic transition here
self._edit(**kw)
if not portal_type:
# If no portal_type was dicovered, pass self
# through to portal_contribution_registry
# to guess destination portal_type against all properties.
# If returned portal_type is different, then reingest.
registry = getToolByName(self.getPortalObject(),
'portal_contribution_registry')
portal_type = registry.findPortalTypeName(context=self)
if portal_type != self.getPortalType():
return self.migratePortalType(portal_type)
# Finish ingestion by calling method
self.finishIngestion() # XXX - is this really the right place ?
self.reindexObject() # XXX - is this really the right place ?
# Revision merge is tightly coupled
# to metadata discovery - refer to the documentation of mergeRevision method
merged_doc = self.mergeRevision() # XXX - is this really the right place ?
merged_doc.reindexObject() # XXX - is this really the right place ?
return merged_doc # XXX - is this really the right place ?
security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
def finishIngestion(self):
"""
Finish the ingestion process by calling the appropriate script. This
script can for example allocate a reference number automatically if
no reference was defined.
"""
method = self._getTypeBasedMethod('finishIngestion',
fallback_script_id='Document_finishIngestion')
return method()
security.declareProtected(Permissions.AccessContentsInformation,
'getContentTypeFromContent')
def getContentTypeFromContent(self):
"""
Return content_type read from metadata extraction of content.
This method is called by portal_contribution_registry
"""
mime, content = self.convert(None)
if not content:
return
if magic is not None:
# This will be delegated soon to external web service
# like cloudooo
# ERP5 will no longer handle data itself.
mimedetector = magic.Magic(mime=True)
return mimedetector.from_buffer(content)
security.declareProtected(Permissions.AccessContentsInformation,
'getExtensionFromFilename')
def getExtensionFromFilename(self, filename=None):
"""
Return extension read from filename in lower case.
"""
if not filename:
filename = self.getStandardFilename()
basename, extension = os.path.splitext(filename)
if extension:
extension = extension[1:].lower() # remove first dot
return extension
security.declareProtected(Permissions.AccessContentsInformation,
'getContentInformation')
def getContentInformation(self):
"""
Call private implementation, then store the result in conversion
cache storage.
"""
format = CONTENT_INFORMATION_FORMAT
# How to knows if a instance implement an interface
try:
mime, cached_value = self.getConversion(format=format)
return cached_value
except KeyError:
value = self._getContentInformation()
self.setConversion(value, format=format)
return value
def _getContentInformation(self):
"""
Returns the content information from the HTML conversion.
The default implementation tries to build a dictionary
from the HTML conversion of the document and extract
the document title.
"""
result = {}
html = self.asEntireHTML()
if not html:
return result
title_list = re.findall(self.title_parser, str(html))
if title_list:
result['title'] = title_list[0]
return result
...@@ -31,6 +31,7 @@ from Products.ERP5Type import Permissions ...@@ -31,6 +31,7 @@ from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import fill_args_from_request from Products.ERP5Type.Utils import fill_args_from_request
from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\ from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\
_ViewEmulator _ViewEmulator
import warnings
_MARKER = [] _MARKER = []
...@@ -108,15 +109,31 @@ class DownloadableMixin: ...@@ -108,15 +109,31 @@ class DownloadableMixin:
return str(data) return str(data)
security.declareProtected(Permissions.AccessContentsInformation, security.declareProtected(Permissions.AccessContentsInformation,
'getStandardFileName') 'getStandardFilename')
def getStandardFileName(self, format=None): def getStandardFilename(self, format=None):
"""Returns the document coordinates as a standard file name. This """Returns the document coordinates as a standard file name. This
method is the reverse of getPropertyDictFromFileName. method is the reverse of getPropertyDictFromFileName.
""" """
method = self._getTypeBasedMethod('getStandardFileName', method = self._getTypeBasedMethod('getStandardFilename',
fallback_script_id='Document_getStandardFilename')
if method is None:
# backward compatibility
method = self._getTypeBasedMethod('getStandardFileName',
fallback_script_id='Document_getStandardFileName') fallback_script_id='Document_getStandardFileName')
return method(format=format) return method(format=format)
# backward compatibility
security.declareProtected(Permissions.AccessContentsInformation,
'getStandardFileName')
def getStandardFileName(self, format=None):
"""(deprecated) use getStandardFilename() instead."""
warnings.warn('getStandardFileName() is deprecated. '
'use getStandardFilename() instead.')
return self.getStandardFilename(format=format)
method = self._getTypeBasedMethod('getStandardFilename',
fallback_script_id='Document_getStandardFilename')
return method(format=format)
def manage_FTPget(self): def manage_FTPget(self):
"""Return body for ftp. and WebDAV """Return body for ftp. and WebDAV
""" """
......
...@@ -43,6 +43,7 @@ from zExceptions import BadRequest ...@@ -43,6 +43,7 @@ from zExceptions import BadRequest
from Products.ERP5Type.tests.backportUnittest import skip from Products.ERP5Type.tests.backportUnittest import skip
from Products.ERP5Type.Tool.ClassTool import _aq_reset from Products.ERP5Type.Tool.ClassTool import _aq_reset
from Products.ERP5Type.Workflow import addWorkflowByType from Products.ERP5Type.Workflow import addWorkflowByType
from Products.CMFCore.WorkflowCore import WorkflowException
def getDummyTypeBaseMethod(self): def getDummyTypeBaseMethod(self):
""" Use a type Base method """ Use a type Base method
...@@ -1248,6 +1249,43 @@ class TestBase(ERP5TypeTestCase, ZopeTestCase.Functional): ...@@ -1248,6 +1249,43 @@ class TestBase(ERP5TypeTestCase, ZopeTestCase.Functional):
self.assertFalse(person.isIndexable) self.assertFalse(person.isIndexable)
self.assertEquals(0, len(self.portal.portal_catalog(uid=person.getUid()))) self.assertEquals(0, len(self.portal.portal_catalog(uid=person.getUid())))
def test_metaWorkflowTransition(self):
"""Test Meta Transtion, jump from state to another without explicitely
transtion defined.
"""
module = self.portal.person_module
person = module.newContent(portal_type='Person')
self.assertEquals(person.getValidationState(), 'draft')
self.assertFalse(self.portal.portal_workflow.isTransitionPossible(person,
'invalidate'))
# test low-level implementation
self.portal.portal_workflow.validation_workflow._executeMetaTransition(
person, 'invalidated')
self.assertEquals(person.getValidationState(), 'invalidated')
validation_history = person.workflow_history['validation_workflow']
self.assertEquals(len(validation_history), 2)
self.assertEquals(validation_history[-1]['comment'],
'Jump from \'draft\' to \'invalidated\'')
person = module.newContent(portal_type='Person')
self.assertEquals(person.getValidationState(), 'draft')
# test high-level implementation
self.portal.portal_workflow._jumpToStateFor(person, 'invalidated')
self.assertEquals(person.getValidationState(), 'invalidated')
person = module.newContent(portal_type='Person')
self.assertEquals(person.getValidationState(), 'draft')
self.portal.portal_workflow._jumpToStateFor(person, 'invalidated',
wf_id='validation_workflow')
self.assertEquals(person.getValidationState(), 'invalidated')
person = module.newContent(portal_type='Person')
self.assertEquals(person.getValidationState(), 'draft')
self.assertRaises(WorkflowException,
self.portal.portal_workflow._jumpToStateFor,
person, 'invalidated', wf_id='edit_workflow')
self.assertEquals(person.getValidationState(), 'draft')
class TestERP5PropertyManager(unittest.TestCase): class TestERP5PropertyManager(unittest.TestCase):
"""Tests for ERP5PropertyManager. """Tests for ERP5PropertyManager.
""" """
......
...@@ -36,7 +36,7 @@ from Products.CMFCore.WorkflowCore import WorkflowException ...@@ -36,7 +36,7 @@ from Products.CMFCore.WorkflowCore import WorkflowException
from Products.ERP5Type.tests.utils import DummyMailHost, FileUpload from Products.ERP5Type.tests.utils import DummyMailHost, FileUpload
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\ from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
_getConversionServerDict _getConversionServerDict
from Products.ERP5OOo.tests.testIngestion import FILE_NAME_REGULAR_EXPRESSION from Products.ERP5OOo.tests.testIngestion import FILENAME_REGULAR_EXPRESSION
from Products.ERP5OOo.tests.testIngestion import REFERENCE_REGULAR_EXPRESSION from Products.ERP5OOo.tests.testIngestion import REFERENCE_REGULAR_EXPRESSION
from Products.ERP5Type.tests.backportUnittest import expectedFailure from Products.ERP5Type.tests.backportUnittest import expectedFailure
...@@ -443,7 +443,7 @@ class TestCRMMailIngestion(BaseTestCRM): ...@@ -443,7 +443,7 @@ class TestCRMMailIngestion(BaseTestCRM):
data=self._readTestData(filename) data=self._readTestData(filename)
return self.portal.portal_contributions.newContent( return self.portal.portal_contributions.newContent(
container_path='event_module', container_path='event_module',
file_name='postfix_mail.eml', filename='postfix_mail.eml',
data=data) data=data)
def test_findTypeByName_MailMessage(self): def test_findTypeByName_MailMessage(self):
...@@ -451,7 +451,7 @@ class TestCRMMailIngestion(BaseTestCRM): ...@@ -451,7 +451,7 @@ class TestCRMMailIngestion(BaseTestCRM):
self.assertEquals( self.assertEquals(
'Mail Message', 'Mail Message',
self.portal.portal_contribution_registry.findPortalTypeName( self.portal.portal_contribution_registry.findPortalTypeName(
file_name='postfix_mail.eml', mime_type='message/rfc822', data='Test' filename='postfix_mail.eml', content_type='message/rfc822', data='Test'
)) ))
def test_Base_getEntityListFromFromHeader(self): def test_Base_getEntityListFromFromHeader(self):
...@@ -767,7 +767,7 @@ class TestCRMMailSend(BaseTestCRM): ...@@ -767,7 +767,7 @@ class TestCRMMailSend(BaseTestCRM):
conversion_dict = _getConversionServerDict() conversion_dict = _getConversionServerDict()
default_pref.setPreferredOoodocServerAddress(conversion_dict['hostname']) default_pref.setPreferredOoodocServerAddress(conversion_dict['hostname'])
default_pref.setPreferredOoodocServerPortNumber(conversion_dict['port']) default_pref.setPreferredOoodocServerPortNumber(conversion_dict['port'])
default_pref.setPreferredDocumentFileNameRegularExpression(FILE_NAME_REGULAR_EXPRESSION) default_pref.setPreferredDocumentFileNameRegularExpression(FILENAME_REGULAR_EXPRESSION)
default_pref.setPreferredDocumentReferenceRegularExpression(REFERENCE_REGULAR_EXPRESSION) default_pref.setPreferredDocumentReferenceRegularExpression(REFERENCE_REGULAR_EXPRESSION)
if default_pref.getPreferenceState() == 'disabled': if default_pref.getPreferenceState() == 'disabled':
default_pref.enable() default_pref.enable()
......
...@@ -120,36 +120,36 @@ return predicate.getDestinationPortalType() ...@@ -120,36 +120,36 @@ return predicate.getDestinationPortalType()
tool = self.portal.portal_contribution_registry tool = self.portal.portal_contribution_registry
# Test extension matching # Test extension matching
self.assertEqual(tool.findPortalTypeName(file_name='test.txt'), 'Text') self.assertEqual(tool.findPortalTypeName(filename='test.txt'), 'Text')
self.assertEqual(tool.findPortalTypeName(file_name='test.odt'), 'Text') self.assertEqual(tool.findPortalTypeName(filename='test.odt'), 'Text')
self.assertEqual(tool.findPortalTypeName(file_name='001.jpg'), 'Image') self.assertEqual(tool.findPortalTypeName(filename='001.jpg'), 'Image')
self.assertEqual(tool.findPortalTypeName(file_name='002.PNG'), 'Image') self.assertEqual(tool.findPortalTypeName(filename='002.png'), 'Image')
self.assertEqual(tool.findPortalTypeName(file_name='002.PNG'), 'Image') self.assertEqual(tool.findPortalTypeName(filename='002.PNG'), 'Image')
self.assertEqual(tool.findPortalTypeName(file_name='index.html'), 'Web Page') self.assertEqual(tool.findPortalTypeName(filename='index.html'), 'Web Page')
# Unknown extension # Unknown extension
self.assertEqual(tool.findPortalTypeName(file_name='index.xxx'), 'File') self.assertEqual(tool.findPortalTypeName(filename='index.xxx'), 'File')
# Test mimetype matching # Test mimetype matching
self.assertEqual(tool.findPortalTypeName(mime_type='text/html'), 'Web Page') self.assertEqual(tool.findPortalTypeName(content_type='text/html'), 'Web Page')
# Unknown mimetype # Unknown mimetype
self.assertEqual(tool.findPortalTypeName(mime_type='application/octet-stream'), 'File') self.assertEqual(tool.findPortalTypeName(content_type='application/octet-stream'), 'File')
# Test both of extension and mimetype # Test both of extension and mimetype
self.assertNotEqual(tool.findPortalTypeName(file_name='message.eml'), self.assertNotEqual(tool.findPortalTypeName(filename='message.eml'),
'Mail Message') 'Mail Message')
self.assertNotEqual(tool.findPortalTypeName(mime_type='message/rfc822'), self.assertNotEqual(tool.findPortalTypeName(content_type='message/rfc822'),
'Mail Message') 'Mail Message')
self.assertEqual(tool.findPortalTypeName(file_name='message.eml', self.assertEqual(tool.findPortalTypeName(filename='message.eml',
mime_type='message/rfc822'), content_type='message/rfc822'),
'Mail Message') 'Mail Message')
# Test test script # Test test script
data = """\ data = """\
Subject: Fax Subject: Fax
""" """
self.assertEqual(tool.findPortalTypeName(file_name='message.eml', self.assertEqual(tool.findPortalTypeName(filename='message.eml',
mime_type='message/rfc822', content_type='message/rfc822',
data=data), data=data),
'Fax Message') 'Fax Message')
......
...@@ -37,7 +37,8 @@ from AccessControl.SecurityManagement import newSecurityManager ...@@ -37,7 +37,8 @@ from AccessControl.SecurityManagement import newSecurityManager
from Testing import ZopeTestCase from Testing import ZopeTestCase
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\ from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
_getConversionServerDict _getConversionServerDict
from Products.ERP5Type.tests.utils import FileUpload from Products.ERP5Type.tests.utils import FileUpload, createZODBPythonScript
LANGUAGE_LIST = ('en', 'fr', 'de', 'bg',) LANGUAGE_LIST = ('en', 'fr', 'de', 'bg',)
...@@ -568,8 +569,21 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional): ...@@ -568,8 +569,21 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
def test_PreviewOOoDocumentWithEmbeddedImage(self): def test_PreviewOOoDocumentWithEmbeddedImage(self):
"""Tests html preview of an OOo document with images as extensible content. """Tests html preview of an OOo document with images as extensible content.
For this test, Presentation_checkConversionFormatPermission does not allow
access to original format for Unauthenticated users.
Chack that user can still access to other format.
""" """
portal = self.portal portal = self.portal
script_id = 'Presentation_checkConversionFormatPermission'
python_code = """from AccessControl import getSecurityManager
user = getSecurityManager().getUser()
if (not user or not user.getId()) and not format:
return False
return True
"""
createZODBPythonScript(portal.portal_skins.custom, script_id,
'format, **kw', python_code)
request = portal.REQUEST request = portal.REQUEST
request['PARENTS'] = [self.app] request['PARENTS'] = [self.app]
self.getPortalObject().aq_parent.acl_users._doAddUser( self.getPortalObject().aq_parent.acl_users._doAddUser(
...@@ -611,7 +625,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional): ...@@ -611,7 +625,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
# then publish the document and access it anonymously by reference through # then publish the document and access it anonymously by reference through
# the web site # the web site
document.publish() document.publish()
transaction.commit() transaction.commit()
self.tic() self.tic()
...@@ -620,7 +634,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional): ...@@ -620,7 +634,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
self.assertTrue(response.getHeader('content-type').startswith('text/html')) self.assertTrue(response.getHeader('content-type').startswith('text/html'))
html = response.getBody() html = response.getBody()
self.assertTrue('<img' in html, html) self.assertTrue('<img' in html, html)
# find the img src # find the img src
img_list = etree.HTML(html).findall('.//img') img_list = etree.HTML(html).findall('.//img')
self.assertEquals(1, len(img_list)) self.assertEquals(1, len(img_list))
...@@ -633,6 +647,22 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional): ...@@ -633,6 +647,22 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
png = response.getBody() png = response.getBody()
self.assertTrue(png.startswith('\x89PNG')) self.assertTrue(png.startswith('\x89PNG'))
# Now purge cache and let Anonymous user converting the document.
self.login()
document.edit() # Reset cache key
transaction.commit()
self.tic()
response = self.publish('%s/%s/asEntireHTML' % (
website.absolute_url_path(), document_reference))
self.assertTrue(response.getHeader('content-type').startswith('text/html'))
html = response.getBody()
self.assertTrue('<img' in html, html)
# find the img src
img_list = etree.HTML(html).findall('.//img')
self.assertEquals(1, len(img_list))
src = img_list[0].get('src')
def test_ImageConversionThroughWebSite(self): def test_ImageConversionThroughWebSite(self):
"""Check that conversion parameters pass in url """Check that conversion parameters pass in url
are hounoured to display an image in context of a website are hounoured to display an image in context of a website
......
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Nicolas Delaby <nicolas@erp5.org>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import unittest
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
_getConversionServerDict
import transaction
# test files' home
FILENAME_REGULAR_EXPRESSION = "(?P<reference>[A-Z&é@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})"
REFERENCE_REGULAR_EXPRESSION = "(?P<reference>[A-Z&é@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?"
class TestWebCrawler(ERP5TypeTestCase):
"""
Test Crawling mechanism
"""
_path_to_delete_list = []
system_pref_id = 'my_preference'
def getTitle(self):
"""
Return the title of the current test set.
"""
return "ERP5 Live DMS - Web Crawling"
def getBusinessTemplateList(self):
"""
Return the list of required business templates.
"""
return ('erp5_base',
'erp5_ingestion',
'erp5_ingestion_mysql_innodb_catalog',
'erp5_web',
'erp5_dms')
def afterSetUp(self):
"""
Initialize the ERP5 site.
"""
self.login()
self.portal = self.getPortal()
self.setSystemPreference()
self.bootstrapWebSite()
transaction.commit()
self.tic()
def beforeTearDown(self):
portal = self.portal
module_id_list = [
'web_page_module',
'web_site_module',
'external_source_module',
'document_module',
]
# delete created documents by test
for module_id in module_id_list:
module = portal[module_id]
module.manage_delObjects(list(module.objectIds()))
# Unindex deleted documents
transaction.commit()
self.tic()
def setSystemPreference(self):
portal_preferences = self.portal.portal_preferences
system_preference = portal_preferences._getOb(self.system_pref_id, None)
if system_preference is None:
system_preference = portal_preferences.newContent(id=self.system_pref_id,
portal_type='System Preference')
conversion_dict = _getConversionServerDict()
system_preference.\
setPreferredOoodocServerAddress(conversion_dict['hostname'])
system_preference.\
setPreferredOoodocServerPortNumber(conversion_dict['port'])
system_preference.setPreferredDocumentFilenameRegularExpression(
FILENAME_REGULAR_EXPRESSION)
system_preference.setPreferredDocumentReferenceRegularExpression(
REFERENCE_REGULAR_EXPRESSION)
if system_preference.getPreferenceState() != 'global':
system_preference.enable()
def bootstrapWebSite(self):
"""Create 1 Website
live_test_web_site/section1/section1a
/section2
create 2 web pages
W-REFERENCE.PAGE
W-REFERENCE.HOMEPAGE
the website use light version of erp5_web_layout
It keep just displaying sections and subsection
And default Web page
"""
web_site_portal_type = 'Web Site'
web_section_portal_type = 'Web Section'
web_page_portal_type = 'Web Page'
web_site_module = self.portal.getDefaultModule(web_site_portal_type)
web_page_module = self.portal.getDefaultModule(web_page_portal_type)
text_content = """<p><a href="W-REFERENCE.PAGE">Page</a></p>"""
web_page_id = 'live_test_home'
home_page = web_page_module.newContent(portal_type=web_page_portal_type,
title='Home Page',
text_content=text_content,
reference='W-REFERENCE.HOMEPAGE',
version='001',
language='en',
id=web_page_id)
home_page.submit()
home_page.publish()
web_site_id = 'live_test_web_site'
web_site = web_site_module.newContent(portal_type=web_site_portal_type,
id=web_site_id,
title='Live Test Web Site',
visible=True,
default_page_displayed=True,
site_map_section_parent=True,
authorization_forced=True,
aggregate_value=home_page,
available_language_set=['en'],
container_layout='erp5_web_layout_test',
content_layout='erp5_web_content_layout_test')
web_site.publish()
text_content = """<p>
<a href="%s/W-REFERENCE.HOMEPAGE">absolute link to HOME PAGE</a>
</p>""" % web_site.absolute_url()
section1a_page = web_page_module.newContent(
portal_type=web_page_portal_type,
title='Home Page',
text_content=text_content,
reference='W-REFERENCE.PAGE',
version='001',
language='en')
section1a_page.submit()
section1a_page.publish()
web_section1 = web_site.newContent(portal_type=web_section_portal_type,
title='Section 1',
id='section1',
aggregate_value=section1a_page)
web_section2 = web_site.newContent(portal_type=web_section_portal_type,
title='Section 2',
id='section2',
aggregate_value=section1a_page)
web_section1a = web_section1.newContent(
portal_type=web_section_portal_type,
title='Section 1a',
id='section 1a', #add a space in id
aggregate_value=section1a_page)
def test_01_check_URLTransformations(self):
"""Check crawlable functionalities regarding URL handling
getContentBaseURL
asNormalisedURL
getContentNormalisedURLList
"""
web_page_portal_type = 'Web Page'
web_page_module = self.portal.getDefaultModule(web_page_portal_type)
web_page = web_page_module.newContent(portal_type=web_page_portal_type)
self.assertEquals(web_page.getContentBaseURL(), '')
web_page.fromURL('http://www.example.com')
self.assertEquals(web_page.getContentBaseURL(), 'http://www.example.com')
web_page.fromURL('http://www.example.com/section/sub_section')
self.assertEquals(web_page.getContentBaseURL(),
'http://www.example.com/section')
text_content = """<html>
<head>
<base href="http://www.example.com"/>
</head>
<body>
<p><a href="http://www.notexample.com/">External link</a></p>
<p><a href="http://www.example.com//I don't care I put what/ I want/">
Funny link</a></p>
<p><a href="http://www.example.com/section">Internal link</a></p>
<p><a href="section2">Relative Internal link</a></p>
<p><a href="http://www.example.com/?title=%E9+crit">With Encoding issue
This link will be discarded</a></p>
<img src="my_image_link"/>
<script src="should_not_be_followed.js"/>
<p><a href="http://http://www.example.com/section">Not a link</a></p>
</body>
</html>"""
web_page.edit(text_content=text_content)
self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
self.assertEquals(web_page.getContentNormalisedURLList(),
["http://www.example.com/I don't care I put what/ I want/",
'http://www.example.com/section',
'http://www.example.com/section2',])
# relative links without base tag
text_content = """<html>
<head>
</head>
<body>
<p><a href="section2">Relative Internal link</a></p>
</body>
</html>"""
web_page.edit(text_content=text_content)
web_page.fromURL('http://www.example.com/#fffff')
self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
self.assertEquals(web_page.getContentNormalisedURLList(),
['http://www.example.com/section2',])
self.assertEquals(web_page.asNormalisedURL(),
'http://www.example.com/#fffff')
def test_02_crawlWebSite(self):
"""Call portal_contribution to crawl website hosted by itself.
"""
web_site = self.portal.web_site_module.live_test_web_site
external_source_portal_type = 'URL Crawler'
web_crawler_module = self.portal.getDefaultModule(
external_source_portal_type)
web_crawler = web_crawler_module.newContent(
portal_type=external_source_portal_type,
crawling_depth=5)
web_crawler.fromURL(web_site.absolute_url())
transaction.commit()
self.tic()
web_crawler.crawlContent()
transaction.commit()
self.tic()
# 6 = 1 website
# + 3 Web Sections
# + 1 absolute link to home_page
# + 1 relative link from home_page to another web page
self.assertEquals(len(web_crawler), 6)
self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
6)
date_before = web_crawler.getModificationDate()
web_crawler.crawlContent()
transaction.commit()
self.tic()
# Nothing happens, portal_url_registry keep crawling twice
# the same url
self.assertEquals(len(web_crawler), 6)
self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
6)
# not modified
self.assertEquals(date_before, web_crawler.getModificationDate())
new_web_crawler = web_crawler_module.newContent(
portal_type=external_source_portal_type,
crawling_depth=5)
new_web_crawler.fromURL(web_site.absolute_url())
transaction.commit()
self.tic()
new_web_crawler.crawlContent()
transaction.commit()
self.tic()
# check that portal_url_registry
# block contribution of existing content
self.assertFalse(len(new_web_crawler))
# set another namespace on preference
preference = self.portal.portal_preferences[self.system_pref_id]
preference.setPreferredIngestionNamespace('NEW')
transaction.commit()
self.tic()
new_web_crawler.crawlContent()
transaction.commit()
self.tic()
self.assertEquals(len(web_crawler), 6)
def test_suite():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestWebCrawler))
return suite
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment