Commit de2545fc authored by Nicolas Delaby's avatar Nicolas Delaby

Refactoring of DMS.

- file_name become filename
- filename values are not stored in source_reference
Contribution Tool will not honour id arguments.
Contribution Tool can create any kind of document.
Portal Contribution Registry can read extention, content_type and read content_type from data
to guess what will be the best Portal Type to use.

All discoverable methods (IDiscoverable) can change the portal_type of document.
  (migratePortalType)
User can change portal_type of document through UI with simple Action.
Crawling will not hardcode ids of document depending of their URLs thanks to 
Portal Url Registry





git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@40971 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4627391c
......@@ -40,18 +40,15 @@ from Products.ERP5Type import Permissions, PropertySheet, interfaces
from Products.ERP5Type.XMLObject import XMLObject
from Products.ERP5Type.DateUtils import convertDateToHour,\
number_of_hours_in_day, number_of_hours_in_year
from Products.ERP5Type.Utils import convertToUpperCase, fill_args_from_request
from Products.ERP5Type.Utils import convertToUpperCase, fill_args_from_request,\
deprecated
from Products.ERP5Type.TransactionalVariable import getTransactionalVariable
from Products.ERP5Type.Cache import getReadOnlyTransactionCache
from Products.ERP5.Document.Url import UrlMixIn
from Products.ERP5.Tool.ContributionTool import MAX_REPEAT
from Products.ERP5Type.UnrestrictedMethod import unrestricted_apply
from Products.ZSQLCatalog.SQLCatalog import SQLQuery
from AccessControl import Unauthorized
import zope.interface
from Products.PythonScripts.Utility import allow_class
import tempfile
from subprocess import Popen, PIPE
# Mixin Import
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
......@@ -60,9 +57,10 @@ from Products.ERP5.mixin.downloadable import DownloadableMixin
from Products.ERP5.mixin.document import DocumentMixin
from Products.ERP5.mixin.extensible_traversable import DocumentExtensibleTraversableMixin
from Products.ERP5.mixin.crawlable import CrawlableMixin
from Products.ERP5.mixin.discoverable import DiscoverableMixin
from Products.ERP5.mixin.url import UrlMixin
_MARKER = []
VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')
# these property ids are unchangable
FIXED_PROPERTY_IDS = ('id', 'uid', 'rid', 'sid')
......@@ -88,8 +86,9 @@ class DocumentProxyError(Exception):pass
class NotConvertedError(Exception):pass
allow_class(NotConvertedError)
class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedConvertableMixin,
CrawlableMixin, TextConvertableMixin, DownloadableMixin, DocumentMixin):
class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixin,
CachedConvertableMixin, CrawlableMixin, TextConvertableMixin,
DownloadableMixin, DocumentMixin, DiscoverableMixin):
"""Document is an abstract class with all methods related to document
management in ERP5. This includes searchable text, explicit relations,
implicit relations, metadata, versions, languages, etc.
......@@ -144,7 +143,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
input - data supplied with http request or set on the object during (2) (e.g.
discovered from email text)
file_name - data which might be encoded in file name
filename - data which might be encoded in filename
user_login - information about user who is contributing the file
content - data which might be derived from document content
......@@ -156,7 +155,7 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
Methods for discovering metadata are:
getPropertyDictFromInput
getPropertyDictFromFileName
getPropertyDictFromFilename
getPropertyDictFromUserLogin
getPropertyDictFromContent
......@@ -266,10 +265,15 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
interfaces.IVersionable,
interfaces.IDownloadable,
interfaces.ICrawlable,
interfaces.IDocument
interfaces.IDocument,
interfaces.IDiscoverable,
interfaces.IUrl,
)
# Regular expressions
# XXX those regex are weak, fast but not reliable.
# this is a valid url than regex are not able to parse
# http://www.example.com//I don't care i put what/ i want/
href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
......@@ -639,141 +643,14 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
if not reference:
return
catalog = self.getPortalObject().portal_catalog
res = catalog(reference=self.getReference(), sort_on=(('creation_date','ascending'),))
result_list = catalog.unrestrictedSearchResults(
reference=self.getReference(),
sort_on=(('creation_date',
'ascending'),))
# XXX this should be security-unaware - delegate to script with proxy roles
return res[0].getLanguage() # XXX what happens if it is empty?
### Property getters
# Property Getters are document dependent so that we can
# handle the weird cases in which needed properties change with the type of document
# and the usual cases in which accessing content changes with the meta type
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromUserLogin')
def getPropertyDictFromUserLogin(self, user_login=None):
"""
Based on the user_login, find out as many properties as needed.
returns properties which should be set on the document
"""
if user_login is None:
user_login = str(getSecurityManager().getUser())
method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
fallback_script_id='Document_getPropertyDictFromUserLogin')
return method(user_login)
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromContent')
def getPropertyDictFromContent(self):
"""
Based on the document content, find out as many properties as needed.
returns properties which should be set on the document
"""
# accesss data through convert
mime, content = self.convert(None)
if not content:
# if document is empty, we will not find anything in its content
return {}
method = self._getTypeBasedMethod('getPropertyDictFromContent',
fallback_script_id='Document_getPropertyDictFromContent')
return method()
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
def getPropertyDictFromFileName(self, file_name):
"""
Based on the file name, find out as many properties as needed.
returns properties which should be set on the document
"""
return self.portal_contributions.getPropertyDictFromFileName(file_name)
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromInput')
def getPropertyDictFromInput(self):
"""
Get properties which were supplied explicitly to the ingestion method
(discovered or supplied before the document was created).
The implementation consists in saving document properties
into _backup_input by supposing that original input parameters were
set on the document by ContributionTool.newContent as soon
as the document was created.
"""
kw = getattr(self, '_backup_input', {})
if kw:
return kw
for id in self.propertyIds():
# We should not consider file data
if id not in ('data', 'categories_list', 'uid', 'id',
'text_content', 'base_data',) \
and self.hasProperty(id):
kw[id] = self.getProperty(id)
self._backup_input = kw # We could use volatile and pass kw in activate
# if we are garanteed that _backup_input does not
# disappear within a given transaction
return kw
### Metadata disovery and ingestion methods
security.declareProtected(Permissions.ModifyPortalContent, 'discoverMetadata')
def discoverMetadata(self, file_name=None, user_login=None):
"""
This is the main metadata discovery function - controls the process
of discovering data from various sources. The discovery itself is
delegated to scripts or uses preference-configurable regexps. The
method returns either self or the document which has been
merged in the discovery process.
file_name - this parameter is a file name of the form "AA-BBB-CCC-223-en"
user_login - this is a login string of a person; can be None if the user is
currently logged in, then we'll get him from session
"""
# Preference is made of a sequence of 'user_login', 'content', 'file_name', 'input'
method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList',
fallback_script_id = 'Document_getPreferredDocumentMetadataDiscoveryOrderList')
order_list = list(method())
order_list.reverse()
# build a dictionary according to the order
kw = {}
for order_id in order_list:
result = None
if order_id not in VALID_ORDER_KEY_LIST:
# Prevent security attack or bad preferences
raise AttributeError, "%s is not in valid order key list" % order_id
method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
method = getattr(self, method_id)
if order_id == 'file_name':
if file_name is not None:
result = method(file_name)
elif order_id == 'user_login':
if user_login is not None:
result = method(user_login)
else:
result = method()
if result is not None:
for key, value in result.iteritems():
if value not in (None, ''):
kw[key]=value
if file_name is not None:
# filename is often undefined....
kw['source_reference'] = file_name
# Prepare the content edit parameters - portal_type should not be changed
kw.pop('portal_type', None)
# Try not to invoke an automatic transition here
self._edit(**kw)
# Finish ingestion by calling method
self.finishIngestion() # XXX - is this really the right place ?
self.reindexObject() # XXX - is this really the right place ?
# Revision merge is tightly coupled
# to metadata discovery - refer to the documentation of mergeRevision method
merged_doc = self.mergeRevision() # XXX - is this really the right place ?
merged_doc.reindexObject() # XXX - is this really the right place ?
return merged_doc # XXX - is this really the right place ?
security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
def finishIngestion(self):
"""
Finish the ingestion process by calling the appropriate script. This
script can for example allocate a reference number automatically if
no reference was defined.
"""
method = self._getTypeBasedMethod('finishIngestion', fallback_script_id='Document_finishIngestion')
return method()
if result_list:
return result_list[0].getLanguage()
return
security.declareProtected(Permissions.View, 'asSubjectText')
def asSubjectText(self, **kw):
......@@ -827,32 +704,13 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
return self._stripHTML(self._asHTML(**kw))
security.declarePrivate('_guessEncoding')
@deprecated
def _guessEncoding(self, string, mime='text/html'):
"""
Try to guess the encoding for this string.
Returns None if no encoding can be guessed.
Deprecated method
"""
try:
import chardet
except ImportError:
chardet = None
if chardet is not None and (mime == 'text/html'\
or os.sys.platform != 'linux2'):
# chardet works fine on html document and its platform independent
return chardet.detect(string).get('encoding', None)
else:
# file command provide better result
# for text/plain documents
# store the content into tempfile
file_descriptor, path = tempfile.mkstemp()
file_object = os.fdopen(file_descriptor, 'w')
file_object.write(string)
file_object.close()
# run file command against tempfile to and read encoded
command_result = Popen(['file', '-b', '--mime-encoding', path],
stdout=PIPE).communicate()[0]
# return detected encoding
return command_result.strip()
contribution_tool = self.getPortalObject().portal_contributions
return contribution_tool.guessEncodingFromText(string, content_type=mime)
def _stripHTML(self, html, charset=None):
"""
......@@ -866,22 +724,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
stripped_html = html
return stripped_html
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def getContentInformation(self):
"""
Returns the content information from the HTML conversion.
The default implementation tries to build a dictionnary
from the HTML conversion of the document and extract
the document title.
"""
result = {}
html = self.asEntireHTML()
if not html: return result
title_list = re.findall(self.title_parser, str(html))
if title_list:
result['title'] = title_list[0]
return result
security.declareProtected(Permissions.AccessContentsInformation,
'getMetadataMappingDict')
def getMetadataMappingDict(self):
......@@ -918,21 +760,6 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
method = None
if method is not None: method()
# Crawling API
security.declareProtected(Permissions.AccessContentsInformation, 'getContentURLList')
def getContentURLList(self):
"""
Returns a list of URLs referenced by the content of this document.
Default implementation consists in analysing the document
converted to HTML. Subclasses may overload this method
if necessary. However, it is better to extend the conversion
methods in order to produce valid HTML, which is useful to
many people, rather than overload this method which is only
useful for crawling.
"""
html_content = self.asStrippedHTML()
return re.findall(self.href_parser, str(html_content))
security.declareProtected(Permissions.ModifyPortalContent, 'updateContentFromURL')
def updateContentFromURL(self, repeat=MAX_REPEAT, crawling_depth=0):
"""
......@@ -963,18 +790,3 @@ class Document(DocumentExtensibleTraversableMixin, XMLObject, UrlMixIn, CachedCo
if hasattr(aq_base(container), 'isIndexContent'):
return container.isIndexContent(self)
return False
security.declareProtected(Permissions.AccessContentsInformation, 'getContentBaseURL')
def getContentBaseURL(self):
"""
Returns the content base URL based on the actual content or
on its URL.
"""
base_url = self.asURL()
base_url_list = base_url.split('/')
if len(base_url_list):
if base_url_list[-1] and base_url_list[-1].find('.') > 0:
# Cut the trailing part in http://www.some.site/at/trailing.html
# but not in http://www.some.site/at
base_url = '/'.join(base_url_list[:-1])
return base_url
......@@ -114,22 +114,14 @@ class PDFDocument(Image):
"""
if not self.hasData():
return ''
tmp = tempfile.NamedTemporaryFile()
tmp.write(self.getData())
tmp.seek(0)
try:
command = ['pdftotext', '-layout', '-enc', 'UTF-8',
'-nopgbrk', tmp.name, '-']
try:
command_result = Popen(command, stdout=PIPE).communicate()[0]
except OSError, e:
if e.errno == errno.ENOENT:
raise ConversionError('pdftotext was not found')
raise
finally:
tmp.close()
if command_result:
return command_result
mime_type = 'text/plain'
portal_transforms = self.getPortalObject().portal_transforms
filename = self.getStandardFilename(format='txt')
result = portal_transforms.convertToData(mime_type, str(self.getData()),
context=self, filename=filename,
mimetype=self.getContentType())
if result:
return result
else:
# Try to use OCR
# As high dpi images are required, it may take some times to convert the
......@@ -145,13 +137,12 @@ class PDFDocument(Image):
frame=page_number, display='identical')
if not src_mimetype.endswith('png'):
continue
content = '%s' % png_data
mime_type = 'text/plain'
content = str(png_data)
if content is not None:
portal_transforms = getToolByName(self, 'portal_transforms')
filename = self.getStandardFilename(format='png')
result = portal_transforms.convertToData(mime_type, content,
context=self,
filename=self.getTitleOrId(),
filename=filename,
mimetype=src_mimetype)
if result is None:
raise ConversionError('PDFDocument conversion error. '
......
......@@ -45,6 +45,9 @@ try:
from string import Template
except ImportError:
from Products.ERP5Type.patches.string import Template
from Products.ERP5Type.Utils import guessEncodingFromText
from lxml import html as etree_html
class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
TextContent, File):
......@@ -147,7 +150,7 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
kw['format'] = format
if not self.hasConversion(**kw):
portal_transforms = getToolByName(portal, 'portal_transforms')
filename = self.getSourceReference(self.getTitleOrId())
filename = self.getStandardFilename(format=format)
if mime_type == 'text/html':
mime_type = 'text/x-html-safe'
result = portal_transforms.convertToData(mime_type, text_content,
......@@ -183,9 +186,13 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
"""
if self.hasTextContent():
html = self._asHTML()
base_list = re.findall(self.base_parser, str(html))
if base_list:
return base_list[0]
# a document can be entirely stripped by safe_html
# so its html conversion can be empty
if html.strip():
html_tree = etree_html.fromstring(html)
base_list = [href for href in html_tree.xpath('//base/@href') if href]
if base_list:
return str(base_list[0])
return Document.getContentBaseURL(self)
security.declareProtected(Permissions.ModifyPortalContent, 'setBaseData')
......@@ -270,14 +277,14 @@ class TextDocument(CachedConvertableMixin, BaseConvertableFileMixin,
return encoded content_type and message if encoding
is not utf-8
"""
codec = document._guessEncoding(text_content, content_type)
codec = guessEncodingFromText(text_content, content_type)
if codec is not None:
try:
text_content = text_content.decode(codec).encode('utf-8')
except (UnicodeDecodeError, LookupError):
message = 'Conversion to base format with codec %r fails' % codec
# try again with another guesser based on file command
codec = document._guessEncoding(text_content, 'text/plain')
codec = guessEncodingFromText(text_content, 'text/plain')
if codec is not None:
try:
text_content = text_content.decode(codec).encode('utf-8')
......
......@@ -29,7 +29,7 @@
from AccessControl import ClassSecurityInfo
from Products.ERP5Type.Globals import InitializeClass
from Products.ERP5Type.Tool.BaseTool import BaseTool
from Products.ERP5Type import Permissions
class ContributionRegistryTool(BaseTool):
......@@ -41,14 +41,18 @@ class ContributionRegistryTool(BaseTool):
security = ClassSecurityInfo()
security.declarePrivate('findPortalTypeName')
def findPortalTypeName(self, file_name='', mime_type=None, data=None):
from Products.ERP5Type.Document import newTempIngestionFile
ingestion_file = newTempIngestionFile(self, 'id')
ingestion_file._edit(file_name=file_name, mime_type=mime_type, data=data)
security.declareProtected(Permissions.AccessContentsInformation,
'findPortalTypeName')
def findPortalTypeName(self, context=None, **kw):
# if a context is passed, ignore other arguments
if context is None:
# Build a temp object edited with provided parameters
from Products.ERP5Type.Document import newTempFile
context = newTempFile(self, 'id')
context.edit(**kw)
for predicate in self.objectValues(sort_on='int_index'):
result = predicate.test(ingestion_file)
result = predicate.test(context)
if result:
return result
......
......@@ -29,12 +29,7 @@
import cStringIO
import re
import string
import socket
try:
from hashlib import md5 as md5_new
except ImportError:
from md5 import new as md5_new
import urllib2, urllib
import urlparse
from cgi import parse_header
......@@ -46,13 +41,11 @@ from Products.CMFCore.utils import getToolByName, _checkPermission
from Products.ERP5Type.Tool.BaseTool import BaseTool
from Products.ERP5Type import Permissions
from Products.ERP5 import _dtmldir
from Products.ERP5.Document.Url import no_crawl_protocol_list, no_host_protocol_list
from Products.ERP5.Document.Url import no_crawl_protocol_list
from AccessControl import Unauthorized
from zLOG import LOG
from DateTime import DateTime
from Acquisition import aq_base
from zExceptions import BadRequest
import warnings
# Install openers
import ContributionOpener
......@@ -83,7 +76,7 @@ class ContributionTool(BaseTool):
Configuration Scripts:
- ContributionTool_getPropertyDictFromFileName: receives file name and a
- ContributionTool_getPropertyDictFromFilename: receives file name and a
dict derived from filename by regular expression, and does any necesary
operations (e.g. mapping document type id onto a real portal_type).
......@@ -98,8 +91,7 @@ class ContributionTool(BaseTool):
meta_type = 'ERP5 Contribution Tool'
portal_type = 'Contribution Tool'
# Regular expressions
simple_normaliser = re.compile('#.*')
# Declarative Security
security = ClassSecurityInfo()
......@@ -108,153 +100,141 @@ class ContributionTool(BaseTool):
manage_overview = DTMLFile( 'explainContributionTool', _dtmldir )
security.declareProtected(Permissions.AddPortalContent, 'newContent')
def newContent(self, id=None, portal_type=None, url=None, container=None,
container_path=None,
discover_metadata=1, temp_object=0,
user_login=None, data=None, file_name=None, **kw):
def newContent(self, **kw):
"""
The newContent method is overriden to implement smart content
creation by detecting the portal type based on whatever information
was provided and finding out the most appropriate module to store
the content.
user_login is the name under which the content will be created
XXX - this is a security hole which needs to be fixed by
making sure only Manager can use this parameter
container -- if specified, it is possible to define
where to contribute the content. Else, ContributionTool
tries to guess.
container_path -- if specified, defines the container path
and has precedence over container
url -- if specified, content is download from the URL.
NOTE:
We always generate ID. So, we must prevent using the one
which we were provided.
explicit named parameters was:
id - ignored argument
portal_type - explicit portal_type parameter, must be honoured
url - Identifier of external resource. Content will be downloaded
from it
container - if specified, it is possible to define
where to contribute the content. Else, ContributionTool
tries to guess.
container_path - if specified, defines the container path
and has precedence over container
discover_metadata - Enable metadata extraction and discovery
(default True)
temp_object - build tempObject or not (default False)
user_login - is the name under which the content will be created
XXX - this is a security hole which needs to be fixed by
making sure only Manager can use this parameter
data - Binary representation of content
filename - explicit filename of content
"""
if file_name is not None:
kw['file_name'] = file_name
if data is not None:
# This is only used to make sure
# we can pass file as parameter to ZPublisher
# whenever we ingest email
kw['data'] = data
kw.pop('id', None) # Never use hardcoded ids anymore longer
# Useful for metadata discovery, keep it as it as been provided
input_parameter_dict = kw.copy()
# But file and data are exceptions.
# They are potentialy too big to be keept into memory.
# We want to keep only one reference of thoses values
# on futur created document only !
if 'file' in input_parameter_dict:
del input_parameter_dict['file']
if 'data' in input_parameter_dict:
del input_parameter_dict['data']
# pop: remove keys which are not document properties
url = kw.pop('url', None)
container = kw.pop('container', None)
container_path = kw.pop('container_path', None)
discover_metadata = kw.pop('discover_metadata', True)
user_login = kw.pop('user_login', None)
# check file_name argument for backward compatibility.
if 'file_name' in kw:
if 'filename' not in kw:
kw['filename'] = kw['file_name']
del(kw['file_name'])
filename = kw.get('filename', None)
portal_type = kw.get('portal_type')
temp_object = kw.get('temp_object', False)
document = None
# Try to find the file_name
portal = self.getPortalObject()
# Try to find the filename
content_type = None
if not url:
# check if file was provided
file = kw.get('file', None)
if file is not None and file_name is None:
file_name = file.filename
file_object = kw.get('file')
if file_object is not None:
if not filename:
filename = file_object.filename
else:
# some channels supply data and file-name separately
# this is the case for example for email ingestion
# in this case, we build a file wrapper for it
data = kw.get('data', None)
if data is not None:
file_name = kw.get('file_name', None)
if file_name is not None:
file = cStringIO.StringIO()
file.write(data)
file.seek(0)
kw['file'] = file
del kw['data']
del kw['file_name']
data = kw.get('data')
if data is not None and filename:
file_object = cStringIO.StringIO()
file_object.write(data)
file_object.seek(0)
kw['file'] = file_object
del kw['data']
else:
raise TypeError, 'data and filename must be provided'
else:
# build a new file from the url
url_file = urllib2.urlopen(url)
data = url_file.read() # time out must be set or ... too long XXX
file = cStringIO.StringIO()
file.write(data)
file.seek(0)
# if a content-disposition header is present,
# try first to read the suggested filename from it.
header_info = url_file.info()
content_disposition = header_info.getheader('content-disposition', '')
file_name = parse_header(content_disposition)[1].get('filename')
if not file_name:
# Now read the filename from url.
# In case of http redirection, the real url must be read
# from file object returned by urllib2.urlopen.
# It can happens when the header 'Location' is present in request.
# See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
url = url_file.geturl()
# Create a file name based on the URL and quote it
file_name = urlparse.urlsplit(url)[-3]
file_name = os.path.basename(file_name)
file_name = urllib.quote(file_name, safe='')
file_name = file_name.replace('%', '')
# For URLs, we want an id by default equal to the encoded URL
if id is None:
id = self.encodeURL(url)
content_type = header_info.gettype()
file_object, filename, content_type = self._openURL(url)
if content_type:
kw['content_type'] = content_type
kw['file'] = file
kw['file'] = file_object
# If the portal_type was provided, we can go faster
if portal_type and container is None:
# We know the portal_type, let us find the default module
# and use it as container
try:
container = self.getDefaultModule(portal_type)
container = portal.getDefaultModule(portal_type)
except ValueError:
container = None
if portal_type and container is not None:
# We could simplify things here and return a document immediately
# NOTE: we use the module ID generator rather than the provided ID
#document = module.newContent(portal_type=portal_type, **kw)
#if discover_metadata:
# document.activate().discoverMetadata(file_name=file_name, user_login=user_login)
#return document
pass # XXX - This needs to be implemented once the rest is stable
# From here, there is no hope unless a file was provided
if file is None:
raise ValueError, "could not determine portal type"
if file_object is None:
raise ValueError, "No data provided"
if portal_type is None:
# Guess it with help of portal_contribution_registry
registry = getToolByName(portal, 'portal_contribution_registry')
portal_type = registry.findPortalTypeName(filename=filename,
content_type=content_type)
#
# Check if same file is already exists. if it exists, then update it.
#
if portal_type is None:
portal_type = self._guessPortalType(file_name, content_type, data)
property_dict = self.getMatchedFileNamePatternDict(file_name)
reference = property_dict.get('reference', None)
version = property_dict.get('version', None)
language = property_dict.get('language', None)
if portal_type and reference and version and language:
portal_catalog = getToolByName(self, 'portal_catalog')
document = portal_catalog.getResultValue(portal_type=portal_type,
reference=reference,
version=version,
language=language)
if document is not None:
# document is already uploaded. So overrides file.
if not _checkPermission(Permissions.ModifyPortalContent, document):
raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
document.edit(file=kw['file'])
return document
property_dict = self.getMatchedFilenamePatternDict(filename)
reference = property_dict.get('reference', None)
version = property_dict.get('version', None)
language = property_dict.get('language', None)
if portal_type and reference and version and language:
portal_catalog = getToolByName(portal, 'portal_catalog')
document = portal_catalog.getResultValue(portal_type=portal_type,
reference=reference,
version=version,
language=language)
if document is not None:
# document is already uploaded. So overrides file.
if not _checkPermission(Permissions.ModifyPortalContent, document):
raise Unauthorized, "[DMS] You are not allowed to update the existing document which has the same coordinates (id %s)" % document.getId()
document.edit(file=kw['file'])
return document
# Temp objects use the standard newContent from Folder
if temp_object:
# For temp_object creation, use the standard method
return BaseTool.newContent(self, id=id, portal_type=portal_type,
temp_object=temp_object, **kw)
kw['portal_type'] = portal_type
return BaseTool.newContent(self, **kw)
# Then put the file inside ourselves for a short while
if container_path is not None:
container = self.getPortalObject().restrictedTraverse(container_path)
document = self._setObject(file_name, None, portal_type=portal_type,
user_login=user_login, id=id,
container=container,
document = self._setObject(filename, None, portal_type=portal_type,
user_login=user_login, container=container,
discover_metadata=discover_metadata,
filename=filename,
input_parameter_dict=input_parameter_dict
)
object_id = document.getId()
document = self._getOb(object_id) # Call _getOb to purge cache
......@@ -264,18 +244,12 @@ class ContributionTool(BaseTool):
if modified_kw is not None:
kw.update(modified_kw)
kw['filename'] = filename # Override filename property
# Then edit the document contents (so that upload can happen)
document._edit(**kw)
# if no content_type has been set, guess it
if 'content_type' not in kw and getattr(document, 'guessMimeType', None) is not None:
# For File force to setup the mime_type
document.guessMimeType(fname=file_name)
if url:
document.fromURL(url)
# Notify workflows
#document.notifyWorkflowCreated()
# Allow reindexing, reindex it and return the document
try:
delattr(document, 'isIndexable')
......@@ -293,17 +267,19 @@ class ContributionTool(BaseTool):
"""
pass
security.declareProtected(Permissions.ModifyPortalContent,'getMatchedFileNamePatternDict')
def getMatchedFileNamePatternDict(self, file_name):
security.declareProtected(Permissions.ModifyPortalContent,
'getMatchedFilenamePatternDict')
def getMatchedFilenamePatternDict(self, filename):
"""
Get matched group dict of file name parsing regular expression.
"""
property_dict = {}
if file_name is None:
if filename is None:
return property_dict
regex_text = self.portal_preferences.getPreferredDocumentFileNameRegularExpression()
regex_text = self.portal_preferences.\
getPreferredDocumentFilenameRegularExpression()
if regex_text in ('', None):
return property_dict
......@@ -311,42 +287,55 @@ class ContributionTool(BaseTool):
pattern = re.compile(regex_text)
if pattern is not None:
try:
property_dict = pattern.match(file_name).groupdict()
property_dict = pattern.match(filename).groupdict()
except AttributeError: # no match
pass
return property_dict
security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
def getPropertyDictFromFileName(self, file_name):
# backward compatibility
security.declareProtected(Permissions.ModifyPortalContent,
'getMatchedFileNamePatternDict')
def getMatchedFileNamePatternDict(self, filename):
"""
(deprecated) use getMatchedFilenamePatternDict() instead.
"""
warnings.warn('getMatchedFileNamePatternDict() is deprecated. '
'use getMatchedFilenamePatternDict() instead.')
return self.getMatchedFilenamePatternDict(filename)
security.declareProtected(Permissions.ModifyPortalContent,
'getPropertyDictFromFilename')
def getPropertyDictFromFilename(self, filename):
"""
Gets properties from filename. File name is parsed with a regular expression
set in preferences. The regexp should contain named groups.
"""
if file_name is None:
if filename is None:
return {}
property_dict = self.getMatchedFileNamePatternDict(file_name)
method = self._getTypeBasedMethod('getPropertyDictFromFileName',
fallback_script_id = 'ContributionTool_getPropertyDictFromFileName')
property_dict = method(file_name, property_dict)
if property_dict.get('portal_type', None) is not None:
# we have to return portal_type as a tuple
# because we should allow for having multiple candidate types
property_dict['portal_type'] = (property_dict['portal_type'],)
else:
# we have to find candidates by file extenstion
basename, extension = os.path.splitext(file_name)
if extension:
extension = extension.lstrip('.') # remove first dot
property_dict['portal_type'] =\
self.ContributionTool_getCandidateTypeListByExtension(extension)
property_dict = self.getMatchedFilenamePatternDict(filename)
method = self._getTypeBasedMethod('getPropertyDictFromFilename',
fallback_script_id='ContributionTool_getPropertyDictFromFilename')
property_dict = method(filename, property_dict)
return property_dict
# backward compatibility
security.declareProtected(Permissions.ModifyPortalContent,
'getPropertyDictFromFileName')
def getPropertyDictFromFileName(self, filename):
"""
(deprecated) use getPropertyDictFromFilename() instead.
"""
warnings.warn('getPropertyDictFromFileName() is deprecated. '
'use getPropertyDictFromFilename() instead.')
return self.getPropertyDictFromFilename(filename)
# WebDAV virtual folder support
def _setObject(self, name, ob, portal_type=None, user_login=None,
container=None, id=None, discover_metadata=1):
def _setObject(self, id, ob, portal_type=None, user_login=None,
container=None, discover_metadata=True, filename=None,
input_parameter_dict=None):
"""
portal_contribution_registry will find appropriate portal type
name by file_name and content itself.
name by filename and content itself.
The ContributionTool instance must be configured in such
way that _verifyObjectPaste will return TRUE.
......@@ -362,9 +351,8 @@ class ContributionTool(BaseTool):
# redefine parameters
portal_type = ob.getPortalType()
container = ob.getParentValue()
id = ob.getId()
if not portal_type:
document = BaseTool.newContent(self, id=name,
document = BaseTool.newContent(self, id=id,
portal_type=portal_type,
is_indexable=0)
else:
......@@ -379,33 +367,27 @@ class ContributionTool(BaseTool):
module = self.getDefaultModule(portal_type)
else:
module = container
if id is None:
new_id = module.generateNewId()
else:
new_id = id
existing_document = module._getOb(new_id, None)
if existing_document is None:
# There is no preexisting document - we can therefore
# set the new object
document = module.newContent(id=new_id,
portal_type=portal_type,
is_indexable=0)
# We can now discover metadata
if discover_metadata:
# Metadata disovery is done as an activity by default
# If we need to discoverMetadata synchronously, it must
# be for user interface and should thus be handled by
# ZODB scripts
document.activate(after_path_and_method_id=(document.getPath(),
('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
.discoverMetadata(file_name=name, user_login=user_login)
else:
document = existing_document
# There is no preexisting document - we can therefore
# set the new object
document = module.newContent(portal_type=portal_type, is_indexable=0)
# We can now discover metadata
if discover_metadata:
# Metadata disovery is done as an activity by default
# If we need to discoverMetadata synchronously, it must
# be for user interface and should thus be handled by
# ZODB scripts
document.activate(after_path_and_method_id=(document.getPath(),
('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
.discoverMetadata(filename=filename,
user_login=user_login,
input_parameter_dict=input_parameter_dict)
# Keep the document close to us - this is only useful for
# file upload from webdav
if not hasattr(self, '_v_document_cache'):
volatile_cache = getattr(self, '_v_document_cache', None)
if volatile_cache is None:
self._v_document_cache = {}
self._v_document_cache[document.getId()] = document.getRelativeUrl()
volatile_cache = self._v_document_cache
volatile_cache[document.getId()] = document.getRelativeUrl()
# Return document to newContent method
return document
......@@ -417,10 +399,11 @@ class ContributionTool(BaseTool):
"""
# Use the document cache if possible and return result immediately
# this is only useful for webdav
if hasattr(self, '_v_document_cache'):
document_url = self._v_document_cache.get(id, None)
volatile_cache = getattr(self, '_v_document_cache', None)
if volatile_cache is not None:
document_url = volatile_cache.get(id)
if document_url is not None:
del self._v_document_cache[id]
del volatile_cache[id]
return self.getPortalObject().unrestrictedTraverse(document_url)
# Try first to return the real object inside
......@@ -475,66 +458,11 @@ class ContributionTool(BaseTool):
def wrapper(o_list):
for o in o_list:
o = o.getObject()
id = '%s-%s' % (o.getUid(), o.getStandardFileName(),)
id = '%s-%s' % (o.getUid(), o.getStandardFilename(),)
yield o.asContext(id=id)
return wrapper(object_list)
# Crawling methods
security.declareProtected(Permissions.View, 'normaliseURL')
def normaliseURL(self, url, base_url=None):
"""
Returns a normalised version of the url so
that we do not download twice the same content.
URL normalisation is an important part in crawlers.
The current implementation is obviously simplistic.
Refer to http://en.wikipedia.org/wiki/Web_crawler
and study Harvestman for more ideas.
"""
url = self.simple_normaliser.sub('', url)
url_split = url.split(':')
url_protocol = url_split[0]
if url_protocol in no_host_protocol_list:
return url
if base_url and len(url_split) == 1:
# Make relative URL absolute
url = '%s/%s' % (base_url, url)
return url
security.declareProtected(Permissions.View, 'encodeURL')
def encodeURL(self, url):
"""
Returns the URL as an ID. ID should be chosen in such
way that it is optimal with HBTreeFolder (ie. so that
distribution of access time on a cluster is possible)
NOTE: alternate approach is based on a url table
and catalog lookup. It is faster ? Not sure. Since
we must anyway insert objects in btrees and this
is simimar in cost to accessing them.
"""
# Produce an MD5 from the URL
hex_md5 = md5_new(url).hexdigest()
# Take the first part in the URL which is not empty
# LOG("encodeURL", 0, url)
url_segment = url.split(':')[1]
url_segment_list = url_segment.split('/')
url_domain = None
for url_part in url_segment_list:
if url_part:
url_domain = url_part
break
# Return encoded url
if url_domain:
url_domain = urllib.quote(url_domain, safe='')
url_domain = url_domain.replace('%', '')
return "%s-%s" % (url_domain, hex_md5)
return hex_md5
url = urllib.quote(url, safe='')
url = url.replace('_', '__')
url = url.replace('%', '_')
return url
security.declareProtected(Permissions.AddPortalContent, 'crawlContent')
def crawlContent(self, content, container=None):
"""
......@@ -543,6 +471,8 @@ class ContributionTool(BaseTool):
XXX: missing is the conversion of content local href to something
valid.
"""
portal = self.getPortalObject()
url_registry_tool = portal.portal_url_registry
depth = content.getCrawlingDepth()
if depth < 0:
# Do nothing if crawling depth is reached
......@@ -554,32 +484,34 @@ class ContributionTool(BaseTool):
if depth < 0:
# Do nothing if crawling depth is reached
return
base_url = content.getContentBaseURL()
url_list = map(lambda url: self.normaliseURL(url, base_url), set(content.getContentURLList()))
url_list = content.getContentNormalisedURLList()
for url in set(url_list):
# LOG('trying to crawl', 0, url)
# Some url protocols should not be crawled
if url.split(':')[0] in no_crawl_protocol_list:
if urlparse.urlsplit(url)[0] in no_crawl_protocol_list:
continue
if container is None:
#if content.getParentValue()
# in place of not ?
container = content.getParentValue()
# Calculate the id under which content will be stored
id = self.encodeURL(url)
# Try to access the document if it already exists
document = container.get(id, None)
if document is None:
# XXX - This call is not working due to missing group_method_id
# therefore, multiple call happen in parallel and eventually fail
# (the same URL is created multiple times)
# LOG('activate newContentFromURL', 0, url)
self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(),
id=id, url=url, crawling_depth=depth)
elif depth and document.getCrawlingDepth() < depth:
# Update the crawling depth if necessary
document._setCrawlingDepth(depth)
document.activate().crawlContent()
try:
url_registry_tool.getReferenceFromURL(url, context=container)
except KeyError:
pass
else:
# url already crawled
continue
# XXX - This call is not working due to missing group_method_id
# therefore, multiple call happen in parallel and eventually fail
# (the same URL is created multiple times)
# LOG('activate newContentFromURL', 0, url)
self.activate(activity="SQLQueue").newContentFromURL(
container_path=container.getRelativeUrl(),
url=url, crawling_depth=depth)
# Url is not known yet but register right now to avoid
# creation of duplicated crawled content
# An activity will later setup the good reference for it.
url_registry_tool.registerURL(url, None, context=container)
security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL')
def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0):
......@@ -595,10 +527,7 @@ class ContributionTool(BaseTool):
# Step 1: download new content
try:
url = content.asURL()
data = urllib2.urlopen(url).read()
file = cStringIO.StringIO()
file.write(data)
file.seek(0)
file_object, filename, content_type = self._openURL(url)
except urllib2.HTTPError, error:
if repeat == 0:
# XXX - Call the extendBadURLList method,--NOT Implemented--
......@@ -615,28 +544,28 @@ class ContributionTool(BaseTool):
content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1)
return
# Step 2: compare and update if necessary (md5)
# md5 stuff to compare contents
new_content_md5 = md5_new(data).hexdigest()
content_md5 = content.getContentMd5()
if content_md5 == new_content_md5:
return
content._edit(file=file)# Please make sure that if content is the same
content._edit(file=file_object, content_type=content_type)
# Please make sure that if content is the same
# we do not update it
# This feature must be implemented by Base or File
# not here (look at _edit in Base)
# Step 3: convert to base format
content.convertToBaseFormat()
# Step 2: convert to base format
if content.isSupportBaseDataConversion():
content.activate().Document_tryToConvertToBaseFormat()
# Step 3: run discoverMetadata
content.activate(after_path_and_method_id=(content.getPath(),
('convertToBaseFormat', 'Document_tryToConvertToBaseFormat'))) \
.discoverMetadata(filename=filename)
# Step 4: activate populate (unless interaction workflow does it)
content.activate().populateContent()
# Step 5: activate crawlContent
depth = content.getCrawlingDepth()
if depth > 0:
content.activate().crawlContent()
content.setContentMd5(new_content_md5)
security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL')
def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT, repeat_interval=1, batch_mode=True, **kw):
def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT,
repeat_interval=1, batch_mode=True, url=None, **kw):
"""
A wrapper method for newContent which provides extra safety
in case or errors (ie. download, access, conflict, etc.).
......@@ -646,17 +575,13 @@ class ContributionTool(BaseTool):
the at_date parameter and some standard values.
NOTE: implementation needs to be done.
id parameter is ignored
"""
document = None
# First of all, make sure do not try to create an existing document
if container_path is not None and id is not None:
container = self.restrictedTraverse(container_path)
document = container.get(id, None)
if document is not None:
# Document aleardy exists: no need to keep on crawling
return document
if not url:
raise TypeError, 'url parameter is mandatory'
try:
document = self.newContent(container_path=container_path, id=id, **kw)
document = self.newContent(container_path=container_path, url=url, **kw)
if document.isIndexContent() and document.getCrawlingDepth() >= 0:
# If this is an index document, keep on crawling even if crawling_depth is 0
document.activate().crawlContent()
......@@ -672,7 +597,7 @@ class ContributionTool(BaseTool):
if repeat > 0:
# Catch any HTTP error
self.activate(at_date=DateTime() + repeat_interval).newContentFromURL(
container_path=container_path, id=id,
container_path=container_path, url=url,
repeat=repeat - 1,
repeat_interval=repeat_interval, **kw)
except urllib2.URLError, error:
......@@ -685,28 +610,57 @@ class ContributionTool(BaseTool):
if repeat > 0:
self.activate(at_date=DateTime() + repeat_interval,
activity="SQLQueue").newContentFromURL(
container_path=container_path, id=id,
container_path=container_path, url=url,
repeat=repeat - 1,
repeat_interval=repeat_interval, **kw)
return document
def _guessPortalType(self, name, typ, body):
security.declareProtected(Permissions.AccessContentsInformation,
'guessMimeTypeFromFilename')
def guessMimeTypeFromFilename(self, filename):
"""
Call Portal Contribution Registry
to know which portal_type should be used
get mime type from file name
"""
findPortalTypeName = None
registry = getToolByName(self, 'portal_contribution_registry', None)
if registry is not None:
findPortalTypeName = registry.findPortalTypeName
else:
# Keep backward compatibility
registry = getToolByName(self, 'content_type_registry', None)
if registry is None:
return None
findPortalTypeName = registry.findTypeName
portal_type = findPortalTypeName(name, typ, body)
return portal_type
if not filename:
return
portal = self.getPortalObject()
content_type = portal.mimetypes_registry.lookupExtension(filename)
return content_type
def _openURL(self, url):
"""Download content from url,
read filename and content_type
return file_object, filename, content_type tuple
"""
# Quote path part of url
url_tuple = urlparse.urlsplit(url)
quoted_path = urllib.quote(url_tuple[2])
url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path,
url_tuple[3], url_tuple[4]))
# build a new file from the url
url_file = urllib2.urlopen(url)
data = url_file.read() # time out must be set or ... too long XXX
file_object = cStringIO.StringIO()
file_object.write(data)
file_object.seek(0)
# if a content-disposition header is present,
# try first to read the suggested filename from it.
header_info = url_file.info()
content_disposition = header_info.getheader('content-disposition', '')
filename = parse_header(content_disposition)[1].get('filename')
if not filename:
# Now read the filename from url.
# In case of http redirection, the real url must be read
# from file object returned by urllib2.urlopen.
# It can happens when the header 'Location' is present in request.
# See http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.30
url = url_file.geturl()
# Create a file name based on the URL and quote it
filename = urlparse.urlsplit(url)[-3]
filename = os.path.basename(filename)
filename = urllib.quote(filename, safe='')
filename = filename.replace('%', '')
content_type = header_info.gettype()
return file_object, filename, content_type
InitializeClass(ContributionTool)
......@@ -50,7 +50,7 @@ from Tool import CategoryTool, SimulationTool, RuleTool, IdTool, TemplateTool,\
TrashTool, ContributionTool, NotificationTool, PasswordTool,\
GadgetTool, ContributionRegistryTool, IntrospectionTool,\
AcknowledgementTool, SolverTool, SolverProcessTool,\
ConversionTool, RoundingTool
ConversionTool, RoundingTool, UrlRegistryTool
import ERP5Site
from Document import PythonScript
object_classes = ( ERP5Site.ERP5Site,
......@@ -78,6 +78,7 @@ portal_tools = ( CategoryTool.CategoryTool,
SolverProcessTool.SolverProcessTool,
ConversionTool.ConversionTool,
RoundingTool.RoundingTool,
UrlRegistryTool.UrlRegistryTool,
)
content_classes = ()
content_constructors = ()
......
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>file_extension</string> </key>
<key> <string>extension_from_filename</string> </key>
<value>
<list>
<string>sxd</string>
......@@ -32,7 +32,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>file_extension</string>
<string>extension_from_filename</string>
</tuple>
</value>
</item>
......@@ -46,7 +46,7 @@
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>60</int> </value>
<value> <int>10</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
......@@ -60,7 +60,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Drawing</string> </value>
<value> <string>Drawing by extension</string> </value>
</item>
</dictionary>
</pickle>
......
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>file_extension</string> </key>
<key> <string>extension_from_filename</string> </key>
<value>
<list>
<string>gif</string>
......@@ -35,7 +35,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>file_extension</string>
<string>extension_from_filename</string>
</tuple>
</value>
</item>
......@@ -49,7 +49,7 @@
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>20</int> </value>
<value> <int>10</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
......@@ -63,7 +63,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Image</string> </value>
<value> <string>Image by extension</string> </value>
</item>
</dictionary>
</pickle>
......
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>file_extension</string> </key>
<key> <string>extension_from_filename</string> </key>
<value>
<list>
<string>pdf</string>
......@@ -31,7 +31,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>file_extension</string>
<string>extension_from_filename</string>
</tuple>
</value>
</item>
......@@ -45,7 +45,7 @@
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>30</int> </value>
<value> <int>10</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
......@@ -59,7 +59,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>PDF</string> </value>
<value> <string>PDF by extension</string> </value>
</item>
</dictionary>
</pickle>
......
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>mime_type</string> </key>
<key> <string>content_type</string> </key>
<value>
<list>
<string>application/pdf</string>
......@@ -31,7 +31,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>mime_type</string>
<string>content_type</string>
</tuple>
</value>
</item>
......@@ -45,7 +45,7 @@
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>30</int> </value>
<value> <int>20</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
......@@ -59,7 +59,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>PDF</string> </value>
<value> <string>PDF by mimetype</string> </value>
</item>
</dictionary>
</pickle>
......
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>file_extension</string> </key>
<key> <string>extension_from_filename</string> </key>
<value>
<list>
<string>ppt</string>
......@@ -34,7 +34,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>file_extension</string>
<string>extension_from_filename</string>
</tuple>
</value>
</item>
......@@ -48,7 +48,7 @@
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>50</int> </value>
<value> <int>10</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
......@@ -62,7 +62,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Presentation</string> </value>
<value> <string>Presentation by extension</string> </value>
</item>
</dictionary>
</pickle>
......
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_identity_criterion</string> </key>
<value>
<dictionary>
<item>
<key> <string>content_type_from_content</string> </key>
<value>
<list>
<string>application/vnd.ms-excel</string>
<string>application/vnd.ms-office</string>
<string>application/msexcel</string>
<string>application/vnd.oasis.opendocument.spreadsheet</string>
<string>application/vnd.oasis.opendocument.spreadsheet-template</string>
</list>
</value>
</item>
</dictionary>
</value>
</item>
<item>
<key> <string>_range_criterion</string> </key>
<value>
<dictionary/>
</value>
</item>
<item>
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>content_type_from_content</string>
</tuple>
</value>
</item>
<item>
<key> <string>destination_portal_type</string> </key>
<value> <string>Spreadsheet</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>spreadsheet_by_content</string> </value>
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>70</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Contribution Predicate</string> </value>
</item>
<item>
<key> <string>test_method_id</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Spreadsheet by content</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>file_extension</string> </key>
<key> <string>extension_from_filename</string> </key>
<value>
<list>
<string>xls</string>
......@@ -35,7 +35,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>file_extension</string>
<string>extension_from_filename</string>
</tuple>
</value>
</item>
......@@ -49,7 +49,7 @@
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>40</int> </value>
<value> <int>10</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
......@@ -63,7 +63,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Spreadsheet</string> </value>
<value> <string>Spreadsheet by extension</string> </value>
</item>
</dictionary>
</pickle>
......
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_identity_criterion</string> </key>
<value>
<dictionary>
<item>
<key> <string>content_type</string> </key>
<value>
<list>
<string>text/plain</string>
</list>
</value>
</item>
</dictionary>
</value>
</item>
<item>
<key> <string>_range_criterion</string> </key>
<value>
<dictionary/>
</value>
</item>
<item>
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>content_type</string>
</tuple>
</value>
</item>
<item>
<key> <string>destination_portal_type</string> </key>
<value> <string>Text</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>text_by_conent_type</string> </value>
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>20</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Contribution Predicate</string> </value>
</item>
<item>
<key> <string>test_method_id</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Text by content type</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_identity_criterion</string> </key>
<value>
<dictionary>
<item>
<key> <string>content_type_from_content</string> </key>
<value>
<list>
<string>text/plain</string>
</list>
</value>
</item>
</dictionary>
</value>
</item>
<item>
<key> <string>_range_criterion</string> </key>
<value>
<dictionary/>
</value>
</item>
<item>
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>content_type_from_content</string>
</tuple>
</value>
</item>
<item>
<key> <string>destination_portal_type</string> </key>
<value> <string>Text</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>text_by_content</string> </value>
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>70</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Contribution Predicate</string> </value>
</item>
<item>
<key> <string>test_method_id</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Text by mimetype from data</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>file_extension</string> </key>
<key> <string>extension_from_filename</string> </key>
<value>
<list>
<string>txt</string>
......@@ -36,7 +36,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>file_extension</string>
<string>extension_from_filename</string>
</tuple>
</value>
</item>
......@@ -64,7 +64,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Text</string> </value>
<value> <string>Text by extension</string> </value>
</item>
</dictionary>
</pickle>
......
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="ContributionPredicate" module="Products.ERP5Type.Document.ContributionPredicate"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_identity_criterion</string> </key>
<value>
<dictionary>
<item>
<key> <string>content_type_from_content</string> </key>
<value>
<list>
<string>text/html</string>
</list>
</value>
</item>
</dictionary>
</value>
</item>
<item>
<key> <string>_range_criterion</string> </key>
<value>
<dictionary/>
</value>
</item>
<item>
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>content_type_from_content</string>
</tuple>
</value>
</item>
<item>
<key> <string>destination_portal_type</string> </key>
<value> <string>Web Page</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>web_page_by_content</string> </value>
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>70</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Contribution Predicate</string> </value>
</item>
<item>
<key> <string>test_method_id</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Web Page by mimetype from data</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>file_extension</string> </key>
<key> <string>extension_from_filename</string> </key>
<value>
<list>
<string>html</string>
......@@ -33,7 +33,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>file_extension</string>
<string>extension_from_filename</string>
</tuple>
</value>
</item>
......@@ -47,7 +47,7 @@
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>90</int> </value>
<value> <int>10</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
......@@ -61,7 +61,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Web Page</string> </value>
<value> <string>Web Page by extension</string> </value>
</item>
</dictionary>
</pickle>
......
......@@ -11,7 +11,7 @@
<value>
<dictionary>
<item>
<key> <string>mime_type</string> </key>
<key> <string>content_type</string> </key>
<value>
<list>
<string>text/html</string>
......@@ -31,7 +31,7 @@
<key> <string>criterion_property</string> </key>
<value>
<tuple>
<string>mime_type</string>
<string>content_type</string>
</tuple>
</value>
</item>
......@@ -45,13 +45,7 @@
</item>
<item>
<key> <string>int_index</string> </key>
<value> <int>90</int> </value>
</item>
<item>
<key> <string>membership_criterion_base_category</string> </key>
<value>
<tuple/>
</value>
<value> <int>20</int> </value>
</item>
<item>
<key> <string>portal_type</string> </key>
......@@ -65,7 +59,7 @@
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Web Page</string> </value>
<value> <string>Web Page by mimetype</string> </value>
</item>
</dictionary>
</pickle>
......
......@@ -24,6 +24,22 @@
</tuple>
</value>
</item>
<item>
<key> <string>_Add_portal_content_Permission</string> </key>
<value>
<tuple>
<string>Manager</string>
</tuple>
</value>
</item>
<item>
<key> <string>_Delete_objects_Permission</string> </key>
<value>
<tuple>
<string>Manager</string>
</tuple>
</value>
</item>
<item>
<key> <string>_Modify_portal_content_Permission</string> </key>
<value>
......@@ -252,6 +268,22 @@ It\'s the lowest priority one; ie. managers can create higher priority preferenc
<key> <string>preferred_date_order</string> </key>
<value> <string>ymd</string> </value>
</item>
<item>
<key> <string>preferred_document_file_name_regular_expression</string> </key>
<value> <string encoding="cdata"><![CDATA[
(?P<reference>[A-Z&é@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})
]]></string> </value>
</item>
<item>
<key> <string>preferred_document_reference_regular_expression</string> </key>
<value> <string encoding="cdata"><![CDATA[
(?P<reference>[A-Z&é@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?
]]></string> </value>
</item>
<item>
<key> <string>preferred_event_assessment_form_id</string> </key>
<value>
......
......@@ -58,8 +58,8 @@ from zExceptions import Unauthorized\n
format = None\n
# Always force download of document even if format is supported\n
# by browser\n
file_name = context.getStandardFileName(format)\n
response.setHeader(\'Content-disposition\', \'attachment; filename="%s"\' % file_name)\n
filename = context.getStandardFilename(format)\n
response.setHeader(\'Content-disposition\', \'attachment; filename="%s"\' % filename)\n
\n
try:\n
return context.index_html(request, response, format)\n
......@@ -111,7 +111,7 @@ except Unauthorized:\n
<string>None</string>
<string>format</string>
<string>context</string>
<string>file_name</string>
<string>filename</string>
<string>msg</string>
<string>dict</string>
</tuple>
......
......@@ -222,12 +222,16 @@
<value>
<list>
<tuple>
<string>file_extension</string>
<string>file_extension</string>
<string>extension_from_filename</string>
<string>extension_from_filename</string>
</tuple>
<tuple>
<string>mime_type</string>
<string>mime_type</string>
<string>content_type</string>
<string>content_type</string>
</tuple>
<tuple>
<string>content_type_from_content</string>
<string>content_type_from_content</string>
</tuple>
</list>
</value>
......
......@@ -352,6 +352,10 @@
<key> <string>css_class</string> </key>
<value> <string></string> </value>
</item>
<item>
<key> <string>default_display_style</string> </key>
<value> <string>table</string> </value>
</item>
<item>
<key> <string>default_params</string> </key>
<value>
......@@ -362,6 +366,12 @@
<key> <string>description</string> </key>
<value> <string></string> </value>
</item>
<item>
<key> <string>display_style_list</string> </key>
<value>
<list/>
</value>
</item>
<item>
<key> <string>domain_root_list</string> </key>
<value>
......@@ -396,10 +406,18 @@
<list/>
</value>
</item>
<item>
<key> <string>global_search_column</string> </key>
<value> <string></string> </value>
</item>
<item>
<key> <string>hidden</string> </key>
<value> <int>0</int> </value>
</item>
<item>
<key> <string>hide_rows_on_no_search_criterion</string> </key>
<value> <int>0</int> </value>
</item>
<item>
<key> <string>lines</string> </key>
<value> <int>20</int> </value>
......@@ -425,6 +443,10 @@
</list>
</value>
</item>
<item>
<key> <string>page_navigation_mode</string> </key>
<value> <string>slider</string> </value>
</item>
<item>
<key> <string>page_template</string> </key>
<value> <string></string> </value>
......@@ -445,6 +467,10 @@
<key> <string>report_tree</string> </key>
<value> <int>0</int> </value>
</item>
<item>
<key> <string>row_css_method</string> </key>
<value> <string></string> </value>
</item>
<item>
<key> <string>search</string> </key>
<value> <int>0</int> </value>
......@@ -490,10 +516,22 @@
<key> <string>stat_method</string> </key>
<value> <string></string> </value>
</item>
<item>
<key> <string>style_columns</string> </key>
<value>
<list/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Contribution Predicates</string> </value>
</item>
<item>
<key> <string>untranslatable_columns</string> </key>
<value>
<list/>
</value>
</item>
<item>
<key> <string>url_columns</string> </key>
<value>
......
40819
\ No newline at end of file
40820
\ No newline at end of file
......@@ -22,8 +22,12 @@ portal_contribution_registry/image_extension
portal_contribution_registry/pdf_extension
portal_contribution_registry/pdf_mimetype
portal_contribution_registry/presentation_extension
portal_contribution_registry/spreadsheet_by_content
portal_contribution_registry/spreadsheet_extension
portal_contribution_registry/text_by_conent_type
portal_contribution_registry/text_by_content
portal_contribution_registry/text_extension
portal_contribution_registry/web_page_by_content
portal_contribution_registry/webpage_extension
portal_contribution_registry/webpage_mimetype
portal_domains/base_day_domain
......
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Jean-Paul Smets-Solanes <jp@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from zope.interface import Interface
class IDiscoverable(Interface):
"""
Discoverable interface specification
Documents which implement IMetadataDiscoverable provides
methods to discover and update metadata properties
from content, user input, file name, etc.
"""
def getContentInformation():
"""
Returns a dictionary of possible metadata which can be extracted from the
document content (ex. title from an HTML file, creation date from a PDF
document, etc.)
"""
def getPropertyDictFromUserLogin(user_login=None):
"""
Based on the user_login, find out all properties which
can be discovered to later update document metadata.
user_login -- optional user login ID
"""
def getPropertyDictFromContent():
"""
Based on the result of getContentInformation, find out all
properties which can be discovered to later update document metadata.
"""
def getPropertyDictFromFilename(filename):
"""
Based on the file name, find out all properties which
can be discovered to later update document metadata.
filename -- file name to use in discovery process
"""
def getPropertyDictFromInput():
"""
Based on the user input, find out all properties which
can be discovered to later update document metadata.
"""
def discoverMetadata(filename=None, user_login=None):
"""
Updates the document metadata by discovering metadata from
the user login, the document content, the file name and the
user input. The order of discovery should be set in system
preferences.
filename - optional file name (ex. AA-BBB-CCC-223-en.doc)
user_login -- optional user login ID
XXX - it is unclear if this method should also trigger finishIngestion
and whether this should be documented here or not
"""
def finishIngestion():
"""
Finish the ingestion process (ex. allocate a reference number automatically if
no reference was defined.)
XXX - it is unclear if this method should be part of the interface
"""
def getExtensionFromFilename():
"""Return calculated value of extension read from filename
"""
def getContentTypeFromContent():
"""Return calculated value of content type read from content
"""
......@@ -87,7 +87,7 @@ class IDocument(Interface):
input - data supplied with http request or set on the object during (2) (e.g.
discovered from email text)
file_name - data which might be encoded in file name
filename - data which might be encoded in filename
user_login - information about user who is contributing the file
content - data which might be derived from document content
......
......@@ -52,11 +52,11 @@ class IDownloadable(Interface):
kw -- optional conversion parameters
"""
def getStandardFileName(format=None):
def getStandardFilename(format=None):
"""
Returns a standard file name for the document to download.
This method is the reverse of
IMetadataDiscoverable.getPropertyDictFromFileName.
IDiscoverable.getPropertyDictFromFilename.
format -- extension of returned file name
"""
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Nicolas Delaby <nicolas@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from zope.interface import Interface
class IUrl(Interface):
"""
"""
def asURL():
"""
Returns a text representation of the Url if defined
or None else.
"""
def fromURL(url):
"""
Analyses a URL and splits it into two parts. URLs
normally follow RFC 1738. However, we accept URLs
without the protocol a.k.a. scheme part (http, mailto, etc.). In this
case only the url_string a.k.a. scheme-specific-part is taken
into account. asURL will then generate the full URL.
"""
def getURLServer():
"""
Returns the server part of a URL
"""
def getURLPort():
"""
Returns the port part of a URL
"""
def getURLPath():
"""
Returns the path part of a URL
"""
def asNormalisedURL(base_url=None):
"""
Returns a normalised version of the url so
that we do not download twice the same content.
This normalisation must refer to the same resource !
Refer to http://en.wikipedia.org/wiki/URL_normalization .
base_url - Specify a default URL and a default target
for all links on a page.
if url is a relative link, we try to compute an absolute url
with help of base_url
"""
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Nicolas Delaby <nicolas@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from zope.interface import Interface
class IUrlRegistryTool(Interface):
"""Tool to register URLs
This tool aim to maintain consistency in URL management
of crawlable sources in order to maintain consistency
between an external resource identifier and generated
document inside ERP5.
Multiple URL can be associated to the same reference
A System Preference can used to configure the global namespace.
This enable isolation of url mappings for different Groups.
This is a configurable tool to support different scope for mappings.
So it is possible to restrict the crawling of an URL
only once in the context of portal;
Or restrict the crawling of an url for the scope of an external_source
or a module only (Crawling multiple times the same URL for a portal)
"""
def clearUrlRegistryTool(context=None):
"""Unregister all urls in all namespaces.
Only available for Manager
context - a context to access container of mappings.
"""
def registerURL(url, reference, context=None):
"""Register the mapping url:reference
this method is aimed to be called from interaction_workflow
which trig on _setReference in order to keep the association
between url:reference up to date.
url - external resource identifier
reference - reference of downloaded resource (ERP5 Object instance)
context - a context to access container of mappings.
If not passed, mappings are stored on tool itself
"""
def getReferenceList(context=None):
"""return all references registered by portal_url_registry
according given context
context - a context to access container of mappings.
"""
def getReferenceFromURL(url, context=None):
"""return reference of document according provided url
url - external resource identifier
context - a context to access container of mappings.
If not passed, mapping are stored on tool itself
"""
def getURLListFromReference(reference, context=None):
"""return list of urls associated to given reference
and context.
reference - reference of downloaded resource (ERP5 Object instance)
context - a context to access container of mappings.
"""
def updateUrlRegistryTool():
"""Rebuild all url mappings for active preference
"""
......@@ -139,10 +139,21 @@ class CachedConvertableMixin:
cached_value = data
conversion_md5 = md5_new(str(data.data)).hexdigest()
size = len(data.data)
else:
elif isinstance(data, (str, unicode,)):
cached_value = data
conversion_md5 = md5_new(cached_value).hexdigest()
size = len(cached_value)
elif isinstance(data, dict):
# Dict instance are used to store computed metadata
# from actual content.
# So this value is intimely related to cache of conversion.
# As it should be cleared each time the document is edited.
# Also may be a proper API should be used
cached_value = data
conversion_md5 = None
size = len(cached_value)
else:
raise NotImplementedError, 'Not able to store type:%r' % type(data)
if date is None:
date = DateTime()
stored_data_dict = {'content_md5': self.getContentMd5(),
......
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2009 Nexedi SA and Contributors. All Rights Reserved.
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Ivan Tyagov <ivan@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
......@@ -27,8 +27,13 @@
#
##############################################################################
from AccessControl import ClassSecurityInfo, getSecurityManager
from AccessControl import ClassSecurityInfo
from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import normaliseUrl
from Products.ERP5Type.DateUtils import convertDateToHour,\
number_of_hours_in_day, number_of_hours_in_year
from urlparse import urlsplit, urlunsplit
from lxml import html as etree_html
class CrawlableMixin:
"""
......@@ -80,3 +85,81 @@ class CrawlableMixin:
method = self._getTypeBasedMethod('isUpdatable',
fallback_script_id = 'Document_isUpdatable')
return method()
security.declareProtected(Permissions.AccessContentsInformation,
'getContentURLList')
def getContentURLList(self):
"""
Returns a list of URLs referenced by the content of this document.
Default implementation consists in analysing the document
converted to HTML. Subclasses may overload this method
if necessary. However, it is better to extend the conversion
methods in order to produce valid HTML, which is useful to
many people, rather than overload this method which is only
useful for crawling.
"""
html_content = self.asEntireHTML()
html_tree = etree_html.fromstring(html_content)
base_href = self.getContentBaseURL()
if base_href:
html_tree.make_links_absolute(base_href)
href_list = []
for elemnt, attribute_name, link, position in html_tree.iterlinks():
# For now take into acount only a and img tags
if attribute_name not in ('href',):
continue
if isinstance(link, unicode):
link = link.encode('utf-8')
href_list.append(link)
return href_list
security.declareProtected(Permissions.AccessContentsInformation,
'getContentBaseURL')
def getContentBaseURL(self):
"""
Returns the content base URL based on the actual content or
on its URL.
"""
raw_url = self.asURL() or ''
splitted_url = urlsplit(raw_url)
path_part = splitted_url[2]
path_part = '/'.join(path_part.split('/')[:-1])
base_url = urlunsplit((splitted_url[0], splitted_url[1], path_part, None,
None))
if isinstance(base_url, unicode):
base_url = base_url.encode('utf-8')
return base_url
security.declareProtected(Permissions.AccessContentsInformation,
'getContentNormalisedURLList')
def getContentNormalisedURLList(self):
"""
Call url normalizer for each url returned by getContentURLList
Return only url associated to the same Domain
"""
reference_domain = urlsplit(normaliseUrl(self.asURL() or ''))[1]
# in www.example.com or www.3.example.com
# keep only the example.com part
reference_domain = ''.join(reference_domain.split('.')[-2:])
if isinstance(reference_domain, unicode):
reference_domain = reference_domain.encode('utf-8')
url_list = []
base_url = self.getContentBaseURL()
for url in self.getContentURLList():
try:
url = normaliseUrl(url, base_url=base_url)
except UnicodeDecodeError:
# Ignore wrong encoding errors
# Web is not a kind world
continue
if not url:
continue
url_domain = urlsplit(url)[1]
if isinstance(url_domain, unicode):
url_domain = url_domain.encode('utf-8')
if url_domain and ''.join(url_domain.split('.')[-2:]) != reference_domain:
continue
# if domain is empty (relative link) or domain is same, then OK
url_list.append(url)
return url_list
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Ivan Tyagov <ivan@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from AccessControl import ClassSecurityInfo, getSecurityManager
from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import convertToUpperCase
from Products.CMFCore.utils import getToolByName
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
import os
import re
try:
import magic
except ImportError:
magic = None
VALID_ORDER_KEY_LIST = ('user_login', 'content', 'filename', 'input')
CONTENT_INFORMATION_FORMAT = '_idiscoverable_content_information'
class DiscoverableMixin(CachedConvertableMixin):
"""
Implements IDiscoverable
This class provide methods useful for Metadata extraction.
It inherit from CachedConvertableMixin to access
Cache storage API.
As computed data needs to be stored in same backend.
"""
security = ClassSecurityInfo()
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromUserLogin')
def getPropertyDictFromUserLogin(self, user_login=None):
"""
Based on the user_login, find out as many properties as needed.
returns properties which should be set on the document
"""
if user_login is None:
user_login = str(getSecurityManager().getUser())
method = self._getTypeBasedMethod('getPropertyDictFromUserLogin',
fallback_script_id='Document_getPropertyDictFromUserLogin')
return method(user_login)
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromContent')
def getPropertyDictFromContent(self):
"""
Based on the document content, find out as many properties as needed.
returns properties which should be set on the document
"""
# accesss data through convert
mime, content = self.convert(None)
if not content:
# if document is empty, we will not find anything in its content
return {}
method = self._getTypeBasedMethod('getPropertyDictFromContent',
fallback_script_id='Document_getPropertyDictFromContent')
return method()
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromFilename')
def getPropertyDictFromFilename(self, filename):
"""
Based on the file name, find out as many properties as needed.
returns properties which should be set on the document
"""
return self.portal_contributions.getPropertyDictFromFilename(filename)
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromFileName')
getPropertyDictFromFileName = getPropertyDictFromFilename
security.declareProtected(Permissions.AccessContentsInformation,
'getPropertyDictFromInput')
def getPropertyDictFromInput(self, input_parameter_dict):
"""
Fetch argument_dict, then filter pass this dictionary
to getPropertyDictFromInput.
"""
method = self._getTypeBasedMethod('getPropertyDictFromInput')
return method(input_parameter_dict)
### Metadata disovery and ingestion methods
security.declareProtected(Permissions.ModifyPortalContent,
'discoverMetadata')
def discoverMetadata(self, filename=None, user_login=None,
input_parameter_dict=None):
"""
This is the main metadata discovery function - controls the process
of discovering data from various sources. The discovery itself is
delegated to scripts or uses preference-configurable regexps. The
method returns either self or the document which has been
merged in the discovery process.
filename - this parameter is a file name of the form "AA-BBB-CCC-223-en"
user_login - this is a login string of a person; can be None if the user is
currently logged in, then we'll get him from session
input_parameter_dict - arguments provided to Create this content by user.
"""
# Preference is made of a sequence of 'user_login', 'content', 'filename', 'input'
method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList')
order_list = list(method())
order_list.reverse()
# build a dictionary according to the order
kw = {}
for order_id in order_list:
result = None
if order_id not in VALID_ORDER_KEY_LIST:
# Prevent security attack or bad preferences
raise AttributeError, "%s is not in valid order key list" % order_id
method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
method = getattr(self, method_id)
if order_id == 'filename':
if filename is not None:
result = method(filename)
elif order_id == 'user_login':
if user_login is not None:
result = method(user_login)
elif order_id == 'input':
if input_parameter_dict is not None:
result = method(input_parameter_dict)
else:
result = method()
if result is not None:
for key, value in result.iteritems():
if value not in (None, ''):
kw[key]=value
# Prepare the content edit parameters
portal_type = kw.pop('portal_type', None)
if portal_type and portal_type != self.getPortalType():
# Reingestion is required to update portal_type
return self.migratePortalType(portal_type)
# Try not to invoke an automatic transition here
self._edit(**kw)
if not portal_type:
# If no portal_type was dicovered, pass self
# through to portal_contribution_registry
# to guess destination portal_type against all properties.
# If returned portal_type is different, then reingest.
registry = getToolByName(self.getPortalObject(),
'portal_contribution_registry')
portal_type = registry.findPortalTypeName(context=self)
if portal_type != self.getPortalType():
return self.migratePortalType(portal_type)
# Finish ingestion by calling method
self.finishIngestion() # XXX - is this really the right place ?
self.reindexObject() # XXX - is this really the right place ?
# Revision merge is tightly coupled
# to metadata discovery - refer to the documentation of mergeRevision method
merged_doc = self.mergeRevision() # XXX - is this really the right place ?
merged_doc.reindexObject() # XXX - is this really the right place ?
return merged_doc # XXX - is this really the right place ?
security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
def finishIngestion(self):
"""
Finish the ingestion process by calling the appropriate script. This
script can for example allocate a reference number automatically if
no reference was defined.
"""
method = self._getTypeBasedMethod('finishIngestion',
fallback_script_id='Document_finishIngestion')
return method()
security.declareProtected(Permissions.AccessContentsInformation,
'getContentTypeFromContent')
def getContentTypeFromContent(self):
"""
Return content_type read from metadata extraction of content.
This method is called by portal_contribution_registry
"""
mime, content = self.convert(None)
if not content:
return
if magic is not None:
# This will be delegated soon to external web service
# like cloudooo
# ERP5 will no longer handle data itself.
mimedetector = magic.Magic(mime=True)
return mimedetector.from_buffer(content)
security.declareProtected(Permissions.AccessContentsInformation,
'getExtensionFromFilename')
def getExtensionFromFilename(self, filename=None):
"""
Return extension read from filename in lower case.
"""
if not filename:
filename = self.getStandardFilename()
basename, extension = os.path.splitext(filename)
if extension:
extension = extension[1:].lower() # remove first dot
return extension
security.declareProtected(Permissions.AccessContentsInformation,
'getContentInformation')
def getContentInformation(self):
"""
Call private implementation, then store the result in conversion
cache storage.
"""
format = CONTENT_INFORMATION_FORMAT
# How to knows if a instance implement an interface
try:
mime, cached_value = self.getConversion(format=format)
return cached_value
except KeyError:
value = self._getContentInformation()
self.setConversion(value, format=format)
return value
def _getContentInformation(self):
"""
Returns the content information from the HTML conversion.
The default implementation tries to build a dictionary
from the HTML conversion of the document and extract
the document title.
"""
result = {}
html = self.asEntireHTML()
if not html:
return result
title_list = re.findall(self.title_parser, str(html))
if title_list:
result['title'] = title_list[0]
return result
......@@ -31,6 +31,7 @@ from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import fill_args_from_request
from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\
_ViewEmulator
import warnings
_MARKER = []
......@@ -108,15 +109,31 @@ class DownloadableMixin:
return str(data)
security.declareProtected(Permissions.AccessContentsInformation,
'getStandardFileName')
def getStandardFileName(self, format=None):
'getStandardFilename')
def getStandardFilename(self, format=None):
"""Returns the document coordinates as a standard file name. This
method is the reverse of getPropertyDictFromFileName.
"""
method = self._getTypeBasedMethod('getStandardFileName',
method = self._getTypeBasedMethod('getStandardFilename',
fallback_script_id='Document_getStandardFilename')
if method is None:
# backward compatibility
method = self._getTypeBasedMethod('getStandardFileName',
fallback_script_id='Document_getStandardFileName')
return method(format=format)
# backward compatibility
security.declareProtected(Permissions.AccessContentsInformation,
'getStandardFileName')
def getStandardFileName(self, format=None):
"""(deprecated) use getStandardFilename() instead."""
warnings.warn('getStandardFileName() is deprecated. '
'use getStandardFilename() instead.')
return self.getStandardFilename(format=format)
method = self._getTypeBasedMethod('getStandardFilename',
fallback_script_id='Document_getStandardFilename')
return method(format=format)
def manage_FTPget(self):
"""Return body for ftp. and WebDAV
"""
......
......@@ -43,6 +43,7 @@ from zExceptions import BadRequest
from Products.ERP5Type.tests.backportUnittest import skip
from Products.ERP5Type.Tool.ClassTool import _aq_reset
from Products.ERP5Type.Workflow import addWorkflowByType
from Products.CMFCore.WorkflowCore import WorkflowException
def getDummyTypeBaseMethod(self):
""" Use a type Base method
......@@ -1248,6 +1249,43 @@ class TestBase(ERP5TypeTestCase, ZopeTestCase.Functional):
self.assertFalse(person.isIndexable)
self.assertEquals(0, len(self.portal.portal_catalog(uid=person.getUid())))
def test_metaWorkflowTransition(self):
"""Test Meta Transtion, jump from state to another without explicitely
transtion defined.
"""
module = self.portal.person_module
person = module.newContent(portal_type='Person')
self.assertEquals(person.getValidationState(), 'draft')
self.assertFalse(self.portal.portal_workflow.isTransitionPossible(person,
'invalidate'))
# test low-level implementation
self.portal.portal_workflow.validation_workflow._executeMetaTransition(
person, 'invalidated')
self.assertEquals(person.getValidationState(), 'invalidated')
validation_history = person.workflow_history['validation_workflow']
self.assertEquals(len(validation_history), 2)
self.assertEquals(validation_history[-1]['comment'],
'Jump from \'draft\' to \'invalidated\'')
person = module.newContent(portal_type='Person')
self.assertEquals(person.getValidationState(), 'draft')
# test high-level implementation
self.portal.portal_workflow._jumpToStateFor(person, 'invalidated')
self.assertEquals(person.getValidationState(), 'invalidated')
person = module.newContent(portal_type='Person')
self.assertEquals(person.getValidationState(), 'draft')
self.portal.portal_workflow._jumpToStateFor(person, 'invalidated',
wf_id='validation_workflow')
self.assertEquals(person.getValidationState(), 'invalidated')
person = module.newContent(portal_type='Person')
self.assertEquals(person.getValidationState(), 'draft')
self.assertRaises(WorkflowException,
self.portal.portal_workflow._jumpToStateFor,
person, 'invalidated', wf_id='edit_workflow')
self.assertEquals(person.getValidationState(), 'draft')
class TestERP5PropertyManager(unittest.TestCase):
"""Tests for ERP5PropertyManager.
"""
......
......@@ -36,7 +36,7 @@ from Products.CMFCore.WorkflowCore import WorkflowException
from Products.ERP5Type.tests.utils import DummyMailHost, FileUpload
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
_getConversionServerDict
from Products.ERP5OOo.tests.testIngestion import FILE_NAME_REGULAR_EXPRESSION
from Products.ERP5OOo.tests.testIngestion import FILENAME_REGULAR_EXPRESSION
from Products.ERP5OOo.tests.testIngestion import REFERENCE_REGULAR_EXPRESSION
from Products.ERP5Type.tests.backportUnittest import expectedFailure
......@@ -443,7 +443,7 @@ class TestCRMMailIngestion(BaseTestCRM):
data=self._readTestData(filename)
return self.portal.portal_contributions.newContent(
container_path='event_module',
file_name='postfix_mail.eml',
filename='postfix_mail.eml',
data=data)
def test_findTypeByName_MailMessage(self):
......@@ -451,7 +451,7 @@ class TestCRMMailIngestion(BaseTestCRM):
self.assertEquals(
'Mail Message',
self.portal.portal_contribution_registry.findPortalTypeName(
file_name='postfix_mail.eml', mime_type='message/rfc822', data='Test'
filename='postfix_mail.eml', content_type='message/rfc822', data='Test'
))
def test_Base_getEntityListFromFromHeader(self):
......@@ -767,7 +767,7 @@ class TestCRMMailSend(BaseTestCRM):
conversion_dict = _getConversionServerDict()
default_pref.setPreferredOoodocServerAddress(conversion_dict['hostname'])
default_pref.setPreferredOoodocServerPortNumber(conversion_dict['port'])
default_pref.setPreferredDocumentFileNameRegularExpression(FILE_NAME_REGULAR_EXPRESSION)
default_pref.setPreferredDocumentFileNameRegularExpression(FILENAME_REGULAR_EXPRESSION)
default_pref.setPreferredDocumentReferenceRegularExpression(REFERENCE_REGULAR_EXPRESSION)
if default_pref.getPreferenceState() == 'disabled':
default_pref.enable()
......
......@@ -120,36 +120,36 @@ return predicate.getDestinationPortalType()
tool = self.portal.portal_contribution_registry
# Test extension matching
self.assertEqual(tool.findPortalTypeName(file_name='test.txt'), 'Text')
self.assertEqual(tool.findPortalTypeName(file_name='test.odt'), 'Text')
self.assertEqual(tool.findPortalTypeName(file_name='001.jpg'), 'Image')
self.assertEqual(tool.findPortalTypeName(file_name='002.PNG'), 'Image')
self.assertEqual(tool.findPortalTypeName(file_name='002.PNG'), 'Image')
self.assertEqual(tool.findPortalTypeName(file_name='index.html'), 'Web Page')
self.assertEqual(tool.findPortalTypeName(filename='test.txt'), 'Text')
self.assertEqual(tool.findPortalTypeName(filename='test.odt'), 'Text')
self.assertEqual(tool.findPortalTypeName(filename='001.jpg'), 'Image')
self.assertEqual(tool.findPortalTypeName(filename='002.png'), 'Image')
self.assertEqual(tool.findPortalTypeName(filename='002.PNG'), 'Image')
self.assertEqual(tool.findPortalTypeName(filename='index.html'), 'Web Page')
# Unknown extension
self.assertEqual(tool.findPortalTypeName(file_name='index.xxx'), 'File')
self.assertEqual(tool.findPortalTypeName(filename='index.xxx'), 'File')
# Test mimetype matching
self.assertEqual(tool.findPortalTypeName(mime_type='text/html'), 'Web Page')
self.assertEqual(tool.findPortalTypeName(content_type='text/html'), 'Web Page')
# Unknown mimetype
self.assertEqual(tool.findPortalTypeName(mime_type='application/octet-stream'), 'File')
self.assertEqual(tool.findPortalTypeName(content_type='application/octet-stream'), 'File')
# Test both of extension and mimetype
self.assertNotEqual(tool.findPortalTypeName(file_name='message.eml'),
self.assertNotEqual(tool.findPortalTypeName(filename='message.eml'),
'Mail Message')
self.assertNotEqual(tool.findPortalTypeName(mime_type='message/rfc822'),
self.assertNotEqual(tool.findPortalTypeName(content_type='message/rfc822'),
'Mail Message')
self.assertEqual(tool.findPortalTypeName(file_name='message.eml',
mime_type='message/rfc822'),
self.assertEqual(tool.findPortalTypeName(filename='message.eml',
content_type='message/rfc822'),
'Mail Message')
# Test test script
data = """\
Subject: Fax
"""
self.assertEqual(tool.findPortalTypeName(file_name='message.eml',
mime_type='message/rfc822',
self.assertEqual(tool.findPortalTypeName(filename='message.eml',
content_type='message/rfc822',
data=data),
'Fax Message')
......
......@@ -37,7 +37,8 @@ from AccessControl.SecurityManagement import newSecurityManager
from Testing import ZopeTestCase
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
_getConversionServerDict
from Products.ERP5Type.tests.utils import FileUpload
from Products.ERP5Type.tests.utils import FileUpload, createZODBPythonScript
LANGUAGE_LIST = ('en', 'fr', 'de', 'bg',)
......@@ -568,8 +569,21 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
def test_PreviewOOoDocumentWithEmbeddedImage(self):
"""Tests html preview of an OOo document with images as extensible content.
For this test, Presentation_checkConversionFormatPermission does not allow
access to original format for Unauthenticated users.
Chack that user can still access to other format.
"""
portal = self.portal
script_id = 'Presentation_checkConversionFormatPermission'
python_code = """from AccessControl import getSecurityManager
user = getSecurityManager().getUser()
if (not user or not user.getId()) and not format:
return False
return True
"""
createZODBPythonScript(portal.portal_skins.custom, script_id,
'format, **kw', python_code)
request = portal.REQUEST
request['PARENTS'] = [self.app]
self.getPortalObject().aq_parent.acl_users._doAddUser(
......@@ -611,7 +625,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
# then publish the document and access it anonymously by reference through
# the web site
document.publish()
transaction.commit()
self.tic()
......@@ -620,7 +634,7 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
self.assertTrue(response.getHeader('content-type').startswith('text/html'))
html = response.getBody()
self.assertTrue('<img' in html, html)
# find the img src
img_list = etree.HTML(html).findall('.//img')
self.assertEquals(1, len(img_list))
......@@ -633,6 +647,22 @@ class TestERP5WebWithDms(ERP5TypeTestCase, ZopeTestCase.Functional):
png = response.getBody()
self.assertTrue(png.startswith('\x89PNG'))
# Now purge cache and let Anonymous user converting the document.
self.login()
document.edit() # Reset cache key
transaction.commit()
self.tic()
response = self.publish('%s/%s/asEntireHTML' % (
website.absolute_url_path(), document_reference))
self.assertTrue(response.getHeader('content-type').startswith('text/html'))
html = response.getBody()
self.assertTrue('<img' in html, html)
# find the img src
img_list = etree.HTML(html).findall('.//img')
self.assertEquals(1, len(img_list))
src = img_list[0].get('src')
def test_ImageConversionThroughWebSite(self):
"""Check that conversion parameters pass in url
are hounoured to display an image in context of a website
......
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Nicolas Delaby <nicolas@erp5.org>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import unittest
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
_getConversionServerDict
import transaction
# test files' home
FILENAME_REGULAR_EXPRESSION = "(?P<reference>[A-Z&é@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})"
REFERENCE_REGULAR_EXPRESSION = "(?P<reference>[A-Z&é@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?"
class TestWebCrawler(ERP5TypeTestCase):
"""
Test Crawling mechanism
"""
_path_to_delete_list = []
system_pref_id = 'my_preference'
def getTitle(self):
"""
Return the title of the current test set.
"""
return "ERP5 Live DMS - Web Crawling"
def getBusinessTemplateList(self):
"""
Return the list of required business templates.
"""
return ('erp5_base',
'erp5_ingestion',
'erp5_ingestion_mysql_innodb_catalog',
'erp5_web',
'erp5_dms')
def afterSetUp(self):
"""
Initialize the ERP5 site.
"""
self.login()
self.portal = self.getPortal()
self.setSystemPreference()
self.bootstrapWebSite()
transaction.commit()
self.tic()
def beforeTearDown(self):
portal = self.portal
module_id_list = [
'web_page_module',
'web_site_module',
'external_source_module',
'document_module',
]
# delete created documents by test
for module_id in module_id_list:
module = portal[module_id]
module.manage_delObjects(list(module.objectIds()))
# Unindex deleted documents
transaction.commit()
self.tic()
def setSystemPreference(self):
portal_preferences = self.portal.portal_preferences
system_preference = portal_preferences._getOb(self.system_pref_id, None)
if system_preference is None:
system_preference = portal_preferences.newContent(id=self.system_pref_id,
portal_type='System Preference')
conversion_dict = _getConversionServerDict()
system_preference.\
setPreferredOoodocServerAddress(conversion_dict['hostname'])
system_preference.\
setPreferredOoodocServerPortNumber(conversion_dict['port'])
system_preference.setPreferredDocumentFilenameRegularExpression(
FILENAME_REGULAR_EXPRESSION)
system_preference.setPreferredDocumentReferenceRegularExpression(
REFERENCE_REGULAR_EXPRESSION)
if system_preference.getPreferenceState() != 'global':
system_preference.enable()
def bootstrapWebSite(self):
"""Create 1 Website
live_test_web_site/section1/section1a
/section2
create 2 web pages
W-REFERENCE.PAGE
W-REFERENCE.HOMEPAGE
the website use light version of erp5_web_layout
It keep just displaying sections and subsection
And default Web page
"""
web_site_portal_type = 'Web Site'
web_section_portal_type = 'Web Section'
web_page_portal_type = 'Web Page'
web_site_module = self.portal.getDefaultModule(web_site_portal_type)
web_page_module = self.portal.getDefaultModule(web_page_portal_type)
text_content = """<p><a href="W-REFERENCE.PAGE">Page</a></p>"""
web_page_id = 'live_test_home'
home_page = web_page_module.newContent(portal_type=web_page_portal_type,
title='Home Page',
text_content=text_content,
reference='W-REFERENCE.HOMEPAGE',
version='001',
language='en',
id=web_page_id)
home_page.submit()
home_page.publish()
web_site_id = 'live_test_web_site'
web_site = web_site_module.newContent(portal_type=web_site_portal_type,
id=web_site_id,
title='Live Test Web Site',
visible=True,
default_page_displayed=True,
site_map_section_parent=True,
authorization_forced=True,
aggregate_value=home_page,
available_language_set=['en'],
container_layout='erp5_web_layout_test',
content_layout='erp5_web_content_layout_test')
web_site.publish()
text_content = """<p>
<a href="%s/W-REFERENCE.HOMEPAGE">absolute link to HOME PAGE</a>
</p>""" % web_site.absolute_url()
section1a_page = web_page_module.newContent(
portal_type=web_page_portal_type,
title='Home Page',
text_content=text_content,
reference='W-REFERENCE.PAGE',
version='001',
language='en')
section1a_page.submit()
section1a_page.publish()
web_section1 = web_site.newContent(portal_type=web_section_portal_type,
title='Section 1',
id='section1',
aggregate_value=section1a_page)
web_section2 = web_site.newContent(portal_type=web_section_portal_type,
title='Section 2',
id='section2',
aggregate_value=section1a_page)
web_section1a = web_section1.newContent(
portal_type=web_section_portal_type,
title='Section 1a',
id='section 1a', #add a space in id
aggregate_value=section1a_page)
def test_01_check_URLTransformations(self):
"""Check crawlable functionalities regarding URL handling
getContentBaseURL
asNormalisedURL
getContentNormalisedURLList
"""
web_page_portal_type = 'Web Page'
web_page_module = self.portal.getDefaultModule(web_page_portal_type)
web_page = web_page_module.newContent(portal_type=web_page_portal_type)
self.assertEquals(web_page.getContentBaseURL(), '')
web_page.fromURL('http://www.example.com')
self.assertEquals(web_page.getContentBaseURL(), 'http://www.example.com')
web_page.fromURL('http://www.example.com/section/sub_section')
self.assertEquals(web_page.getContentBaseURL(),
'http://www.example.com/section')
text_content = """<html>
<head>
<base href="http://www.example.com"/>
</head>
<body>
<p><a href="http://www.notexample.com/">External link</a></p>
<p><a href="http://www.example.com//I don't care I put what/ I want/">
Funny link</a></p>
<p><a href="http://www.example.com/section">Internal link</a></p>
<p><a href="section2">Relative Internal link</a></p>
<p><a href="http://www.example.com/?title=%E9+crit">With Encoding issue
This link will be discarded</a></p>
<img src="my_image_link"/>
<script src="should_not_be_followed.js"/>
<p><a href="http://http://www.example.com/section">Not a link</a></p>
</body>
</html>"""
web_page.edit(text_content=text_content)
self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
self.assertEquals(web_page.getContentNormalisedURLList(),
["http://www.example.com/I don't care I put what/ I want/",
'http://www.example.com/section',
'http://www.example.com/section2',])
# relative links without base tag
text_content = """<html>
<head>
</head>
<body>
<p><a href="section2">Relative Internal link</a></p>
</body>
</html>"""
web_page.edit(text_content=text_content)
web_page.fromURL('http://www.example.com/#fffff')
self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
self.assertEquals(web_page.getContentNormalisedURLList(),
['http://www.example.com/section2',])
self.assertEquals(web_page.asNormalisedURL(),
'http://www.example.com/#fffff')
def test_02_crawlWebSite(self):
"""Call portal_contribution to crawl website hosted by itself.
"""
web_site = self.portal.web_site_module.live_test_web_site
external_source_portal_type = 'URL Crawler'
web_crawler_module = self.portal.getDefaultModule(
external_source_portal_type)
web_crawler = web_crawler_module.newContent(
portal_type=external_source_portal_type,
crawling_depth=5)
web_crawler.fromURL(web_site.absolute_url())
transaction.commit()
self.tic()
web_crawler.crawlContent()
transaction.commit()
self.tic()
# 6 = 1 website
# + 3 Web Sections
# + 1 absolute link to home_page
# + 1 relative link from home_page to another web page
self.assertEquals(len(web_crawler), 6)
self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
6)
date_before = web_crawler.getModificationDate()
web_crawler.crawlContent()
transaction.commit()
self.tic()
# Nothing happens, portal_url_registry keep crawling twice
# the same url
self.assertEquals(len(web_crawler), 6)
self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
6)
# not modified
self.assertEquals(date_before, web_crawler.getModificationDate())
new_web_crawler = web_crawler_module.newContent(
portal_type=external_source_portal_type,
crawling_depth=5)
new_web_crawler.fromURL(web_site.absolute_url())
transaction.commit()
self.tic()
new_web_crawler.crawlContent()
transaction.commit()
self.tic()
# check that portal_url_registry
# block contribution of existing content
self.assertFalse(len(new_web_crawler))
# set another namespace on preference
preference = self.portal.portal_preferences[self.system_pref_id]
preference.setPreferredIngestionNamespace('NEW')
transaction.commit()
self.tic()
new_web_crawler.crawlContent()
transaction.commit()
self.tic()
self.assertEquals(len(web_crawler), 6)
def test_suite():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestWebCrawler))
return suite
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment