diff --git a/product/ERP5/Tool/ContributionTool.py b/product/ERP5/Tool/ContributionTool.py index ea9dd252c478f08acb5e8eef52cada5b5f98bf92..f13b3503c52e702b89f256d13e01b1a230fe34f8 100644 --- a/product/ERP5/Tool/ContributionTool.py +++ b/product/ERP5/Tool/ContributionTool.py @@ -51,7 +51,7 @@ urllib2.install_opener(opener) # A temporary hack until urllib2 supports timeout setting - XXX import socket -socket.setdefaulttimeout(60) # 1 minute timeout +socket.setdefaulttimeout(600) # 1 minute timeout # Global parameters TEMP_NEW_OBJECT_KEY = '_v_new_object' @@ -98,7 +98,7 @@ class ContributionTool(BaseTool): manage_overview = DTMLFile( 'explainContributionTool', _dtmldir ) security.declarePrivate('findTypeName') - def findTypeName(self, file_name, document): + def findTypeName(self, file_name, document, container=None): """ Finds the appropriate portal type based on the file name or if necessary the content of the document. @@ -140,6 +140,30 @@ class ContributionTool(BaseTool): return document.portal_type valid_portal_type_list = [document.portal_type] + extra_valid_portal_type_list + # LOG('valid_portal_type_list', 0, str(valid_portal_type_list)) + + # If a container is defined, filter valid portal types with allowedContentTypes + if container is not None: + allowed_type_list = map(lambda x: x.id, container.allowedContentTypes()) + # LOG('allowed_type_list', 0, str(allowed_type_list)) + valid_portal_type_list = filter(lambda x: x in allowed_type_list, valid_portal_type_list) + # LOG('filtered valid_portal_type_list', 0, str(valid_portal_type_list)) + + # Check if there is any intersection with index portal types + # If not, we do not need to even check if content is an index + is_index_candidate = False + for index_type in self.getPortalCrawlerIndexTypeList(): + if index_type in valid_portal_type_list: + is_index_candidate = True + candidate_index_type = index_type + + if is_index_candidate and document.isIndexContent(container=container): + # If this document has to be created inside an External Source (container) + # we need to analyse its content to determine whether it is or not + # an index document. Index documents should not be searchable as documents + # and should not be considered in the depth calculation of the crawling + # process + return candidate_index_type # We suppose that there is only one index type in allowed content types # Check if the filename tells which portal_type this is portal_type_list = self.getPropertyDictFromFileName(file_name).get('portal_type', []) @@ -151,7 +175,7 @@ class ContributionTool(BaseTool): # if we have only one, then this is it # LOG('findTypeName single portal_type_list', 0, portal_type_list[0]) return portal_type_list[0] - + # If it is still None, we need to read the document # to check which of the candidates is suitable # Let us give a chance to getPropertyDictFromContent to @@ -207,7 +231,7 @@ class ContributionTool(BaseTool): # Try to find the file_name file_name = None mime_type = None - if url is None: + if not url: # check if file was provided file = kw.get('file', None) if file is not None: @@ -238,7 +262,7 @@ class ContributionTool(BaseTool): file_name = urllib.quote(file_name, safe='') file_name = file_name.replace('%', '') # For URLs, we want an id by default equal to the encoded URL - if id is None: id = self._encodeURL(url) + if id is None: id = self.encodeURL(url) if hasattr(url_file, 'headers'): headers = url_file.headers if hasattr(headers, 'type'): @@ -260,7 +284,7 @@ class ContributionTool(BaseTool): #return document pass # XXX - This needs to be implemented once the rest is stable - # From here, there is no hope unless a file was provided + # From here, there is no hope unless a file was provided if file is None: raise ValueError, "could not determine portal type" @@ -274,6 +298,9 @@ class ContributionTool(BaseTool): if ob is None: raise ValueError, "Could not determine the document type" + # Prevent any reindexing operations + ob.isIndexable = 0 + # Then put the file inside ourselves for a short while BaseTool._setObject(self, file_name, ob) document = BaseTool._getOb(self, file_name) @@ -281,7 +308,8 @@ class ContributionTool(BaseTool): try: # Then edit the document contents (so that upload can happen) document._edit(**kw) - if url: document.fromURL(url) + if url: + document.fromURL(url) finally: # Remove the object from ourselves BaseTool._delObject(self, file_name) @@ -297,7 +325,8 @@ class ContributionTool(BaseTool): # Notify workflows #document.notifyWorkflowCreated() - # Reindex it and return the document + # Allow reindexing, reindex it and return the document + delattr(document, 'isIndexable') document.reindexObject() return document @@ -380,7 +409,7 @@ class ContributionTool(BaseTool): # portal_type based on the document content # (ex. a Memo is a kind of Text which can be identified # by the fact it includes some specific content) - portal_type = self.findTypeName(name, ob.__of__(self)) + portal_type = self.findTypeName(name, ob.__of__(self), container=container) if portal_type is None: portal_type = ob.portal_type ob._setPortalTypeName(portal_type) # This is redundant with finishConstruction # but necessary to move objects to appropriate @@ -413,9 +442,9 @@ class ContributionTool(BaseTool): document.activate().discoverMetadata(file_name=name, user_login=user_login) else: if document.isExternalDocument(): - document = existing_document + document = existing_document # If this is an external document, update its content - document.activate().updateContentFromURL() + # document.activate().updateContentFromURL() # XXX I think this is no longer useful with alarms # XXX - Make sure this does not increase ZODB # XXX - what to do also with parameters (put again edit_kw) ? # Providing some information to the use about the fact @@ -423,7 +452,7 @@ class ContributionTool(BaseTool): else: # We may have to implement additional revision support # to support in place contribution (ie. for a given ID) - # but is this really useful ? + # but is this really useful ? raise NotImplementedError # Keep the document close to us - this is only useful for @@ -448,17 +477,31 @@ class ContributionTool(BaseTool): del self._v_document_cache[id] return self.getPortalObject().unrestrictedTraverse(document_url) - # Try first to return an object listed by listDAVObjects + # Try first to return the real object inside + # This is much safer than trying to access objects displayed by listDAVObjects + # because the behaviour of catalog is unpredicatble if a string is passed + # for a UID. For example + # select path from catalog where uid = "001193.html"; + # will return the same as + # select path from catalog where uid = 1193; + # This was the source of an error in which the contribution tool + # was creating a web page and was returning a Base Category + # when + # o = folder._getOb(id) + # was called in DocumentConstructor + result = BaseTool._getOb(self, id, default=default) + if result is not _marker: + return result + + # Return an object listed by listDAVObjects uid = str(id).split('-')[-1] object = self.getPortalObject().portal_catalog.unrestrictedGetResultValue(uid=uid) if object is not None: return object.getObject() # Make sure this does not break security. XXX - # Fallback to default method - if default is _marker: - return BaseTool._getOb(self, id) - else: - return BaseTool._getOb(self, id, default=default) + # Raise an AttributeError the same way as in OFS.ObjectManager._getOb + raise AttributeError, id + def listDAVObjects(self): """ @@ -487,7 +530,8 @@ class ContributionTool(BaseTool): return wrapper(object_list) # Crawling methods - def _normaliseURL(self, url, base_url=None): + security.declareProtected(Permissions.View, 'normaliseURL') + def normaliseURL(self, url, base_url=None): """ Returns a normalised version of the url so that we do not download twice the same content. @@ -506,7 +550,8 @@ class ContributionTool(BaseTool): url = '%s/%s' % (base_url, url) return url - def _encodeURL(self, url): + security.declareProtected(Permissions.View, 'encodeURL') + def encodeURL(self, url): """ Returns the URL as an ID. ID should be chosen in such way that it is optimal with HBTreeFolder (ie. so that @@ -520,7 +565,7 @@ class ContributionTool(BaseTool): # Produce an MD5 from the URL hex_md5 = md5.md5(url).hexdigest() # Take the first part in the URL which is not empty - # LOG("_encodeURL", 0, url) + # LOG("encodeURL", 0, url) url_segment = url.split(':')[1] url_segment_list = url_segment.split('/') url_domain = None @@ -548,11 +593,18 @@ class ContributionTool(BaseTool): valid. """ depth = content.getCrawlingDepth() - if depth <= 0: + if depth < 0: + # Do nothing if crawling depth is reached + # (this is not a duplicate code but a way to prevent + # calling isIndexContent unnecessarily) + return + if not content.isIndexContent(): # Decrement depth only if it is a content document + depth = depth - 1 + if depth < 0: # Do nothing if crawling depth is reached return base_url = content.getContentBaseURL() - url_list = map(lambda url: self._normaliseURL(url, base_url), set(content.getContentURLList())) + url_list = map(lambda url: self.normaliseURL(url, base_url), set(content.getContentURLList())) for url in set(url_list): # LOG('trying to crawl', 0, url) # Some url protocols should not be crawled @@ -563,7 +615,7 @@ class ContributionTool(BaseTool): # in place of not ? container = content.getParentValue() # Calculate the id under which content will be stored - id = self._encodeURL(url) + id = self.encodeURL(url) # Try to access the document if it already exists document = container.get(id, None) if document is None: @@ -572,50 +624,65 @@ class ContributionTool(BaseTool): # (the same URL is created multiple times) # LOG('activate newContentFromURL', 0, url) self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(), - id=id, url=url, crawling_depth=depth - 1) - else: - # Update depth to the max. of the two values - new_depth = max(depth - 1, document.getCrawlingDepth()) - document._setCrawlingDepth(new_depth) - # And activate updateContentFromURL on existing document - next_date = document.getNextAlarmDate() # This should prevent doing the update too often - # LOG('activate updateContentFromURL', 0, url) - document.activate(at_date=next_date).updateContentFromURL(crawling_depth=depth - 1) + id=id, url=url, crawling_depth=depth) + elif depth and document.getCrawlingDepth() < depth: + # Update the crawling depth if necessary + document._setCrawlingDepth(depth) + document.activate().crawlContent() security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL') def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0): """ Updates an existing content. """ - # Step 0: update crawling_depth if required - if crawling_depth > content.getCrawlingDepth(): - content._setCrawlingDepth(crawling_depth) - # Step 1: download new content - try: - url = content.asURL() - data = urllib2.urlopen(url).read() - file = cStringIO.StringIO() - file.write(data) - file.seek(0) - except socket.error, msg: # repeat multiple times in case of socket error - content.updateContentFromURL(repeat=repeat - 1) - # Step 2: compare and update if necessary (md5) - # do here some md5 stuff to compare contents... - if 1: - # content._edit(file=file) # Commented for testing + # First, test if the document is updatable according to + # its workflow states (if it has a workflow associated with) + if content.isUpdatable(): + # Step 0: update crawling_depth if required + if crawling_depth > content.getCrawlingDepth(): + content._setCrawlingDepth(crawling_depth) + # Step 1: download new content + try: + url = content.asURL() + data = urllib2.urlopen(url).read() + file = cStringIO.StringIO() + file.write(data) + file.seek(0) + except urllib2.HTTPError, error: + if repeat == 0: + # XXX - Call the extendBadURLList method,--NOT Implemented-- + # IDEA : ajouter l'url en question dans une list "bad_url_list"聽 puis lors du crawling au lieu que de boucler sur + # la liste des url extraites de la page web on fait un test supplementaire qui verifie que l'url n'est pas + # dans la liste聽 bad_url_lis + raise urllib2.HTTPError + content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1) + return + except urllib2.URLError, error: + if repeat == 0: + # XXX - Call the extendBadURLList method,--NOT Implemented-- + raise urllib2.URLError + content.activate(at_date=DateTime() + 1).updateContentFromURL(repeat=repeat - 1) + return + + # Step 2: compare and update if necessary (md5) + # md5 stuff to compare contents + new_content_md5 = md5.md5(data).hexdigest() + content_md5 = content.getContentMd5() + if content_md5 is new_content_md5: + return + content._edit(file=file)# Please make sure that if content is the same + # we do not update it + # This feature must be implemented by Base or File + # not here (look at _edit in Base) # Step 3: convert to base format - # content.convertToBaseFormat() # Commented for testing + content.convertToBaseFormat() # Step 4: activate populate (unless interaction workflow does it) - # content.activate().populateContent() # Commented for testing + content.activate().populateContent() # Step 5: activate crawlContent - content.activate().crawlContent() - else: - # XXX - # We must handle the case for which content type has changed in between - pass - # Step 6: activate updateContentFromURL at next period - next_date = content.getNextAlarmDate() - content.activate(at_date=next_date).updateContentFromURL() + depth = content.getCrawlingDepth() + if depth > 0: + content.activate().crawlContent() + content.setContentMd5(new_content_md5) security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL') def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT, **kw): @@ -638,25 +705,33 @@ class ContributionTool(BaseTool): return try: document = self.newContent(container_path=container_path, id=id, **kw) - if document.getCrawlingDepth() > 0: document.activate().crawlContent() - document.activate(at_date=document.getNextAlarmDate()).updateContentFromURL() + if document.isIndexContent() and document.getCrawlingDepth() >= 0: + # If this is an index document, keep on crawling even if crawling_depth is 0 + document.activate().crawlContent() + elif document.getCrawlingDepth() > 0: + # If this is an index document, stop crawling if crawling_depth is 0 + document.activate().crawlContent() except urllib2.HTTPError, error: + if repeat == 0: + # here we must call the extendBadURLList method,--NOT Implemented-- + # which had to add this url to bad URL list, so next time we avoid + # crawling bad URL + raise urllib2.HTTPError # Catch any HTTP error self.activate(at_date=DateTime() + 1).newContentFromURL( container_path=container_path, id=id, repeat=repeat - 1, **kw) except urllib2.URLError, error: - if error.reason.args[0] == -3: - # Temporary failure in name resolution - try again in 1 day - self.activate(at_date=DateTime() + 1, - activity="SQLQueue").newContentFromURL( - container_path=container_path, id=id, - repeat=repeat - 1, **kw) - else: - # Unknown errror - to be extended - raise - except: - # Pass exception to Zope (ex. conflict errors) - raise + if repeat == 0: + # XXX - Call the extendBadURLList method, --NOT Implemented-- + raise urllib2.URLError + print error.reason + #if getattr(error.reason,'args',None): + #if error.reason.args[0] == socket.EAI_AGAIN: + ## Temporary failure in name resolution - try again in 1 day + self.activate(at_date=DateTime() + 1, + activity="SQLQueue").newContentFromURL( + container_path=container_path, id=id, + repeat=repeat - 1, **kw) InitializeClass(ContributionTool)