diff --git a/product/ERP5OOo/Document/DMSFile.py b/product/ERP5OOo/Document/DMSFile.py deleted file mode 100644 index ef3306f688f0c665087fd4279adabe5558e165d0..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/Document/DMSFile.py +++ /dev/null @@ -1,295 +0,0 @@ - -############################################################################## -# -# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved. -# -# WARNING: This program as such is intended to be used by professional -# programmers who take the whole responsability of assessing all potential -# consequences resulting from its eventual inadequacies and bugs -# End users who are looking for a ready-to-use solution with commercial -# garantees and support are strongly adviced to contract a Free Software -# Service Company -# -# This program is Free Software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# -############################################################################## - -from AccessControl import ClassSecurityInfo -from Products.CMFCore.WorkflowCore import WorkflowMethod -from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface -from Products.ERP5Type.Cache import CachingMethod -from Products.ERP5.Document.File import File -from Products.ERP5Type.XMLObject import XMLObject -# to overwrite WebDAV methods -from Products.CMFDefault.File import File as CMFFile - -import mimetypes, re -from DateTime import DateTime -mimetypes.init() - - -rs=[] -rs.append(re.compile('<HEAD>.*</HEAD>',re.DOTALL|re.MULTILINE|re.IGNORECASE)) -rs.append(re.compile('<!DOCTYPE[^>]*>')) -rs.append(re.compile('<.?(HTML|BODY)[^>]*>',re.DOTALL|re.MULTILINE|re.IGNORECASE)) - -def stripHtml(txt): - for r in rs: - txt=r.sub('',txt) - return txt - - -class CachingMixin: - # time of generation of various formats - cached_time={} - # generated files (cache) - cached_data={} - # mime types for cached formats XXX to be refactored - cached_mime={} - - # Declarative security - security = ClassSecurityInfo() - security.declareObjectProtected(Permissions.AccessContentsInformation) - - security.declareProtected(Permissions.ModifyPortalContent,'clearCache') - def clearCache(self): - """ - Clear cache (invoked by interaction workflow upon file upload - needed here to overwrite class attribute with instance attrs - """ - self.cached_time={} - self.cached_data={} - self.cached_mime={} - - security.declareProtected(Permissions.View,'hasFileCache') - def hasFileCache(self,format): - """ - Checks whether we have a version in this format - """ - return self.cached_data.has_key(format) - - def getCacheTime(self,format): - """ - Checks when if ever was the file produced - """ - return self.cached_time.get(format,0) - - def cacheUpdate(self,format): - self.cached_time[format]=DateTime() - - def cacheSet(self,format,mime=None,data=None): - if mime is not None: - self.cached_mime[format]=mime - if data is not None: - self.cached_data[format]=data - self.cacheUpdate(format) - self._p_changed=1 - - def cacheGet(self,format): - ''' - we could be much cooler here - pass testing and updating methods to this function - so that it does it all by itself; this'd eliminate the need for cacheSet public method - ''' - return self.cached_mime.get(format,''),self.cached_data.get(format,'') - - security.declareProtected(Permissions.View,'getCacheInfo') - def getCacheInfo(self): - """ - Get cache details as string (for debugging) - """ - s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>' - #self.log('getCacheInfo',self.cached_time) - #self.log('getCacheInfo',self.cached_data) - for f in self.cached_time.keys(): - t=self.cached_time[f] - data=self.cached_data.get(f) - if data: - if isinstance(data,str): - ln=len(data) - else: - ln=0 - while data is not None: - ln+=len(data.data) - data=data.next - else: - ln='no data!!!' - s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),'-') - s+='</table>' - return s - -class DMSFile(XMLObject,File): - """ - Special base class, different from File only in that it can contain things - (like Role Definition, for example) - will be merged with File when WebDAV issues are solved - """ - # CMF Type Definition - meta_type = 'ERP5 DMS File' - portal_type = 'DMS File' - isPortalContent = 1 - isRADContent = 1 - __dav_collection__=0 - - # Declarative security - security = ClassSecurityInfo() - security.declareObjectProtected(Permissions.AccessContentsInformation) - - # Default Properties - property_sheets = ( PropertySheet.Base - , PropertySheet.CategoryCore - , PropertySheet.DublinCore - , PropertySheet.Version - , PropertySheet.Reference - , PropertySheet.DMSFile - ) - - - # make sure to call the right edit methods - _edit=File._edit - edit=File.edit - - searchable_attrs=('title','description','id','reference','version', - 'short_title','keywords','subject','source_reference','source_project_title') - - ### Content indexing methods - security.declareProtected(Permissions.View, 'getSearchableText') - def getSearchableText(self, md=None): - """ - Used by the catalog for basic full text indexing - """ - searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',self.searchable_attrs)) - return searchable_text - - def get_size(self): - ''' - has to be overwritted here, otherwise WebDAV fails - ''' - try: - return len(self.data) - except (AttributeError, TypeError): - return 0 - - getcontentlength=get_size - - security.declareProtected(Permissions.View,'hasFile') - def hasFile(self): - """ - Checks whether we have an initial file - """ - _marker=[] - if getattr(self,'data',_marker) is not _marker: # XXX - use propertysheet accessors - return getattr(self,'data') is not None - return False - - security.declarePrivate('_unpackData') - def _unpackData(self,data): - """ - Unpack Pdata into string - """ - if isinstance(data,str): - return data - else: - data_list=[] - while data is not None: - data_list.append(data.data) - data=data.next - return ''.join(data_list) - - SearchableText=getSearchableText - - security.declareProtected(Permissions.ModifyPortalContent, 'guessMimeType') - def guessMimeType(self,fname=''): - '''get mime type from file name''' - if fname=='':fname=self.getOriginalFilename() - if fname: - content_type,enc=mimetypes.guess_type(fname) - if content_type is not None: - self.content_type=content_type - return content_type - - security.declareProtected(Permissions.ModifyPortalContent, 'setPropertyListFromFilename') - def setPropertyListFromFilename(self,fname): - rx_parse=re.compile(self.portal_preferences.getPreferredDmsFilenameRegexp()) - if rx_parse is None: - self.setReference(fname) - return - m=rx_parse.match(fname) - if m is None: - self.setReference(fname) - return - for k,v in m.groupdict().items(): - self.setProperty(k,v) - - security.declareProtected(Permissions.View, 'getWikiSuccessorReferenceList') - def getWikiSuccessorReferenceList(self): - ''' - find references in text_content, return matches - with this we can then find objects - ''' - if self.getTextContent() is None: - return [] - rx_search=re.compile(self.portal_preferences.getPreferredDmsReferenceRegexp()) - try: - res=rx_search.finditer(self.getTextContent()) - except AttributeError: - return [] - res=[(r.group(),r.groupdict()) for r in res] - return res - - security.declareProtected(Permissions.View, 'getWikiSuccessorValueList') - def getWikiSuccessorValueList(self): - ''' - getWikiSuccessorValueList - the way to find objects is on - implementation level - ''' - # XXX results should be cached as volatile attributes - lst=[] - for ref in self.getWikiSuccessorReferenceList(): - r=ref[1] - res=self.DMS_findDocument(**r) - if len(res)>0: - lst.append(res[0].getObject()) - return lst - - security.declareProtected(Permissions.View, 'getWikiPredecessorValueList') - def getWikiPredecessorValueList(self): - ''' - it is mostly implementation level - depends on what parameters we use to identify - document, and on how a doc must reference me to be my predecessor (reference only, - or with a language, etc - ''' - # XXX results should be cached as volatile attributes - lst=self.DMS_findPredecessors() - lst=[r.getObject() for r in lst] - di=dict.fromkeys(lst) # make it unique - ref=self.getReference() - return [o for o in di.keys() if o.getReference()!=ref] # every object has its own reference in SearchableText - - security.declareProtected(Permissions.ModifyPortalContent,'PUT') - def PUT(self,REQUEST,RESPONSE): - CMFFile.PUT(self,REQUEST,RESPONSE) - self.DMS_ingestFile(fname=self.getId()) - - # BG copied from File in case - index_html = CMFFile.index_html - #PUT = CMFFile.PUT - security.declareProtected('FTP access', 'manage_FTPget', 'manage_FTPstat', 'manage_FTPlist') - manage_FTPget = CMFFile.manage_FTPget - manage_FTPlist = CMFFile.manage_FTPlist - manage_FTPstat = CMFFile.manage_FTPstat - - -# vim: syntax=python shiftwidth=2 - diff --git a/product/ERP5OOo/Document/ExternalDocument.py b/product/ERP5OOo/Document/ExternalDocument.py deleted file mode 100644 index 2bc5a6bc40c5fff0c232699789efc7fda6c1b4d8..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/Document/ExternalDocument.py +++ /dev/null @@ -1,160 +0,0 @@ - -############################################################################## -# -# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved. -# -# WARNING: This program as such is intended to be used by professional -# programmers who take the whole responsability of assessing all potential -# consequences resulting from its eventual inadequacies and bugs -# End users who are looking for a ready-to-use solution with commercial -# garantees and support are strongly adviced to contract a Free Software -# Service Company -# -# This program is Free Software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# -############################################################################## - -from AccessControl import ClassSecurityInfo -from Products.CMFCore.WorkflowCore import WorkflowMethod -from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface -from Products.ERP5OOo.Document.DMSFile import DMSFile - -import mimetypes, re, urllib -from htmlentitydefs import name2codepoint -from DateTime import DateTime - - -class SpiderException(Exception): - - def __init__(self,code, msg): - msg="%i: %s" % (code, msg) - Exception.__init__(self,msg) - -class Opener(urllib.FancyURLopener): - - def http_error_default(self, url, fp, code, msg, headers): - raise SpiderException(code, msg) - -class ExternalDocument(DMSFile): - """ - caching sources from outside - This is basically an abstract class - classes deriving from it should overwrite method _processData (this - is the one that does something with character data obtained from source) - Spidering method supports http, ftp and file protocols, and possibly many others - """ - # CMF Type Definition - meta_type = 'ERP5 External Document' - portal_type = 'External Document' - isPortalContent = 1 - isRADContent = 1 - - # Declarative security - security = ClassSecurityInfo() - security.declareObjectProtected(Permissions.AccessContentsInformation) - - # Default Properties - property_sheets = ( PropertySheet.Base - , PropertySheet.CategoryCore - , PropertySheet.DublinCore - , PropertySheet.Version - , PropertySheet.Reference - , PropertySheet.DMSFile - , PropertySheet.Document - , PropertySheet.Url - , PropertySheet.ExternalDocument - ) - - protocols=(('Web page','http'),('FTP site','ftp'),('Local file','file'),) - - searchable_attrs=DMSFile.searchable_attrs+('text_content',) - - security.declareProtected(Permissions.View, 'getProtocolList') - def getProtocolList(self): - """ - """ - return [x[1] for x in self.protocols] - - security.declareProtected(Permissions.View, 'getProtocolItemList') - def getProtocolItemList(self): - """ - """ - return self.protocols - - security.declarePrivate(Permissions.View, '_spiderSource') - def _spiderSource(self): - """ - FancyURLopener can open various protocols - """ - op=Opener() - f=op.open(self.getQualifiedUrl()) - s=f.read() - inf=f.info() - return s, inf - - security.declarePrivate('_processData') - def _processData(self,s, inf): - raise Exception('this should be implemented in subclass') - - security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject') - def resetTopObject(self): - ''' - abstract function for maintaining interface - call before beginning recursive spidering - used mostly in web pages - ''' - pass - - security.declareProtected(Permissions.View, 'getProtocolItemList') - def spiderSource(self): - """ - spiders external datasource - sets status message - returned value tells us if it succeeded or failed - """ - try: - s,inf=self._spiderSource() - except Exception,e: - self.log(e,level=1) - self.setExternalProcessingStatusMessage("Tried on %s: %s" % (self._time(),str(e))) - return False - chars=len(s) - if chars==0: - self.setExternalProcessingStatusMessage("Tried on %s: got empty string" % self._time()) - return False - try: - s=self._processData(s,inf) - except Exception,e: - self.log(e,level=1) - self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, but could not process; reason: %s" % (self._time(), chars, str(e))) - return False - self.setTextContent(s) - self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, recorded %i chars" % (self._time(), chars, len(s))) - return True - - security.declareProtected(Permissions.View, 'getProtocolItemList') - def getQualifiedUrl(self): - """ - this should be in the Url, not here - otherwise why does the url have a property 'url_protocol'? - """ - return (self.getUrlProtocol() or '')+'://'+(self.getUrlString() or '') - - def _time(self): - return DateTime().strftime('%Y/%m/%d %H:%M:%S') - - -# vim: syntax=python shiftwidth=2 - diff --git a/product/ERP5OOo/Document/ExternalLibraryFile.py b/product/ERP5OOo/Document/ExternalLibraryFile.py deleted file mode 100644 index bd08cf286aec18c212c5f1f4a33b1e1fa9055fce..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/Document/ExternalLibraryFile.py +++ /dev/null @@ -1,136 +0,0 @@ - -############################################################################## -# -# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved. -# -# WARNING: This program as such is intended to be used by professional -# programmers who take the whole responsability of assessing all potential -# consequences resulting from its eventual inadequacies and bugs -# End users who are looking for a ready-to-use solution with commercial -# garantees and support are strongly adviced to contract a Free Software -# Service Company -# -# This program is Free Software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# -############################################################################## - -from AccessControl import ClassSecurityInfo -from Products.CMFCore.WorkflowCore import WorkflowMethod -from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface -from Products.ERP5OOo.Document.ExternalDocument import ExternalDocument, SpiderException - -from xml import sax - -def stripName(s): - return s[4:].replace('-','_').encode() - -class BookInfo(object): - id=title=description='' - -class Handler(sax.handler.ContentHandler): - stack=[] - attrs=None - c='' - d=None - results=[] - - def startElement(self,name,attrs): - name=stripName(name) - self.stack.append(name) - self.attrs=attrs - if hasattr(self,'start_'+name): - getattr(self,'start_'+name)() - - def endElement(self,name): - name=stripName(name) - if hasattr(self,'end_'+name): - getattr(self,'end_'+name)() - self.stack.pop() - self.attrs=None - self.c='' - - def characters(self,c): - self.c+=c.strip().encode('utf-8') - - def start_Record(self): - self.d=BookInfo() - self.results.append(self.d) - - def end_ID(self): - self.d.id=self.c - - def end_Title(self): - self.d.title+=self.c - - def end_Author(self): - self.d.description+=self.c+'; ' - - def end_Label_Information(self): - self.d.description+=self.c+'; ' - -def parseLibraryFile(s): - h=Handler() - sax.parseString(s,h) - return h.results - - -class ExternalLibraryFile(ExternalDocument): - """ - get AU library data - """ - # CMF Type Definition - meta_type = 'ERP5 External Library File' - portal_type = 'External Library File' - isPortalContent = 1 - isRADContent = 1 - - # Declarative security - security = ClassSecurityInfo() - security.declareObjectProtected(Permissions.AccessContentsInformation) - - # Default Properties - property_sheets = ( PropertySheet.Base - , PropertySheet.CategoryCore - , PropertySheet.DublinCore - , PropertySheet.Version - , PropertySheet.Reference - , PropertySheet.DMSFile - , PropertySheet.Document - , PropertySheet.Url - , PropertySheet.ExternalDocument - ) - - def _processData(self,s,inf): - # remove current subobjects - self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='Book')]) - # parse xml file and iterate over results - lista=parseLibraryFile(s) - for i,o in enumerate(lista): - n=self.newContent(portal_type='Book') - self.log(n.getRelativeUrl()) - n.setTitle(o.title) - n.setDescription(o.description) - # copy attributes - for atr in self.portal_types[self.getPortalType()].getInstanceBaseCategoryList(): - n.setProperty(atr,self.getProperty(atr)) - # partial commits (otherwise packet may exceed mysql max size) - # XXX this should probably be deferred as portal_activities - if i % 50 ==0: - get_transaction().commit() - self.log(len(lista)) - return 'k'*len(lista) # a hack to have number of objects in status message - - -# vim: filetype=python syntax=python shiftwidth=2 diff --git a/product/ERP5OOo/Document/ExternalWebPage.py b/product/ERP5OOo/Document/ExternalWebPage.py deleted file mode 100644 index b749264699f01634e2765f7d67f103695db13504..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/Document/ExternalWebPage.py +++ /dev/null @@ -1,215 +0,0 @@ - - -############################################################################## -# -# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved. -# -# WARNING: This program as such is intended to be used by professional -# programmers who take the whole responsability of assessing all potential -# consequences resulting from its eventual inadequacies and bugs -# End users who are looking for a ready-to-use solution with commercial -# garantees and support are strongly adviced to contract a Free Software -# Service Company -# -# This program is Free Software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# -############################################################################## - -from AccessControl import ClassSecurityInfo -from Products.CMFCore.WorkflowCore import WorkflowMethod -from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface -from Products.ERP5OOo.Document.DMSFile import stripHtml -from Products.ERP5OOo.Document.ExternalDocument import ExternalDocument, SpiderException - -import mimetypes, re, urllib -from htmlentitydefs import name2codepoint - -rx=[] -rx.append(re.compile('<!--.*?-->',re.DOTALL|re.MULTILINE)) # clear comments (sometimes JavaScript code in comments contains > chars) -rx.append(re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)) # clear tags -rx.append(re.compile('\s+')) # compress multiple spaces - -def clearHtml(s): - for r in rx: - s=r.sub(" ",s) - return s - - -tgtencoding='utf-8' -encodings=['iso-8859-2','iso-8859-15','windows-1250'] -rx_charset=re.compile('<meta.*charset="?([\w\d\-]*)',re.DOTALL|re.MULTILINE|re.IGNORECASE) - -def recode(s): - """ - maybe it can be useful system-wide - """ - _encodings=encodings[:] # local copy - _encodings.insert(0,tgtencoding) # if not declared or declared wrongly, we try - m=rx_charset.search(s) - if m and len(m.groups())>0: - enc=m.groups()[0].lower() - if enc==tgtencoding: - return s - if enc in _encodings: - _encodings.remove(enc) - _encodings.insert(0,enc) # we'll start from what we've found - for enc in _encodings: - try: - return s.decode(enc).encode('utf-8') - except UnicodeDecodeError, LookupError: - pass - raise CanNotDecode('sorry') - -def _convertEntities(txt,rx,mapper=None): - def repl(code): - if mapper: - code=mapper.get(code) - if code is None: - return '' - return unichr(int(code)).encode(tgtencoding) - res=re.split(rx,txt) - res[1::2]=map(repl,res[1::2]) # Isn't it beautiful? :) - return ''.join(res) - -rx_chars=re.compile('&#(\d{3});') -rx_ents=re.compile('&(\w{1,6});') - -def convertEntities(txt): - txt=_convertEntities(txt,rx_chars) - txt=_convertEntities(txt,rx_ents, name2codepoint) - return txt - -class ExternalWebPage(ExternalDocument): - """ - caching sources from outside - """ - # CMF Type Definition - meta_type = 'ERP5 External Web Page' - portal_type = 'External Web Page' - isPortalContent = 1 - isRADContent = 1 - - # Declarative security - security = ClassSecurityInfo() - security.declareObjectProtected(Permissions.AccessContentsInformation) - - # Default Properties - property_sheets = ( PropertySheet.Base - , PropertySheet.CategoryCore - , PropertySheet.DublinCore - , PropertySheet.Version - , PropertySheet.Reference - , PropertySheet.DMSFile - , PropertySheet.Document - , PropertySheet.Url - , PropertySheet.ExternalDocument - ) - - def _findTopObject(self): - ''' - find the top object from which the spidering begun - we search upwards untill we find or reach portal object - the top object is the one that is maintaining the dictionary - I think we have to do it instead of using simple acquisition - because we have to find a non-empty one - ''' - ob=self - if hasattr(self,'urldict') and len(self.urldict)>0: - return self - else: - while 1: - ob=ob.aq_parent - if ob==self.getPortalObject(): - return self - if hasattr(ob,'urldict') and len(ob.urldict)>0: - return ob - - security.declareProtected(Permissions.ModifyPortalContent,'addUrl') - def addUrl(self,url): - ''' - record url that has already been spidered - ''' - self.urldict[url]=1 - self._p_changed=1 - - security.declareProtected(Permissions.ModifyPortalContent,'checkUrl') - def checkUrl(self,url): - ''' - check if the url has already been spidered - ''' - return self.urldict.has_key(url) - - security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject') - def resetTopObject(self): - ''' - reset the url dictionary - remember do it before you start recursive spidering - ''' - self.urldict={} - self._p_changed=1 - - def _processData(self,s, inf): - # since this is a web page, we don't want anything else - # XXX we should find another way - like this, we end up with empty draft objects - if (inf.getmaintype(),inf.getsubtype())!=('text','html'): - raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype())) - top=self._findTopObject() - # remove current subobjects - self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')]) - if self.getOptionRecursively()>0 and self.getRecursionDepth()>0: - # first find links in text - rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE) - for ref in re.findall(rx, s): - # eliminate anchors and specials, select internal links - if ref.startswith('javascript') or ref.startswith('mailto'): - continue - ref=re.sub('#.*','',ref) - if ref=='':continue - #baseref='/'.join(self.getQualifiedUrl().split('/')) - baseref=self.getQualifiedUrl() - if not ref.startswith('http'): - # complete relative paths - ref=baseref+'/'+ref - # eliminate multiple slashes - rx=re.compile('([^:]{1})\/{2,}') - ref=re.sub(rx,'\1/',ref) - # create subobjects - if ref.startswith(baseref) and not top.checkUrl(ref): - # record my url in top object - top.addUrl(ref) - n=self.newContent(portal_type='External Web Page') - # set coordinates - n.setUrlProtocol('http') - n.setUrlString(ref) - n.setOptionRecursively(1) - n.setRecursionDepth(self.getRecursionDepth()-1) - # copy attributes - for atr in self.portal_types[self.getPortalType()].getInstanceBaseCategoryList(): - n.setProperty(atr,self.getProperty(atr)) - n.activate(activity='SQLQueue').ExternalDocument_spiderAndSetState() - # process self - # here we check encoding and convert to UTF8 - try: - s=recode(s) - except CanNotDecode: - self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, but could not decode" % (self._time(), chars)) - return False - s=stripHtml(s) # remove headers, doctype and the like - s=clearHtml(s) # remove tags - s=convertEntities(s) # convert charrefs and named entities - return s - - -# vim: filetype=python syntax=python shiftwidth=2 diff --git a/product/ERP5OOo/Document/ImageDocument.py b/product/ERP5OOo/Document/ImageDocument.py deleted file mode 100644 index 35aee2a3a134dfe3d5702a8c207e2b6d44930ede..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/Document/ImageDocument.py +++ /dev/null @@ -1,79 +0,0 @@ - -############################################################################## -# -# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved. -# -# WARNING: This program as such is intended to be used by professional -# programmers who take the whole responsability of assessing all potential -# consequences resulting from its eventual inadequacies and bugs -# End users who are looking for a ready-to-use solution with commercial -# garantees and support are strongly adviced to contract a Free Software -# Service Company -# -# This program is Free Software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# -############################################################################## - -from AccessControl import ClassSecurityInfo -from Products.CMFCore.WorkflowCore import WorkflowMethod -from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface -#from Products.ERP5Type.Cache import CachingMethod -from Products.ERP5.Document.Image import Image -from Products.ERP5OOo.Document.DMSFile import DMSFile -#from Products.ERP5Type.XMLObject import XMLObject -# to overwrite WebDAV methods -#from Products.CMFDefault.File import File as CMFFile - - -#class ImageDocument(Image): -class ImageDocument(DMSFile,Image): - """ - o - """ - # CMF Type Definition - meta_type = 'ERP5 Image Document' - portal_type = 'Image Document' - isPortalContent = 1 - isRADContent = 1 - - # Declarative security - security = ClassSecurityInfo() - security.declareObjectProtected(Permissions.AccessContentsInformation) - - # Default Properties - property_sheets = ( PropertySheet.Base - , PropertySheet.CategoryCore - , PropertySheet.DublinCore - , PropertySheet.Version - , PropertySheet.Reference - , PropertySheet.DMSFile - ) - - def __init__(self,id,**kw): - #print 'ImageDocument __init__' - Image.__init__(self,id,**kw) - DMSFile.__init__(self,id,**kw) - - # make sure to call the right edit methods - _edit=Image._edit - edit=Image.edit - index_html=Image.index_html - - def clearCache(self): - pass # this is handled by ERP5.Document.Image._edit method - - -# vim: syntax=python shiftwidth=2 - diff --git a/product/ERP5OOo/Document/PdfDocument.py b/product/ERP5OOo/Document/PdfDocument.py deleted file mode 100644 index 72c75fea9641f02638a8593ce5f995bb5aa82ac6..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/Document/PdfDocument.py +++ /dev/null @@ -1,156 +0,0 @@ - -############################################################################## -# -# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved. -# -# WARNING: This program as such is intended to be used by professional -# programmers who take the whole responsability of assessing all potential -# consequences resulting from its eventual inadequacies and bugs -# End users who are looking for a ready-to-use solution with commercial -# garantees and support are strongly adviced to contract a Free Software -# Service Company -# -# This program is Free Software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# -############################################################################## - -from AccessControl import ClassSecurityInfo -from Products.CMFCore.WorkflowCore import WorkflowMethod -from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface -from Products.ERP5Type.Cache import CachingMethod -from Products.ERP5.Document.Image import Image -from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml -from zLOG import LOG - -import tempfile, os, glob, zipfile, cStringIO, re - - -class PdfDocument(DMSFile, CachingMixin): - """ - PdfDocument - same as file, but has its own getSearchableText method - (converts via pdftotext) - in effect it has two separate caches - from CachingMixin for txt and html - and for image formats from Image - """ - # CMF Type Definition - meta_type = 'ERP5 Pdf Document' - portal_type = 'Pdf Document' - isPortalContent = 1 - isRADContent = 1 - - # Declarative security - security = ClassSecurityInfo() - security.declareObjectProtected(Permissions.AccessContentsInformation) - - # Default Properties - property_sheets = ( PropertySheet.Base - , PropertySheet.CategoryCore - , PropertySheet.DublinCore - , PropertySheet.Version - , PropertySheet.Reference - , PropertySheet.DMSFile - , PropertySheet.Document - ) - - def getTargetFile(self,format): - ''' - we need to make our own, because Photo's methods are not - sufficient (we have to zip etc) - ''' - if not self.hasFileCache(format): - self.cacheSet(format,data=self._makeFile(format),mime='application/zip') - return self.cacheGet(format) - - - def _makeFile(self,format): - tempfile.tempdir=os.path.join(os.getenv('INSTANCE_HOME'),'tmp') - os.putenv('TMPDIR','/tmp') # because if we run zope as root, we have /root/tmp here and convert goes berserk - if not os.path.exists(tempfile.tempdir): - os.mkdir(tempfile.tempdir,0775) - fr=tempfile.mktemp(suffix='.pdf') - to=tempfile.mktemp(suffix='.'+format) - file_fr=open(fr,'w') - file_fr.write(self._unpackData(self.data)) - file_fr.close() - cmd='convert %s %s' % (fr,to) - os.system(cmd) - # pack it - f=cStringIO.StringIO() - z=zipfile.ZipFile(f,'a') - print to.replace('.','*') - for fname in glob.glob(to.replace('.','*')): - base=os.path.basename(fname) - pg=re.match('.*(\d+)\.'+format,base).groups() - if pg: - pg=pg[0] - arcname='%s/page-%s.%s' % (format,pg,format) - else: - arcname=base - z.write(fname,arcname) - z.close() - f.seek(0) - return f.read() - - searchable_attrs=DMSFile.searchable_attrs+('text_content',) - - ### Content indexing methods - security.declareProtected(Permissions.View, 'getSearchableText') - def getSearchableText(self, md=None, force=0): - """ - Used by the catalog for basic full text indexing - we get text content by using pdftotext - but we have to do it only once after uplad - for simplicity we check only modification_date, which means we rebuild txt and html after every edit - but that shouldn't hurt too much - """ - if hasattr(self,'data') and (force==1 or not self.hasFileCache('txt') or self.getTextContent() is None): - self.log('PdfDocument','regenerating txt') - tmp=tempfile.NamedTemporaryFile() - tmp.write(self._unpackData(self.data)) - tmp.seek(0) - cmd='pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name - r=os.popen(cmd) - self.setTextContent(r.read().replace('\n',' ')) - tmp.close() - r.close() - self.cacheSet('txt',data='-') # we don't need to store it twice, just mark we have it - return DMSFile.getSearchableText(self,md) - - SearchableText=getSearchableText - - security.declareProtected(Permissions.View, 'getHtmlRepresentation') - def getHtmlRepresentation(self, force=0): - ''' - get simplified html version to display - ''' - if not hasattr(self,'data'): - return 'no data' - if force==1 or not self.hasFileCache('html'): - self.log('PdfDocument','regenerating html') - tmp=tempfile.NamedTemporaryFile() - tmp.write(self._unpackData(self.data)) - tmp.seek(0) - cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name - r=os.popen(cmd) - h=r.read() - tmp.close() - r.close() - h=stripHtml(h) - self.cacheSet('html',data=h) - self.cacheUpdate('html') - return self.cacheGet('html')[1] - -# vim: syntax=python shiftwidth=2 - diff --git a/product/ERP5OOo/Permissions.py b/product/ERP5OOo/Permissions.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/product/ERP5OOo/PropertySheet/DMSFile.py b/product/ERP5OOo/PropertySheet/DMSFile.py deleted file mode 100644 index 19a01698aeabf1e3a80e78c4f37682fddf63c8b7..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/PropertySheet/DMSFile.py +++ /dev/null @@ -1,53 +0,0 @@ -############################################################################## -# -## Copyright (c) 2002 Nexedi SARL and Contributors. All Rights Reserved. -# -## WARNING: This program as such is intended to be used by professional -# programmers who take the whole responsability of assessing all potential -# # consequences resulting from its eventual inadequacies and bugs -# # End users who are looking for a ready-to-use solution with commercial -# # garantees and support are strongly adviced to contract a Free Software -# # Service Company -# # -# # This program is Free Software; you can redistribute it and/or -# # modify it under the terms of the GNU General Public License -# # as published by the Free Software Foundation; either version 2 -# # of the License, or (at your option) any later version. -# # -# # This program is distributed in the hope that it will be useful, -# # but WITHOUT ANY WARRANTY; without even the implied warranty of -# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# # GNU General Public License for more details. -# # -# # You should have received a copy of the GNU General Public License -# # along with this program; if not, write to the Free Software -# # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# # -# ############################################################################## - -class DMSFile: - """ - - """ - - _properties = ( - { 'id' : 'content_type', - 'description' : 'content type', - 'type' : 'string', - 'default' : 'application/unknown', - 'mode' : 'w' }, - ) - - _categories = ('destination','similar','predecessor','successor','source_project','publication_section','classification', - 'contributor','function','group','site') - - _constraints = ( - { - 'id' : 'unique_coordinates', - 'description':'coordinate triplet must be complete and unique', - 'type':'DocumentCoordinatesConstraint'}, - ) - - -# vim: shiftwidth=2 - diff --git a/product/ERP5OOo/PropertySheet/ExternalDocument.py b/product/ERP5OOo/PropertySheet/ExternalDocument.py deleted file mode 100644 index 2d009f9ad0107a238652310aaef1398d411d6d29..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/PropertySheet/ExternalDocument.py +++ /dev/null @@ -1,20 +0,0 @@ - -class ExternalDocument: - """ - """ - _properties = ( - { 'id' : 'external_processing_status_message', - 'description' : 'message about status', - 'type' : 'string', - 'mode' : 'w' }, - { 'id' : 'option_recursively', - 'description' : 'do we want recursive spidering (meaningless in some classes)', - 'type' : 'int', - 'mode' : 'w'}, - { 'id' : 'recursion_depth', - 'description' : 'how deep should recursive spidering be (0 - no recursion) (meaningless in some classes)', - 'type' : 'int', - 'default' : 5, - 'mode' : 'w'}, - ) - diff --git a/product/ERP5OOo/PropertySheet/OOoDocument.py b/product/ERP5OOo/PropertySheet/OOoDocument.py deleted file mode 100644 index 65bf25beb456de935b9a9f90fd28fceca94a01f4..0000000000000000000000000000000000000000 --- a/product/ERP5OOo/PropertySheet/OOoDocument.py +++ /dev/null @@ -1,48 +0,0 @@ -############################################################################## -# -## Copyright (c) 2002 Nexedi SARL and Contributors. All Rights Reserved. -# -## WARNING: This program as such is intended to be used by professional -# programmers who take the whole responsability of assessing all potential -# # consequences resulting from its eventual inadequacies and bugs -# # End users who are looking for a ready-to-use solution with commercial -# # garantees and support are strongly adviced to contract a Free Software -# # Service Company -# # -# # This program is Free Software; you can redistribute it and/or -# # modify it under the terms of the GNU General Public License -# # as published by the Free Software Foundation; either version 2 -# # of the License, or (at your option) any later version. -# # -# # This program is distributed in the hope that it will be useful, -# # but WITHOUT ANY WARRANTY; without even the implied warranty of -# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# # GNU General Public License for more details. -# # -# # You should have received a copy of the GNU General Public License -# # along with this program; if not, write to the Free Software -# # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# # -# ############################################################################## - -class OOoDocument: - """ - XXX - I think that most of these properties are useless because - they already existor should be implemented in a different way - """ - - _properties = ( - { 'id' : 'mime_type', # Please check if this property is already defined in Zope File class - I think it is called content_type - 'description' : 'mime type of OOo version', - 'type' : 'string', - 'mode' : ''}, - { 'id' : 'external_processing_status_message', - 'description' : 'message about status', - 'type' : 'string', - 'mode' : 'w' }, - ) - - -# vim: shiftwidth=2 -