Commit e9821d76 authored by Jean-Paul Smets's avatar Jean-Paul Smets

Code review and refactoring based on Document API.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@13628 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent ed729538
############################################################################## ##############################################################################
# #
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved. # Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
...@@ -31,23 +30,20 @@ from Products.CMFCore.WorkflowCore import WorkflowMethod ...@@ -31,23 +30,20 @@ from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.Image import Image from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.File import File, stripHtml
from Products.ERP5.Document.Document import ConversionCacheMixin from Products.ERP5.Document.Document import ConversionCacheMixin
from Products.CMFCore.utils import getToolByName from Products.CMFCore.utils import getToolByName
from zLOG import LOG from zLOG import LOG
import tempfile, os, glob, zipfile, cStringIO, re import tempfile, os, cStringIO
class PDFDocument(File, ConversionCacheMixin): class PDFDocument(Image, ConversionCacheMixin):
""" """
PdfDocument - same as file, but has its own getSearchableText method PDFDocument is a subclass of Image which is able to
(converts via pdftotext) extract text content from a PDF file either as text
in effect it has two separate caches - from CachingMixin for txt and html or as HTML.
and for image formats from Image
""" """
# CMF Type Definition # CMF Type Definition
meta_type = 'ERP5 PDF' meta_type = 'ERP5 PDF Document'
portal_type = 'PDF' portal_type = 'PDF'
isPortalContent = 1 isPortalContent = 1
isRADContent = 1 isRADContent = 1
...@@ -58,17 +54,20 @@ class PDFDocument(File, ConversionCacheMixin): ...@@ -58,17 +54,20 @@ class PDFDocument(File, ConversionCacheMixin):
# Default Properties # Default Properties
property_sheets = ( PropertySheet.Base property_sheets = ( PropertySheet.Base
, PropertySheet.XMLObject
, PropertySheet.CategoryCore , PropertySheet.CategoryCore
, PropertySheet.DublinCore , PropertySheet.DublinCore
, PropertySheet.Version , PropertySheet.Version
, PropertySheet.Reference , PropertySheet.Reference
, PropertySheet.Document , PropertySheet.Document
, PropertySheet.TextDocument
, PropertySheet.Data , PropertySheet.Data
, PropertySheet.ExternalDocument
, PropertySheet.Url
, PropertySheet.Periodicity
) )
security.declareProtected(Permissions.View, 'index_html')
def index_html(self, REQUEST, RESPONSE, format=None, force=0): def index_html(self, REQUEST, RESPONSE, display=None, format='', quality=75, resolution=None):
""" """
Returns data in the appropriate format (graphical) Returns data in the appropriate format (graphical)
it is always a zip because multi-page pdfs are converted into a zip it is always a zip because multi-page pdfs are converted into a zip
...@@ -77,111 +76,63 @@ class PDFDocument(File, ConversionCacheMixin): ...@@ -77,111 +76,63 @@ class PDFDocument(File, ConversionCacheMixin):
if format is None: if format is None:
RESPONSE.setHeader('Content-Type', 'application/pdf') RESPONSE.setHeader('Content-Type', 'application/pdf')
return self._unpackData(self.data) return self._unpackData(self.data)
if format in ('html', 'txt', 'text'):
mime, data = self.convert(format)
RESPONSE.setHeader('Content-Length', len(data))
RESPONSE.setHeader('Content-Type', '%s;charset=UTF-8' % mime)
RESPONSE.setHeader('Accept-Ranges', 'bytes')
return data
return Image.index_html(self, REQUEST, RESPONSE, display=display,
format=format, quality=quality, resolution=resolution)
# Conversion API
security.declareProtected(Permissions.ModifyPortalContent, 'convert')
def convert(self, format, **kw):
"""
Implementation of conversion for PDF files
"""
if format == 'html': if format == 'html':
RESPONSE.setHeader('Content-Type', 'text/html;charset=UTF-8') if not self.hasConversion(format=format):
return self.getHtmlRepresentation(force) data = self._convertToHTML()
if format == 'txt': self.setConversion(data, mime='text/html', format=format)
RESPONSE.setHeader('Content-Type', 'text/plain;charset=UTF-8') return self.getConversion(format=format)
self._convertToText(force) elif format in ('txt', 'text'):
return self.getTextContent() if not self.hasConversion(format='txt'):
mime = 'image/'+format.lower() data = self._convertToText()
if force or not self.hasConversion(format = format): self.setConversion(data, mime='text/plain', format='txt')
self.setConversion(self._makeFile(format), 'application/zip', format=format) return self.getConversion(format=format)
RESPONSE.setHeader('Content-Type', 'application/zip')
return self.getConversion(format = format)
def _makeFile(self,format):
tempfile.tempdir = os.path.join(os.getenv('INSTANCE_HOME'), 'tmp')
os.putenv('TMPDIR', '/tmp') # because if we run zope as root, we have /root/tmp here and convert goes crazy
if not os.path.exists(tempfile.tempdir):
os.mkdir(tempfile.tempdir, 0775)
fr = tempfile.mktemp(suffix='.pdf')
to = tempfile.mktemp(suffix = '.' + format)
file_fr = open(fr, 'w')
file_fr.write(self._unpackData(self.data))
file_fr.close()
cmd = 'convert %s %s' % (fr, to)
os.system(cmd)
# pack it
f = cStringIO.StringIO()
z = zipfile.ZipFile(f, 'a')
for fname in glob.glob(to.replace('.', '*')):
base = os.path.basename(fname)
pg = re.match('.*?(\d*)\.'+format, base).groups()
if pg:
pg = pg[0]
arcname = '%s/page-%s.%s' % (format, pg, format)
else: else:
arcname = base return Image.convert(self, format, **kw)
z.write(fname, arcname)
z.close()
f.seek(0)
return f.read()
searchable_property_list = File.searchable_property_list + ('text_content',) security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
def populateContent(self):
### Content indexing methods
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None, force=0):
""" """
Used by the catalog for basic full text indexing Convert each page to an Image and populate the
conditionally convert pdf to text PDF directory with converted images. May be useful
to provide online PDF reader
""" """
self._convertToText(force) raise NotImplementedError
return File.getSearchableText(self, md)
security.declarePrivate('_convertToText') security.declarePrivate('_convertToText')
def _convertToText(self, force): def _convertToText(self):
""" """
Private implementation method. Convert the PDF text content to text with pdftotext
If we don't have txt cache or we are forced to convert, we try to do it
using system pdftotext utility. We set the result as text_content property.
We mark it in cache as done, even if we fail, so we don't keep trying if it
doesn't work.
""" """
if hasattr(self, 'data') and (force == 1 or not self.hasConversion(format = 'txt')):
# XXX-JPS accessing attribute data is bad
self.log('PdfDocument', 'regenerating txt')
try:
try:
tmp = tempfile.NamedTemporaryFile() tmp = tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data)) tmp.write(self._unpackData(self.data))
tmp.seek(0) tmp.seek(0)
cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
r = os.popen(cmd) r = os.popen(cmd)
self.setTextContent(r.read().replace('\n', ' ')) h = r.read()
tmp.close() tmp.close()
r.close() r.close()
except Exception, e: return h
self.log(str(e))
msg = 'Conversion to text failed: ' + str(e)
else:
msg = 'Converted to text'
finally:
self.processFile(comment=msg)
# we don't need to store it twice, just mark we have it (or rather we already tried)
# we try only once
self.setConversion('empty', format = 'txt')
SearchableText=getSearchableText security.declarePrivate('_convertToHTML')
def _convertToHTML(self):
security.declarePrivate('_convertToBase') """
def _convertToBase(self): Convert the PDF text content to HTML with pdftohtml
self._convertToText(force=1) """
security.declareProtected(Permissions.View, 'getHtmlRepresentation')
def getHtmlRepresentation(self, force=0):
'''
get simplified html version to display
If we fail to convert, we set workflow message and put error message
as html preview so that the user knows what's going on
'''
portal_workflow = getToolByName(self, 'portal_workflow')
if not hasattr(self, 'data'):
return 'no data'
if force==1 or not self.hasConversion(format = 'html'):
try:
self.log('PDF', 'regenerating html')
tmp = tempfile.NamedTemporaryFile() tmp = tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data)) tmp.write(self._unpackData(self.data))
tmp.seek(0) tmp.seek(0)
...@@ -190,13 +141,27 @@ class PDFDocument(File, ConversionCacheMixin): ...@@ -190,13 +141,27 @@ class PDFDocument(File, ConversionCacheMixin):
h = r.read() h = r.read()
tmp.close() tmp.close()
r.close() r.close()
h = stripHtml(h) h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ') # Quick hack to remove bg color - XXX
except Exception, e: return h
msg = 'Could not convert to html: ' + str(e)
h = msg
portal_workflow.doActionFor(self, 'process', comment=msg)
self.setConversion(h, format = 'html')
return self.getConversion(format = 'html')[1]
# vim: syntax=python shiftwidth=2
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def getContentInformation(self):
"""
Returns the information about the PDF document with
pdfinfo.
"""
tmp = tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data))
tmp.seek(0)
cmd = 'pdfinfo -meta -box %s' % tmp.name
r = os.popen(cmd)
h = r.read()
tmp.close()
r.close()
result = {}
for line in h.splitlines():
item_list = line.split(':')
key = item_list[0].strip()
value = ':'.join(item_list[1:]).strip()
result[key] = value
return result
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment