OOoUtils.py 14.6 KB
Newer Older
Kevin Deldycke's avatar
Kevin Deldycke committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
##############################################################################
#
# Copyright (c) 2003-2005 Nexedi SARL and Contributors. All Rights Reserved.
#                         Kevin DELDYCKE    <kevin@nexedi.com>
#                         Guillaume MICHON  <guillaume@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

from Products.PythonScripts.Utility import allow_class
from ZPublisher.HTTPRequest import FileUpload
from xml.dom.ext.reader import PyExpat
from xml.dom import Node
from AccessControl import ClassSecurityInfo
35 36 37
from Globals import InitializeClass, get_request
from zipfile import ZipFile, ZIP_DEFLATED
from StringIO import StringIO
Kevin Deldycke's avatar
Kevin Deldycke committed
38 39
from zLOG import LOG
import imghdr
40
import random
Kevin Deldycke's avatar
Kevin Deldycke committed
41 42 43 44 45



class CorruptedOOoFile(Exception): pass

46 47 48 49 50 51 52 53 54 55 56 57
OOo_mimeType_dict = {
  'sxw' : 'application/vnd.sun.xml.writer',
  'stw' : 'application/vnd.sun.xml.writer.template',
  'sxg' : 'application/vnd.sun.xml.writer.global',
  'sxc' : 'application/vnd.sun.xml.calc',
  'stc' : 'application/vnd.sun.xml.calc.template',
  'sxi' : 'application/vnd.sun.xml.impress',
  'sti' : 'application/vnd.sun.xml.impress.template',
  'sxd' : 'application/vnd.sun.xml.draw',
  'std' : 'application/vnd.sun.xml.draw.template',
  'sxm' : 'application/vnd.sun.xml.math',
}
Kevin Deldycke's avatar
Kevin Deldycke committed
58

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
class OOoBuilder:
  """
  Tool that allows to reinject new files in a ZODB OOo document.
  """
  # Declarative security
  security = ClassSecurityInfo()

  security.declarePrivate('__init__')
  def __init__(self, document):
    self._document = StringIO(document.data)

  security.declarePublic('replace')
  def replace(self, filename, stream):
    """
    Replaces the content of filename by stream in the archive.
    Creates a new file if filename was not already there.
    """
    try:
      zf = ZipFile(self._document, mode='a', compression=ZIP_DEFLATED)
    except RuntimeError:
      zf = ZipFile(self._document, mode='a')
    zf.writestr(filename, stream)
    zf.close()

  security.declarePublic('render')
  def render(self, name='', extension='sxw'):
    """
    returns the OOo document
    """
    request = get_request()
    request.response.setHeader('Content-type', OOo_mimeType_dict.get(extension, 'application/vnd.sun.xml.writer'))
    if name:
      request.response.setHeader('Content-Disposition', 'attachment; filename=%s.%s' % (name, extension))
    self._document.seek(0)
    return self._document.read()
    
InitializeClass(OOoBuilder)
allow_class(OOoBuilder)
Kevin Deldycke's avatar
Kevin Deldycke committed
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116

class OOoParser:
  """
    General purpose tools to parse and handle OpenOffice v1.x documents.
  """


  # Declarative security
  security = ClassSecurityInfo()


  security.declarePrivate('__init__')
  def __init__(self):
    # Create the PyExpat reader
    self.reader = PyExpat.Reader()
    self.oo_content_dom = None
    self.oo_styles_dom  = None
    self.oo_files = {}
    self.pictures = {}
    self.ns = {}
Kevin Deldycke's avatar
Kevin Deldycke committed
117
    self.filename = None
Kevin Deldycke's avatar
Kevin Deldycke committed
118 119


120
  security.declareProtected('Import/Export objects', 'openFile')
121
  def openFile(self, file_descriptor):
Kevin Deldycke's avatar
Kevin Deldycke committed
122 123 124 125 126
    """
      Load all files in the zipped OpenOffice document
    """
    # Try to unzip the Open Office doc
    try:
127
      oo_unzipped = ZipFile(file_descriptor, mode="r")
Kevin Deldycke's avatar
Kevin Deldycke committed
128 129 130 131 132 133
    except:
      raise CorruptedOOoFile
    # Test the integrity of the file
    if oo_unzipped.testzip() != None:
      raise CorruptedOOoFile

Kevin Deldycke's avatar
Kevin Deldycke committed
134 135 136
    # Get the filename
    self.filename = file_descriptor.filename

Kevin Deldycke's avatar
Kevin Deldycke committed
137 138 139
    # List and load the content of the zip file
    for name in oo_unzipped.namelist():
      self.oo_files[name] = oo_unzipped.read(name)
140
    oo_unzipped.close()
Kevin Deldycke's avatar
Kevin Deldycke committed
141 142 143 144 145 146 147 148 149 150 151 152 153 154

    # Get the main content and style definitions
    self.oo_content_dom = self.reader.fromString(self.oo_files["content.xml"])
    self.oo_styles_dom  = self.reader.fromString(self.oo_files["styles.xml"])

    # Create a namespace table
    doc_ns = self.oo_styles_dom.getElementsByTagName("office:document-styles")
    for i in range(doc_ns[0].attributes.length):
        if doc_ns[0].attributes.item(i).nodeType == Node.ATTRIBUTE_NODE:
            name = doc_ns[0].attributes.item(i).name
            if name[:5] == "xmlns":
                self.ns[name[6:]] = doc_ns[0].attributes.item(i).value


Kevin Deldycke's avatar
Kevin Deldycke committed
155 156 157 158 159 160 161 162
  security.declarePublic('getFilename')
  def getFilename(self):
    """
      Return the name of the OpenOffice file
    """
    return self.filename


163 164
  security.declarePublic('getPicturesMapping')
  def getPicturesMapping(self):
Kevin Deldycke's avatar
Kevin Deldycke committed
165 166 167 168 169 170 171 172 173 174 175 176
    """
      Return a dictionnary of all pictures in the document
    """
    if len(self.pictures) <= 0:
      for file_name in self.oo_files:
        raw_data = self.oo_files[file_name]
        pict_type = imghdr.what(None, raw_data)
        if pict_type != None:
          self.pictures[file_name] = raw_data
    return self.pictures


177 178
  security.declarePublic('getContentDom')
  def getContentDom(self):
Kevin Deldycke's avatar
Kevin Deldycke committed
179 180 181 182 183 184
    """
      Return the DOM tree of the main OpenOffice content
    """
    return self.oo_content_dom


185 186
  security.declarePublic('getSpreadsheetsDom')
  def getSpreadsheetsDom(self, include_embedded=False):
187 188 189 190
    """
      Return a list of DOM tree spreadsheets (optionnaly included embedded ones)
    """
    spreadsheets = []
191
    spreadsheets = self.getPlainSpreadsheetsDom()
192
    if include_embedded == True:
193
      spreadsheets += self.getEmbeddedSpreadsheetsDom()
194 195 196
    return spreadsheets


197
  security.declarePublic('getSpreadsheetsMapping')
198
  def getSpreadsheetsMapping(self, include_embedded=False, no_empty_lines=False, normalize=True):
199 200 201
    """
      Return a list of table-like spreadsheets (optionnaly included embedded ones)
    """
202
    tables = {}
203
    tables = self.getPlainSpreadsheetsMapping(no_empty_lines, normalize)
204
    if include_embedded == True:
205
      embedded_tables = self.getEmbeddedSpreadsheetsMapping(no_empty_lines, normalize)
206 207
      tables = self._getTableListUnion(tables, embedded_tables)
    return tables
208 209


210 211
  security.declarePublic('getPlainSpreadsheetsDom')
  def getPlainSpreadsheetsDom(self):
212 213 214 215 216 217 218 219 220 221
    """
      Retrieve every spreadsheets from the document and get they DOM tree
    """
    spreadsheets = []
    # List all spreadsheets
    for table in self.oo_content_dom.getElementsByTagName("table:table"):
      spreadsheets.append(table)
    return spreadsheets


222
  security.declarePublic('getPlainSpreadsheetsMapping')
223
  def getPlainSpreadsheetsMapping(self, no_empty_lines=False, normalize=True):
224 225 226
    """
      Return a list of plain spreadsheets from the document and transform them as table
    """
227
    tables = {}
228
    for spreadsheet in self.getPlainSpreadsheetsDom():
229
      new_table = self.getSpreadsheetMapping(spreadsheet, no_empty_lines, normalize)
230
      if new_table != None:
231
        tables = self._getTableListUnion(tables, new_table)
232 233 234
    return tables


235 236
  security.declarePublic('getEmbeddedSpreadsheetsDom')
  def getEmbeddedSpreadsheetsDom(self):
Kevin Deldycke's avatar
Kevin Deldycke committed
237 238 239 240 241 242 243 244 245 246 247
    """
      Return a list of existing embedded spreadsheets in the file as DOM tree
    """
    spreadsheets = []
    # List all embedded spreadsheets
    emb_objects = self.oo_content_dom.getElementsByTagName("draw:object")
    for embedded in emb_objects:
      document = embedded.getAttributeNS(self.ns["xlink"], "href")
      if document:
        try:
          object_content = self.reader.fromString(self.oo_files[document[3:] + '/content.xml'])
248 249
          for table in object_content.getElementsByTagName("table:table"):
            spreadsheets.append(table)
Kevin Deldycke's avatar
Kevin Deldycke committed
250 251 252 253 254
        except:
          pass
    return spreadsheets


255
  security.declarePublic('getEmbeddedSpreadsheetsMapping')
256
  def getEmbeddedSpreadsheetsMapping(self, no_empty_lines=False, normalize=True):
Kevin Deldycke's avatar
Kevin Deldycke committed
257
    """
258
      Return a list of embedded spreadsheets in the document as table
Kevin Deldycke's avatar
Kevin Deldycke committed
259
    """
260
    tables = {}
261
    for spreadsheet in self.getEmbeddedSpreadsheetsDom():
262
      new_table = self.getSpreadsheetMapping(spreadsheet, no_empty_lines, normalize)
Kevin Deldycke's avatar
Kevin Deldycke committed
263
      if new_table != None:
264
        tables = self._getTableListUnion(tables, new_table)
Kevin Deldycke's avatar
Kevin Deldycke committed
265 266 267
    return tables


268
  security.declarePublic('getSpreadsheetMapping')
269
  def getSpreadsheetMapping(self, spreadsheet=None, no_empty_lines=False, normalize=True):
Kevin Deldycke's avatar
Kevin Deldycke committed
270 271
    """
      This method convert an OpenOffice spreadsheet to a simple table.
272
      This code is based on the oo2pt tool (http://cvs.sourceforge.net/viewcvs.py/collective/CMFReportTool/oo2pt).
Kevin Deldycke's avatar
Kevin Deldycke committed
273
    """
274
    if spreadsheet == None or spreadsheet.nodeName != 'table:table':
Kevin Deldycke's avatar
Kevin Deldycke committed
275 276
      return None

277
    table = []
Kevin Deldycke's avatar
Kevin Deldycke committed
278

279 280 281
    # Get the table name
    table_name = spreadsheet.getAttributeNS(self.ns["table"], "name")

282 283
    # Scan table and store usable informations
    for line in spreadsheet.getElementsByTagName("table:table-row"):
284 285 286 287 288 289

      # TODO : to the same as cell about abusive repeated lines

      line_group_found = line.getAttributeNS(self.ns["table"], "number-rows-repeated")
      if not line_group_found:
        lines_to_repeat = 1
290
      else:
291
        lines_to_repeat = int(line_group_found)
292

293
      for i in range(lines_to_repeat):
294 295
        table_line = []

296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
        # Get all cells
        cells = line.getElementsByTagName("table:table-cell")
        cell_index_range = range(len(cells))

        for cell_index in cell_index_range:
          cell = cells[cell_index]

          # If the cell as no child, cells have no content
          # And if the cell is the last of the row, we don't need to add it to the line
          # So we can go to the next line (= exit this cells loop)
          #
          # I must do this test because sometimes the following cell group
          #   can be found in OOo documents : <table:table-cell table:number-columns-repeated='246'/>
          # This is bad because it create too much irrevelent content that slow down the process
          # So it's a good idea to break the loop in this case
          if cell.childNodes.length == 0 and cell_index == cell_index_range[-1]:
            break

          # Handle cells group
          cell_group_found = cell.getAttributeNS(self.ns["table"], "number-columns-repeated")
          if not cell_group_found:
            cells_to_repeat = 1
318
          else:
319
            cells_to_repeat = int(cell_group_found)
320

321 322 323
          # Ungroup repeated cells
          for j in range(cells_to_repeat):
            # Get the cell content
324 325 326 327 328 329 330 331 332 333
            cell_text = None
            text_tags = cell.getElementsByTagName("text:p")
            for text in text_tags:
              for k in range(text.childNodes.length):
                child = text.childNodes[k]
                if child.nodeType == Node.TEXT_NODE:
                  if cell_text == None:
                    cell_text = ''
                  cell_text += child.nodeValue

334
            # Add the cell to the line
335
            table_line.append(cell_text)
336

Kevin Deldycke's avatar
Kevin Deldycke committed
337 338 339 340 341 342 343 344 345
        # Delete empty lines if needed
        if no_empty_lines:
          empty_cell = 0
          for table_cell in table_line:
            if table_cell == None:
              empty_cell += 1
          if empty_cell == len(table_line):
            table_line = None

346
        # Add the line to the table
Kevin Deldycke's avatar
Kevin Deldycke committed
347 348
        if table_line != None:
          table.append(table_line)
349

350
    # Reduce the table to the minimum
351 352 353 354 355 356 357 358 359 360
    new_table = self._getReducedTable(table)

    # Get a homogenized table
    if normalize:
      table_size = self._getTableSizeDict(new_table)
      new_table = self._getNormalizedBoundsTable( table  = new_table
                                                , width  = table_size['width']
                                                , height = table_size['height']
                                                )
    return {table_name: new_table}
Kevin Deldycke's avatar
Kevin Deldycke committed
361 362


363 364
  security.declarePrivate('_getReducedTable')
  def _getReducedTable(self, table):
Kevin Deldycke's avatar
Kevin Deldycke committed
365
    """
366
      Reduce the table to its minimum size
Kevin Deldycke's avatar
Kevin Deldycke committed
367 368 369 370 371
    """
    empty_lines = 0
    no_more_empty_lines = 0

    # Eliminate all empty cells at the ends of lines and columns
372
    # Browse the table starting from the bottom for easy empty lines count
373
    for line in range(len(table)-1, -1, -1):
Kevin Deldycke's avatar
Kevin Deldycke committed
374
      empty_cells = 0
375
      line_content = table[line]
Kevin Deldycke's avatar
Kevin Deldycke committed
376
      for cell in range(len(line_content)-1, -1, -1):
377
        if line_content[cell] in ('', None):
Kevin Deldycke's avatar
Kevin Deldycke committed
378 379 380
          empty_cells += 1
        else:
          break
381

Kevin Deldycke's avatar
Kevin Deldycke committed
382 383 384 385
      if (not no_more_empty_lines) and (empty_cells == len(line_content)):
        empty_lines += 1
      else:
        line_size = len(line_content) - empty_cells
386
        table[line] = line_content[:line_size]
Kevin Deldycke's avatar
Kevin Deldycke committed
387 388
        no_more_empty_lines = 1

389 390 391
    table_height = len(table) - empty_lines

    return table[:table_height]
Kevin Deldycke's avatar
Kevin Deldycke committed
392

393 394 395 396 397 398

  security.declarePrivate('_getTableSizeDict')
  def _getTableSizeDict(self, table):
    """
      Get table dimension as dictionnary contain both height and width
    """
Kevin Deldycke's avatar
Kevin Deldycke committed
399
    max_cols = 0
400 401 402 403
    for line_index in range(len(table)):
      line = table[line_index]
      if len(line) > max_cols:
        max_cols = len(line)
Kevin Deldycke's avatar
Kevin Deldycke committed
404

405 406 407
    return { 'width' : max_cols
           , 'height': len(table)
           }
Kevin Deldycke's avatar
Kevin Deldycke committed
408 409


410 411
  security.declarePrivate('_getNormalizedBoundsTable')
  def _getNormalizedBoundsTable(self, table, width=0, height=0):
Kevin Deldycke's avatar
Kevin Deldycke committed
412
    """
413
      Add necessary cells and lines to obtain given bounds
Kevin Deldycke's avatar
Kevin Deldycke committed
414
    """
415 416
    while height > len(table):
      table.append([])
Kevin Deldycke's avatar
Kevin Deldycke committed
417
    for line in range(height):
418 419 420 421
      while width > len(table[line]):
        table[line].append(None)
    return table

Kevin Deldycke's avatar
Kevin Deldycke committed
422

423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
  security.declarePrivate('_getTableListUnion')
  def _getTableListUnion(self, list1, list2):
    """
      Coerce two dict containing tables structures.
      We need to use this method because a OpenOffice document can hold
        several embedded spreadsheets with the same id. This explain the
        use of random suffix in such extreme case.
    """
    for list2_key in list2.keys():
      # Generate a new table ID if needed
      new_key = list2_key
      while new_key in list1.keys():
        new_key = list2_key + '_' + str(random.randint(1000,9999))
      list1[new_key] = list2[list2_key]
    return list1


440

Kevin Deldycke's avatar
Kevin Deldycke committed
441 442
InitializeClass(OOoParser)
allow_class(OOoParser)