crawlable.py

# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2009 Nexedi SA and Contributors. All Rights Reserved.
#                    Jean-Paul Smets-Solanes <jp@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

from zope.interface import Interface

class ICrawlable(Interface):
  """
  Crawlable interface specification

  Documents which implement the ICrawlable can be crawled by
  extracting the URLs which they refer to and can be processed
  by an ERP5 crawler such as the ContributionTool.
  """

  def crawlContent():
    """
    Initialises the crawling process from the current document.
    The crawling process is delegate to an ERP5 crawler such
    as the ContributionTool.
    """

  def getContentURLList():
    """
    Returns a list of URLs which the current document refers to.
    URLs are returned as is (ie. relative, absolute, with or
    without server header).
    """

  def getContentBaseURL():
    """
    Returns the content base URL based on the actual content or
    based on any other information (ex. URL property, system 
    preferences, etc.). This information can be used to generate
    a normalised URL.
    """

  def getContentNormalisedURLList():
    """
    Returns a list of URLs which the current document refers to.
    URLs are returned in a normalised way, including server, port
    and absolute path.
    """

  def isIndexContent(container=None, content=None):
    """
    Returns True if the content document acts as an index
    to other documents. Returns False is the content document
    contains relevant content for the end-user. 

    This method is used by ERP5 crawlers to make a difference between
    URLs which return an index (ex. the list of emails of a mailing
    list archive) and true content (ex. email content of a mailing list
    archive).

    Either container or content must be set equal None.

    container -- a container document to which the calculation of
                 isIndexContent is delegated to, by default the
                 parent document

    content -- the content document to ass, by default self

    NOTE: Crawlable Documents and External Sources current
    use the same isIndexContent method which is unified here,
    but with a different signature. This is probably inconsistent
    and the interface must be revised.
    """