# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
#                    Nicolas Delaby <nicolas@erp5.org>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################

import unittest
from Products.ERP5Type.tests.ERP5TypeTestCase import ERP5TypeTestCase,\
     _getConversionServerDict
import urlnorm # This library is imported to detect lack of
               # urlnorm availibility in python environment

import transaction

# test files' home
FILENAME_REGULAR_EXPRESSION = "(?P<reference>[A-Z&é@{]{3,7})-(?P<language>[a-z]{2})-(?P<version>[0-9]{3})"
REFERENCE_REGULAR_EXPRESSION = "(?P<reference>[A-Z&é@{]{3,7})(-(?P<language>[a-z]{2}))?(-(?P<version>[0-9]{3}))?"

class TestWebCrawler(ERP5TypeTestCase):
  """
    Test Crawling mechanism
  """

  _path_to_delete_list = []
  system_pref_id = 'my_preference'

  def getTitle(self):
    """
      Return the title of the current test set.
    """
    return "ERP5 Live DMS - Web Crawling"

  def getBusinessTemplateList(self):
    """
      Return the list of required business templates.
    """
    return ('erp5_base',
            'erp5_ingestion',
            'erp5_ingestion_mysql_innodb_catalog',
            'erp5_web',
            'erp5_dms')

  def afterSetUp(self):
    """
      Initialize the ERP5 site.
    """
    self.login()
    self.portal = self.getPortal()
    self.setSystemPreference()
    self.bootstrapWebSite()
    transaction.commit()
    self.tic()

  def beforeTearDown(self):
    portal = self.portal
    module_id_list = [
      'web_page_module',
      'web_site_module',
      'external_source_module',
      'document_module',
      ]
    # delete created documents by test
    for module_id in module_id_list:
      module = portal[module_id]
      module.manage_delObjects(list(module.objectIds()))
    # Unindex deleted documents
    transaction.commit()
    self.tic()

  def setSystemPreference(self):
    portal_preferences = self.portal.portal_preferences
    system_preference = portal_preferences._getOb(self.system_pref_id, None)
    if system_preference is None:
      system_preference = portal_preferences.newContent(id=self.system_pref_id,
                                               portal_type='System Preference')
    conversion_dict = _getConversionServerDict()
    system_preference.\
                   setPreferredOoodocServerAddress(conversion_dict['hostname'])
    system_preference.\
                    setPreferredOoodocServerPortNumber(conversion_dict['port'])
    system_preference.setPreferredDocumentFilenameRegularExpression(
                                                   FILENAME_REGULAR_EXPRESSION)
    system_preference.setPreferredDocumentReferenceRegularExpression(
                                                  REFERENCE_REGULAR_EXPRESSION)
    if system_preference.getPreferenceState() != 'global':
      system_preference.enable()


  def bootstrapWebSite(self):
    """Create 1 Website
    live_test_web_site/section1/section1a
                      /section2
    create 2 web pages
      W-REFERENCE.PAGE
      W-REFERENCE.HOMEPAGE

    the website use light version of erp5_web_layout
    It keep just displaying sections and subsection
    And default Web page
    """
    web_site_portal_type = 'Web Site'
    web_section_portal_type = 'Web Section'
    web_page_portal_type = 'Web Page'
    web_site_module = self.portal.getDefaultModule(web_site_portal_type)
    web_page_module = self.portal.getDefaultModule(web_page_portal_type)

    text_content = """<p><a href="W-REFERENCE.PAGE">Page</a></p>"""
    web_page_id = 'live_test_home'
    home_page = web_page_module.newContent(portal_type=web_page_portal_type,
                                          title='Home Page',
                                          text_content=text_content,
                                          reference='W-REFERENCE.HOMEPAGE',
                                          version='001',
                                          language='en',
                                          id=web_page_id)
    home_page.submit()
    home_page.publish()

    web_site_id = 'live_test_web_site'
    web_site = web_site_module.newContent(portal_type=web_site_portal_type,
                      id=web_site_id,
                      title='Live Test Web Site',
                      visible=True,
                      default_page_displayed=True,
                      site_map_section_parent=True,
                      authorization_forced=True,
                      aggregate_value=home_page,
                      available_language_set=['en'],
                      container_layout='erp5_web_layout_test',
                      content_layout='erp5_web_content_layout_test')
    web_site.publish()

    text_content = """<p>
    <a href="%s/W-REFERENCE.HOMEPAGE">absolute link to HOME PAGE</a>
    </p>""" % web_site.absolute_url()
    section1a_page = web_page_module.newContent(
                                              portal_type=web_page_portal_type,
                                              title='Home Page',
                                              text_content=text_content,
                                              reference='W-REFERENCE.PAGE',
                                              version='001',
                                              language='en')
    section1a_page.submit()
    section1a_page.publish()
    web_section1 = web_site.newContent(portal_type=web_section_portal_type,
                                      title='Section 1',
                                      id='section1',
                                      aggregate_value=section1a_page)
    web_section2 = web_site.newContent(portal_type=web_section_portal_type,
                                      title='Section 2',
                                      id='section2',
                                      aggregate_value=section1a_page)
    web_section1a = web_section1.newContent(
                                          portal_type=web_section_portal_type,
                                          title='Section 1a',
                                          id='section 1a', #add a space in id
                                          aggregate_value=section1a_page)

  def test_01_check_URLTransformations(self):
    """Check crawlable functionalities regarding URL handling

    getContentBaseURL
    asNormalisedURL
    getContentNormalisedURLList
    """
    web_page_portal_type = 'Web Page'
    web_page_module = self.portal.getDefaultModule(web_page_portal_type)
    web_page = web_page_module.newContent(portal_type=web_page_portal_type)
    self.assertEquals(web_page.getContentBaseURL(), '')
    web_page.fromURL('http://www.example.com')
    self.assertEquals(web_page.getContentBaseURL(), 'http://www.example.com')
    web_page.fromURL('http://www.example.com/section/sub_section')
    self.assertEquals(web_page.getContentBaseURL(),
                      'http://www.example.com/section')
    text_content = """<html>
    <head>
      <base href="http://www.example.com"/>
    </head>
    <body>
      <p><a href="http://www.notexample.com/">External link</a></p>
      <p><a href="http://www.example.com//I don't care I put what/ I want/">
          Funny link</a></p>
      <p><a href="http://www.example.com/section">Internal link</a></p>
      <p><a href="section2">Relative Internal link</a></p>
      <p><a href="http://www.example.com/?title=%E9+crit">With Encoding issue
      This link will be discarded</a></p>
      <img src="my_image_link"/>
      <script src="should_not_be_followed.js"/>
      <p><a href="http://http://www.example.com/section">Not a link</a></p>
    </body>
    </html>"""
    web_page.edit(text_content=text_content)
    self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
    self.assertEquals(web_page.getContentNormalisedURLList(),
                    ["http://www.example.com/I%20don't%20care%20I%20put%20what/%20I%20want/",
                     'http://www.example.com/section',
                     'http://www.example.com/section2',])
    # relative links without base tag
    text_content = """<html>
    <head>
    </head>
    <body>
      <p><a href="section2">Relative Internal link</a></p>
    </body>
    </html>"""
    web_page.edit(text_content=text_content)
    web_page.fromURL('http://www.example.com/#fffff')
    self.assertEquals(web_page.getContentBaseURL(), "http://www.example.com")
    self.assertEquals(web_page.getContentNormalisedURLList(),
                      ['http://www.example.com/section2',])
    self.assertEquals(web_page.asNormalisedURL(),
                      'http://www.example.com/#fffff')

  def test_02_crawlWebSite(self):
    """Call portal_contribution to crawl website hosted by itself.
    """
    web_site = self.portal.web_site_module.live_test_web_site
    external_source_portal_type = 'URL Crawler'
    web_crawler_module = self.portal.getDefaultModule(
                                                   external_source_portal_type)
    web_crawler = web_crawler_module.newContent(
                                       portal_type=external_source_portal_type,
                                       crawling_depth=5)
    web_crawler.fromURL(web_site.absolute_url())
    transaction.commit()
    self.tic()
    web_crawler.crawlContent()
    transaction.commit()
    self.tic()

    # 6 = 1 website
    #     + 3 Web Sections
    #     + 1 absolute link to home_page
    #     + 1 relative link from home_page to another web page
    self.assertEquals(len(web_crawler), 6)
    self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
                      6)
    date_before = web_crawler.getModificationDate()
    web_crawler.crawlContent()
    transaction.commit()
    self.tic()
    # Nothing happens, portal_url_registry keep crawling twice
    # the same url
    self.assertEquals(len(web_crawler), 6)
    self.assertEquals(len(self.portal.portal_url_registry._getMappingDict()),
                      6)
    # not modified
    self.assertEquals(date_before, web_crawler.getModificationDate())

    new_web_crawler = web_crawler_module.newContent(
                                       portal_type=external_source_portal_type,
                                       crawling_depth=5)
    new_web_crawler.fromURL(web_site.absolute_url())
    transaction.commit()
    self.tic()
    new_web_crawler.crawlContent()
    transaction.commit()
    self.tic()
    # check that portal_url_registry
    # block contribution of existing content
    self.assertFalse(len(new_web_crawler))

    # set another namespace on preference
    preference = self.portal.portal_preferences[self.system_pref_id]
    preference.setPreferredIngestionNamespace('NEW')
    transaction.commit()
    self.tic()
    new_web_crawler.crawlContent()
    transaction.commit()
    self.tic()
    self.assertEquals(len(web_crawler), 6)


def test_suite():
  suite = unittest.TestSuite()
  suite.addTest(unittest.makeSuite(TestWebCrawler))
  return suite