From 731b9498e169be3c4cd24885eccf428848746298 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bartek=20G=C3=B3rny?= <bartek@gorny.edu.pl>
Date: Mon, 2 Oct 2006 14:25:02 +0000
Subject: [PATCH] scan only text/html, skip anchors, don't follow mailto

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10487 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/ERP5OOo/Document/ExternalWebPage.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/product/ERP5OOo/Document/ExternalWebPage.py b/product/ERP5OOo/Document/ExternalWebPage.py
index f4916dd01d..3d3cba8659 100644
--- a/product/ERP5OOo/Document/ExternalWebPage.py
+++ b/product/ERP5OOo/Document/ExternalWebPage.py
@@ -160,7 +160,11 @@ class ExternalWebPage(ExternalDocument):
     self.urldict={}
     self._p_changed=1
 
-  def _processData(self,s):
+  def _processData(self,s, inf):
+    # since this is a web page, we don't want anything else
+    # XXX we should find another way - like this, we end up with empty draft objects
+    if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
+      raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
     top=self._findTopObject()
     # record my url in top object
     top.addUrl(self.getQualifiedUrl())
@@ -170,11 +174,11 @@ class ExternalWebPage(ExternalDocument):
       # first find links in text
       rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
       for ref in re.findall(rx, s):
-        if ref.startswith('javascript'):
+        # eliminate anchors and specials, select internal links
+        if ref.startswith('javascript') or ref.startswith('mailto'):
           continue
-        # XXX not sure where to store those already spidered
-        # for now, the only precaution against infinite loop is recursion depth
-        # select internal links
+        ref=re.sub('#.*','',ref)
+        if ref=='':continue
         baseref='/'.join(self.getQualifiedUrl().split('/')[:-1])
         if not ref.startswith('http'):
           # complete relative paths
-- 
2.30.9