From 731b9498e169be3c4cd24885eccf428848746298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20G=C3=B3rny?= <bartek@gorny.edu.pl> Date: Mon, 2 Oct 2006 14:25:02 +0000 Subject: [PATCH] scan only text/html, skip anchors, don't follow mailto git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10487 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/ERP5OOo/Document/ExternalWebPage.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/product/ERP5OOo/Document/ExternalWebPage.py b/product/ERP5OOo/Document/ExternalWebPage.py index f4916dd01d..3d3cba8659 100644 --- a/product/ERP5OOo/Document/ExternalWebPage.py +++ b/product/ERP5OOo/Document/ExternalWebPage.py @@ -160,7 +160,11 @@ class ExternalWebPage(ExternalDocument): self.urldict={} self._p_changed=1 - def _processData(self,s): + def _processData(self,s, inf): + # since this is a web page, we don't want anything else + # XXX we should find another way - like this, we end up with empty draft objects + if (inf.getmaintype(),inf.getsubtype())!=('text','html'): + raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype())) top=self._findTopObject() # record my url in top object top.addUrl(self.getQualifiedUrl()) @@ -170,11 +174,11 @@ class ExternalWebPage(ExternalDocument): # first find links in text rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE) for ref in re.findall(rx, s): - if ref.startswith('javascript'): + # eliminate anchors and specials, select internal links + if ref.startswith('javascript') or ref.startswith('mailto'): continue - # XXX not sure where to store those already spidered - # for now, the only precaution against infinite loop is recursion depth - # select internal links + ref=re.sub('#.*','',ref) + if ref=='':continue baseref='/'.join(self.getQualifiedUrl().split('/')[:-1]) if not ref.startswith('http'): # complete relative paths -- 2.30.9