From cee5922b4e057dd98f35828fb433161473b47e3c Mon Sep 17 00:00:00 2001 From: Fabien Morin <fabien@nexedi.com> Date: Tue, 7 Apr 2009 13:28:47 +0000 Subject: [PATCH] simplify Base_showFoundText to make it not dependent from erp5_dms bt. Copy DocumentExtraction extension from erp5_dms to here because it's used by Base_showFoundText DocumentExtraction should be rewrited and refactored. Copy it to here temporary to use it before rewriting it. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@26337 20353a03-c40f-0410-a6d1-a30d3c3de9de --- .../DocumentExtraction.py | 127 ++++++++++++++++++ .../erp5_core/Base_getExcerptText.xml | 31 +++++ .../erp5_core/Base_showFoundText.xml | 72 +--------- product/ERP5/bootstrap/erp5_core/bt/revision | 2 +- .../erp5_core/bt/template_extension_id_list | 3 +- 5 files changed, 168 insertions(+), 67 deletions(-) create mode 100644 product/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py create mode 100644 product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml diff --git a/product/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py b/product/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py new file mode 100644 index 0000000000..a3f021b194 --- /dev/null +++ b/product/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py @@ -0,0 +1,127 @@ +############################################################################## +# +# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved. +# +# WARNING: This program as such is intended to be used by professional +# programmers who take the whole responsability of assessing all potential +# consequences resulting from its eventual inadequacies and bugs +# End users who are looking for a ready-to-use solution with commercial +# garantees and support are strongly adviced to contract a Free Software +# Service Company +# +# This program is Free Software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +############################################################################## + +import string, re + +redundant_chars='"\'.:;,-+<>()*~' # chars we need to strip from a word before we see if it matches, and from the searchwords to eliminate boolean mode chars +tr=string.maketrans(redundant_chars,' '*len(redundant_chars)) + +class Done(Exception): + pass + +class Word(str):pass + +class FoundWord(str): + + def __str__(self): + return self.tags[0]+self+self.tags[1] + +class Part: + + def __init__(self,tags,trail): + self.chain=[] + self.limit=trail + self.trail=trail + self.has=False + self.tags=tags + + def push(self,w): + self.chain.insert(0,Word(w)) + if len(self.chain)>self.limit: + if self.has: + self.chain.reverse() + raise Done() + self.chain.pop() + + def add(self,w): + self.chain.insert(0,FoundWord(w)) + self.limit+=self.trail+1 + self.has=True + + def __str__(self): + return '...%s...' % ' '.join(map(str,self.chain)) + + + +def generateParts(context,text,sw,tags,trail,maxlines): + par=Part(tags,trail) + sw=sw.translate(tr).strip().lower().split() + test=lambda w:w.translate(tr).strip().lower() in sw + i=0 + length=len(text) + for counter,aw in enumerate(text): + if i==maxlines: + raise StopIteration + if test(aw): + par.add(aw) + else: + try: + par.push(aw) + except Done: + i+=1 + yield par + par=Part(tags,trail) + if counter==length-1: + if par.has: + par.chain.reverse() + yield par # return the last marked part + + +def getExcerptText(context, txt, sw, tags, trail, maxlines): + """ + Returns an excerpt of text found in the txt string + """ + txt = str(txt) + # initialize class + FoundWord.tags=tags + # strip html tags (in case it is a web page - we show result without formatting) + r = re.compile('<script>.*?</script>',re.DOTALL|re.IGNORECASE) + r = re.compile('<head>.*?</head>',re.DOTALL|re.IGNORECASE) + txt = re.sub(r,'',txt) + r = re.compile('<([^>]+)>',re.DOTALL|re.IGNORECASE) + txt = re.sub(r,'',txt) + r = re.compile('\s+') + txt = re.sub(r,' ',txt) + txt = txt.replace('-',' - ') # to find hyphenated occurrences + text = ' '.join(txt.split('\n')).split(' ') # very rough tokenization + return [p for p in generateParts(context,text,sw,tags,trail,maxlines)] + + +if __name__=='__main__': + sw='pricing priority right acting proportion' + txt=' '.join([l.strip() for l in open('offer.txt').readlines()]) + + # configuration + + tags=('<b>','</b>') + trail=5 + maxlines=5 + for p in cutFound(None,txt,sw,tags,trail,maxlines): + print p + + +# vim: filetype=python syntax=python shiftwidth=2 diff --git a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml new file mode 100644 index 0000000000..e544ac5222 --- /dev/null +++ b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml @@ -0,0 +1,31 @@ +<?xml version="1.0"?> +<ZopeData> + <record id="1" aka="AAAAAAAAAAE="> + <pickle> + <tuple> + <global name="ExternalMethod" module="Products.ExternalMethod.ExternalMethod"/> + <tuple/> + </tuple> + </pickle> + <pickle> + <dictionary> + <item> + <key> <string>_function</string> </key> + <value> <string>getExcerptText</string> </value> + </item> + <item> + <key> <string>_module</string> </key> + <value> <string>DocumentExtraction</string> </value> + </item> + <item> + <key> <string>id</string> </key> + <value> <string>Base_getExcerptText</string> </value> + </item> + <item> + <key> <string>title</string> </key> + <value> <string></string> </value> + </item> + </dictionary> + </pickle> + </record> +</ZopeData> diff --git a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml index 9a3fecd7d0..2521d796c2 100644 --- a/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml +++ b/product/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml @@ -61,67 +61,21 @@ containing searched words as well highlighting the searched \n words in the text itself.\n """\n -is_gadget_mode = context.REQUEST.get(\'is_gadget_mode\', 0)\n -\n -if is_gadget_mode:\n - # in gadget mode less space is available thus show less text\n - max_text_length = 100\n - max_lines = 1\n -\n -def getRandomDocumentTextExcerpt():\n - # try to get somewhat arbitrary choice of searchable attrs\n - if isinstance(document_text, str) and document_text!=\'\':\n - start = min(len(document_text) - 300, 200)\n - return \'... %s ...\' %document_text[start:start + max_text_length]\n -\n -# get search words from listbox selection\n -argument_names = (\'advanced_search_text\', \n - \'title\',\n - \'reference\',\n - \'searchabletext\', \n - \'searchabletext_any\',\n - \'searchabletext_all\', \n - \'searchabletext_phrase\',)\n -\n -if document_text is None:\n - # convert object to text (if possible)\n - if getattr(context, \'asText\', None) is not None and \\\n - getattr(context, \'hasBaseData\', None) is not None:\n - if context.hasBaseData():\n - # document is successfully converted\n - document_text = context.asText()\n - else:\n - # document not converted (due to a conversion error), return message to user\n - return context.Base_translateString(\'Document is not converted or missing content.\')\n -\n \n if selection is not None:\n params = selection.getParams()\n else:\n - params = context.portal_selections.getSelectionParamsFor(\'web_search_result_selection\')\n -\n -params = [params.get(name, \'\') for name in argument_names]\n -params = [(hasattr(par, \'sort\') and \'\'.join(par) or par) for par in params]\n -search_string = \' \'.join(params)\n + params = context.portal_selections.getSelectionParamsFor(\'search_result_selection\')\n +search_words = params.get(\'your_search_text\')\n \n -if search_string.strip() == \'\':\n - # listbox uses its own method, not searching\n - return getRandomDocumentTextExcerpt()\n -\n -search_argument_list = context.Base_parseSearchString(search_string)\n -search_words = search_argument_list.get(\'searchabletext\', None)\n -\n -if search_words in (\'\', None,):\n - # the searched words are empty (e.g. because we used only parameters \n - # without pure searchable text)\n - return getRandomDocumentTextExcerpt()\n +if document_text is None:\n + document_text = context.getSearchableText()\n \n -# get fragments of text containing searched words\n found_text_fragments = context.Base_getExcerptText(\n context, \\\n document_text, \\\n search_words, \\\n - tags = (\'<div style="font-weight:bold;display:inline;">\', \'</div>\'), \\\n + tags = (\'<em>\', \'</em>\'), \\\n trail = 5, \\\n maxlines = max_lines)\n result = \' \'.join(map(str, found_text_fragments))\n @@ -173,22 +127,10 @@ return result\n <string>selection</string> <string>max_lines</string> <string>max_text_length</string> - <string>_getattr_</string> - <string>context</string> - <string>is_gadget_mode</string> - <string>getRandomDocumentTextExcerpt</string> - <string>argument_names</string> <string>None</string> - <string>getattr</string> + <string>_getattr_</string> <string>params</string> - <string>append</string> - <string>$append0</string> - <string>_getiter_</string> - <string>name</string> - <string>par</string> - <string>hasattr</string> - <string>search_string</string> - <string>search_argument_list</string> + <string>context</string> <string>search_words</string> <string>found_text_fragments</string> <string>map</string> diff --git a/product/ERP5/bootstrap/erp5_core/bt/revision b/product/ERP5/bootstrap/erp5_core/bt/revision index de12c71a3f..43da71c576 100644 --- a/product/ERP5/bootstrap/erp5_core/bt/revision +++ b/product/ERP5/bootstrap/erp5_core/bt/revision @@ -1 +1 @@ -1146 \ No newline at end of file +1148 \ No newline at end of file diff --git a/product/ERP5/bootstrap/erp5_core/bt/template_extension_id_list b/product/ERP5/bootstrap/erp5_core/bt/template_extension_id_list index f350421735..3c722df392 100644 --- a/product/ERP5/bootstrap/erp5_core/bt/template_extension_id_list +++ b/product/ERP5/bootstrap/erp5_core/bt/template_extension_id_list @@ -1 +1,2 @@ -StandardSecurity \ No newline at end of file +StandardSecurity +DocumentExtraction \ No newline at end of file -- 2.30.9