[Erp5-report] r26337 - in /erp5/trunk/products/ERP5/bootstrap/erp5_core: ExtensionTemplateI...
nobody at svn.erp5.org
nobody at svn.erp5.org
Tue Apr 7 15:28:49 CEST 2009
Author: fabien
Date: Tue Apr 7 15:28:47 2009
New Revision: 26337
URL: http://svn.erp5.org?rev=26337&view=rev
Log:
simplify Base_showFoundText to make it not dependent from erp5_dms bt.
Copy DocumentExtraction extension from erp5_dms to here because it's used by Base_showFoundText
DocumentExtraction should be rewrited and refactored. Copy it to here temporary to use it before rewriting it.
Added:
erp5/trunk/products/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py
erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml
Modified:
erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml
erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/revision
erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/template_extension_id_list
Added: erp5/trunk/products/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py?rev=26337&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py (added)
+++ erp5/trunk/products/ERP5/bootstrap/erp5_core/ExtensionTemplateItem/DocumentExtraction.py [utf8] Tue Apr 7 15:28:47 2009
@@ -1,0 +1,127 @@
+##############################################################################
+#
+# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+import string, re
+
+redundant_chars='"\'.:;,-+<>()*~' # chars we need to strip from a word before we see if it matches, and from the searchwords to eliminate boolean mode chars
+tr=string.maketrans(redundant_chars,' '*len(redundant_chars))
+
+class Done(Exception):
+ pass
+
+class Word(str):pass
+
+class FoundWord(str):
+
+ def __str__(self):
+ return self.tags[0]+self+self.tags[1]
+
+class Part:
+
+ def __init__(self,tags,trail):
+ self.chain=[]
+ self.limit=trail
+ self.trail=trail
+ self.has=False
+ self.tags=tags
+
+ def push(self,w):
+ self.chain.insert(0,Word(w))
+ if len(self.chain)>self.limit:
+ if self.has:
+ self.chain.reverse()
+ raise Done()
+ self.chain.pop()
+
+ def add(self,w):
+ self.chain.insert(0,FoundWord(w))
+ self.limit+=self.trail+1
+ self.has=True
+
+ def __str__(self):
+ return '...%s...' % ' '.join(map(str,self.chain))
+
+
+
+def generateParts(context,text,sw,tags,trail,maxlines):
+ par=Part(tags,trail)
+ sw=sw.translate(tr).strip().lower().split()
+ test=lambda w:w.translate(tr).strip().lower() in sw
+ i=0
+ length=len(text)
+ for counter,aw in enumerate(text):
+ if i==maxlines:
+ raise StopIteration
+ if test(aw):
+ par.add(aw)
+ else:
+ try:
+ par.push(aw)
+ except Done:
+ i+=1
+ yield par
+ par=Part(tags,trail)
+ if counter==length-1:
+ if par.has:
+ par.chain.reverse()
+ yield par # return the last marked part
+
+
+def getExcerptText(context, txt, sw, tags, trail, maxlines):
+ """
+ Returns an excerpt of text found in the txt string
+ """
+ txt = str(txt)
+ # initialize class
+ FoundWord.tags=tags
+ # strip html tags (in case it is a web page - we show result without formatting)
+ r = re.compile('<script>.*?</script>',re.DOTALL|re.IGNORECASE)
+ r = re.compile('<head>.*?</head>',re.DOTALL|re.IGNORECASE)
+ txt = re.sub(r,'',txt)
+ r = re.compile('<([^>]+)>',re.DOTALL|re.IGNORECASE)
+ txt = re.sub(r,'',txt)
+ r = re.compile('\s+')
+ txt = re.sub(r,' ',txt)
+ txt = txt.replace('-',' - ') # to find hyphenated occurrences
+ text = ' '.join(txt.split('\n')).split(' ') # very rough tokenization
+ return [p for p in generateParts(context,text,sw,tags,trail,maxlines)]
+
+
+if __name__=='__main__':
+ sw='pricing priority right acting proportion'
+ txt=' '.join([l.strip() for l in open('offer.txt').readlines()])
+
+ # configuration
+
+ tags=('<b>','</b>')
+ trail=5
+ maxlines=5
+ for p in cutFound(None,txt,sw,tags,trail,maxlines):
+ print p
+
+
+# vim: filetype=python syntax=python shiftwidth=2
Added: erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml?rev=26337&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml (added)
+++ erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_getExcerptText.xml [utf8] Tue Apr 7 15:28:47 2009
@@ -1,0 +1,31 @@
+<?xml version="1.0"?>
+<ZopeData>
+ <record id="1" aka="AAAAAAAAAAE=">
+ <pickle>
+ <tuple>
+ <global name="ExternalMethod" module="Products.ExternalMethod.ExternalMethod"/>
+ <tuple/>
+ </tuple>
+ </pickle>
+ <pickle>
+ <dictionary>
+ <item>
+ <key> <string>_function</string> </key>
+ <value> <string>getExcerptText</string> </value>
+ </item>
+ <item>
+ <key> <string>_module</string> </key>
+ <value> <string>DocumentExtraction</string> </value>
+ </item>
+ <item>
+ <key> <string>id</string> </key>
+ <value> <string>Base_getExcerptText</string> </value>
+ </item>
+ <item>
+ <key> <string>title</string> </key>
+ <value> <string></string> </value>
+ </item>
+ </dictionary>
+ </pickle>
+ </record>
+</ZopeData>
Modified: erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml?rev=26337&r1=26336&r2=26337&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml [utf8] (original)
+++ erp5/trunk/products/ERP5/bootstrap/erp5_core/SkinTemplateItem/portal_skins/erp5_core/Base_showFoundText.xml [utf8] Tue Apr 7 15:28:47 2009
@@ -61,67 +61,21 @@
containing searched words as well highlighting the searched \n
words in the text itself.\n
"""\n
-is_gadget_mode = context.REQUEST.get(\'is_gadget_mode\', 0)\n
-\n
-if is_gadget_mode:\n
- # in gadget mode less space is available thus show less text\n
- max_text_length = 100\n
- max_lines = 1\n
-\n
-def getRandomDocumentTextExcerpt():\n
- # try to get somewhat arbitrary choice of searchable attrs\n
- if isinstance(document_text, str) and document_text!=\'\':\n
- start = min(len(document_text) - 300, 200)\n
- return \'... %s ...\' %document_text[start:start + max_text_length]\n
-\n
-# get search words from listbox selection\n
-argument_names = (\'advanced_search_text\', \n
- \'title\',\n
- \'reference\',\n
- \'searchabletext\', \n
- \'searchabletext_any\',\n
- \'searchabletext_all\', \n
- \'searchabletext_phrase\',)\n
-\n
-if document_text is None:\n
- # convert object to text (if possible)\n
- if getattr(context, \'asText\', None) is not None and \\\n
- getattr(context, \'hasBaseData\', None) is not None:\n
- if context.hasBaseData():\n
- # document is successfully converted\n
- document_text = context.asText()\n
- else:\n
- # document not converted (due to a conversion error), return message to user\n
- return context.Base_translateString(\'Document is not converted or missing content.\')\n
-\n
\n
if selection is not None:\n
params = selection.getParams()\n
else:\n
- params = context.portal_selections.getSelectionParamsFor(\'web_search_result_selection\')\n
+ params = context.portal_selections.getSelectionParamsFor(\'search_result_selection\')\n
+search_words = params.get(\'your_search_text\')\n
\n
-params = [params.get(name, \'\') for name in argument_names]\n
-params = [(hasattr(par, \'sort\') and \'\'.join(par) or par) for par in params]\n
-search_string = \' \'.join(params)\n
+if document_text is None:\n
+ document_text = context.getSearchableText()\n
\n
-if search_string.strip() == \'\':\n
- # listbox uses its own method, not searching\n
- return getRandomDocumentTextExcerpt()\n
-\n
-search_argument_list = context.Base_parseSearchString(search_string)\n
-search_words = search_argument_list.get(\'searchabletext\', None)\n
-\n
-if search_words in (\'\', None,):\n
- # the searched words are empty (e.g. because we used only parameters \n
- # without pure searchable text)\n
- return getRandomDocumentTextExcerpt()\n
-\n
-# get fragments of text containing searched words\n
found_text_fragments = context.Base_getExcerptText(\n
context, \\\n
document_text, \\\n
search_words, \\\n
- tags = (\'<div style="font-weight:bold;display:inline;">\', \'</div>\'), \\\n
+ tags = (\'<em>\', \'</em>\'), \\\n
trail = 5, \\\n
maxlines = max_lines)\n
result = \' \'.join(map(str, found_text_fragments))\n
@@ -173,22 +127,10 @@
<string>selection</string>
<string>max_lines</string>
<string>max_text_length</string>
+ <string>None</string>
<string>_getattr_</string>
+ <string>params</string>
<string>context</string>
- <string>is_gadget_mode</string>
- <string>getRandomDocumentTextExcerpt</string>
- <string>argument_names</string>
- <string>None</string>
- <string>getattr</string>
- <string>params</string>
- <string>append</string>
- <string>$append0</string>
- <string>_getiter_</string>
- <string>name</string>
- <string>par</string>
- <string>hasattr</string>
- <string>search_string</string>
- <string>search_argument_list</string>
<string>search_words</string>
<string>found_text_fragments</string>
<string>map</string>
Modified: erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/revision
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/revision?rev=26337&r1=26336&r2=26337&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/revision [utf8] (original)
+++ erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/revision [utf8] Tue Apr 7 15:28:47 2009
@@ -1,1 +1,1 @@
-1146
+1148
Modified: erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/template_extension_id_list
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/template_extension_id_list?rev=26337&r1=26336&r2=26337&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/template_extension_id_list [utf8] (original)
+++ erp5/trunk/products/ERP5/bootstrap/erp5_core/bt/template_extension_id_list [utf8] Tue Apr 7 15:28:47 2009
@@ -1,1 +1,2 @@
-StandardSecurity
+StandardSecurity
+DocumentExtraction
More information about the Erp5-report
mailing list