[Erp5-report] r9072 - /erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/cutFound.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Mon Aug 7 16:53:34 CEST 2006
Author: bartek
Date: Mon Aug 7 16:53:30 2006
New Revision: 9072
URL: http://svn.erp5.org?rev=9072&view=rev
Log:
strip html tags from found text displayed in listbox
Modified:
erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/cutFound.py
Modified: erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/cutFound.py
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/cutFound.py?rev=9072&r1=9071&r2=9072&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/cutFound.py (original)
+++ erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/cutFound.py Mon Aug 7 16:53:30 2006
@@ -1,4 +1,4 @@
-import string
+import string, re
redundant_chars='"\'.:;,-' # chars we need to strip from a word before we see if it matches
tr=string.maketrans(redundant_chars,' '*len(redundant_chars))
@@ -59,7 +59,16 @@
def cutFound(context,txt,sw,tags,trail,maxlines):
+ # initialize class
FoundWord.tags=tags
+ # strip html tags (in case it is a web page - we show result without formatting)
+ r=re.compile('<script>.*?</script>',re.DOTALL|re.IGNORECASE)
+ r=re.compile('<head>.*?</head>',re.DOTALL|re.IGNORECASE)
+ txt=re.sub(r,'',txt)
+ r=re.compile('<([^>]+)>',re.DOTALL|re.IGNORECASE)
+ txt=re.sub(r,'',txt)
+ r=re.compile('\s+')
+ txt=re.sub(r,' ',txt)
text = ' '.join(txt.split('\n')).split(' ') # very rough tokenization
return [p for p in generateParts(context,text,sw,tags,trail,maxlines)]
More information about the Erp5-report
mailing list