[Erp5-report] r14368 - in /erp5/trunk/bt5/erp5_dms: ExtensionTemplateItem/ SkinTemplateItem...
nobody at svn.erp5.org
nobody at svn.erp5.org
Fri May 4 15:54:41 CEST 2007
Author: jp
Date: Fri May 4 15:54:40 2007
New Revision: 14368
URL: http://svn.erp5.org?rev=14368&view=rev
Log:
Renamed extension files. Fixed bug in document extraction (whenever text is a data stream rather than a string)
Added:
erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py
erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py
erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py
Modified:
erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml
erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml
erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml
erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml
erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml
erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list
Added: erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py?rev=14368&view=auto
==============================================================================
--- erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py (added)
+++ erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py Fri May 4 15:54:40 2007
@@ -1,0 +1,127 @@
+##############################################################################
+#
+# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+import string, re
+
+redundant_chars='"\'.:;,-+<>()*~' # chars we need to strip from a word before we see if it matches, and from the searchwords to eliminate boolean mode chars
+tr=string.maketrans(redundant_chars,' '*len(redundant_chars))
+
+class Done(Exception):
+ pass
+
+class Word(str):pass
+
+class FoundWord(str):
+
+ def __str__(self):
+ return self.tags[0]+self+self.tags[1]
+
+class Part:
+
+ def __init__(self,tags,trail):
+ self.chain=[]
+ self.limit=trail
+ self.trail=trail
+ self.has=False
+ self.tags=tags
+
+ def push(self,w):
+ self.chain.insert(0,Word(w))
+ if len(self.chain)>self.limit:
+ if self.has:
+ self.chain.reverse()
+ raise Done()
+ self.chain.pop()
+
+ def add(self,w):
+ self.chain.insert(0,FoundWord(w))
+ self.limit+=self.trail+1
+ self.has=True
+
+ def __str__(self):
+ return '...%s...' % ' '.join(map(str,self.chain))
+
+
+
+def generateParts(context,text,sw,tags,trail,maxlines):
+ par=Part(tags,trail)
+ sw=sw.translate(tr).strip().lower().split()
+ test=lambda w:w.translate(tr).strip().lower() in sw
+ i=0
+ length=len(text)
+ for counter,aw in enumerate(text):
+ if i==maxlines:
+ raise StopIteration
+ if test(aw):
+ par.add(aw)
+ else:
+ try:
+ par.push(aw)
+ except Done:
+ i+=1
+ yield par
+ par=Part(tags,trail)
+ if counter==length-1:
+ if par.has:
+ par.chain.reverse()
+ yield par # return the last marked part
+
+
+def cutFound(context, txt, sw, tags, trail, maxlines):
+ """
+ Returns an excerpt of text found in the txt string
+ """
+ txt = str(txt)
+ # initialize class
+ FoundWord.tags=tags
+ # strip html tags (in case it is a web page - we show result without formatting)
+ r = re.compile('<script>.*?</script>',re.DOTALL|re.IGNORECASE)
+ r = re.compile('<head>.*?</head>',re.DOTALL|re.IGNORECASE)
+ txt = re.sub(r,'',txt)
+ r = re.compile('<([^>]+)>',re.DOTALL|re.IGNORECASE)
+ txt = re.sub(r,'',txt)
+ r = re.compile('\s+')
+ txt = re.sub(r,' ',txt)
+ txt = txt.replace('-',' - ') # to find hyphenated occurrences
+ text = ' '.join(txt.split('\n')).split(' ') # very rough tokenization
+ return [p for p in generateParts(context,text,sw,tags,trail,maxlines)]
+
+
+if __name__=='__main__':
+ sw='pricing priority right acting proportion'
+ txt=' '.join([l.strip() for l in open('offer.txt').readlines()])
+
+ # configuration
+
+ tags=('<b>','</b>')
+ trail=5
+ maxlines=5
+ for p in cutFound(None,txt,sw,tags,trail,maxlines):
+ print p
+
+
+# vim: filetype=python syntax=python shiftwidth=2
Added: erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py?rev=14368&view=auto
==============================================================================
--- erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py (added)
+++ erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py Fri May 4 15:54:40 2007
@@ -1,0 +1,128 @@
+##############################################################################
+#
+# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+import zipfile, cStringIO, re
+import xmlrpclib, base64
+from Products.CMFCore.utils import getToolByName
+
+def extractContent(data):
+ """
+ extract text content from ODF data
+ directly by unzipping (no need for oood here)
+ """
+ # XXX probably not used - to really get text content it should
+ # strip xml too
+ cs = cStringIO.StringIO()
+ cs.write(data)
+ try:
+ z = zipfile.ZipFile(cs)
+ except zipfile.BadZipfile:
+ cs.close()
+ return ''
+ s = z.read('content.xml')
+ cs.close()
+ z.close()
+ return s
+
+###### XXX these methods repeat what is in OOoDocument class
+# maybe redundant, but we need to access them from Script (Python)
+
+def convertToOdf(self, name, data):
+ """
+ convert data into ODF format
+ to be used in ingestion when we don't yet have an ERP5 object
+ to work with (and we for example have to figure out portal_type)
+ """
+ sp = mkProxy(self)
+ kw = sp.run_convert(name,base64.encodestring(data))
+ odf = base64.decodestring(kw['data'])
+ return odf
+
+def mkProxy(self):
+ pref = getToolByName(self,'portal_preferences')
+ adr = pref.getPreferredDmsOoodocServerAddress()
+ nr = pref.getPreferredDmsOoodocServerPortNumber()
+ if adr is None or nr is None:
+ raise Exception('you should set conversion server coordinates in preferences')
+ sp = xmlrpclib.ServerProxy('http://%s:%d' % (adr,nr), allow_none=True)
+ return sp
+
+def generateFile(self, name, data, format):
+ sp = mkProxy(self)
+ kw = sp.run_generate(name, data, None, format)
+ res = base64.decodestring(kw['data'])
+ return res
+
+def getAttrFromFilename(self, fname):
+ """
+ parse file name using regexp specified in preferences
+ """
+ rx_parse = re.compile(self.portal_preferences.getPreferredDmsFilenameRegexp())
+ m = rx_parse.match(fname)
+ if m is None:
+ return {}
+ return m.groupdict()
+
+def getLastWorkflowDate(self, state_name='simulation_state', state=('released','public')):
+ '''we can make something more generic out of it
+ or JP says "there is an API for it" and we trash this one'''
+ for name,wflow in self.workflow_history.items():
+ if len(wflow) == 0: continue # empty history
+ if wflow[0].get(state_name) is None: continue # not the right one
+ for i in range(len(wflow)):
+ ch = wflow[-1-i]
+ act = ch.get('action', '')
+ if act is not None and act.endswith('action'):
+ if ch.get(state_name, '') in state:
+ return ch['time']
+ return 0
+
+#############################################################################
+# Mail management
+
+def findAddress(txt):
+ """
+ find email address in a string
+ """
+ validchars='0-9A-Za-z.\-_'
+ r=re.compile('[%s]+@[%s]+' % (validchars,validchars))
+ m=r.search(txt)
+ return m and m.group()
+
+def extractParams(txt):
+ """
+ extract parameters given in mail body
+ We assume that parameters are given as lines of the format:
+ name:value
+ """
+ r=re.compile('^([\w_]+):([\w_/]+)$')
+ res=[]
+ for line in txt.split():
+ found=r.findall(line.strip())
+ if len(found)==1:
+ res.append(found[0])
+ return dict(res)
Added: erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py?rev=14368&view=auto
==============================================================================
--- erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py (added)
+++ erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py Fri May 4 15:54:40 2007
@@ -1,0 +1,126 @@
+##############################################################################
+#
+# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+"""
+RULES
+
+Single arguments:
+ - arg:value translates into arg='value' in query
+ - quotes are cleared
+ - if value contains spaces, punctuation or anything else it has to be put in quotes
+ - file is source_reference (original file name)
+ - language, version, reference
+
+Multiple arguments:
+ - arg:xxx works the same way
+ - arg:(xxx,yyy) ORs both
+ - arg:all translates into empty tuple, which implies all available values
+ - state (simulation_state), type (portal_type)
+
+Everything else is treated as SearchableText
+"""
+
+# XXX score:
+# pythonicity: high
+# obfuscation level: brain-twisting
+
+# how to customize:
+# (1) think for two hours
+# (2) type for 20 seconds
+
+import re
+import sys
+sys.path.append('/usr/lib/zope/lib/python/')
+from DateTime import DateTime
+
+def dateRangeProc(s):
+ """
+ process date range (can be given in months or years)
+ """
+ m=re.match('(\d)([wmy]).*',s)
+ try:
+ dif=0
+ gr=m.groups()
+ if gr[1]=='w':dif=int(gr[0])*7
+ if gr[1]=='m':dif=int(gr[0])*30
+ if gr[1]=='y':dif=int(gr[0])*365
+ return ('creation_from',DateTime()-dif)
+ except AttributeError, IndexError:
+ return ()
+
+# parsing defined here
+simulation_states=()
+r=re.compile('(\w+:"[^"]+"|\w+:\([^)]+\)|\w+:[\(\),\w/\-.]+)')
+filetyper=lambda s:('source_reference','%%.%s' % s)
+filestripper=lambda s: ('source_reference',s.replace('"',''))
+#addarchived=lambda s: ('simulation_state',simulation_states+('archived',))
+state=lambda s:('simulation_state',parsestates(s))
+type=lambda s:('portal_type',parsestates(s))
+paramsmap=dict(file=filestripper,type=type,reference='reference',filetype=filetyper,state=state,\
+ language='language',version='version',created=dateRangeProc)
+
+def parsestates(s):
+ print s
+ if s=='all':
+ return ()
+ if s[0]=='(' and s[-1]==')':
+ return [i.replace('"','').replace("'","") for i in s[1:-1].split(',') if i!='']
+ return s.replace('"','').replace("'","")
+
+def analyze(params):
+ params['SearchableText']=''
+ params['simulation_state']=simulation_states
+ def cutter(s):
+ ss=s.split(':')
+ if len(ss)==1:
+ params['SearchableText']+=ss[0]
+ if len(ss)==2:
+ try:
+ ps=paramsmap.get(ss[0])(ss[1])
+ params[ps[0]]=ps[1]
+ except TypeError:
+ if paramsmap.has_key(ss[0]):
+ params[paramsmap.get(ss[0])]=ss[1]
+ else:
+ params[ss[0]]=ss[1]
+ except IndexError:
+ return
+ return cutter
+
+def parseSearchString(searchstring):
+ params={}
+ l=r.split(searchstring)
+ print l
+ map(analyze(params),l)
+ params['SearchableText']=params['SearchableText'].strip()
+ return params
+
+if __name__=='__main__':
+ #searchstring='byle cisnie zego file:"ble ble.doc" filetype:doc type:Text poza tym reference:abc-def'
+ #searchstring='byle "cisnie zego" state:draft file:"ble ble.doc" type:("Site","Text") poza tym reference:abc-def dupa:kwas/zbita'
+ searchstring='byleco created:3mth'
+ print parseSearchString(searchstring)
Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml Fri May 4 15:54:40 2007
@@ -24,7 +24,7 @@
</item>
<item>
<key> <string>_module</string> </key>
- <value> <string>documentUtils</string> </value>
+ <value> <string>DocumentManagement</string> </value>
</item>
<item>
<key> <string>_owner</string> </key>
Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml Fri May 4 15:54:40 2007
@@ -24,7 +24,7 @@
</item>
<item>
<key> <string>_module</string> </key>
- <value> <string>mailUtils</string> </value>
+ <value> <string>DocumentManagement</string> </value>
</item>
<item>
<key> <string>_owner</string> </key>
Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml Fri May 4 15:54:40 2007
@@ -24,7 +24,7 @@
</item>
<item>
<key> <string>_module</string> </key>
- <value> <string>cutFound</string> </value>
+ <value> <string>DocumentExtraction</string> </value>
</item>
<item>
<key> <string>id</string> </key>
Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml Fri May 4 15:54:40 2007
@@ -24,7 +24,7 @@
</item>
<item>
<key> <string>_module</string> </key>
- <value> <string>mailUtils</string> </value>
+ <value> <string>DocumentManagement</string> </value>
</item>
<item>
<key> <string>id</string> </key>
Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml Fri May 4 15:54:40 2007
@@ -24,7 +24,7 @@
</item>
<item>
<key> <string>_module</string> </key>
- <value> <string>searchUtils</string> </value>
+ <value> <string>DocumentSearch</string> </value>
</item>
<item>
<key> <string>id</string> </key>
Modified: erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list (original)
+++ erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list Fri May 4 15:54:40 2007
@@ -1,5 +1,3 @@
-searchUtils
-mailUtils
-cutFound
-asSecurityGroupId
-documentUtils
+DocumentSearch
+DocumentExtraction
+DocumentManagement
More information about the Erp5-report
mailing list