[Erp5-report] r14368 - in /erp5/trunk/bt5/erp5_dms: ExtensionTemplateItem/ SkinTemplateItem...

nobody at svn.erp5.org nobody at svn.erp5.org
Fri May 4 15:54:41 CEST 2007


Author: jp
Date: Fri May  4 15:54:40 2007
New Revision: 14368

URL: http://svn.erp5.org?rev=14368&view=rev
Log:
Renamed extension files. Fixed bug in document extraction (whenever text is a data stream rather than a string)

Added:
    erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py
    erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py
    erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py
Modified:
    erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml
    erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml
    erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml
    erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml
    erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml
    erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list

Added: erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py?rev=14368&view=auto
==============================================================================
--- erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py (added)
+++ erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentExtraction.py Fri May  4 15:54:40 2007
@@ -1,0 +1,127 @@
+##############################################################################
+#
+# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+import string, re
+
+redundant_chars='"\'.:;,-+<>()*~' # chars we need to strip from a word before we see if it matches, and from the searchwords to eliminate boolean mode chars
+tr=string.maketrans(redundant_chars,' '*len(redundant_chars))
+
+class Done(Exception):
+  pass
+
+class Word(str):pass
+
+class FoundWord(str):
+  
+  def __str__(self):
+    return self.tags[0]+self+self.tags[1]
+
+class Part:
+
+  def __init__(self,tags,trail):
+    self.chain=[]
+    self.limit=trail
+    self.trail=trail
+    self.has=False
+    self.tags=tags
+
+  def push(self,w):
+    self.chain.insert(0,Word(w))
+    if len(self.chain)>self.limit:
+      if self.has:
+        self.chain.reverse()
+        raise Done()
+      self.chain.pop()
+
+  def add(self,w):
+    self.chain.insert(0,FoundWord(w))
+    self.limit+=self.trail+1
+    self.has=True
+
+  def __str__(self):
+    return '...%s...' % ' '.join(map(str,self.chain))
+
+
+
+def generateParts(context,text,sw,tags,trail,maxlines):
+  par=Part(tags,trail)
+  sw=sw.translate(tr).strip().lower().split()
+  test=lambda w:w.translate(tr).strip().lower() in sw
+  i=0
+  length=len(text)
+  for counter,aw in enumerate(text):
+    if i==maxlines:
+      raise StopIteration
+    if test(aw):
+      par.add(aw)
+    else:
+      try:
+        par.push(aw)
+      except Done:
+        i+=1
+        yield par
+        par=Part(tags,trail)
+      if counter==length-1:
+        if par.has:
+          par.chain.reverse()
+          yield par # return the last marked part
+
+
+def cutFound(context, txt, sw, tags, trail, maxlines):
+  """
+  Returns an excerpt of text found in the txt string
+  """
+  txt = str(txt)
+  # initialize class
+  FoundWord.tags=tags
+  # strip html tags (in case it is a web page - we show result without formatting)
+  r = re.compile('<script>.*?</script>',re.DOTALL|re.IGNORECASE)
+  r = re.compile('<head>.*?</head>',re.DOTALL|re.IGNORECASE)
+  txt = re.sub(r,'',txt)
+  r = re.compile('<([^>]+)>',re.DOTALL|re.IGNORECASE)
+  txt = re.sub(r,'',txt)
+  r = re.compile('\s+')
+  txt = re.sub(r,' ',txt)
+  txt = txt.replace('-',' - ') # to find hyphenated occurrences
+  text = ' '.join(txt.split('\n')).split(' ') # very rough tokenization
+  return [p for p in generateParts(context,text,sw,tags,trail,maxlines)]
+
+
+if __name__=='__main__':
+  sw='pricing priority right acting proportion'
+  txt=' '.join([l.strip() for l in open('offer.txt').readlines()])
+
+  # configuration
+
+  tags=('<b>','</b>')
+  trail=5
+  maxlines=5
+  for p in cutFound(None,txt,sw,tags,trail,maxlines):
+    print p
+
+
+# vim: filetype=python syntax=python shiftwidth=2 

Added: erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py?rev=14368&view=auto
==============================================================================
--- erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py (added)
+++ erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentManagement.py Fri May  4 15:54:40 2007
@@ -1,0 +1,128 @@
+##############################################################################
+#
+# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+import zipfile, cStringIO, re
+import xmlrpclib, base64
+from Products.CMFCore.utils import getToolByName
+
+def extractContent(data):
+  """
+  extract text content from ODF data
+  directly by unzipping (no need for oood here)
+  """
+  # XXX probably not used - to really get text content it should
+  # strip xml too
+  cs = cStringIO.StringIO()
+  cs.write(data)
+  try:
+    z = zipfile.ZipFile(cs)
+  except zipfile.BadZipfile:
+    cs.close()
+    return ''
+  s = z.read('content.xml')
+  cs.close()
+  z.close()
+  return s
+
+###### XXX these methods repeat what is in OOoDocument class
+# maybe redundant, but we need to access them from Script (Python)
+
+def convertToOdf(self, name, data):
+  """
+  convert data into ODF format
+  to be used in ingestion when we don't yet have an ERP5 object
+  to work with (and we for example have to figure out portal_type)
+  """
+  sp = mkProxy(self)
+  kw = sp.run_convert(name,base64.encodestring(data))
+  odf = base64.decodestring(kw['data'])
+  return odf
+
+def mkProxy(self):
+  pref = getToolByName(self,'portal_preferences')
+  adr = pref.getPreferredDmsOoodocServerAddress()
+  nr = pref.getPreferredDmsOoodocServerPortNumber()
+  if adr is None or nr is None:
+    raise Exception('you should set conversion server coordinates in preferences')
+  sp = xmlrpclib.ServerProxy('http://%s:%d' % (adr,nr), allow_none=True)
+  return sp
+
+def generateFile(self, name, data, format):
+  sp = mkProxy(self)
+  kw = sp.run_generate(name, data, None, format)
+  res = base64.decodestring(kw['data'])
+  return res
+
+def getAttrFromFilename(self, fname):
+  """
+  parse file name using regexp specified in preferences
+  """
+  rx_parse = re.compile(self.portal_preferences.getPreferredDmsFilenameRegexp())
+  m = rx_parse.match(fname)
+  if m is None:
+    return {}
+  return m.groupdict()
+
+def getLastWorkflowDate(self, state_name='simulation_state', state=('released','public')):
+  '''we can make something more generic out of it
+  or JP says "there is an API for it" and we trash this one'''
+  for name,wflow in self.workflow_history.items():
+    if len(wflow) == 0: continue # empty history
+    if wflow[0].get(state_name) is None: continue # not the right one
+    for i in range(len(wflow)):
+      ch = wflow[-1-i]
+      act = ch.get('action', '')
+      if act is not None and act.endswith('action'):
+        if ch.get(state_name, '') in state:
+          return ch['time']
+  return 0
+
+#############################################################################
+# Mail management
+
+def findAddress(txt):
+  """
+  find email address in a string
+  """
+  validchars='0-9A-Za-z.\-_'
+  r=re.compile('[%s]+@[%s]+' % (validchars,validchars))
+  m=r.search(txt)
+  return m and m.group()
+
+def extractParams(txt):
+  """
+  extract parameters given in mail body
+  We assume that parameters are given as lines of the format:
+  name:value
+  """
+  r=re.compile('^([\w_]+):([\w_/]+)$')
+  res=[]
+  for line in txt.split():
+    found=r.findall(line.strip())
+    if len(found)==1:
+      res.append(found[0])
+  return dict(res)

Added: erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py?rev=14368&view=auto
==============================================================================
--- erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py (added)
+++ erp5/trunk/bt5/erp5_dms/ExtensionTemplateItem/DocumentSearch.py Fri May  4 15:54:40 2007
@@ -1,0 +1,126 @@
+##############################################################################
+#
+# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+"""
+RULES
+
+Single arguments:
+    - arg:value translates into arg='value' in query
+    - quotes are cleared
+    - if value contains spaces, punctuation or anything else it has to be put in quotes
+    - file is source_reference (original file name)
+    - language, version, reference
+
+Multiple arguments:
+    - arg:xxx works the same way
+    - arg:(xxx,yyy) ORs both
+    - arg:all translates into empty tuple, which implies all available values
+    - state (simulation_state), type (portal_type)
+
+Everything else is treated as SearchableText
+"""
+
+# XXX score:
+# pythonicity: high
+# obfuscation level: brain-twisting
+
+# how to customize:
+# (1) think for two hours
+# (2) type for 20 seconds
+
+import re
+import sys
+sys.path.append('/usr/lib/zope/lib/python/')
+from DateTime import DateTime
+
+def dateRangeProc(s):
+    """
+    process date range (can be given in months or years)
+    """
+    m=re.match('(\d)([wmy]).*',s)
+    try:
+        dif=0
+        gr=m.groups()
+        if gr[1]=='w':dif=int(gr[0])*7
+        if gr[1]=='m':dif=int(gr[0])*30
+        if gr[1]=='y':dif=int(gr[0])*365
+        return ('creation_from',DateTime()-dif)
+    except AttributeError, IndexError:
+        return ()
+
+# parsing defined here
+simulation_states=()
+r=re.compile('(\w+:"[^"]+"|\w+:\([^)]+\)|\w+:[\(\),\w/\-.]+)')
+filetyper=lambda s:('source_reference','%%.%s' % s)
+filestripper=lambda s: ('source_reference',s.replace('"',''))
+#addarchived=lambda s: ('simulation_state',simulation_states+('archived',))
+state=lambda s:('simulation_state',parsestates(s))
+type=lambda s:('portal_type',parsestates(s))
+paramsmap=dict(file=filestripper,type=type,reference='reference',filetype=filetyper,state=state,\
+        language='language',version='version',created=dateRangeProc)
+
+def parsestates(s):
+    print s
+    if s=='all':
+        return ()
+    if s[0]=='(' and s[-1]==')':
+        return [i.replace('"','').replace("'","") for i in s[1:-1].split(',') if i!='']
+    return s.replace('"','').replace("'","")
+
+def analyze(params):
+    params['SearchableText']=''
+    params['simulation_state']=simulation_states
+    def cutter(s):
+        ss=s.split(':')
+        if len(ss)==1:
+            params['SearchableText']+=ss[0]
+        if len(ss)==2:
+            try:
+                ps=paramsmap.get(ss[0])(ss[1])
+                params[ps[0]]=ps[1]
+            except TypeError:
+                if paramsmap.has_key(ss[0]):
+                    params[paramsmap.get(ss[0])]=ss[1]
+                else:
+                    params[ss[0]]=ss[1]
+            except IndexError:
+                return
+    return cutter
+
+def parseSearchString(searchstring):
+    params={}
+    l=r.split(searchstring)
+    print l
+    map(analyze(params),l)
+    params['SearchableText']=params['SearchableText'].strip()
+    return params
+
+if __name__=='__main__':
+    #searchstring='byle cisnie zego file:"ble ble.doc" filetype:doc type:Text poza tym reference:abc-def'
+    #searchstring='byle "cisnie zego" state:draft file:"ble ble.doc" type:("Site","Text") poza tym reference:abc-def dupa:kwas/zbita'
+    searchstring='byleco created:3mth'
+    print parseSearchString(searchstring)

Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getLastWorkflowStateEntryDate.xml Fri May  4 15:54:40 2007
@@ -24,7 +24,7 @@
         </item>
         <item>
             <key> <string>_module</string> </key>
-            <value> <string>documentUtils</string> </value>
+            <value> <string>DocumentManagement</string> </value>
         </item>
         <item>
             <key> <string>_owner</string> </key>

Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/Document_getPropertyDictFromMail.xml Fri May  4 15:54:40 2007
@@ -24,7 +24,7 @@
         </item>
         <item>
             <key> <string>_module</string> </key>
-            <value> <string>mailUtils</string> </value>
+            <value> <string>DocumentManagement</string> </value>
         </item>
         <item>
             <key> <string>_owner</string> </key>

Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/cutFound.xml Fri May  4 15:54:40 2007
@@ -24,7 +24,7 @@
         </item>
         <item>
             <key> <string>_module</string> </key>
-            <value> <string>cutFound</string> </value>
+            <value> <string>DocumentExtraction</string> </value>
         </item>
         <item>
             <key> <string>id</string> </key>

Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/findAddress.xml Fri May  4 15:54:40 2007
@@ -24,7 +24,7 @@
         </item>
         <item>
             <key> <string>_module</string> </key>
-            <value> <string>mailUtils</string> </value>
+            <value> <string>DocumentManagement</string> </value>
         </item>
         <item>
             <key> <string>id</string> </key>

Modified: erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml (original)
+++ erp5/trunk/bt5/erp5_dms/SkinTemplateItem/portal_skins/erp5_dms/parseSearchString.xml Fri May  4 15:54:40 2007
@@ -24,7 +24,7 @@
         </item>
         <item>
             <key> <string>_module</string> </key>
-            <value> <string>searchUtils</string> </value>
+            <value> <string>DocumentSearch</string> </value>
         </item>
         <item>
             <key> <string>id</string> </key>

Modified: erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list
URL: http://svn.erp5.org/erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list?rev=14368&r1=14367&r2=14368&view=diff
==============================================================================
--- erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list (original)
+++ erp5/trunk/bt5/erp5_dms/bt/template_extension_id_list Fri May  4 15:54:40 2007
@@ -1,5 +1,3 @@
-searchUtils
-mailUtils
-cutFound
-asSecurityGroupId
-documentUtils
+DocumentSearch
+DocumentExtraction
+DocumentManagement




More information about the Erp5-report mailing list