[Erp5-report] r11808 - /erp5/trunk/products/ERP5/Document/

nobody at svn.erp5.org nobody at svn.erp5.org
Sun Dec 31 17:00:37 CET 2006


Author: jp
Date: Sun Dec 31 17:00:30 2006
New Revision: 11808

URL: http://svn.erp5.org?rev=11808&view=rev
Log:
Moved from ERP5OOo to ERP5. Code is still very early. Do not use in production.

Added:
    erp5/trunk/products/ERP5/Document/ExternalDocument.py
    erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py
    erp5/trunk/products/ERP5/Document/ExternalWebPage.py
    erp5/trunk/products/ERP5/Document/TextDocument.py

Added: erp5/trunk/products/ERP5/Document/ExternalDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/ExternalDocument.py?rev=11808&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/Document/ExternalDocument.py (added)
+++ erp5/trunk/products/ERP5/Document/ExternalDocument.py Sun Dec 31 17:00:30 2006
@@ -1,0 +1,159 @@
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5.Document.Document import Document
+
+import mimetypes, re, urllib
+from htmlentitydefs import name2codepoint
+from DateTime import DateTime
+
+
+class SpiderException(Exception):
+
+  def __init__(self,code, msg):
+    msg="%i: %s" % (code, msg)
+    Exception.__init__(self,msg)
+
+class Opener(urllib.FancyURLopener):
+
+  def http_error_default(self, url, fp, code, msg, headers):
+    raise SpiderException(code, msg)
+
+class ExternalDocument(Document):
+  """
+  caching sources from outside
+  This is basically an abstract class
+  classes deriving from it should overwrite method _processData (this
+  is the one that does something with character data obtained from source)
+  Spidering method supports http, ftp and file protocols, and possibly many others
+  """
+  # CMF Type Definition
+  meta_type = 'ERP5 External Document'
+  portal_type = 'External Document'
+  isPortalContent = 1
+  isRADContent = 1
+
+  # Declarative security
+  security = ClassSecurityInfo()
+  security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+  # Default Properties
+  property_sheets = ( PropertySheet.Base
+                    , PropertySheet.CategoryCore
+                    , PropertySheet.DublinCore
+                    , PropertySheet.Version
+                    , PropertySheet.Reference
+                    , PropertySheet.Document
+                    , PropertySheet.TextDocument
+                    , PropertySheet.Url
+                    , PropertySheet.ExternalDocument
+                    )
+
+  protocols=(('Web page','http'),('FTP site','ftp'),('Local file','file'),)
+
+  searchable_property_list = Document.searchable_property_list + ('text_content',)
+
+  security.declareProtected(Permissions.View, 'getProtocolList')
+  def getProtocolList(self):
+    """
+    """
+    return [x[1] for x in self.protocols]
+
+  security.declareProtected(Permissions.View, 'getProtocolItemList')
+  def getProtocolItemList(self):
+    """
+    """
+    return self.protocols
+
+  security.declarePrivate(Permissions.View, '_spiderSource')
+  def _spiderSource(self):
+    """
+    FancyURLopener can open various protocols
+    """
+    op=Opener()
+    f=op.open(self.getQualifiedUrl())
+    s=f.read()
+    inf=f.info()
+    return s, inf
+
+  security.declarePrivate('_processData')
+  def _processData(self,s, inf):
+    raise Exception('this should be implemented in subclass')
+
+  security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject')
+  def resetTopObject(self):
+    '''
+    abstract function for maintaining interface
+    call before beginning recursive spidering
+    used mostly in web pages
+    '''
+    pass
+
+  security.declareProtected(Permissions.View, 'getProtocolItemList')
+  def spiderSource(self):
+    """
+    spiders external datasource
+    sets status message
+    returned value tells us if it succeeded or failed
+    """
+    try:
+      s,inf=self._spiderSource()
+    except Exception,e:
+      self.log(e,level=1)
+      self.setExternalProcessingStatusMessage("Tried on %s: %s" % (self._time(),str(e)))
+      return False
+    chars=len(s)
+    if chars==0:
+      self.setExternalProcessingStatusMessage("Tried on %s: got empty string" % self._time())
+      return False
+    try:
+      s=self._processData(s,inf)
+    except Exception,e:
+      self.log(e,level=1)
+      self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, but could not process; reason: %s" % (self._time(), chars, str(e)))
+      return False
+    self.setTextContent(s)
+    self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, recorded %i chars" % (self._time(), chars, len(s)))
+    return True
+
+  security.declareProtected(Permissions.View, 'getProtocolItemList')
+  def getQualifiedUrl(self):
+    """
+    this should be in the Url, not here
+    otherwise why does the url have a property 'url_protocol'?
+    """
+    return (self.getUrlProtocol() or '')+'://'+(self.getUrlString() or '')
+
+  def _time(self):
+    return DateTime().strftime('%Y/%m/%d %H:%M:%S')
+
+
+# vim: syntax=python shiftwidth=2 
+

Added: erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py?rev=11808&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py (added)
+++ erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py Sun Dec 31 17:00:30 2006
@@ -1,0 +1,135 @@
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5.Document.ExternalDocument import ExternalDocument, SpiderException
+
+from xml import sax
+
+def stripName(s):
+  return s[4:].replace('-','_').encode()
+
+class BookInfo(object):
+  id=title=description=''
+
+class Handler(sax.handler.ContentHandler):
+  stack=[]
+  attrs=None
+  c=''
+  d=None
+  results=[]
+
+  def startElement(self,name,attrs):
+    name=stripName(name)
+    self.stack.append(name)
+    self.attrs=attrs
+    if hasattr(self,'start_'+name):
+      getattr(self,'start_'+name)()
+
+  def endElement(self,name):
+    name=stripName(name)
+    if hasattr(self,'end_'+name):
+      getattr(self,'end_'+name)()
+    self.stack.pop()
+    self.attrs=None
+    self.c=''
+
+  def characters(self,c):
+    self.c+=c.strip().encode('utf-8')
+
+  def start_Record(self):
+    self.d=BookInfo()
+    self.results.append(self.d)
+
+  def end_ID(self):
+    self.d.id=self.c
+
+  def end_Title(self):
+    self.d.title+=self.c
+
+  def end_Author(self):
+    self.d.description+=self.c+'; '
+
+  def end_Label_Information(self):
+    self.d.description+=self.c+'; '
+
+def parseLibraryFile(s):
+  h=Handler()
+  sax.parseString(s,h)
+  return h.results
+
+
+class ExternalLibraryFile(ExternalDocument):
+  """
+  get AU library data
+  """
+  # CMF Type Definition
+  meta_type = 'ERP5 External Library File'
+  portal_type = 'External Library File'
+  isPortalContent = 1
+  isRADContent = 1
+
+  # Declarative security
+  security = ClassSecurityInfo()
+  security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+  # Default Properties
+  property_sheets = ( PropertySheet.Base
+                    , PropertySheet.CategoryCore
+                    , PropertySheet.DublinCore
+                    , PropertySheet.Version
+                    , PropertySheet.Reference
+                    , PropertySheet.Document
+                    , PropertySheet.TextDocument
+                    , PropertySheet.Url
+                    , PropertySheet.ExternalDocument
+                    )
+
+  def _processData(self,s,inf):
+    # remove current subobjects
+    self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='Book')])
+    # parse xml file and iterate over results
+    lista=parseLibraryFile(s)
+    for i,o in enumerate(lista):
+      n=self.newContent(portal_type='Book')
+      self.log(n.getRelativeUrl())
+      n.setTitle(o.title)
+      n.setDescription(o.description)
+      # copy attributes
+      for atr in self.portal_types[self.getPortalType()].getInstanceBaseCategoryList():
+        n.setProperty(atr,self.getProperty(atr))
+      # partial commits (otherwise packet may exceed mysql max size)
+      # XXX this should probably be deferred as portal_activities
+      if i % 50 ==0:
+        get_transaction().commit()
+    self.log(len(lista))
+    return 'k'*len(lista) # a hack to have number of objects in status message
+
+
+# vim: filetype=python syntax=python shiftwidth=2 

Added: erp5/trunk/products/ERP5/Document/ExternalWebPage.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/ExternalWebPage.py?rev=11808&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/Document/ExternalWebPage.py (added)
+++ erp5/trunk/products/ERP5/Document/ExternalWebPage.py Sun Dec 31 17:00:30 2006
@@ -1,0 +1,213 @@
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5.Document.File import stripHtml
+from Products.ERP5.Document.ExternalDocument import ExternalDocument, SpiderException
+
+import mimetypes, re, urllib
+from htmlentitydefs import name2codepoint
+
+rx=[]
+rx.append(re.compile('<!--.*?-->',re.DOTALL|re.MULTILINE)) # clear comments (sometimes JavaScript code in comments contains > chars)
+rx.append(re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)) # clear tags
+rx.append(re.compile('\s+')) # compress multiple spaces
+
+def clearHtml(s):
+  for r in rx:
+    s=r.sub(" ",s)
+  return s
+
+
+tgtencoding='utf-8'
+encodings=['iso-8859-2','iso-8859-15','windows-1250']
+rx_charset=re.compile('<meta.*charset="?([\w\d\-]*)',re.DOTALL|re.MULTILINE|re.IGNORECASE)
+
+def recode(s):
+  """
+  maybe it can be useful system-wide
+  """
+  _encodings=encodings[:] # local copy
+  _encodings.insert(0,tgtencoding) # if not declared or declared wrongly, we try
+  m=rx_charset.search(s)
+  if m and len(m.groups())>0:
+    enc=m.groups()[0].lower()
+    if enc==tgtencoding:
+      return s
+    if enc in _encodings:
+      _encodings.remove(enc)
+    _encodings.insert(0,enc) # we'll start from what we've found
+  for enc in _encodings:
+    try:
+      return s.decode(enc).encode('utf-8')
+    except UnicodeDecodeError, LookupError:
+      pass
+  raise CanNotDecode('sorry')
+
+def _convertEntities(txt,rx,mapper=None):
+  def repl(code):
+    if mapper:
+      code=mapper.get(code)
+    if code is None:
+      return ''
+    return unichr(int(code)).encode(tgtencoding)
+  res=re.split(rx,txt)
+  res[1::2]=map(repl,res[1::2]) # Isn't it beautiful? :)
+  return ''.join(res)
+
+rx_chars=re.compile('&#(\d{3});')
+rx_ents=re.compile('&(\w{1,6});')
+
+def convertEntities(txt):
+  txt=_convertEntities(txt,rx_chars)
+  txt=_convertEntities(txt,rx_ents, name2codepoint)
+  return txt
+
+class ExternalWebPage(ExternalDocument):
+  """
+  caching sources from outside
+  """
+  # CMF Type Definition
+  meta_type = 'ERP5 External Web Page'
+  portal_type = 'External Web Page'
+  isPortalContent = 1
+  isRADContent = 1
+
+  # Declarative security
+  security = ClassSecurityInfo()
+  security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+  # Default Properties
+  property_sheets = ( PropertySheet.Base
+                    , PropertySheet.CategoryCore
+                    , PropertySheet.DublinCore
+                    , PropertySheet.Version
+                    , PropertySheet.Reference
+                    , PropertySheet.Document
+                    , PropertySheet.TextDocument
+                    , PropertySheet.Url
+                    , PropertySheet.ExternalDocument
+                    )
+
+  def _findTopObject(self):
+    '''
+    find the top object from which the spidering begun
+    we search upwards untill we find or reach portal object
+    the top object is the one that is maintaining the dictionary
+    I think we have to do it instead of using simple acquisition
+    because we have to find a non-empty one
+    '''
+    ob=self
+    if hasattr(self,'urldict') and len(self.urldict)>0:
+      return self
+    else:
+      while 1:
+        ob=ob.aq_parent
+        if ob==self.getPortalObject():
+          return self
+        if hasattr(ob,'urldict') and len(ob.urldict)>0:
+          return ob
+
+  security.declareProtected(Permissions.ModifyPortalContent,'addUrl')
+  def addUrl(self,url):
+    '''
+    record url that has already been spidered
+    '''
+    self.urldict[url]=1
+    self._p_changed=1
+
+  security.declareProtected(Permissions.ModifyPortalContent,'checkUrl')
+  def checkUrl(self,url):
+    '''
+    check if the url has already been spidered
+    '''
+    return self.urldict.has_key(url)
+
+  security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject')
+  def resetTopObject(self):
+    '''
+    reset the url dictionary
+    remember do it before you start recursive spidering
+    '''
+    self.urldict={}
+    self._p_changed=1
+
+  def _processData(self,s, inf):
+    # since this is a web page, we don't want anything else
+    # XXX we should find another way - like this, we end up with empty draft objects
+    if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
+      raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
+    top=self._findTopObject()
+    # remove current subobjects
+    self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')])
+    if self.getOptionRecursively()>0 and self.getRecursionDepth()>0:
+      # first find links in text
+      rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
+      for ref in re.findall(rx, s):
+        # eliminate anchors and specials, select internal links
+        if ref.startswith('javascript') or ref.startswith('mailto'):
+          continue
+        ref=re.sub('#.*','',ref)
+        if ref=='':continue
+        #baseref='/'.join(self.getQualifiedUrl().split('/'))
+        baseref=self.getQualifiedUrl()
+        if not ref.startswith('http'):
+          # complete relative paths
+          ref=baseref+'/'+ref
+        # eliminate multiple slashes
+        rx=re.compile('([^:]{1})\/{2,}')
+        ref=re.sub(rx,'\1/',ref)
+        # create subobjects
+        if ref.startswith(baseref) and not top.checkUrl(ref):
+          # record my url in top object
+          top.addUrl(ref)
+          n=self.newContent(portal_type='External Web Page')
+          # set coordinates
+          n.setUrlProtocol('http')
+          n.setUrlString(ref)
+          n.setOptionRecursively(1)
+          n.setRecursionDepth(self.getRecursionDepth()-1)
+          # copy attributes
+          for atr in self.portal_types[self.getPortalType()].getInstanceBaseCategoryList():
+            n.setProperty(atr,self.getProperty(atr))
+          n.activate(activity='SQLQueue').ExternalDocument_spiderAndSetState()
+    # process self
+    # here we check encoding and convert to UTF8
+    try:
+      s=recode(s)
+    except CanNotDecode:
+      self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, but could not decode" % (self._time(), chars))
+      return False
+    s=stripHtml(s) # remove headers, doctype and the like
+    s=clearHtml(s) # remove tags
+    s=convertEntities(s) # convert charrefs and named entities
+    return s
+
+
+# vim: filetype=python syntax=python shiftwidth=2 

Added: erp5/trunk/products/ERP5/Document/TextDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/TextDocument.py?rev=11808&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/Document/TextDocument.py (added)
+++ erp5/trunk/products/ERP5/Document/TextDocument.py Sun Dec 31 17:00:30 2006
@@ -1,0 +1,96 @@
+##############################################################################
+#
+# Copyright (c) 2002 Nexedi SARL and Contributors. All Rights Reserved.
+#                    Jean-Paul Smets-Solanes <jp at nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5.Document.Document import Document
+from Products.ERP5Type.WebDAVSupport import TextContent
+
+class TextDocument(Document, TextContent):
+    """
+        A Document contains text which can be formatted using
+        *Structured Text* or *HTML*. Text can be automatically translated
+        through the use of 'message catalogs'.
+
+        Document inherits from XMLObject and can
+        be synchronized accross multiple sites.
+
+        Version Management: the notion of version depends on the
+        type of application. For example, in the case (1) of Transformation
+        (BOM), all versions are considered as equal and may be kept
+        indefinitely for both archive and usage purpose. In the case (2)
+        of Person data, the new version replaces the previous one
+        in place and is not needed for archive. In the case (3) of
+        a web page, the new version replaces the previous one,
+        the previous one being kept in place for archive.
+
+        Subcontent: documents may include subcontent (files, images, etc.)
+        so that publication of rich content can be path independent.
+    """
+
+    meta_type = 'ERP5 Text Document'
+    portal_type = 'Text Document'
+    add_permission = Permissions.AddPortalContent
+    isPortalContent = 1
+    isRADContent = 1
+
+    # Declarative security
+    security = ClassSecurityInfo()
+    security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+    # Declarative properties
+    property_sheets = ( PropertySheet.Base
+                      , PropertySheet.XMLObject
+                      , PropertySheet.CategoryCore
+                      , PropertySheet.DublinCore
+                      , PropertySheet.Version
+                      , PropertySheet.Document
+                      , PropertySheet.TextDocument
+                      )
+
+    # Declarative interfaces
+    __implements__ = ()
+
+    # Patch
+    PUT = TextContent.PUT # XXX-JPS - Here wa have a security issue - ask seb what to do
+
+    ### Content indexing methods
+    security.declareProtected(Permissions.View, 'getSearchableText')
+    def getSearchableText(self, md=None):
+        """\
+        Used by the catalog for basic full text indexing
+        We should try to do some kind of file conversion here so that getTextContent
+        returns something more readable.
+        """
+        searchable_text = "%s %s %s %s" %  (self.getTitle(), self.getDescription(),
+                                            self.getId(), self.getTextContent())
+        return searchable_text
+
+    # Compatibility with CMF Catalog / CPS sites
+    SearchableText = getSearchableText # XXX-JPS - Here wa have a security issue - ask seb what to do




More information about the Erp5-report mailing list