[Erp5-report] r11808 - /erp5/trunk/products/ERP5/Document/
nobody at svn.erp5.org
nobody at svn.erp5.org
Sun Dec 31 17:00:37 CET 2006
Author: jp
Date: Sun Dec 31 17:00:30 2006
New Revision: 11808
URL: http://svn.erp5.org?rev=11808&view=rev
Log:
Moved from ERP5OOo to ERP5. Code is still very early. Do not use in production.
Added:
erp5/trunk/products/ERP5/Document/ExternalDocument.py
erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py
erp5/trunk/products/ERP5/Document/ExternalWebPage.py
erp5/trunk/products/ERP5/Document/TextDocument.py
Added: erp5/trunk/products/ERP5/Document/ExternalDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/ExternalDocument.py?rev=11808&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/Document/ExternalDocument.py (added)
+++ erp5/trunk/products/ERP5/Document/ExternalDocument.py Sun Dec 31 17:00:30 2006
@@ -1,0 +1,159 @@
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5.Document.Document import Document
+
+import mimetypes, re, urllib
+from htmlentitydefs import name2codepoint
+from DateTime import DateTime
+
+
+class SpiderException(Exception):
+
+ def __init__(self,code, msg):
+ msg="%i: %s" % (code, msg)
+ Exception.__init__(self,msg)
+
+class Opener(urllib.FancyURLopener):
+
+ def http_error_default(self, url, fp, code, msg, headers):
+ raise SpiderException(code, msg)
+
+class ExternalDocument(Document):
+ """
+ caching sources from outside
+ This is basically an abstract class
+ classes deriving from it should overwrite method _processData (this
+ is the one that does something with character data obtained from source)
+ Spidering method supports http, ftp and file protocols, and possibly many others
+ """
+ # CMF Type Definition
+ meta_type = 'ERP5 External Document'
+ portal_type = 'External Document'
+ isPortalContent = 1
+ isRADContent = 1
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ # Default Properties
+ property_sheets = ( PropertySheet.Base
+ , PropertySheet.CategoryCore
+ , PropertySheet.DublinCore
+ , PropertySheet.Version
+ , PropertySheet.Reference
+ , PropertySheet.Document
+ , PropertySheet.TextDocument
+ , PropertySheet.Url
+ , PropertySheet.ExternalDocument
+ )
+
+ protocols=(('Web page','http'),('FTP site','ftp'),('Local file','file'),)
+
+ searchable_property_list = Document.searchable_property_list + ('text_content',)
+
+ security.declareProtected(Permissions.View, 'getProtocolList')
+ def getProtocolList(self):
+ """
+ """
+ return [x[1] for x in self.protocols]
+
+ security.declareProtected(Permissions.View, 'getProtocolItemList')
+ def getProtocolItemList(self):
+ """
+ """
+ return self.protocols
+
+ security.declarePrivate(Permissions.View, '_spiderSource')
+ def _spiderSource(self):
+ """
+ FancyURLopener can open various protocols
+ """
+ op=Opener()
+ f=op.open(self.getQualifiedUrl())
+ s=f.read()
+ inf=f.info()
+ return s, inf
+
+ security.declarePrivate('_processData')
+ def _processData(self,s, inf):
+ raise Exception('this should be implemented in subclass')
+
+ security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject')
+ def resetTopObject(self):
+ '''
+ abstract function for maintaining interface
+ call before beginning recursive spidering
+ used mostly in web pages
+ '''
+ pass
+
+ security.declareProtected(Permissions.View, 'getProtocolItemList')
+ def spiderSource(self):
+ """
+ spiders external datasource
+ sets status message
+ returned value tells us if it succeeded or failed
+ """
+ try:
+ s,inf=self._spiderSource()
+ except Exception,e:
+ self.log(e,level=1)
+ self.setExternalProcessingStatusMessage("Tried on %s: %s" % (self._time(),str(e)))
+ return False
+ chars=len(s)
+ if chars==0:
+ self.setExternalProcessingStatusMessage("Tried on %s: got empty string" % self._time())
+ return False
+ try:
+ s=self._processData(s,inf)
+ except Exception,e:
+ self.log(e,level=1)
+ self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, but could not process; reason: %s" % (self._time(), chars, str(e)))
+ return False
+ self.setTextContent(s)
+ self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, recorded %i chars" % (self._time(), chars, len(s)))
+ return True
+
+ security.declareProtected(Permissions.View, 'getProtocolItemList')
+ def getQualifiedUrl(self):
+ """
+ this should be in the Url, not here
+ otherwise why does the url have a property 'url_protocol'?
+ """
+ return (self.getUrlProtocol() or '')+'://'+(self.getUrlString() or '')
+
+ def _time(self):
+ return DateTime().strftime('%Y/%m/%d %H:%M:%S')
+
+
+# vim: syntax=python shiftwidth=2
+
Added: erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py?rev=11808&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py (added)
+++ erp5/trunk/products/ERP5/Document/ExternalLibraryFile.py Sun Dec 31 17:00:30 2006
@@ -1,0 +1,135 @@
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5.Document.ExternalDocument import ExternalDocument, SpiderException
+
+from xml import sax
+
+def stripName(s):
+ return s[4:].replace('-','_').encode()
+
+class BookInfo(object):
+ id=title=description=''
+
+class Handler(sax.handler.ContentHandler):
+ stack=[]
+ attrs=None
+ c=''
+ d=None
+ results=[]
+
+ def startElement(self,name,attrs):
+ name=stripName(name)
+ self.stack.append(name)
+ self.attrs=attrs
+ if hasattr(self,'start_'+name):
+ getattr(self,'start_'+name)()
+
+ def endElement(self,name):
+ name=stripName(name)
+ if hasattr(self,'end_'+name):
+ getattr(self,'end_'+name)()
+ self.stack.pop()
+ self.attrs=None
+ self.c=''
+
+ def characters(self,c):
+ self.c+=c.strip().encode('utf-8')
+
+ def start_Record(self):
+ self.d=BookInfo()
+ self.results.append(self.d)
+
+ def end_ID(self):
+ self.d.id=self.c
+
+ def end_Title(self):
+ self.d.title+=self.c
+
+ def end_Author(self):
+ self.d.description+=self.c+'; '
+
+ def end_Label_Information(self):
+ self.d.description+=self.c+'; '
+
+def parseLibraryFile(s):
+ h=Handler()
+ sax.parseString(s,h)
+ return h.results
+
+
+class ExternalLibraryFile(ExternalDocument):
+ """
+ get AU library data
+ """
+ # CMF Type Definition
+ meta_type = 'ERP5 External Library File'
+ portal_type = 'External Library File'
+ isPortalContent = 1
+ isRADContent = 1
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ # Default Properties
+ property_sheets = ( PropertySheet.Base
+ , PropertySheet.CategoryCore
+ , PropertySheet.DublinCore
+ , PropertySheet.Version
+ , PropertySheet.Reference
+ , PropertySheet.Document
+ , PropertySheet.TextDocument
+ , PropertySheet.Url
+ , PropertySheet.ExternalDocument
+ )
+
+ def _processData(self,s,inf):
+ # remove current subobjects
+ self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='Book')])
+ # parse xml file and iterate over results
+ lista=parseLibraryFile(s)
+ for i,o in enumerate(lista):
+ n=self.newContent(portal_type='Book')
+ self.log(n.getRelativeUrl())
+ n.setTitle(o.title)
+ n.setDescription(o.description)
+ # copy attributes
+ for atr in self.portal_types[self.getPortalType()].getInstanceBaseCategoryList():
+ n.setProperty(atr,self.getProperty(atr))
+ # partial commits (otherwise packet may exceed mysql max size)
+ # XXX this should probably be deferred as portal_activities
+ if i % 50 ==0:
+ get_transaction().commit()
+ self.log(len(lista))
+ return 'k'*len(lista) # a hack to have number of objects in status message
+
+
+# vim: filetype=python syntax=python shiftwidth=2
Added: erp5/trunk/products/ERP5/Document/ExternalWebPage.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/ExternalWebPage.py?rev=11808&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/Document/ExternalWebPage.py (added)
+++ erp5/trunk/products/ERP5/Document/ExternalWebPage.py Sun Dec 31 17:00:30 2006
@@ -1,0 +1,213 @@
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5.Document.File import stripHtml
+from Products.ERP5.Document.ExternalDocument import ExternalDocument, SpiderException
+
+import mimetypes, re, urllib
+from htmlentitydefs import name2codepoint
+
+rx=[]
+rx.append(re.compile('<!--.*?-->',re.DOTALL|re.MULTILINE)) # clear comments (sometimes JavaScript code in comments contains > chars)
+rx.append(re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)) # clear tags
+rx.append(re.compile('\s+')) # compress multiple spaces
+
+def clearHtml(s):
+ for r in rx:
+ s=r.sub(" ",s)
+ return s
+
+
+tgtencoding='utf-8'
+encodings=['iso-8859-2','iso-8859-15','windows-1250']
+rx_charset=re.compile('<meta.*charset="?([\w\d\-]*)',re.DOTALL|re.MULTILINE|re.IGNORECASE)
+
+def recode(s):
+ """
+ maybe it can be useful system-wide
+ """
+ _encodings=encodings[:] # local copy
+ _encodings.insert(0,tgtencoding) # if not declared or declared wrongly, we try
+ m=rx_charset.search(s)
+ if m and len(m.groups())>0:
+ enc=m.groups()[0].lower()
+ if enc==tgtencoding:
+ return s
+ if enc in _encodings:
+ _encodings.remove(enc)
+ _encodings.insert(0,enc) # we'll start from what we've found
+ for enc in _encodings:
+ try:
+ return s.decode(enc).encode('utf-8')
+ except UnicodeDecodeError, LookupError:
+ pass
+ raise CanNotDecode('sorry')
+
+def _convertEntities(txt,rx,mapper=None):
+ def repl(code):
+ if mapper:
+ code=mapper.get(code)
+ if code is None:
+ return ''
+ return unichr(int(code)).encode(tgtencoding)
+ res=re.split(rx,txt)
+ res[1::2]=map(repl,res[1::2]) # Isn't it beautiful? :)
+ return ''.join(res)
+
+rx_chars=re.compile('&#(\d{3});')
+rx_ents=re.compile('&(\w{1,6});')
+
+def convertEntities(txt):
+ txt=_convertEntities(txt,rx_chars)
+ txt=_convertEntities(txt,rx_ents, name2codepoint)
+ return txt
+
+class ExternalWebPage(ExternalDocument):
+ """
+ caching sources from outside
+ """
+ # CMF Type Definition
+ meta_type = 'ERP5 External Web Page'
+ portal_type = 'External Web Page'
+ isPortalContent = 1
+ isRADContent = 1
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ # Default Properties
+ property_sheets = ( PropertySheet.Base
+ , PropertySheet.CategoryCore
+ , PropertySheet.DublinCore
+ , PropertySheet.Version
+ , PropertySheet.Reference
+ , PropertySheet.Document
+ , PropertySheet.TextDocument
+ , PropertySheet.Url
+ , PropertySheet.ExternalDocument
+ )
+
+ def _findTopObject(self):
+ '''
+ find the top object from which the spidering begun
+ we search upwards untill we find or reach portal object
+ the top object is the one that is maintaining the dictionary
+ I think we have to do it instead of using simple acquisition
+ because we have to find a non-empty one
+ '''
+ ob=self
+ if hasattr(self,'urldict') and len(self.urldict)>0:
+ return self
+ else:
+ while 1:
+ ob=ob.aq_parent
+ if ob==self.getPortalObject():
+ return self
+ if hasattr(ob,'urldict') and len(ob.urldict)>0:
+ return ob
+
+ security.declareProtected(Permissions.ModifyPortalContent,'addUrl')
+ def addUrl(self,url):
+ '''
+ record url that has already been spidered
+ '''
+ self.urldict[url]=1
+ self._p_changed=1
+
+ security.declareProtected(Permissions.ModifyPortalContent,'checkUrl')
+ def checkUrl(self,url):
+ '''
+ check if the url has already been spidered
+ '''
+ return self.urldict.has_key(url)
+
+ security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject')
+ def resetTopObject(self):
+ '''
+ reset the url dictionary
+ remember do it before you start recursive spidering
+ '''
+ self.urldict={}
+ self._p_changed=1
+
+ def _processData(self,s, inf):
+ # since this is a web page, we don't want anything else
+ # XXX we should find another way - like this, we end up with empty draft objects
+ if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
+ raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
+ top=self._findTopObject()
+ # remove current subobjects
+ self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')])
+ if self.getOptionRecursively()>0 and self.getRecursionDepth()>0:
+ # first find links in text
+ rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
+ for ref in re.findall(rx, s):
+ # eliminate anchors and specials, select internal links
+ if ref.startswith('javascript') or ref.startswith('mailto'):
+ continue
+ ref=re.sub('#.*','',ref)
+ if ref=='':continue
+ #baseref='/'.join(self.getQualifiedUrl().split('/'))
+ baseref=self.getQualifiedUrl()
+ if not ref.startswith('http'):
+ # complete relative paths
+ ref=baseref+'/'+ref
+ # eliminate multiple slashes
+ rx=re.compile('([^:]{1})\/{2,}')
+ ref=re.sub(rx,'\1/',ref)
+ # create subobjects
+ if ref.startswith(baseref) and not top.checkUrl(ref):
+ # record my url in top object
+ top.addUrl(ref)
+ n=self.newContent(portal_type='External Web Page')
+ # set coordinates
+ n.setUrlProtocol('http')
+ n.setUrlString(ref)
+ n.setOptionRecursively(1)
+ n.setRecursionDepth(self.getRecursionDepth()-1)
+ # copy attributes
+ for atr in self.portal_types[self.getPortalType()].getInstanceBaseCategoryList():
+ n.setProperty(atr,self.getProperty(atr))
+ n.activate(activity='SQLQueue').ExternalDocument_spiderAndSetState()
+ # process self
+ # here we check encoding and convert to UTF8
+ try:
+ s=recode(s)
+ except CanNotDecode:
+ self.setExternalProcessingStatusMessage("Spidered on %s, %i chars, but could not decode" % (self._time(), chars))
+ return False
+ s=stripHtml(s) # remove headers, doctype and the like
+ s=clearHtml(s) # remove tags
+ s=convertEntities(s) # convert charrefs and named entities
+ return s
+
+
+# vim: filetype=python syntax=python shiftwidth=2
Added: erp5/trunk/products/ERP5/Document/TextDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/TextDocument.py?rev=11808&view=auto
==============================================================================
--- erp5/trunk/products/ERP5/Document/TextDocument.py (added)
+++ erp5/trunk/products/ERP5/Document/TextDocument.py Sun Dec 31 17:00:30 2006
@@ -1,0 +1,96 @@
+##############################################################################
+#
+# Copyright (c) 2002 Nexedi SARL and Contributors. All Rights Reserved.
+# Jean-Paul Smets-Solanes <jp at nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5.Document.Document import Document
+from Products.ERP5Type.WebDAVSupport import TextContent
+
+class TextDocument(Document, TextContent):
+ """
+ A Document contains text which can be formatted using
+ *Structured Text* or *HTML*. Text can be automatically translated
+ through the use of 'message catalogs'.
+
+ Document inherits from XMLObject and can
+ be synchronized accross multiple sites.
+
+ Version Management: the notion of version depends on the
+ type of application. For example, in the case (1) of Transformation
+ (BOM), all versions are considered as equal and may be kept
+ indefinitely for both archive and usage purpose. In the case (2)
+ of Person data, the new version replaces the previous one
+ in place and is not needed for archive. In the case (3) of
+ a web page, the new version replaces the previous one,
+ the previous one being kept in place for archive.
+
+ Subcontent: documents may include subcontent (files, images, etc.)
+ so that publication of rich content can be path independent.
+ """
+
+ meta_type = 'ERP5 Text Document'
+ portal_type = 'Text Document'
+ add_permission = Permissions.AddPortalContent
+ isPortalContent = 1
+ isRADContent = 1
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ # Declarative properties
+ property_sheets = ( PropertySheet.Base
+ , PropertySheet.XMLObject
+ , PropertySheet.CategoryCore
+ , PropertySheet.DublinCore
+ , PropertySheet.Version
+ , PropertySheet.Document
+ , PropertySheet.TextDocument
+ )
+
+ # Declarative interfaces
+ __implements__ = ()
+
+ # Patch
+ PUT = TextContent.PUT # XXX-JPS - Here wa have a security issue - ask seb what to do
+
+ ### Content indexing methods
+ security.declareProtected(Permissions.View, 'getSearchableText')
+ def getSearchableText(self, md=None):
+ """\
+ Used by the catalog for basic full text indexing
+ We should try to do some kind of file conversion here so that getTextContent
+ returns something more readable.
+ """
+ searchable_text = "%s %s %s %s" % (self.getTitle(), self.getDescription(),
+ self.getId(), self.getTextContent())
+ return searchable_text
+
+ # Compatibility with CMF Catalog / CPS sites
+ SearchableText = getSearchableText # XXX-JPS - Here wa have a security issue - ask seb what to do
More information about the Erp5-report
mailing list