[Erp5-report] r13631 - /erp5/trunk/products/ERP5/Document/
nobody at svn.erp5.org
nobody at svn.erp5.org
Mon Mar 26 13:53:34 CEST 2007
Author: jp
Date: Mon Mar 26 13:53:32 2007
New Revision: 13631
URL: http://svn.erp5.org?rev=13631&view=rev
Log:
Code review and refactoring based on Document API.
Modified:
erp5/trunk/products/ERP5/Document/Document.py
erp5/trunk/products/ERP5/Document/Image.py
erp5/trunk/products/ERP5/Document/TextDocument.py
Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=13631&r1=13630&r2=13631&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Document.py (original)
+++ erp5/trunk/products/ERP5/Document/Document.py Mon Mar 26 13:53:32 2007
@@ -42,6 +42,9 @@
from Products.ERP5Type.Message import Message
from Products.ERP5Type.Utils import convertToUpperCase, convertToMixedCase
from Products.ERP5.Document.Url import UrlMixIn
+from Products.ERP5.Tool.ContributionTool import MAX_REPEAT
+
+from zLOG import LOG
_MARKER = []
VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')
@@ -50,6 +53,47 @@
items = kw.items()
items.sort()
return tuple(items)
+
+class SnapshotMixin:
+ """
+ This class provides a generic API to store in the ZODB
+ PDF snapshots of objects and documents with the
+ goal to keep a facsimile copy of documents as they
+ were at a given date.
+ """
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ security.declareProtected(Permissions.ModifyPortalContent, 'createSnapshot')
+ def createSnapshot(self):
+ """
+ Create a snapshot (PDF). This is the normal way to modifiy
+ snapshot_data. Once a snapshot is taken, a new snapshot
+ can not be taken.
+
+ NOTE: use getSnapshotData and hasSnapshotData accessors
+ to access a snapshot.
+
+ NOTE2: implementation of createSnapshot should probably
+ be delegated to a types base method since this it
+ is configuration dependent.
+ """
+ if self.hasSnapshotData():
+ raise ConversionError('This document already has a snapshot.')
+ self._setSnapshotData(self.convert(format='pdf'))
+
+ security.declareProtected(Permissions.ManagePortal, 'deleteSnapshot')
+ def deleteSnapshot(self):
+ """
+ Deletes the snapshot - in theory this should never be done.
+ It is there for programmers and system administrators.
+ """
+ try:
+ del(self.snapshot_data)
+ except AttributeError:
+ pass
class ConversionError(Exception):pass
@@ -107,8 +151,7 @@
security.declareProtected(Permissions.View, 'getCacheTime')
def getCacheTime(self, **format):
"""
- Checks when
-if ever was the file produced
+ Checks when if ever was the file produced
"""
self.updateConversionCache()
return self._cached_time.get(makeSortedTuple(format), 0)
@@ -148,7 +191,7 @@
tformat = makeSortedTuple(format)
return self._cached_mime.get(tformat, ''), self._cached_data.get(tformat, '')
- security.declareProtected(Permissions.View, 'getConversionCacheInfo')
+ security.declareProtected(Permissions.ViewManagementScreens, 'getConversionCacheInfo')
def getConversionCacheInfo(self):
"""
Get cache details as string (for debugging)
@@ -173,7 +216,7 @@
return s
-class Document(XMLObject, UrlMixIn):
+class Document(XMLObject, UrlMixIn, ConversionCacheMixin, SnapshotMixin):
"""
Document is an abstract class with all methods
related to document management in ERP5. This includes
@@ -239,7 +282,7 @@
input - data supplied with http request or set on the object during (2) (e.g.
discovered from email text)
- file_name - data which might be encoded in file name
+ file_name - data which might be encoded in file name
user_login - information about user who is contributing the file
content - data which might be derived from document content
@@ -335,6 +378,8 @@
# Regular expressions
href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
+ body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
+ title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
# Declarative security
security = ClassSecurityInfo()
@@ -347,25 +392,25 @@
, PropertySheet.DublinCore
, PropertySheet.Version
, PropertySheet.Document
- , PropertySheet.Snapshot
, PropertySheet.ExternalDocument
, PropertySheet.Url
, PropertySheet.Periodicity
+ , PropertySheet.Snapshot
)
# Declarative interfaces
__implements__ = ()
- searchable_property_list = ('title', 'description', 'id', 'reference',
- 'version', 'short_title', 'keyword',
- 'subject', 'source_reference', 'source_project_title')
+ searchable_property_list = ('asText', 'title', 'description', 'id', 'reference',
+ 'version', 'short_title',
+ 'subject', 'source_reference', 'source_project_title',)
data = '' # some day this will be in property sheets
base_format = 'base storage format'
### Content processing methods
security.declareProtected(Permissions.View, 'index_html')
- def index_html(self, REQUEST, RESPONSE, format=None, force=0, **kw):
+ def index_html(self, REQUEST, RESPONSE, format=None, **kw):
"""
We follow here the standard Zope API for files and images
and extend it to support format conversion. The idea
@@ -382,16 +427,14 @@
Should return appropriate format (calling convert
if necessary) and set headers.
- format - the format specied in the form of an extension
+ format -- the format specied in the form of an extension
string (ex. jpeg, html, text, txt, etc.)
- force - convert doc even if it has a cached version which seems to be up2date
- **kw can be various things - e.g. resolution
+
+ **kw -- can be various things - e.g. resolution
TODO:
- implement guards API so that conversion to certain
formats require certain permission
- - force parameter should be somehow restricted
- to prevent denial of service attack
"""
pass
@@ -421,6 +464,7 @@
else:
val = list(val)
return val
+
searchable_text = reduce(add, map(lambda x: getPropertyListOrValue(x),
self.searchable_property_list))
searchable_text = ' '.join(searchable_text)
@@ -429,23 +473,30 @@
# Compatibility with CMF Catalog
SearchableText = getSearchableText
+ security.declareProtected(Permissions.AccessContentsInformation, 'isExternalDocument')
+ def isExternalDocument(self):
+ """
+ Return true if this document was obtained from an external source
+ """
+ return bool(self.getUrlString())
+
### Relation getters
+ security.declareProtected(Permissions.View, 'getSearchableReferenceList')
def getSearchableReferenceList(self):
"""
- Public Method
-
This method returns a list of dictionaries which can
be used to find objects by reference. It uses for
that a regular expression defined at system level
preferences.
"""
- text = self.getSearchableText()
+ text = self.getSearchableText() # XXX getSearchableText or asText ?
regexp = self.portal_preferences.getPreferredDocumentReferenceRegularExpression()
try:
rx_search = re.compile(regexp)
except TypeError: # no regexp in preference
- self.log('please set document reference regexp in preferences')
- return []
+ LOG('ERP5/Document/Document.getSearchableReferenceList', 0,
+ 'Document regular expression must be set in portal preferences')
+ return ()
res = rx_search.finditer(text)
res = [(r.group(), r.groupdict()) for r in res]
return res
@@ -785,8 +836,8 @@
Based on the document content, find out as many properties as needed.
returns properties which should be set on the document
"""
- # XXX this method should first make sure we have text content
- # or do a conversion
+ if not self.hasBaseData():
+ self.convertToBaseFormat()
method = self._getTypeBasedMethod('getPropertyDictFromContent',
fallback_script_id='Document_getPropertyDictFromContent')
return method()
@@ -821,7 +872,7 @@
### Metadata disovery and ingestion methods
security.declareProtected(Permissions.ModifyPortalContent, 'discoverMetadata')
- def discoverMetadata(self, file_name, user_login=None): # XXX Was filename always there ? at least make it optional
+ def discoverMetadata(self, file_name=None, user_login=None):
"""
This is the main metadata discovery function - controls the process
of discovering data from various sources. The discovery itself is
@@ -843,88 +894,39 @@
order_list.reverse()
# Start with everything until content - build a dictionary according to the order
- content_index = order_list.index('content')
kw = {}
- first_list = order_list[:content_index]
- for order_id in first_list:
+ for order_id in order_list:
if order_id not in VALID_ORDER_KEY_LIST:
# Prevent security attack or bad preferences
raise AttributeError, "%s is not in valid order key list" % order_id
method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
method = getattr(self, method_id)
if order_id == 'file_name':
- result = method(file_name)
+ if file_name is not None: result = method(file_name)
elif order_id == 'user_login':
- result = method(user_login)
+ if user_login is not None: result = method(user_login)
else:
result = method()
if result is not None:
+ # LOG('discoverMetadata %s' % order_id, 0, repr(result))
kw.update(result)
- # Edit content
+ # Prepare the content edit parameters - portal_type should not be changed
try:
del(kw['portal_type'])
except KeyError:
pass
- self.edit(**kw)
-
- # Finish in second stage
- self.activate().finishMetadataDiscovery(user_login=user_login)
-
- security.declareProtected(Permissions.ModifyPortalContent, 'finishMetadataDiscovery')
- def finishMetadataDiscovery(self, user_login):
- """
- This is called by portal_activities, to leave time-consuming procedures
- for later. It converts what needs conversion to base, and
- does things that can be done only after it is converted).
- """
- self.convertToBaseFormat()
- # Get the order from preferences
- # Preference is made of a sequence of 'user_login', 'content', 'file_name', 'input'
- method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList',
- fallback_script_id = 'Document_getPreferredDocumentMetadataDiscoveryOrderList')
- order_list = list(method())
- order_list.reverse()
-
- # do content and everything after content
- content_index = order_list.index('content')
- second_list = order_list[content_index:]
-
- kw = {}
- for order_id in second_list:
- if order_id not in VALID_ORDER_KEY_LIST:
- # Prevent security attack or bad preferences
- raise AttributeError, "%s is not in valid order key list" % order_id
- method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
- method = getattr(self, method_id)
- if order_id == 'file_name':
- result = method(self.getSourceReference())
- elif order_id == 'user_login':
- result = method(user_login)
- else:
- result = method()
- if result is not None:
- kw.update(result)
-
- # Edit content
- try:
- del(kw['portal_type'])
- except KeyError:
- pass
- self.edit(**kw)
-
- # Erase backup attributes
- if hasattr(self, '_backup_input'):
- delattr(self, '_backup_input')
-
- # Finish ingestion by calling method
- self.finishIngestion()
+ self._edit(**kw) # Try not to invoke an automatic transition here
+ self.finishIngestion() # Finish ingestion by calling method
+ self.reindexObject()
security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
def finishIngestion(self):
"""
- Finish the ingestion process by calling the appropriate script
+ Finish the ingestion process by calling the appropriate script. This
+ script can for example allocate a reference number automatically if
+ no reference was defined.
"""
return self._getTypeBasedMethod('finishIngestion',
fallback_script_id='Document_finishIngestion')
@@ -939,39 +941,86 @@
string (ex. jpeg, html, text, txt, etc.)
**kw can be various things - e.g. resolution
+ Default implementation returns an empty string (html, text)
+ or raises an error.
+
TODO:
- implement guards API so that conversion to certain
formats require certain permission
"""
- pass
+ if format == 'html':
+ return 'text/html', ''
+ if format in ('text', 'txt'):
+ return 'text/plain', ''
+ raise NotImplementedError
security.declareProtected(Permissions.View, 'asText')
def asText(self):
"""
Converts the content of the document to a textual representation.
"""
- return self.convert(format='txt')
+ mime, data = self.convert(format='txt')
+ return data
security.declareProtected(Permissions.View, 'asHTML')
def asHTML(self):
"""
Returns a complete HTML representation of the document
- (with body tags, etc.).
- """
- return self.convert(format='html')
+ (with body tags, etc.). Adds if necessary a base
+ tag so that the document can be displayed in an iframe
+ or standalone.
+ """
+ if self.hasConversion(format='base-html'):
+ mime, data = self.getConversion(format='base-html')
+ return data
+ mime, html = self.convert(format='html')
+ if self.getUrlString():
+ # If a URL is defined, add the base tag
+ # if base is defined yet.
+ html = str(html)
+ if not html.find('<base') >= 0:
+ base = '<base href="%s">' % self.getContentBaseURL()
+ html = html.replace('<head>', '<head>%s' % base)
+ # We do not implement cache yet since it increases ZODB
+ # for probably no reason. More research needed
+ # self.setConversion(html, mime='text/html', format='base-html')
+ return html
security.declareProtected(Permissions.View, 'asStrippedHTML')
def asStrippedHTML(self):
"""
- Returns a stipped HTML representation of the document
- (without body tags, etc.) which can be used to inline
+ Returns a stripped HTML representation of the document
+ (without html and body tags, etc.) which can be used to inline
a preview of the document.
"""
- return self.convert(format='html')
+ if self.hasConversion(format='stripped-html'):
+ mime, data = self.getConversion(format='stripped-html')
+ return data
+ mime, html = self.convert(format='html')
+ body_list = re.findall(self.body_parser, str(html))
+ if len(body_list):
+ return body_list[0]
+ return html
+
+ security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
+ def getContentInformation(self):
+ """
+ Returns the content information from the HTML conversion.
+ The default implementation tries to build a dictionnary
+ from the HTML conversion of the document and extract
+ the document title.
+ """
+ result = {}
+ html = self.asHTML()
+ if not html: return result
+ title_list = re.findall(self.title_parser, str(html))
+ if title_list:
+ result['title'] = title_list[0]
+ return result
# Base format support
security.declareProtected(Permissions.ModifyPortalContent, 'convertToBaseFormat')
- def convertToBaseFormat(self, REQUEST=None):
+ def convertToBaseFormat(self):
"""
Converts the content of the document to a base format
which is later used for all conversions. This method
@@ -990,7 +1039,8 @@
Use accessors (getBaseData, setBaseData, hasBaseData, etc.)
"""
try:
- msg = self._convertToBaseFormat()
+ msg = self._convertToBaseFormat() # Call implemetation method
+ self.clearConversionCache() # Conversion cache is now invalid
if msg is None:
msg = 'Converted to %s.' % self.base_format
self.convertFile(comment=msg) # Invoke workflow method
@@ -1025,36 +1075,6 @@
"""
raise NotImplementedError
- # Snapshot methods - XXX since this can be useful beyond
- # documents, it should be moved to MixIn class which may
- # be used, for example, to take a snapshot of an invoice.
- security.declareProtected(Permissions.ModifyPortalContent, 'createSnapshot')
- def createSnapshot(self):
- """
- Create a snapshot (PDF). This is the normal way to modifiy
- snapshot_data. Once a snapshot is taken, a new snapshot
- can not be taken.
-
- NOTE: use getSnapshotData and hasSnapshotData accessors
- to access a snapshot.
-
- NOTE2: implementation of createSnapshot should probably
- be delegated to a types base method since this it
- is configuration dependent.
- """
- pass
-
- security.declareProtected(Permissions.ManagePortal, 'deleteSnapshot')
- def deleteSnapshot(self):
- """
- Deletes the snapshot - in theory this should never be done.
- It is there for programmers and system administrators.
- """
- try:
- del(self.snapshot_data)
- except AttributeError:
- pass
-
# Transformation API
security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
def populateContent(self):
@@ -1091,12 +1111,12 @@
return re.findall(self.href_parser, str(html_content))
security.declareProtected(Permissions.ModifyPortalContent, 'updateContentFromURL')
- def updateContentFromURL(self):
+ def updateContentFromURL(self, repeat=MAX_REPEAT, crawling_depth=0):
"""
Download and update content of this document from its source URL.
Implementation is handled by ContributionTool.
"""
- self.portal_contributions.updateContentFromURL(self)
+ self.portal_contributions.updateContentFromURL(self, repeat=repeat, crawling_depth=crawling_depth)
security.declareProtected(Permissions.ModifyPortalContent, 'crawlContent')
def crawlContent(self):
@@ -1111,7 +1131,7 @@
Returns the content base URL based on the actual content or
on its URL.
"""
- # XXX TODO - try to retrive base URL from content
+ # XXX TODO - try to retrieve base URL from content
# If no base_url defined, define the base URL from our URL
base_url = self.asURL()
base_url_list = base_url.split('/')
@@ -1123,7 +1143,7 @@
def getNextAlarmDate(self):
"""
This method is only there to have something to test.
- Serious refactory of Alarm, Periodicity and CalendarPeriod
+ Serious refactoring of Alarm, Periodicity and CalendarPeriod
classes is needed.
"""
- return DateTime() + .01
+ return DateTime() + .1
Modified: erp5/trunk/products/ERP5/Document/Image.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Image.py?rev=13631&r1=13630&r2=13631&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Image.py (original)
+++ erp5/trunk/products/ERP5/Document/Image.py Mon Mar 26 13:53:32 2007
@@ -94,12 +94,16 @@
# Default Properties
property_sheets = ( PropertySheet.Base
+ , PropertySheet.XMLObject
, PropertySheet.CategoryCore
, PropertySheet.DublinCore
, PropertySheet.Version
, PropertySheet.Reference
, PropertySheet.Document
, PropertySheet.Data
+ , PropertySheet.ExternalDocument
+ , PropertySheet.Url
+ , PropertySheet.Periodicity
)
#
@@ -188,7 +192,7 @@
if not self.hasConversion(display=display, format=format,
quality=quality, resolution=resolution):
# Generate photo on-the-fly
- self._makeDisplayPhoto(display, 1, format=format, quality=quality, resolution=resolution)
+ self._makeDisplayPhoto(display, format=format, quality=quality, resolution=resolution)
mime, image = self.getConversion(display=display, format=format,
quality=quality ,resolution=resolution)
width, height = (image.width, image.height)
@@ -290,10 +294,11 @@
if not self.hasConversion(display=display, format=format,
quality=quality,resolution=resolution):
# Generate photo on-the-fly
- self._makeDisplayPhoto(display, 1, format=format, quality=quality,resolution=resolution)
+ self._makeDisplayPhoto(display, format=format, quality=quality,resolution=resolution)
# Return resized image
mime, image = self.getConversion(display=display, format=format,
quality=quality ,resolution=resolution)
+ RESPONSE.setHeader('Content-Type', mime)
return image.index_html(REQUEST, RESPONSE)
# Return original image
@@ -307,7 +312,6 @@
def _resize(self, display, width, height, quality=75, format='', resolution=None):
"""Resize and resample photo."""
newimg = StringIO()
- os.putenv('TMPDIR', '/tmp') # because if we run zope as root, we have /root/tmp here and convert goes crazy
if sys.platform == 'win32':
from win32pipe import popen2
@@ -324,8 +328,9 @@
imgout, imgin = popen2('convert -quality %s -geometry %sx%s - -'
% (quality, width, height))
else:
- LOG('Resolution',0,str(resolution))
- cmd = 'convert -density %sx%s -quality %s -geometry %sx%s - -' % (resolution, resolution, quality, width, height)
+ # LOG('Resolution',0,str(resolution))
+ cmd = 'convert -density %sx%s -quality %s -geometry %sx%s - -' % (resolution,
+ resolution, quality, width, height)
imgout, imgin = popen2(cmd)
imgin.write(str(self.getData()))
@@ -357,9 +362,9 @@
quality=quality,resolution=resolution))
return image
- def _makeDisplayPhoto(self, display, force=0, format='', quality=75, resolution=None):
+ def _makeDisplayPhoto(self, display, format='', quality=75, resolution=None):
"""Create given display."""
- if not self.hasConversion(display=display, format=format, quality=quality,resolution=resolution) or force:
+ if not self.hasConversion(display=display, format=format, quality=quality,resolution=resolution):
image = self._getDisplayPhoto(display, format=format, quality=quality, resolution=resolution)
self.setConversion(image, mime=image.content_type,
display=display, format=format,
Modified: erp5/trunk/products/ERP5/Document/TextDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/TextDocument.py?rev=13631&r1=13630&r2=13631&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/TextDocument.py (original)
+++ erp5/trunk/products/ERP5/Document/TextDocument.py Mon Mar 26 13:53:32 2007
@@ -125,7 +125,11 @@
if format is None:
# The default is to use ERP5 Forms to render the page
return self.view()
- return self.convert(format=format)
+ mime, data = self.convert(format=format)
+ RESPONSE.setHeader('Content-Length', len(data))
+ RESPONSE.setHeader('Content-Type', mime)
+ RESPONSE.setHeader('Accept-Ranges', 'bytes')
+ return data
security.declareProtected(Permissions.View, 'convert')
def convert(self, format, **kw):
@@ -136,30 +140,14 @@
_setCacheHeaders(self, {'format' : format})
# Return the raw content
if format == 'raw':
- return self.getTextContent()
+ return 'text/plain', self.getTextContent()
mime_type = getToolByName(self, 'mimetypes_registry').lookupExtension('name.%s' % format)
src_mimetype = self.getTextFormat()
if not src_mimetype.startswith('text/'):
src_mimetype = 'text/%s' % src_mimetype
- return getToolByName(self, 'portal_transforms').convertTo(mime_type,
+ return mime_type, getToolByName(self, 'portal_transforms').convertTo(mime_type,
self.getTextContent(), object=self, mimetype=src_mimetype)
def __call__(self):
_setCacheHeaders(self, {})
return Document.__call__(self)
-
- ### Content indexing methods
- security.declareProtected(Permissions.View, 'getSearchableText')
- def getSearchableText(self, md=None):
- """\
- Used by the catalog for basic full text indexing
- We should try to do some kind of file conversion here so that getTextContent
- returns something more readable.
- """
- searchable_text = "%s %s %s %s %s" % (self.getTitle(), self.getShortTitle(),
- self.getDescription(),
- self.getId(), self.getTextContent())
- return searchable_text
-
- # Compatibility with CMF Catalog / CPS sites
- SearchableText = getSearchableText # XXX-JPS - Here wa have a security issue - ask seb what to do
More information about the Erp5-report
mailing list