extend processing of office files: error handling, handle description property as comments
This commit is contained in:
parent
027b661110
commit
19b50d9e8f
7 changed files with 31 additions and 17 deletions
|
@ -231,7 +231,7 @@ Extracting Document Properties from MS Office Files
|
||||||
>>> path = os.path.join(dataDir, 'office')
|
>>> path = os.path.join(dataDir, 'office')
|
||||||
>>> fn = os.path.join(path, 'example.docx')
|
>>> fn = os.path.join(path, 'example.docx')
|
||||||
>>> os.path.getsize(fn)
|
>>> os.path.getsize(fn)
|
||||||
20337...
|
23561...
|
||||||
|
|
||||||
>>> officeFile = addAndConfigureObject(resources, Resource, 'test.docx',
|
>>> officeFile = addAndConfigureObject(resources, Resource, 'test.docx',
|
||||||
... title=u'Example Word File', resourceType=tOfficeFile,
|
... title=u'Example Word File', resourceType=tOfficeFile,
|
||||||
|
@ -241,7 +241,7 @@ Extracting Document Properties from MS Office Files
|
||||||
|
|
||||||
>>> content = aOfficeFile.data
|
>>> content = aOfficeFile.data
|
||||||
>>> len(content)
|
>>> len(content)
|
||||||
17409
|
20327
|
||||||
|
|
||||||
Clean up:
|
Clean up:
|
||||||
>>> shutil.copy(fn + '.sav', fn)
|
>>> shutil.copy(fn + '.sav', fn)
|
||||||
|
|
|
@ -40,8 +40,8 @@ class ExternalCollectionView(ConceptView):
|
||||||
def update(self):
|
def update(self):
|
||||||
if 'update' in self.request.form:
|
if 'update' in self.request.form:
|
||||||
cta = adapted(self.context)
|
cta = adapted(self.context)
|
||||||
if cta is not None:
|
cta.request = self.request
|
||||||
cta.update()
|
cta.update()
|
||||||
if cta.updateMessage is not None:
|
if cta.updateMessage is not None:
|
||||||
self.request.form['message'] = cta.updateMessage
|
self.request.form['message'] = cta.updateMessage
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -101,6 +101,7 @@ class ExternalCollectionAdapter(AdapterBase):
|
||||||
adobj = adapted(obj)
|
adobj = adapted(obj)
|
||||||
directory = provider.getDirectory(self)
|
directory = provider.getDirectory(self)
|
||||||
adobj.storageParams=dict(subdirectory=directory)
|
adobj.storageParams=dict(subdirectory=directory)
|
||||||
|
adobj.request = self.request
|
||||||
adobj.externalAddress = addr
|
adobj.externalAddress = addr
|
||||||
# collect error information
|
# collect error information
|
||||||
if adobj.processingErrors:
|
if adobj.processingErrors:
|
||||||
|
@ -209,6 +210,7 @@ class DirectoryCollectionProvider(object):
|
||||||
contentType=contentType,
|
contentType=contentType,
|
||||||
)
|
)
|
||||||
adobj = adapted(obj)
|
adobj = adapted(obj)
|
||||||
|
adobj.request = client.request
|
||||||
adobj.externalAddress = addr # must be set last
|
adobj.externalAddress = addr # must be set last
|
||||||
# collect error information
|
# collect error information
|
||||||
if adobj.processingErrors:
|
if adobj.processingErrors:
|
||||||
|
|
|
@ -52,16 +52,12 @@ class OfficeFile(ExternalFileAdapter):
|
||||||
|
|
||||||
implements(IOfficeFile)
|
implements(IOfficeFile)
|
||||||
|
|
||||||
_adapterAttributes = ExternalFileAdapter._adapterAttributes + (
|
|
||||||
'processingErrors',)
|
|
||||||
|
|
||||||
propertyMap = {u'Revision:': 'version'}
|
propertyMap = {u'Revision:': 'version'}
|
||||||
propFileName = 'docProps/custom.xml'
|
propFileName = 'docProps/custom.xml'
|
||||||
|
corePropFileName = 'docProps/core.xml'
|
||||||
fileExtensions = ('.docm', '.docx', 'dotm', 'dotx', 'pptx', 'potx', 'ppsx',
|
fileExtensions = ('.docm', '.docx', 'dotm', 'dotx', 'pptx', 'potx', 'ppsx',
|
||||||
'.xlsm', '.xlsx', '.xltm', '.xltx')
|
'.xlsm', '.xlsx', '.xltm', '.xltx')
|
||||||
|
|
||||||
processingErrors = []
|
|
||||||
|
|
||||||
@Lazy
|
@Lazy
|
||||||
def logger(self):
|
def logger(self):
|
||||||
return getLogger('loops.integrator.office.base.OfficeFile')
|
return getLogger('loops.integrator.office.base.OfficeFile')
|
||||||
|
@ -91,20 +87,32 @@ class OfficeFile(ExternalFileAdapter):
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
self.logger.warn(e)
|
self.logger.warn(e)
|
||||||
return []
|
return []
|
||||||
|
if self.corePropFileName not in zf.namelist():
|
||||||
|
self.logger.warn('Core properties not found in file %s.' %
|
||||||
|
self.externalAddress)
|
||||||
if self.propFileName not in zf.namelist():
|
if self.propFileName not in zf.namelist():
|
||||||
self.logger.warn('Custom properties not found in file %s.' %
|
self.logger.warn('Custom properties not found in file %s.' %
|
||||||
self.externalAddress)
|
self.externalAddress)
|
||||||
propsXml = zf.read(self.propFileName)
|
propsXml = zf.read(self.propFileName)
|
||||||
|
corePropsXml = zf.read(self.corePropFileName)
|
||||||
|
# TODO: read core.xml, return both trees in dictionary
|
||||||
zf.close()
|
zf.close()
|
||||||
return etree.fromstring(propsXml)
|
return {'custom': etree.fromstring(propsXml),
|
||||||
|
'core': etree.fromstring(corePropsXml)}
|
||||||
|
|
||||||
def getDocProperty(self, pname):
|
def getDocProperty(self, pname):
|
||||||
for p in self.docPropertyDom:
|
for p in self.docPropertyDom['custom']:
|
||||||
name = p.attrib.get('name')
|
name = p.attrib.get('name')
|
||||||
if name == pname:
|
if name == pname:
|
||||||
return p[0].text
|
return p[0].text
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def getCoreProperty(self, pname):
|
||||||
|
for p in self.docPropertyDom['core']:
|
||||||
|
if p.tag.endswith(pname):
|
||||||
|
return p.text
|
||||||
|
return None
|
||||||
|
|
||||||
def processDocument(self):
|
def processDocument(self):
|
||||||
changed = False
|
changed = False
|
||||||
docVersion = None
|
docVersion = None
|
||||||
|
@ -112,11 +120,14 @@ class OfficeFile(ExternalFileAdapter):
|
||||||
strType = ('{http://schemas.openxmlformats.org/'
|
strType = ('{http://schemas.openxmlformats.org/'
|
||||||
'officeDocument/2006/docPropsVTypes}lpwstr')
|
'officeDocument/2006/docPropsVTypes}lpwstr')
|
||||||
attributes = {}
|
attributes = {}
|
||||||
dom = self.docPropertyDom
|
# get dc:description from core.xml
|
||||||
|
desc = self.getCoreProperty('description')
|
||||||
|
if desc is not None:
|
||||||
|
attributes['comments'] = desc
|
||||||
|
dom = self.docPropertyDom['custom']
|
||||||
for p in dom:
|
for p in dom:
|
||||||
name = p.attrib.get('name')
|
name = p.attrib.get('name')
|
||||||
value = p[0].text
|
value = p[0].text
|
||||||
#print '***', self.externalAddress, name, value, p[0].tag
|
|
||||||
attr = self.propertyMap.get(name)
|
attr = self.propertyMap.get(name)
|
||||||
if attr == 'version':
|
if attr == 'version':
|
||||||
docVersion = value
|
docVersion = value
|
||||||
|
|
BIN
integrator/testdata/office/example.docx
vendored
BIN
integrator/testdata/office/example.docx
vendored
Binary file not shown.
BIN
integrator/testdata/office/example.docx.sav
vendored
BIN
integrator/testdata/office/example.docx.sav
vendored
Binary file not shown.
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 2011 Helmut Merz helmutm@cy55.de
|
# Copyright (c) 2012 Helmut Merz helmutm@cy55.de
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License as published by
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
@ -18,8 +18,6 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Definition of the Concept class.
|
Definition of the Concept class.
|
||||||
|
|
||||||
$Id$
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
@ -352,7 +350,10 @@ class ExternalFileAdapter(FileAdapter):
|
||||||
implements(IExternalFile)
|
implements(IExternalFile)
|
||||||
|
|
||||||
_adapterAttributes = (FileAdapter._adapterAttributes
|
_adapterAttributes = (FileAdapter._adapterAttributes
|
||||||
+ ('storageParams', 'externalAddress', 'uniqueAddress'))
|
+ ('storageParams', 'externalAddress', 'uniqueAddress',
|
||||||
|
'processingErrors'))
|
||||||
|
|
||||||
|
processingErrors = []
|
||||||
|
|
||||||
def getStorageParams(self):
|
def getStorageParams(self):
|
||||||
params = getattr(self.context, '_storageParams', None)
|
params = getattr(self.context, '_storageParams', None)
|
||||||
|
|
Loading…
Add table
Reference in a new issue