aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjvoisin <julien.voisin@dustri.org>2020-03-08 12:17:56 +0100
committerjvoisin <julien.voisin@dustri.org>2020-03-08 14:06:27 +0100
commitd7a03d907baac73dd69c0ce77f3610e7d4bad5f4 (patch)
tree1f4ac10ff3355a162047c08c3ec79ed04f1ecbc2
parenta23dc001cd74866204c868ef1dbf7ef7ca99b5aa (diff)
downloadexternal_mat2-d7a03d907baac73dd69c0ce77f3610e7d4bad5f4.tar.gz
external_mat2-d7a03d907baac73dd69c0ce77f3610e7d4bad5f4.tar.bz2
external_mat2-d7a03d907baac73dd69c0ce77f3610e7d4bad5f4.zip
Vastly improve ppt compatibility
-rw-r--r--.gitlab-ci.yml2
-rw-r--r--README.md2
-rw-r--r--libmat2/archive.py9
-rw-r--r--libmat2/office.py86
-rw-r--r--tests/data/narrated_powerpoint_presentation.pptxbin0 -> 4383613 bytes
-rw-r--r--tests/test_libmat2.py10
6 files changed, 103 insertions, 6 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4ad5c98..38cbe20 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -16,7 +16,7 @@ linting:bandit:
script: # TODO: remove B405 and B314
- bandit ./mat2 --format txt --skip B101
- bandit -r ./nautilus/ --format txt --skip B101
- - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
+ - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108,B311
linting:codespell:
image: $CONTAINER_REGISTRY:linting
diff --git a/README.md b/README.md
index 5f902fe..c81daff 100644
--- a/README.md
+++ b/README.md
@@ -152,6 +152,8 @@ Copyright 2016 Marie-Rose for mat2's logo
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
+The `narrated_powerpoint_presentation.pptx` file is in the public domain.
+
# Thanks
mat2 wouldn't exist without:
diff --git a/libmat2/archive.py b/libmat2/archive.py
index de80a35..f6db491 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -82,6 +82,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument,no-self-use
return {} # pragma: no cover
+ def _final_checks(self) -> bool:
+ """ This method is invoked after the file has been cleaned,
+ allowing to run final verifications.
+ """
+ # pylint: disable=unused-argument,no-self-use
+ return True
+
@staticmethod
@abc.abstractmethod
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
@@ -223,6 +230,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
if abort:
os.remove(self.output_filename)
return False
+ if not self._final_checks():
+ return False # pragma: no cover
return True
diff --git a/libmat2/office.py b/libmat2/office.py
index 7f70b72..d122fc8 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,3 +1,4 @@
+import random
import uuid
import logging
import os
@@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser):
def __init__(self, filename):
super().__init__(filename)
+ # MSOffice documents are using various counters for cross-references,
+ # we collect them all, to make sure that they're effectively counters,
+ # and not unique id used for fingerprinting.
+ self.__counters = {
+ 'cNvPr': set(),
+ 'rid': set(),
+ }
+
self.files_to_keep = set(map(re.compile, { # type: ignore
r'^\[Content_Types\]\.xml$',
r'^_rels/\.rels$',
@@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser):
r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
r'^(?:word|ppt)/tableStyles\.xml$',
+ r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
+ r'^ppt/slides/slide[0-9]*\.xml$',
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
r'^(?:word|ppt)/stylesWithEffects\.xml$',
+ r'^ppt/presentation\.xml$',
+ # TODO: check if p:bgRef can be randomized
+ r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
+ r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
}))
self.files_to_omit = set(map(re.compile, { # type: ignore
r'^customXml/',
@@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser):
r'^(?:word|ppt)/theme',
r'^(?:word|ppt)/people\.xml$',
r'^(?:word|ppt)/numbering\.xml$',
+ r'^(?:word|ppt)/tags/',
# View properties like view mode, last viewed slide etc
r'^(?:word|ppt)/viewProps\.xml$',
# Additional presentation-wide properties like printing properties,
@@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser):
"""
try:
tree, namespace = _parse_xml(full_path)
- except ET.ParseError as e:
+ except ET.ParseError as e: # pragma: no cover
logging.error("Unable to parse %s: %s", full_path, e)
return False
@@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser):
def __remove_revisions(full_path: str) -> bool:
try:
tree, namespace = _parse_xml(full_path)
- except ET.ParseError as e:
+ except ET.ParseError as e: # pragma: no cover
logging.error("Unable to parse %s: %s", full_path, e)
return False
@@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser):
tree.write(full_path, xml_declaration=True)
return True
+ def _final_checks(self) -> bool:
+ for k, v in self.__counters.items():
+ if v and len(v) != max(v):
+ # TODO: make this an error and return False
+ # once the ability to correct the counters is implemented
+ logging.warning("%s contains invalid %s: %s", self.filename, k, v)
+ return True
+ return True
+
+ def __collect_counters(self, full_path: str):
+ with open(full_path, encoding='utf-8') as f:
+ content = f.read()
+ # "relationship Id"
+ for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content):
+ self.__counters['rid'].add(int(i))
+ # "connector for Non-visual property"
+ for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
+ self.__counters['cNvPr'].add(int(i))
+
+
+ @staticmethod
+ def __randomize_creationId(full_path: str) -> bool:
+ try:
+ tree, namespace = _parse_xml(full_path)
+ except ET.ParseError as e: # pragma: no cover
+ logging.error("Unable to parse %s: %s", full_path, e)
+ return False
+
+ if 'p14' not in namespace.keys():
+ return True # pragma: no cover
+
+ for item in tree.iterfind('.//p14:creationId', namespace):
+ item.set('val', '%s' % random.randint(0, 2**32))
+ tree.write(full_path, xml_declaration=True)
+ return True
+
+ @staticmethod
+ def __randomize_sldMasterId(full_path: str) -> bool:
+ try:
+ tree, namespace = _parse_xml(full_path)
+ except ET.ParseError as e: # pragma: no cover
+ logging.error("Unable to parse %s: %s", full_path, e)
+ return False
+
+ if 'p' not in namespace.keys():
+ return True # pragma: no cover
+
+ for item in tree.iterfind('.//p:sldMasterId', namespace):
+ item.set('id', '%s' % random.randint(0, 2**32))
+ tree.write(full_path, xml_declaration=True)
+ return True
+
def _specific_cleanup(self, full_path: str) -> bool:
- # pylint: disable=too-many-return-statements
+ # pylint: disable=too-many-return-statements,too-many-branches
if os.stat(full_path).st_size == 0: # Don't process empty files
return True
if not full_path.endswith('.xml'):
return True
+ if self.__randomize_creationId(full_path) is False:
+ return False
+
+ self.__collect_counters(full_path)
+
if full_path.endswith('/[Content_Types].xml'):
# this file contains references to files that we might
# remove, and MS Office doesn't like dangling references
@@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser):
elif full_path.endswith('/word/document.xml'):
# this file contains the revisions
if self.__remove_revisions(full_path) is False:
- return False
+ return False # pragma: no cover
elif full_path.endswith('/docProps/app.xml'):
# This file must be present and valid,
# so we're removing as much as we can.
@@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser):
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
uid = str(uuid.uuid4()).encode('utf-8')
f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
+ elif full_path.endswith('ppt/presentation.xml'):
+ if self.__randomize_sldMasterId(full_path) is False:
+ return False # pragma: no cover
if self.__remove_rsid(full_path) is False:
- return False
+ return False # pragma: no cover
if self.__remove_nsid(full_path) is False:
return False # pragma: no cover
diff --git a/tests/data/narrated_powerpoint_presentation.pptx b/tests/data/narrated_powerpoint_presentation.pptx
new file mode 100644
index 0000000..ef04132
--- /dev/null
+++ b/tests/data/narrated_powerpoint_presentation.pptx
Binary files differ
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 9e208ec..a6c3a9a 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -777,3 +777,13 @@ class TestNoSandbox(unittest.TestCase):
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
os.remove('./tests/data/clean.cleaned.cleaned.png')
+
+class TestComplexOfficeFiles(unittest.TestCase):
+ def test_complex_pptx(self):
+ target = './tests/data/clean.pptx'
+ shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target)
+ p = office.MSOfficeParser(target)
+ self.assertTrue(p.remove_all())
+
+ os.remove(target)
+ os.remove(p.output_filename)