# -*- coding: utf-8 -*-
"""Compound ZIP parser plugin for OpenXML files."""
import re
import zipfile
from xml.parsers import expat
from defusedxml import ElementTree
from dfdatetime import time_elements as dfdatetime_time_elements
from plaso.containers import events
from plaso.parsers import czip
from plaso.parsers.czip_plugins import interface
[docs]
class OpenXMLEventData(events.EventData):
"""OXML event data.
Attributes:
application (str): name of application that created document.
application_version (str): version of application that created document.
author (str): name of author.
creation_time (dfdatetime.DateTimeValues): creation date and time of
the document.
digital_signature (str): digital signature.
edit_duration (int): total editing time.
hyperlinks_changed (bool): True if hyperlinks have changed.
last_printed_time (dfdatetime.DateTimeValues): date and time the document
was last printed.
last_saved_by (str): name of user that last saved the document.
links_up_to_date (bool): True if the links are up to date.
modification_time (dfdatetime.DateTimeValues): modification date and time
of the document.
number_of_characters (int): number of characters without spaces in
the document.
number_of_characters_with_spaces (int): number of characters including
spaces in the document.
number_of_clips (int): number of multi-media clips in the document.
number_of_hidden_slides (int): number of hidden slides in the document.
number_of_lines (int): number of lines in the document.
number_of_pages (int): number of pages in the document.
number_of_paragraphs (int): number of paragraphs in the document.
number_of_slides (int): number of slides in the document.
number_of_words (int): number of words in the document.
revision_number (int): revision number.
scale (bool): True if scaling of the thumbnail is desired or false if
cropping is desired.
security_flags (int): security flags.
shared_doc (bool): True if document is shared.
template (str): name of the template used to created the document.
"""
DATA_TYPE = 'openxml:metadata'
[docs]
def __init__(self):
"""Initializes event data."""
super(OpenXMLEventData, self).__init__(data_type=self.DATA_TYPE)
self.application = None
self.application_version = None
self.author = None
self.creation_time = None
self.digital_signature = None
self.edit_duration = None
self.hyperlinks_changed = None
self.last_printed_time = None
self.last_saved_by = None
self.links_up_to_date = None
self.modification_time = None
self.number_of_characters = None
self.number_of_characters_with_spaces = None
self.number_of_clips = None
self.number_of_hidden_slides = None
self.number_of_lines = None
self.number_of_pages = None
self.number_of_paragraphs = None
self.number_of_slides = None
self.number_of_words = None
self.revision_number = None
self.scale = None
self.security_flags = None
self.shared_doc = None
self.template = None
[docs]
class OpenXMLPlugin(interface.CompoundZIPPlugin):
"""Parse metadata from OXML files."""
NAME = 'oxml'
DATA_FORMAT = 'OpenXML (OXML) file'
REQUIRED_PATHS = frozenset([
'[Content_Types].xml', '_rels/.rels', 'docProps/core.xml'])
_PROPERTY_NAMES = {
'creator': 'author',
'lastModifiedBy': 'last_saved_by',
'Total_Time': 'total_edit_time',
'Pages': 'number_of_pages',
'CharactersWithSpaces': 'number_of_characters_with_spaces',
'Paragraphs': 'number_of_paragraphs',
'Characters': 'number_of_characters',
'Lines': 'number_of_lines',
'revision': 'revision_number',
'Words': 'number_of_words',
'Application': 'application',
'Shared_Doc': 'shared'}
def _GetPropertyValue(self, parser_mediator, properties, name):
"""Retrieves a property value.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfVFS.
properties (dict[str, object]): properties.
name (str): name of the property.
Returns:
str: property value.
"""
property_value = properties.get(name, None)
if isinstance(property_value, bytes):
try:
# TODO: get encoding form XML metadata.
property_value = property_value.decode('utf-8')
except UnicodeDecodeError:
parser_mediator.ProduceExtractionWarning(
'unable to decode property: {0:s}'.format(name))
return property_value
def _FormatPropertyName(self, name):
"""Formats a camel case property name as snake case.
Args:
name (str): property name in camel case.
Returns:
str: property name in snake case.
"""
# TODO: Add Unicode support.
fix_key = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', fix_key).lower()
def _ParseBooleanValue(self, properties, name):
"""Parses a boolean property value.
Args:
properties (dict[str, object]): properties.
name (str): name of the property.
Returns:
bool: boolean value or None not available.
"""
string_value = properties.get(name, None)
if string_value:
if string_value == 'false':
return False
if string_value == 'true':
return True
return None
def _ParseIntegerValue(self, properties, name):
"""Parses an integer property value.
Args:
properties (dict[str, object]): properties.
name (str): name of the property.
Returns:
int: integer value or None not available.
"""
string_value = properties.get(name, None)
if string_value:
try:
return int(string_value, 10)
except (TypeError, ValueError):
pass
return None
def _ParseISO8601DateTimeString(self, parser_mediator, properties, name):
"""Parses an ISO8601 date and time string.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfVFS.
properties (dict[str, object]): properties.
name (str): name of the property.
Returns:
dfdatetime.TimeElementsInMicroseconds: date and time value or None if
not available.
"""
iso8601_string = properties.get(name, None)
if not iso8601_string:
return None
# Date and time strings are in ISO8601 format either with 1 second
# or 100th nano second precision. For example:
# 2012-11-07T23:29:00Z
# 2012-03-05T20:40:00.0000000Z
date_time = dfdatetime_time_elements.TimeElementsInMicroseconds()
try:
date_time.CopyFromStringISO8601(iso8601_string)
except ValueError as exception:
parser_mediator.ProduceExtractionWarning((
'Unable to parse value: {0:s} ISO8601 string: {1:s} with error: '
'{2!s}').format(name, iso8601_string, exception))
return None
return date_time
def _ParsePropertiesXMLFile(self, xml_data):
"""Parses a properties XML file.
Args:
xml_data (bytes): data of a _rels/.rels XML file.
Returns:
dict[str, object]: properties.
Raises:
zipfile.BadZipfile: if the properties XML file cannot be read.
"""
xml_root = ElementTree.fromstring(xml_data)
properties = {}
for xml_element in xml_root.iter():
if not xml_element.text:
continue
# The property name is formatted as: {URL}name
# For example: {http://purl.org/dc/terms/}modified
_, _, name = xml_element.tag.partition('}')
# Do not including the 'lpstr' attribute because it is very verbose.
if name == 'lpstr':
continue
property_name = self._PROPERTY_NAMES.get(name, None)
if not property_name:
property_name = self._FormatPropertyName(name)
properties[property_name] = xml_element.text
return properties
def _ParseRelationshipsXMLFile(self, xml_data):
"""Parses the relationships XML file (_rels/.rels).
Args:
xml_data (bytes): data of a _rels/.rels XML file.
Returns:
list[str]: property file paths. The path is relative to the root of
the ZIP file.
Raises:
zipfile.BadZipfile: if the relationship XML file cannot be read.
"""
xml_root = ElementTree.fromstring(xml_data)
property_files = []
for xml_element in xml_root.iter():
type_attribute = xml_element.get('Type')
if 'properties' in repr(type_attribute):
target_attribute = xml_element.get('Target')
property_files.append(target_attribute)
return property_files
def _ParseZIPFile(self, parser_mediator, zip_file):
"""Parses an OXML file-like object.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfVFS.
zip_file (zipfile.ZipFile): the zip file containing OXML content. It is
not be closed in this method, but will be closed by the parser logic
in czip.py.
"""
try:
xml_data = zip_file.read('_rels/.rels')
property_files = self._ParseRelationshipsXMLFile(xml_data)
except (IndexError, IOError, KeyError, LookupError, OverflowError,
ValueError, ElementTree.ParseError, expat.ExpatError,
zipfile.BadZipfile) as exception:
parser_mediator.ProduceExtractionWarning((
'Unable to parse relationships XML file: _rels/.rels with error: '
'{0!s}').format(exception))
return
metadata = {}
for path in property_files:
try:
xml_data = zip_file.read(path)
properties = self._ParsePropertiesXMLFile(xml_data)
except (IndexError, IOError, KeyError, LookupError, OverflowError,
ValueError, ElementTree.ParseError, expat.ExpatError,
zipfile.BadZipfile) as exception:
parser_mediator.ProduceExtractionWarning((
'Unable to parse properties XML file: {0:s} with error: '
'{1!s}').format(path, exception))
continue
metadata.update(properties)
event_data = OpenXMLEventData()
event_data.application = self._GetPropertyValue(
parser_mediator, metadata, 'application')
event_data.application_version = self._GetPropertyValue(
parser_mediator, metadata, 'app_version')
event_data.author = self._GetPropertyValue(
parser_mediator, metadata, 'author')
event_data.creation_time = self._ParseISO8601DateTimeString(
parser_mediator, metadata, 'created')
event_data.digital_signature = self._GetPropertyValue(
parser_mediator, metadata, 'dig_sig')
event_data.edit_duration = self._ParseIntegerValue(metadata, 'total_time')
event_data.hyperlinks_changed = self._ParseBooleanValue(
metadata, 'hyperlinks_changed')
# event_data.i4 = self._ParseIntegerValue(
# parser_mediator, metadata, 'i4')
event_data.last_printed_time = self._ParseISO8601DateTimeString(
parser_mediator, metadata, 'last_printed')
event_data.last_saved_by = self._GetPropertyValue(
parser_mediator, metadata, 'last_saved_by')
event_data.links_up_to_date = self._ParseBooleanValue(
metadata, 'links_up_to_date')
event_data.modification_time = self._ParseISO8601DateTimeString(
parser_mediator, metadata, 'modified')
event_data.number_of_characters = self._ParseIntegerValue(
metadata, 'number_of_characters')
event_data.number_of_characters_with_spaces = self._ParseIntegerValue(
metadata, 'number_of_characters_with_spaces')
event_data.number_of_clips = self._ParseIntegerValue(metadata, 'mm_clips')
event_data.number_of_hidden_slides = self._ParseIntegerValue(
metadata, 'hidden_slides')
event_data.number_of_lines = self._ParseIntegerValue(
metadata, 'number_of_lines')
event_data.number_of_pages = self._ParseIntegerValue(
metadata, 'number_of_pages')
event_data.number_of_paragraphs = self._ParseIntegerValue(
metadata, 'number_of_paragraphs')
event_data.number_of_slides = self._ParseIntegerValue(metadata, 'slides')
event_data.number_of_words = self._ParseIntegerValue(
metadata, 'number_of_words')
event_data.revision_number = self._ParseIntegerValue(
metadata, 'revision_number')
event_data.scale = self._ParseBooleanValue(metadata, 'scale_crop')
event_data.security_flags = self._ParseIntegerValue(
metadata, 'doc_security')
event_data.shared_doc = self._GetPropertyValue(
parser_mediator, metadata, 'shared_doc')
event_data.template = self._GetPropertyValue(
parser_mediator, metadata, 'template')
parser_mediator.ProduceEventData(event_data)
czip.CompoundZIPParser.RegisterPlugin(OpenXMLPlugin)