Source code for plaso.parsers.czip_plugins.oxml

"""Compound ZIP parser plugin for OpenXML files."""

import re
import zipfile

from xml.parsers import expat

from defusedxml import ElementTree

from dfdatetime import time_elements as dfdatetime_time_elements

from plaso.containers import events
from plaso.parsers import czip
from plaso.parsers.czip_plugins import interface


[docs] class OpenXMLEventData(events.EventData): """OXML event data. Attributes: application (str): name of application that created document. application_version (str): version of application that created document. author (str): name of author. creation_time (dfdatetime.DateTimeValues): creation date and time of the document. digital_signature (str): digital signature. edit_duration (int): total editing time. hyperlinks_changed (bool): True if hyperlinks have changed. last_printed_time (dfdatetime.DateTimeValues): date and time the document was last printed. last_saved_by (str): name of user that last saved the document. links_up_to_date (bool): True if the links are up to date. modification_time (dfdatetime.DateTimeValues): modification date and time of the document. number_of_characters (int): number of characters without spaces in the document. number_of_characters_with_spaces (int): number of characters including spaces in the document. number_of_clips (int): number of multi-media clips in the document. number_of_hidden_slides (int): number of hidden slides in the document. number_of_lines (int): number of lines in the document. number_of_pages (int): number of pages in the document. number_of_paragraphs (int): number of paragraphs in the document. number_of_slides (int): number of slides in the document. number_of_words (int): number of words in the document. revision_number (int): revision number. scale (bool): True if scaling of the thumbnail is desired or false if cropping is desired. security_flags (int): security flags. shared_doc (bool): True if document is shared. template (str): name of the template used to created the document. """ DATA_TYPE = "openxml:metadata"
[docs] def __init__(self): """Initializes event data.""" super().__init__(data_type=self.DATA_TYPE) self.application = None self.application_version = None self.author = None self.creation_time = None self.digital_signature = None self.edit_duration = None self.hyperlinks_changed = None self.last_printed_time = None self.last_saved_by = None self.links_up_to_date = None self.modification_time = None self.number_of_characters = None self.number_of_characters_with_spaces = None self.number_of_clips = None self.number_of_hidden_slides = None self.number_of_lines = None self.number_of_pages = None self.number_of_paragraphs = None self.number_of_slides = None self.number_of_words = None self.revision_number = None self.scale = None self.security_flags = None self.shared_doc = None self.template = None
[docs] class OpenXMLPlugin(interface.CompoundZIPPlugin): """Parse metadata from OXML files.""" NAME = "oxml" DATA_FORMAT = "OpenXML (OXML) file" REQUIRED_PATHS = frozenset( ["[Content_Types].xml", "_rels/.rels", "docProps/core.xml"] ) _PROPERTY_NAMES = { "creator": "author", "lastModifiedBy": "last_saved_by", "Total_Time": "total_edit_time", "Pages": "number_of_pages", "CharactersWithSpaces": "number_of_characters_with_spaces", "Paragraphs": "number_of_paragraphs", "Characters": "number_of_characters", "Lines": "number_of_lines", "revision": "revision_number", "Words": "number_of_words", "Application": "application", "Shared_Doc": "shared", } def _GetPropertyValue(self, parser_mediator, properties, name): """Retrieves a property value. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfVFS. properties (dict[str, object]): properties. name (str): name of the property. Returns: str: property value. """ property_value = properties.get(name) if isinstance(property_value, bytes): try: # TODO: get encoding form XML metadata. property_value = property_value.decode("utf-8") except UnicodeDecodeError: parser_mediator.ProduceExtractionWarning( f"unable to decode property: {name:s}" ) return property_value def _FormatPropertyName(self, name): """Formats a camel case property name as snake case. Args: name (str): property name in camel case. Returns: str: property name in snake case. """ # TODO: Add Unicode support. fix_key = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name) return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", fix_key).lower() def _ParseBooleanValue(self, properties, name): """Parses a boolean property value. Args: properties (dict[str, object]): properties. name (str): name of the property. Returns: bool: boolean value or None not available. """ string_value = properties.get(name) if string_value: if string_value == "false": return False if string_value == "true": return True return None def _ParseIntegerValue(self, properties, name): """Parses an integer property value. Args: properties (dict[str, object]): properties. name (str): name of the property. Returns: int: integer value or None not available. """ string_value = properties.get(name) if string_value: try: return int(string_value, 10) except (TypeError, ValueError): pass return None def _ParseISO8601DateTimeString(self, parser_mediator, properties, name): """Parses an ISO8601 date and time string. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfVFS. properties (dict[str, object]): properties. name (str): name of the property. Returns: dfdatetime.TimeElementsInMicroseconds: date and time value or None if not available. """ iso8601_string = properties.get(name) if not iso8601_string: return None # Date and time strings are in ISO8601 format either with 1 second # or 100th nano second precision. For example: # 2012-11-07T23:29:00Z # 2012-03-05T20:40:00.0000000Z date_time = dfdatetime_time_elements.TimeElementsInMicroseconds() try: date_time.CopyFromStringISO8601(iso8601_string) except ValueError as exception: parser_mediator.ProduceExtractionWarning( ( f"Unable to parse value: {name:s} ISO8601 string: " f"{iso8601_string:s} with error: {exception!s}" ) ) return None return date_time def _ParsePropertiesXMLFile(self, xml_data): """Parses a properties XML file. Args: xml_data (bytes): data of a _rels/.rels XML file. Returns: dict[str, object]: properties. Raises: zipfile.BadZipfile: if the properties XML file cannot be read. """ xml_root = ElementTree.fromstring(xml_data) properties = {} for xml_element in xml_root.iter(): if not xml_element.text: continue # The property name is formatted as: {URL}name # For example: {http://purl.org/dc/terms/}modified _, _, name = xml_element.tag.partition("}") # Do not including the 'lpstr' attribute because it is very verbose. if name == "lpstr": continue property_name = self._PROPERTY_NAMES.get(name) if not property_name: property_name = self._FormatPropertyName(name) properties[property_name] = xml_element.text return properties def _ParseRelationshipsXMLFile(self, xml_data): """Parses the relationships XML file (_rels/.rels). Args: xml_data (bytes): data of a _rels/.rels XML file. Returns: list[str]: property file paths. The path is relative to the root of the ZIP file. Raises: zipfile.BadZipfile: if the relationship XML file cannot be read. """ xml_root = ElementTree.fromstring(xml_data) property_files = [] for xml_element in xml_root.iter(): type_attribute = xml_element.get("Type") if "properties" in repr(type_attribute): target_attribute = xml_element.get("Target") property_files.append(target_attribute) return property_files def _ParseZIPFile(self, parser_mediator, zip_file): """Parses an OXML file-like object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfVFS. zip_file (zipfile.ZipFile): the zip file containing OXML content. It is not be closed in this method, but will be closed by the parser logic in czip.py. """ try: xml_data = zip_file.read("_rels/.rels") property_files = self._ParseRelationshipsXMLFile(xml_data) except ( IndexError, KeyError, LookupError, OSError, OverflowError, ValueError, ElementTree.ParseError, expat.ExpatError, zipfile.BadZipfile, ) as exception: parser_mediator.ProduceExtractionWarning( ( f"Unable to parse relationships XML file: _rels/.rels with error: " f"{exception!s}" ) ) return metadata = {} for path in property_files: try: xml_data = zip_file.read(path) properties = self._ParsePropertiesXMLFile(xml_data) except ( IndexError, KeyError, LookupError, OSError, OverflowError, ValueError, ElementTree.ParseError, expat.ExpatError, zipfile.BadZipfile, ) as exception: parser_mediator.ProduceExtractionWarning( ( f"Unable to parse properties XML file: {path:s} with error: " f"{exception!s}" ) ) continue metadata.update(properties) event_data = OpenXMLEventData() event_data.application = self._GetPropertyValue( parser_mediator, metadata, "application" ) event_data.application_version = self._GetPropertyValue( parser_mediator, metadata, "app_version" ) event_data.author = self._GetPropertyValue(parser_mediator, metadata, "author") event_data.creation_time = self._ParseISO8601DateTimeString( parser_mediator, metadata, "created" ) event_data.digital_signature = self._GetPropertyValue( parser_mediator, metadata, "dig_sig" ) event_data.edit_duration = self._ParseIntegerValue(metadata, "total_time") event_data.hyperlinks_changed = self._ParseBooleanValue( metadata, "hyperlinks_changed" ) # event_data.i4 = self._ParseIntegerValue( # parser_mediator, metadata, 'i4') event_data.last_printed_time = self._ParseISO8601DateTimeString( parser_mediator, metadata, "last_printed" ) event_data.last_saved_by = self._GetPropertyValue( parser_mediator, metadata, "last_saved_by" ) event_data.links_up_to_date = self._ParseBooleanValue( metadata, "links_up_to_date" ) event_data.modification_time = self._ParseISO8601DateTimeString( parser_mediator, metadata, "modified" ) event_data.number_of_characters = self._ParseIntegerValue( metadata, "number_of_characters" ) event_data.number_of_characters_with_spaces = self._ParseIntegerValue( metadata, "number_of_characters_with_spaces" ) event_data.number_of_clips = self._ParseIntegerValue(metadata, "mm_clips") event_data.number_of_hidden_slides = self._ParseIntegerValue( metadata, "hidden_slides" ) event_data.number_of_lines = self._ParseIntegerValue( metadata, "number_of_lines" ) event_data.number_of_pages = self._ParseIntegerValue( metadata, "number_of_pages" ) event_data.number_of_paragraphs = self._ParseIntegerValue( metadata, "number_of_paragraphs" ) event_data.number_of_slides = self._ParseIntegerValue(metadata, "slides") event_data.number_of_words = self._ParseIntegerValue( metadata, "number_of_words" ) event_data.revision_number = self._ParseIntegerValue( metadata, "revision_number" ) event_data.scale = self._ParseBooleanValue(metadata, "scale_crop") event_data.security_flags = self._ParseIntegerValue(metadata, "doc_security") event_data.shared_doc = self._GetPropertyValue( parser_mediator, metadata, "shared_doc" ) event_data.template = self._GetPropertyValue( parser_mediator, metadata, "template" ) parser_mediator.ProduceEventData(event_data)
czip.CompoundZIPParser.RegisterPlugin(OpenXMLPlugin)