Source code for plaso.engine.worker

"""The event extraction worker."""

import copy
import os
import re
import time

import pysigscan

from dfvfs.analyzer import analyzer as dfvfs_analyzer
from dfvfs.helpers import volume_scanner as dfvfs_volume_scanner
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.lib import errors as dfvfs_errors
from dfvfs.path import factory as path_spec_factory
from dfvfs.resolver import resolver as path_spec_resolver

from plaso.analyzers import hashing_analyzer
from plaso.analyzers import manager as analyzers_manager
from plaso.containers import event_sources
from plaso.containers import events
from plaso.engine import extractors
from plaso.engine import logger
from plaso.lib import definitions
from plaso.lib import errors



[docs]
class EventExtractionWorkerVolumeScanner(dfvfs_volume_scanner.VolumeScanner):
    """Volume scanner used by the event extraction worker."""


[docs]
    def GetBasePathSpecs(
        self, source_path_spec, options=None
    ):  # pylint: disable=arguments-renamed
        """Determines the base path specifications.

        Args:
          source_path_spec (dfvfs.PathSpec): source path specification.
          options (Optional[VolumeScannerOptions]): volume scanner options. If None
              the default volume scanner options are used, which are defined in the
              VolumeScannerOptions class.

        Returns:
          list[PathSpec]: path specifications.

        Raises:
          dfvfs.ScannerError: if the source path does not exists, or if the source
              path is not a file or directory, or if the format of or within
              the source file is not supported.
        """
        if not options:
            options = dfvfs_volume_scanner.VolumeScannerOptions()
            options.partitions = ["all"]
            options.scan_mode = options.SCAN_MODE_ALL
            options.snapshots = ["none"]
            options.snapshots_only = False
            options.volumes = ["all"]

        scan_context = self._ScanSourcePathSpec(source_path_spec)

        return self._GetBasePathSpecs(scan_context, options)





[docs]
class EventExtractionWorker:
    """Event extraction worker.

    The event extraction worker determines which parsers are suitable for parsing
    a particular file entry or data stream. The parsers extract relevant data from
    file system and or file content data. All extracted data is passed to the
    parser mediator for further processing.

    Attributes:
      last_activity_timestamp (int): timestamp received that indicates the last
          time activity was observed.
      processing_status (str): human readable status indication such as:
          'Extracting', 'Hashing'.
    """

    # NTFS metadata files that need special handling.
    _METADATA_FILE_LOCATIONS_NTFS = frozenset(
        [
            "\\$AttrDef",
            "\\$BadClus",
            "\\$Bitmap",
            "\\$Boot",
            "\\$Extend\\$ObjId",
            "\\$Extend\\$Quota",
            "\\$Extend\\$Reparse",
            "\\$Extend\\$RmMetadata\\$Repair",
            "\\$Extend\\$RmMetadata\\$TxfLog\\$Tops",
            "\\$Extend\\$UsnJrnl",
            "\\$LogFile",
            "\\$MFT",
            "\\$MFTMirr",
            "\\$Secure",
            "\\$UpCase",
            "\\$Volume",
        ]
    )

    # TSK metadata files that need special handling.
    _METADATA_FILE_LOCATIONS_TSK = frozenset(
        [
            # NTFS
            "/$AttrDef",
            "/$BadClus",
            "/$Bitmap",
            "/$Boot",
            "/$Extend/$ObjId",
            "/$Extend/$Quota",
            "/$Extend/$Reparse",
            "/$Extend/$RmMetadata/$Repair",
            "/$Extend/$RmMetadata/$TxfLog/$Tops",
            "/$Extend/$UsnJrnl",
            "/$LogFile",
            "/$MFT",
            "/$MFTMirr",
            "/$Secure",
            "/$UpCase",
            "/$Volume",
            # HFS+/HFSX
            "/$ExtentsFile",
            "/$CatalogFile",
            "/$BadBlockFile",
            "/$AllocationFile",
            "/$AttributesFile",
        ]
    )

    # TODO: make this filtering solution more generic. Also see:
    # https://github.com/log2timeline/plaso/issues/467
    _CHROME_CACHE_DATA_FILE_RE = re.compile(r"^[fF]_[0-9a-fA-F]{6}$")
    _FIREFOX_CACHE_DATA_FILE_RE = re.compile(r"^[0-9a-fA-F]{5}[dm][0-9]{2}$")
    _FIREFOX_CACHE2_DATA_FILE_RE = re.compile(r"^[0-9a-fA-F]{40}$")

    _TYPES_WITH_ROOT_METADATA = frozenset([dfvfs_definitions.TYPE_INDICATOR_GZIP])


[docs]
    def __init__(self, force_parser=False, parser_filter_expression=None):
        """Initializes an event extraction worker.

        Args:
          force_parser (Optional[bool]): True if a specified parser should be forced
              to be used to extract events.
          parser_filter_expression (Optional[str]): parser filter expression,
              where None represents all parsers and plugins.

              A parser filter expression is a comma separated value string that
              denotes which parsers and plugins should be used. See
              filters/parser_filter.py for details of the expression syntax.

              This function does not support presets, and requires a parser
              filter expression where presets have been expanded.
        """
        super().__init__()
        self._abort = False
        self._analyzers = []
        self._analyzers_profiler = None
        self._achive_type_scanner = self._CreateArchiveTypeScanner([])
        self._archive_types = []
        self._event_data_extractor = extractors.EventDataExtractor(
            force_parser=force_parser, parser_filter_expression=parser_filter_expression
        )
        self._force_parser = force_parser
        self._hasher_file_size_limit = None
        self._path_spec_extractor = extractors.PathSpecExtractor()
        self._process_compressed_streams = None
        self._processing_profiler = None

        self.last_activity_timestamp = 0.0
        self.processing_status = definitions.STATUS_INDICATOR_IDLE


    def _AnalyzeDataStream(
        self, file_entry, data_stream_name, display_name, event_data_stream
    ):
        """Analyzes the contents of a specific data stream of a file entry.

        The results of the analyzers are set in the event data stream as
        attributes that are added to produced event objects. Note that some
        file systems allow directories to have data streams, such as NTFS.

        Args:
          file_entry (dfvfs.FileEntry): file entry whose data stream is to be
              analyzed.
          data_stream_name (str): name of the data stream.
          display_name (str): human readable representation of the file entry
              currently being analyzed.
          event_data_stream (EventDataStream): event data stream attribute
               container.

        Raises:
          RuntimeError: if the file-like object cannot be retrieved from
              the file entry.
        """
        logger.debug(f"[AnalyzeDataStream] analyzing file: {display_name:s}")

        if self._processing_profiler:
            self._processing_profiler.StartTiming("analyzing")

        try:
            file_object = file_entry.GetFileObject(data_stream_name=data_stream_name)
            if not file_object:
                raise RuntimeError(
                    (
                        f"Unable to retrieve file-like object for file entry: "
                        f"{display_name:s}"
                    )
                )

            self._AnalyzeFileObject(file_object, display_name, event_data_stream)

        finally:
            if self._processing_profiler:
                self._processing_profiler.StopTiming("analyzing")

        logger.debug(f"[AnalyzeDataStream] completed analyzing file: {display_name:s}")

    def _AnalyzeFileObject(self, file_object, display_name, event_data_stream):
        """Processes a file-like object with analyzers.

        Args:
          file_object (dfvfs.FileIO): file-like object to process.
          display_name (str): human readable representation of the file entry
              currently being analyzed.
          event_data_stream (EventDataStream): event data stream attribute
               container.
        """
        maximum_read_size = max(
            analyzer_object.SIZE_LIMIT for analyzer_object in self._analyzers
        )
        hashers_only = True
        for analyzer_object in self._analyzers:
            if not isinstance(analyzer_object, hashing_analyzer.HashingAnalyzer):
                hashers_only = False
                break

        file_size = file_object.get_size()

        if (
            hashers_only
            and self._hasher_file_size_limit
            and file_size > self._hasher_file_size_limit
        ):
            return

        file_object.seek(0, os.SEEK_SET)

        data = file_object.read(maximum_read_size)
        while data:
            if self._abort:
                break

            for analyzer_object in self._analyzers:
                if self._abort:
                    break

                if (
                    not analyzer_object.INCREMENTAL_ANALYZER
                    and file_size > analyzer_object.SIZE_LIMIT
                ):
                    continue

                if (
                    isinstance(analyzer_object, hashing_analyzer.HashingAnalyzer)
                    and self._hasher_file_size_limit
                    and file_size > self._hasher_file_size_limit
                ):
                    continue

                self.processing_status = analyzer_object.PROCESSING_STATUS_HINT

                if self._analyzers_profiler:
                    self._analyzers_profiler.StartTiming(analyzer_object.NAME)

                try:
                    analyzer_object.Analyze(data)
                finally:
                    if self._analyzers_profiler:
                        self._analyzers_profiler.StopTiming(analyzer_object.NAME)

                self.last_activity_timestamp = time.time()

            data = file_object.read(maximum_read_size)

        for analyzer_object in self._analyzers:
            for result in analyzer_object.GetResults():
                logger.debug(
                    f"[AnalyzeFileObject] attribute {result.attribute_name:s}:"
                    f"{result.attribute_value!s} calculated for file: "
                    f"{display_name:s}"
                )
                setattr(
                    event_data_stream, result.attribute_name, result.attribute_value
                )

            analyzer_object.Reset()

        self.processing_status = definitions.STATUS_INDICATOR_RUNNING

    def _CanSkipDataStream(self, file_entry, data_stream):
        """Determines if analysis and extraction of a data stream can be skipped.

        This is used to prevent Plaso trying to run analyzers or extract content
        from a pipe or socket it encounters while processing a mounted filesystem.

        Args:
          file_entry (dfvfs.FileEntry): file entry to consider for skipping.
          data_stream (dfvfs.DataStream): data stream to consider for skipping.

        Returns:
          bool: True if the data stream can be skipped.
        """
        if file_entry.IsFile():
            return False

        if data_stream.IsDefault():
            return True

        return False

    def _CanSkipContentExtraction(self, file_entry):
        """Determines if content extraction of a file entry can be skipped.

        Args:
          file_entry (dfvfs.FileEntry): file entry of which to determine content
              extraction can be skipped.

        Returns:
          bool: True if content extraction can be skipped.
        """
        if self._force_parser:
            return False

        # TODO: make this filtering solution more generic. Also see:
        # https://github.com/log2timeline/plaso/issues/467
        location = getattr(file_entry.path_spec, "location", None)
        if not location:
            return False

        data_stream_name = getattr(file_entry.path_spec, "data_stream", None)
        if data_stream_name:
            return False

        file_system = file_entry.GetFileSystem()

        path_segments = file_system.SplitPath(location)
        if not path_segments:
            return False

        if self._CHROME_CACHE_DATA_FILE_RE.match(path_segments[-1]):
            location_segments = path_segments[:-1]
            location_segments.append("index")
            location = file_system.JoinPath(location_segments)
            index_path_spec = path_spec_factory.Factory.NewPathSpec(
                file_entry.type_indicator,
                location=location,
                parent=file_entry.path_spec.parent,
            )

            if file_system.FileEntryExistsByPathSpec(index_path_spec):
                # TODO: improve this check if "index" is a Chrome Cache index file.
                return True

        elif self._FIREFOX_CACHE_DATA_FILE_RE.match(path_segments[-1]):
            location_segments = path_segments[:-4]
            location_segments.append("_CACHE_MAP_")
            location = file_system.JoinPath(location_segments)
            cache_map_path_spec = path_spec_factory.Factory.NewPathSpec(
                file_entry.type_indicator,
                location=location,
                parent=file_entry.path_spec.parent,
            )

            if file_system.FileEntryExistsByPathSpec(cache_map_path_spec):
                # TODO: improve this check if "_CACHE_MAP_" is a Firefox Cache
                # version 1 cache map file.
                return True

        elif self._FIREFOX_CACHE2_DATA_FILE_RE.match(path_segments[-1]):
            location_segments = path_segments[:-2]
            location_segments.append("index")
            location = file_system.JoinPath(location_segments)
            index_path_spec = path_spec_factory.Factory.NewPathSpec(
                file_entry.type_indicator,
                location=location,
                parent=file_entry.path_spec.parent,
            )

            if file_system.FileEntryExistsByPathSpec(index_path_spec):
                # TODO: improve this check if "index" is a Firefox Cache version 2
                # index file.
                return True

        elif len(path_segments) == 1 and path_segments[0].lower() in (
            "hiberfil.sys",
            "pagefile.sys",
            "swapfile.sys",
        ):
            return True

        elif (
            len(path_segments) == 4
            and path_segments[0].lower() == "private"
            and path_segments[1].lower() == "var"
            and path_segments[2].lower() == "vm"
            and path_segments[3].lower() in ("sleepimage", "swapfile0", "swapfile1")
        ):
            return True

        return False

    def _CreateArchiveTypeScanner(self, archive_types):
        """Creates a signature scanner for archive types.

        Args:
          archive_types (list[str]): archive types to scan for.

        Returns:
          pysigscan.scanner: signature scanner.
        """
        scanner_object = pysigscan.scanner()
        scanner_object.set_scan_buffer_size(65536)

        if "iso9660" in archive_types:
            scanner_object.add_signature(
                "iso9660",
                32769,
                b"CD001",
                pysigscan.signature_flags.RELATIVE_FROM_START,
            )

        if "modi" in archive_types:
            scanner_object.add_signature(
                "modi_sparseimage",
                0,
                b"sprs",
                pysigscan.signature_flags.RELATIVE_FROM_START,
            )
            scanner_object.add_signature(
                "modi_udif", 512, b"koly", pysigscan.signature_flags.RELATIVE_FROM_END
            )

        if "tar" in archive_types:
            scanner_object.add_signature(
                "tar", 257, b"ustar\x00", pysigscan.signature_flags.RELATIVE_FROM_START
            )
            scanner_object.add_signature(
                "tar_old",
                257,
                b"ustar\x20\x20\x00",
                pysigscan.signature_flags.RELATIVE_FROM_START,
            )

        if "vhdi" in archive_types:
            scanner_object.add_signature(
                "vhd", 512, b"conectix", pysigscan.signature_flags.RELATIVE_FROM_END
            )
            scanner_object.add_signature(
                "vhdx", 0, b"vhdxfile", pysigscan.signature_flags.RELATIVE_FROM_START
            )

        # Note that ZIP is also a compound format.
        scanner_object.add_signature(
            "zip", 0, b"PK\x03\x04", pysigscan.signature_flags.RELATIVE_FROM_START
        )
        return scanner_object

    def _ExtractContentFromDataStream(
        self, parser_mediator, file_entry, data_stream_name
    ):
        """Extracts content from a data stream.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          file_entry (dfvfs.FileEntry): file entry to extract its content.
          data_stream_name (str): name of the data stream whose content is to be
              extracted.
        """
        self.processing_status = definitions.STATUS_INDICATOR_EXTRACTING

        if self._processing_profiler:
            self._processing_profiler.StartTiming("extracting")

        self._event_data_extractor.ParseDataStream(
            parser_mediator, file_entry, data_stream_name
        )
        if self._processing_profiler:
            self._processing_profiler.StopTiming("extracting")

        self.processing_status = definitions.STATUS_INDICATOR_RUNNING

        self.last_activity_timestamp = time.time()

    def _ExtractMetadataFromFileEntry(self, parser_mediator, file_entry, data_stream):
        """Extracts metadata from a file entry.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          file_entry (dfvfs.FileEntry): file entry to extract metadata from.
          data_stream (dfvfs.DataStream): data stream or None if the file entry
              has no data stream.
        """
        # Do not extract metadata from the root file entry when it is virtual.
        if file_entry.IsRoot() and file_entry.type_indicator not in (
            self._TYPES_WITH_ROOT_METADATA
        ):
            return

        # We always want to extract the file entry metadata but we only want
        # to parse it once per file entry, so we only use it if we are
        # processing the default data stream of regular files.
        if data_stream and not data_stream.IsDefault():
            return

        display_name = parser_mediator.GetDisplayName()
        logger.debug(
            f"[ExtractMetadataFromFileEntry] processing file entry: "
            f"{display_name:s}"
        )
        self.processing_status = definitions.STATUS_INDICATOR_EXTRACTING

        if self._processing_profiler:
            self._processing_profiler.StartTiming("extracting")

        self._event_data_extractor.ParseFileEntryMetadata(parser_mediator, file_entry)

        if self._processing_profiler:
            self._processing_profiler.StopTiming("extracting")

        self.processing_status = definitions.STATUS_INDICATOR_RUNNING

    def _GetCompressedStreamTypes(self, parser_mediator, path_spec):
        """Determines if a data stream contains a compressed stream such as: gzip.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          path_spec (dfvfs.PathSpec): path specification of the data stream.

        Returns:
          list[str]: dfVFS compressed stream type indicators found in
              the data stream.
        """
        try:
            type_indicators = dfvfs_analyzer.Analyzer.GetCompressedStreamTypeIndicators(
                path_spec, resolver_context=parser_mediator.resolver_context
            )
        except OSError as exception:
            type_indicators = []

            warning_message = (
                f"analyzer failed to determine compressed stream type indicators "
                f"with error: {exception!s}"
            )
            parser_mediator.ProduceExtractionWarning(
                warning_message, path_spec=path_spec
            )

        return type_indicators

    def _GetStorageMediaImageTypes(self, parser_mediator, path_spec):
        """Determines if a data stream contains a storage media image such as: DMG.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          path_spec (dfvfs.PathSpec): path specification of the data stream.

        Returns:
          list[str]: dfVFS archive type indicators found in the data stream.
        """
        try:
            type_indicators = (
                dfvfs_analyzer.Analyzer.GetStorageMediaImageTypeIndicators(
                    path_spec, resolver_context=parser_mediator.resolver_context
                )
            )
        except OSError as exception:
            type_indicators = []

            warning_message = (
                f"analyzer failed to determine storage media image type indicators "
                f"with error: {exception!s}"
            )
            parser_mediator.ProduceExtractionWarning(
                warning_message, path_spec=path_spec
            )

        return type_indicators

    def _IsMetadataFile(self, file_entry):
        """Determines if the file entry is a metadata file.

        Args:
          file_entry (dfvfs.FileEntry): a file entry object.

        Returns:
          bool: True if the file entry is a metadata file.
        """
        if (
            file_entry.type_indicator == dfvfs_definitions.TYPE_INDICATOR_NTFS
            and file_entry.path_spec.location in self._METADATA_FILE_LOCATIONS_NTFS
        ):
            return True

        if (
            file_entry.type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK
            and file_entry.path_spec.location in self._METADATA_FILE_LOCATIONS_TSK
        ):
            return True

        return False

    def _ProcessArchiveType(self, parser_mediator, path_spec, type_indicator):
        """Processes a data stream containing an archive type such as: TAR or ZIP.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          path_spec (dfvfs.PathSpec): path specification.
          type_indicator(str): dfVFS type indicators found in the data stream.
        """
        self.processing_status = definitions.STATUS_INDICATOR_COLLECTING

        archive_path_spec = path_spec_factory.Factory.NewPathSpec(
            type_indicator, location="/", parent=path_spec
        )
        try:
            path_spec_generator = self._path_spec_extractor.ExtractPathSpecs(
                archive_path_spec, resolver_context=parser_mediator.resolver_context
            )
            for generated_path_spec in path_spec_generator:
                if self._abort:
                    break

                event_source = event_sources.FileEntryEventSource(
                    file_entry_type=dfvfs_definitions.FILE_ENTRY_TYPE_FILE,
                    path_spec=generated_path_spec,
                )
                parser_mediator.ProduceEventSource(event_source)

                self.last_activity_timestamp = time.time()

        except (OSError, errors.MaximumRecursionDepth) as exception:
            warning_message = (
                f"unable to process archive file with error: {exception!s}"
            )
            parser_mediator.ProduceExtractionWarning(
                warning_message, path_spec=generated_path_spec
            )

    def _ProcessStorageMediaImageType(self, parser_mediator, path_spec, type_indicator):
        """Processes a data stream containing a storage media image type.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          path_spec (dfvfs.PathSpec): path specification.
          type_indicator(str): dfVFS type indicators found in the data stream.
        """
        self.processing_status = definitions.STATUS_INDICATOR_COLLECTING

        if type_indicator in (
            dfvfs_definitions.TYPE_INDICATOR_MODI,
            dfvfs_definitions.TYPE_INDICATOR_VHDI,
        ):
            storage_media_image_path_spec = path_spec_factory.Factory.NewPathSpec(
                type_indicator, parent=path_spec
            )
        else:
            storage_media_image_path_spec = path_spec_factory.Factory.NewPathSpec(
                type_indicator, location="/", parent=path_spec
            )

        volume_scanner = EventExtractionWorkerVolumeScanner()

        try:
            base_path_specs = volume_scanner.GetBasePathSpecs(
                storage_media_image_path_spec
            )
            for base_path_spec in base_path_specs:
                if self._abort:
                    break

                event_source = event_sources.FileEntryEventSource(
                    file_entry_type=dfvfs_definitions.FILE_ENTRY_TYPE_FILE,
                    path_spec=base_path_spec,
                )
                parser_mediator.ProduceEventSource(event_source)

                self.last_activity_timestamp = time.time()

        except (OSError, dfvfs_errors.ScannerError) as exception:
            warning_message = (
                f"unable to process storage media image with error: {exception!s}"
            )
            parser_mediator.ProduceExtractionWarning(
                warning_message, path_spec=storage_media_image_path_spec
            )

    def _ProcessCompressedStreamTypes(
        self, parser_mediator, path_spec, type_indicators
    ):
        """Processes a data stream containing compressed stream types such as: bz2.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          path_spec (dfvfs.PathSpec): path specification.
          type_indicators (list[str]): dfVFS archive type indicators found in
              the data stream.
        """
        number_of_type_indicators = len(type_indicators)
        if number_of_type_indicators == 0:
            return

        self.processing_status = definitions.STATUS_INDICATOR_COLLECTING

        if number_of_type_indicators > 1:
            display_name = parser_mediator.GetDisplayName()
            logger.debug(
                f"Found multiple format type indicators: {type_indicators!s} for "
                f"compressed stream file: {display_name:s}"
            )

        for type_indicator in type_indicators:
            if type_indicator == dfvfs_definitions.TYPE_INDICATOR_BZIP2:
                compressed_stream_path_spec = path_spec_factory.Factory.NewPathSpec(
                    dfvfs_definitions.TYPE_INDICATOR_COMPRESSED_STREAM,
                    compression_method=dfvfs_definitions.COMPRESSION_METHOD_BZIP2,
                    parent=path_spec,
                )
            elif type_indicator == dfvfs_definitions.TYPE_INDICATOR_GZIP:
                compressed_stream_path_spec = path_spec_factory.Factory.NewPathSpec(
                    dfvfs_definitions.TYPE_INDICATOR_GZIP, parent=path_spec
                )
            elif type_indicator == dfvfs_definitions.TYPE_INDICATOR_XZ:
                compressed_stream_path_spec = path_spec_factory.Factory.NewPathSpec(
                    dfvfs_definitions.TYPE_INDICATOR_COMPRESSED_STREAM,
                    compression_method=dfvfs_definitions.COMPRESSION_METHOD_XZ,
                    parent=path_spec,
                )
            else:
                compressed_stream_path_spec = None

                warning_message = (
                    f"unsupported compressed stream format type indicator: "
                    f"{type_indicator:s}"
                )
                parser_mediator.ProduceExtractionWarning(
                    warning_message, path_spec=path_spec
                )

            if compressed_stream_path_spec:
                event_source = event_sources.FileEntryEventSource(
                    file_entry_type=dfvfs_definitions.FILE_ENTRY_TYPE_FILE,
                    path_spec=compressed_stream_path_spec,
                )
                parser_mediator.ProduceEventSource(event_source)

                self.last_activity_timestamp = time.time()

    def _ProcessDirectory(self, parser_mediator, file_entry):
        """Processes a directory file entry.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          file_entry (dfvfs.FileEntry): file entry of the directory.
        """
        self.processing_status = definitions.STATUS_INDICATOR_COLLECTING

        if self._processing_profiler:
            self._processing_profiler.StartTiming("collecting")

        for sub_file_entry in file_entry.sub_file_entries:
            if self._abort:
                break

            try:
                if not sub_file_entry.IsAllocated():
                    continue

            except dfvfs_errors.BackEndError as exception:
                warning_message = (
                    f"unable to process directory entry: {sub_file_entry.name:s} with "
                    f"error: {exception!s}"
                )
                parser_mediator.ProduceExtractionWarning(
                    warning_message, path_spec=file_entry.path_spec
                )
                continue

            # For TSK-based file entries only, ignore the virtual /$OrphanFiles
            # directory.
            if sub_file_entry.type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK:
                if file_entry.IsRoot() and sub_file_entry.name == "$OrphanFiles":
                    continue

            event_source = event_sources.FileEntryEventSource(
                file_entry_type=sub_file_entry.entry_type,
                path_spec=sub_file_entry.path_spec,
            )
            parser_mediator.ProduceEventSource(event_source)

            self.last_activity_timestamp = time.time()

        if self._processing_profiler:
            self._processing_profiler.StopTiming("collecting")

        self.processing_status = definitions.STATUS_INDICATOR_RUNNING

    def _ProcessFileEntry(self, parser_mediator, file_entry):
        """Processes a file entry.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          file_entry (dfvfs.FileEntry): file entry.
        """
        display_name = parser_mediator.GetDisplayName()
        logger.debug(f"[ProcessFileEntry] processing file entry: {display_name:s}")

        if self._IsMetadataFile(file_entry):
            self._ProcessMetadataFile(parser_mediator, file_entry)

        else:
            file_entry_processed = False
            for data_stream in file_entry.data_streams:
                if self._abort:
                    break

                if self._CanSkipDataStream(file_entry, data_stream):
                    logger.debug(
                        f"[ProcessFileEntry] Skipping datastream {data_stream.name:s} "
                        f"for {file_entry.type_indicator:s}: {display_name:s}"
                    )
                    continue

                self._ProcessFileEntryDataStream(
                    parser_mediator, file_entry, data_stream
                )
                file_entry_processed = True

            if not file_entry_processed:
                # For when the file entry does not contain a data stream.
                self._ProcessFileEntryDataStream(parser_mediator, file_entry, None)

        logger.debug(f"[ProcessFileEntry] done processing file entry: {display_name:s}")

    def _ProcessFileEntryDataStream(self, parser_mediator, file_entry, data_stream):
        """Processes a specific data stream of a file entry.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          file_entry (dfvfs.FileEntry): file entry containing the data stream.
          data_stream (dfvfs.DataStream): data stream or None if the file entry
              has no data stream.
        """
        display_name = parser_mediator.GetDisplayName()
        data_stream_name = getattr(data_stream, "name", "") or ""
        logger.debug(
            f"[ProcessFileEntryDataStream] processing data stream: "
            f'"{data_stream_name:s}" of file entry: {display_name:s}'
        )
        event_data_stream = None
        if data_stream:
            display_name = parser_mediator.GetDisplayName()

            path_spec = copy.deepcopy(file_entry.path_spec)
            if not data_stream.IsDefault():
                path_spec.data_stream = data_stream.name

            event_data_stream = events.EventDataStream()
            event_data_stream.path_spec = path_spec

            if self._analyzers:
                # Since AnalyzeDataStream generates event data stream attributes it
                # needs to be called before producing events.
                self._AnalyzeDataStream(
                    file_entry, data_stream.name, display_name, event_data_stream
                )

        parser_mediator.ProduceEventDataStream(event_data_stream)

        self._ExtractMetadataFromFileEntry(parser_mediator, file_entry, data_stream)

        # Not every file entry has a data stream. In such cases we want to
        # extract the metadata only.
        if not data_stream:
            return

        # Determine if the content of the file entry should not be extracted.
        skip_content_extraction = self._CanSkipContentExtraction(file_entry)
        if skip_content_extraction:
            display_name = parser_mediator.GetDisplayName()
            logger.debug(f"Skipping content extraction of: {display_name:s}")
            self.processing_status = definitions.STATUS_INDICATOR_IDLE
            return

        # TODO: merge with previous deepcopy
        path_spec = copy.deepcopy(file_entry.path_spec)
        if data_stream and not data_stream.IsDefault():
            path_spec.data_stream = data_stream.name

        compressed_stream_types = []
        if self._process_compressed_streams:
            compressed_stream_types = self._GetCompressedStreamTypes(
                parser_mediator, path_spec
            )

        if compressed_stream_types:
            self._ProcessCompressedStreamTypes(
                parser_mediator, path_spec, compressed_stream_types
            )
        else:
            results = []
            try:
                file_object = file_entry.GetFileObject(
                    data_stream_name=data_stream_name
                )
                if file_object:
                    scan_state = pysigscan.scan_state()
                    self._achive_type_scanner.scan_file_object(scan_state, file_object)
                    results = [
                        scan_result.identifier
                        for scan_result in iter(scan_state.scan_results)
                    ]

            except OSError:
                pass

            if results == ["iso9660"]:
                self._ProcessStorageMediaImageType(
                    parser_mediator, path_spec, dfvfs_definitions.TYPE_INDICATOR_TSK
                )
            elif results in (["modi_sparseimage"], ["modi_udif"]):
                self._ProcessStorageMediaImageType(
                    parser_mediator, path_spec, dfvfs_definitions.TYPE_INDICATOR_MODI
                )
            elif results in (["tar"], ["tar_old"]):
                self._ProcessArchiveType(
                    parser_mediator, path_spec, dfvfs_definitions.TYPE_INDICATOR_TAR
                )
            elif results in (["vhd"], ["vhdx"]):
                self._ProcessStorageMediaImageType(
                    parser_mediator, path_spec, dfvfs_definitions.TYPE_INDICATOR_VHDI
                )
            elif results == ["zip"]:
                if "zip" in self._archive_types:
                    self._ProcessArchiveType(
                        parser_mediator, path_spec, dfvfs_definitions.TYPE_INDICATOR_ZIP
                    )

                # Note that ZIP is also a compound format.
                self._ExtractContentFromDataStream(
                    parser_mediator, file_entry, data_stream.name
                )
            else:
                if len(results) > 1:
                    display_name = parser_mediator.GetDisplayName()
                    logger.debug(
                        f"Found multiple format type indicators: {results!s} for "
                        f"archive file: {display_name:s}"
                    )

                self._ExtractContentFromDataStream(
                    parser_mediator, file_entry, data_stream.name
                )

    def _ProcessMetadataFile(self, parser_mediator, file_entry):
        """Processes a metadata file.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          file_entry (dfvfs.FileEntry): file entry of the metadata file.
        """
        self.processing_status = definitions.STATUS_INDICATOR_EXTRACTING

        self._event_data_extractor.ParseFileEntryMetadata(parser_mediator, file_entry)
        for data_stream in file_entry.data_streams:
            if self._abort:
                break

            path_spec = copy.deepcopy(file_entry.path_spec)
            if not data_stream.IsDefault():
                path_spec.data_stream = data_stream.name

            event_data_stream = events.EventDataStream()
            event_data_stream.path_spec = path_spec

            parser_mediator.ProduceEventDataStream(event_data_stream)

            self.last_activity_timestamp = time.time()

            self._event_data_extractor.ParseMetadataFile(
                parser_mediator, file_entry, data_stream.name
            )

    def _SetArchiveTypes(self, archive_types_string):
        """Sets the archive types.

        Args:
          archive_types_string (str): comma separated archive types for
              which embedded file entries should be processed.
        """
        if not archive_types_string or archive_types_string == "none":
            return

        self._archive_types = [
            archive_type.lower() for archive_type in archive_types_string.split(",")
        ]
        self._achive_type_scanner = self._CreateArchiveTypeScanner(self._archive_types)

    def _SetHashers(self, hasher_names_string):
        """Sets the hasher names.

        Args:
          hasher_names_string (str): comma separated names of the hashers
              to enable, where 'none' disables the hashing analyzer.
        """
        if not hasher_names_string or hasher_names_string == "none":
            return

        analyzer_object = analyzers_manager.AnalyzersManager.GetAnalyzerInstance(
            "hashing"
        )
        analyzer_object.SetHasherNames(hasher_names_string)
        self._analyzers.append(analyzer_object)

    def _SetYaraRules(self, yara_rules_string):
        """Sets the Yara rules.

        Args:
          yara_rules_string (str): unparsed Yara rule definitions.
        """
        if not yara_rules_string:
            return

        analyzer_object = analyzers_manager.AnalyzersManager.GetAnalyzerInstance("yara")
        analyzer_object.SetRules(yara_rules_string)
        self._analyzers.append(analyzer_object)


[docs]
    def GetAnalyzerNames(self):
        """Gets the names of the active analyzers.

        Returns:
          list[str]: names of active analyzers.
        """
        return [analyzer_instance.NAME for analyzer_instance in self._analyzers]



[docs]
    def ProcessFileEntry(self, parser_mediator, file_entry):
        """Processes a file entry.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          file_entry (dfvfs.FileEntry): file entry.
        """
        self.last_activity_timestamp = time.time()
        self.processing_status = definitions.STATUS_INDICATOR_RUNNING

        parser_mediator.SetFileEntry(file_entry)

        try:
            if file_entry.IsDirectory():
                self._ProcessDirectory(parser_mediator, file_entry)

            self._ProcessFileEntry(parser_mediator, file_entry)

        finally:
            parser_mediator.ResetFileEntry()

            self.last_activity_timestamp = time.time()
            self.processing_status = definitions.STATUS_INDICATOR_IDLE



[docs]
    def ProcessPathSpec(self, parser_mediator, path_spec):
        """Processes a path specification.

        Args:
          parser_mediator (ParserMediator): mediates interactions between parsers
              and other components, such as storage and dfVFS.
          path_spec (dfvfs.PathSpec): path specification.
        """
        file_entry = path_spec_resolver.Resolver.OpenFileEntry(
            path_spec, resolver_context=parser_mediator.resolver_context
        )
        if file_entry is None:
            display_name = parser_mediator.GetDisplayNameForPathSpec(path_spec)
            logger.warning(f"Unable to open file entry: {display_name:s}")
            self.processing_status = definitions.STATUS_INDICATOR_IDLE
            return

        self.ProcessFileEntry(parser_mediator, file_entry)


    # TODO: move the functionality of this method into the constructor.

[docs]
    def SetExtractionConfiguration(self, configuration):
        """Sets the extraction configuration settings.

        Args:
          configuration (ExtractionConfiguration): extraction configuration.
        """
        self._SetArchiveTypes(configuration.archive_types_string)
        self._hasher_file_size_limit = configuration.hasher_file_size_limit
        self._SetHashers(configuration.hasher_names_string)
        self._process_compressed_streams = configuration.process_compressed_streams
        self._SetYaraRules(configuration.yara_rules_string)



[docs]
    def SetAnalyzersProfiler(self, analyzers_profiler):
        """Sets the analyzers profiler.

        Args:
          analyzers_profiler (AnalyzersProfiler): analyzers profile.
        """
        self._analyzers_profiler = analyzers_profiler



[docs]
    def SetProcessingProfiler(self, processing_profiler):
        """Sets the processing profiler.

        Args:
          processing_profiler (ProcessingProfiler): processing profile.
        """
        self._processing_profiler = processing_profiler



[docs]
    def SignalAbort(self):
        """Signals the extraction worker to abort."""
        self._abort = True