Source code for plaso.parsers.opera

"""Parsers for Opera Browser history files."""

import os

from urllib import parse as urlparse

from defusedxml import ElementTree
from dfdatetime import posix_time as dfdatetime_posix_time
from dfdatetime import time_elements as dfdatetime_time_elements
from dfvfs.helpers import text_file

from plaso.containers import events
from plaso.lib import errors
from plaso.parsers import interface
from plaso.parsers import manager



[docs]
class OperaGlobalHistoryEventData(events.EventData):
  """Opera global history entry data.

  Attributes:
    description (str): description.
    last_visited_time (dfdatetime.DateTimeValues): date and time the URL was
        last visited.
    popularity_index (int): popularity index.
    title (str): title.
    url (str):  URL.
  """

  DATA_TYPE = 'opera:history:entry'


[docs]
  def __init__(self):
    """Initializes event data."""
    super().__init__(data_type=self.DATA_TYPE)
    self.description = None
    self.last_visited_time = None
    self.popularity_index = None
    self.title = None
    self.url = None





[docs]
class OperaTypedHistoryEventData(events.EventData):
  """Opera typed history entry data.

  Attributes:
    entry_selection (str): information about whether the URL was directly
        typed in or the result of the user choosing from the auto complete.
    entry_type (str): information about whether the URL was directly typed in
        or the result of the user choosing from the auto complete.
    last_typed_time (dfdatetime.DateTimeValues): date and time the URL was
        last typed.
    url (str): typed URL or hostname.
  """

  DATA_TYPE = 'opera:history:typed_entry'


[docs]
  def __init__(self):
    """Initializes event data."""
    super().__init__(data_type=self.DATA_TYPE)
    self.entry_selection = None
    self.entry_type = None
    self.last_typed_time = None
    self.url = None





[docs]
class OperaTypedHistoryParser(interface.FileObjectParser):
  """Parses the Opera typed_history.xml file."""

  NAME = 'opera_typed_history'
  DATA_FORMAT = 'Opera typed history (typed_history.xml) file'

  _HEADER_READ_SIZE = 128


[docs]
  def ParseFileObject(self, parser_mediator, file_object):
    """Parses an Opera typed history file-like object.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfVFS.
      file_object (dfvfs.FileIO): file-like object.

    Raises:
      WrongParser: when the file cannot be parsed.
    """
    data = file_object.read(self._HEADER_READ_SIZE)
    if not data.startswith(b'<?xml'):
      raise errors.WrongParser(
          'Not an Opera typed history file [not a XML]')

    _, _, data = data.partition(b'\n')
    if not data.startswith(b'<typed_history'):
      raise errors.WrongParser(
          'Not an Opera typed history file [wrong XML root key]')

    # For ElementTree to work we need to work on a file object seeked
    # to the beginning.
    file_object.seek(0, os.SEEK_SET)

    xml = ElementTree.parse(file_object)

    for history_item in xml.iterfind('typed_history_item'):
      last_typed_time = history_item.get('last_typed')
      if last_typed_time is None:
        parser_mediator.ProduceExtractionWarning('missing last typed time.')
        continue

      date_time = dfdatetime_time_elements.TimeElements()

      try:
        date_time.CopyFromStringISO8601(last_typed_time)
      except ValueError as exception:
        parser_mediator.ProduceExtractionWarning(
            f'unsupported last typed time: {last_typed_time!s} '
            f'with error: {exception!s}.')
        continue

      event_data = OperaTypedHistoryEventData()
      event_data.entry_type = history_item.get('type')
      event_data.last_typed_time = date_time
      event_data.url = history_item.get('content')

      if event_data.entry_type == 'selected':
        event_data.entry_selection = 'Filled from autocomplete.'
      elif event_data.entry_type == 'text':
        event_data.entry_selection = 'Manually typed.'

      parser_mediator.ProduceEventData(event_data)





[docs]
class OperaGlobalHistoryParser(interface.FileObjectParser):
  """Parses the Opera global_history.dat file."""

  NAME = 'opera_global'
  DATA_FORMAT = 'Opera global history (global_history.dat) file'

  _ENCODING = 'utf-8'

  _MAXIMUM_LINE_SIZE = 512

  _SUPPORTED_URL_SCHEMES = frozenset(['file', 'http', 'https', 'ftp'])

  def _IsValidUrl(self, url):
    """Checks if a URL is considered valid.

    Returns:
      bool: True if the URL is valid.
    """
    parsed_url = urlparse.urlparse(url)
    return parsed_url.scheme in self._SUPPORTED_URL_SCHEMES

  def _ParseRecord(self, parser_mediator, text_file_object):
    """Parses an Opera global history record.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfVFS.
      text_file_object (dfvfs.TextFile): text file.

    Returns:
      bool: True if the record was successfully parsed.
    """
    try:
      title = text_file_object.readline()
    except UnicodeDecodeError:
      parser_mediator.ProduceExtractionWarning(
          'unable to read and decode title')
      return False

    if not title:
      return False

    try:
      url = text_file_object.readline()
    except UnicodeDecodeError:
      parser_mediator.ProduceExtractionWarning(
          'unable to read and decode url')
      return False

    try:
      timestamp = text_file_object.readline()
    except UnicodeDecodeError:
      parser_mediator.ProduceExtractionWarning(
          'unable to read and decode timestamp')
      return False

    try:
      popularity_index = text_file_object.readline()
    except UnicodeDecodeError:
      parser_mediator.ProduceExtractionWarning(
          'unable to read and decode popularity index')
      return False

    title = title.strip()

    timestamp = timestamp.strip()
    try:
      timestamp = int(timestamp, 10)
    except ValueError:
      parser_mediator.ProduceExtractionWarning(
          f'unable to convert timestamp: {timestamp!s}')
      timestamp = None

    popularity_index = popularity_index.strip()
    try:
      popularity_index = int(popularity_index, 10)
    except ValueError:
      parser_mediator.ProduceExtractionWarning(
          f'unable to convert popularity index: {popularity_index!s}')
      popularity_index = None

    event_data = OperaGlobalHistoryEventData()
    event_data.popularity_index = popularity_index
    event_data.url = url.strip()

    if timestamp:
      event_data.last_visited_time = dfdatetime_posix_time.PosixTime(
          timestamp=timestamp)

    if title != event_data.url:
      event_data.title = title

    if event_data.popularity_index < 0:
      event_data.description = 'First and Only Visit'
    else:
      event_data.description = 'Last Visit'

    parser_mediator.ProduceEventData(event_data)

    return True

  def _ParseAndValidateRecord(self, parser_mediator, text_file_object):
    """Parses and validates an Opera global history record.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfVFS.
      text_file_object (dfvfs.TextFile): text file.

    Returns:
      bool: True if the record was successfully parsed.
    """
    try:
      title = text_file_object.readline(size=self._MAXIMUM_LINE_SIZE)
      url = text_file_object.readline(size=self._MAXIMUM_LINE_SIZE)
      timestamp = text_file_object.readline(size=self._MAXIMUM_LINE_SIZE)
      popularity_index = text_file_object.readline(size=self._MAXIMUM_LINE_SIZE)
    except UnicodeDecodeError:
      return False

    if len(title) == self._MAXIMUM_LINE_SIZE and title[-1] != '\n':
      return False

    if len(url) == self._MAXIMUM_LINE_SIZE and url[-1] != '\n':
      return False

    if len(timestamp) == self._MAXIMUM_LINE_SIZE and timestamp[-1] != '\n':
      return False

    if (len(popularity_index) == self._MAXIMUM_LINE_SIZE and
        popularity_index[-1] != '\n'):
      return False

    title = title.strip()
    url = url.strip()
    timestamp = timestamp.strip()
    popularity_index = popularity_index.strip()

    if not title or not url or not timestamp or not popularity_index:
      return False

    if not self._IsValidUrl(url):
      return False

    try:
      timestamp = int(timestamp, 10)
    except (TypeError, ValueError):
      return False

    try:
      popularity_index = int(popularity_index, 10)
    except (TypeError, ValueError):
      return False

    event_data = OperaGlobalHistoryEventData()
    event_data.last_visited_time = dfdatetime_posix_time.PosixTime(
        timestamp=timestamp)
    event_data.popularity_index = popularity_index
    event_data.url = url

    if title != url:
      event_data.title = title

    if event_data.popularity_index < 0:
      event_data.description = 'First and Only Visit'
    else:
      event_data.description = 'Last Visit'

    parser_mediator.ProduceEventData(event_data)

    return True


[docs]
  def ParseFileObject(self, parser_mediator, file_object):
    """Parses an Opera global history file-like object.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfVFS.
      file_object (dfvfs.FileIO): file-like object.

    Raises:
      WrongParser: when the file cannot be parsed.
    """
    encoding = self._ENCODING
    if not encoding:
      encoding = parser_mediator.GetCodePage()

    text_file_object = text_file.TextFile(file_object, encoding=encoding)
    if not self._ParseAndValidateRecord(parser_mediator, text_file_object):
      raise errors.WrongParser('Unable to parse as Opera global_history.dat.')

    while self._ParseRecord(parser_mediator, text_file_object):
      pass




manager.ParsersManager.RegisterParsers([
    OperaTypedHistoryParser, OperaGlobalHistoryParser])