Source code for plaso.parsers.text_plugins.bitbucket_access

"""Text parser plugin for Atlassian Bitbucket access log files.

This is for the atlassian-bitbucket-access.log file, one of multiple log
files produced by a Bitbucket DC/Server installation.

The standard HTTP/SSH access log format is pipe-delimited:
  ip_address | protocol | request_identifier | user | timestamp | "request" |
  "referer" "user-agent" | status | bytes_read | bytes_written | labels |
  request_time | session_identifier |

The Mesh/gRPC access log format has two additional fields:
  ip_address | grpc | request_identifier | mesh_execution_identifier | user |
  timestamp | "action" | - | status | bytes_read | bytes_written | mesh_in |
  mesh_out | duration_ns | labels | session_identifier |

Also see:
  https://support.atlassian.com/bitbucket-data-center/kb/how-to-read-the-bitbucket-data-center-log-formats/
"""

import pyparsing

from dfdatetime import time_elements as dfdatetime_time_elements

from plaso.containers import events
from plaso.lib import errors
from plaso.parsers import text_parser
from plaso.parsers.text_plugins import interface


[docs] class BitbucketAccessEventData(events.EventData): """Bitbucket access log event data. Attributes: http_request_method (str): HTTP request method (GET, POST, etc.), or 'SSH' for SSH requests, or None for gRPC/Mesh action lines. http_request_uri (str): HTTP request URI, SSH command, or gRPC action. http_request_user_agent (str): HTTP request user agent, or None for SSH/gRPC requests. http_response_bytes_read (int): number of bytes read from the client. http_response_bytes_written (int): number of bytes written to the client. http_response_code (int): HTTP response status code. http_version (str): HTTP request version, or None for SSH/gRPC requests. labels (str): request classification labels, such as push, fetch, clone, access-token:id:..., async, protocol:2 or refs. mesh_execution_identifier (str): Mesh execution identifier, only present for gRPC/Mesh log lines. protocol (str): protocol used, such as http, https, ssh or grpc. recorded_time (dfdatetime.DateTimeValues): date and time the log entry was recorded. remote_address (str): remote IP address(es), including X-Forwarded-For proxies. Multiple addresses are comma-separated. request_identifier (str): unique request identifier, correlatable with audit log. request_time (int): time taken to process the request in milliseconds (HTTP/SSH) or nanoseconds (gRPC/Mesh), or None if not available. session_identifier (str): session identifier. ssh_repository_path (str): SSH repository path for SSH requests, or None. user_name (str): the name of the authenticated user. """ DATA_TYPE = "atlassian:bitbucket:access"
[docs] def __init__(self): """Initializes event data.""" super().__init__(data_type=self.DATA_TYPE) self.http_request_method = None self.http_request_uri = None self.http_request_user_agent = None self.http_response_bytes_read = None self.http_response_bytes_written = None self.http_response_code = None self.http_version = None self.labels = None self.mesh_execution_identifier = None self.protocol = None self.recorded_time = None self.remote_address = None self.request_identifier = None self.request_time = None self.session_identifier = None self.ssh_repository_path = None self.user_name = None
[docs] class BitbucketAccessTextPlugin(interface.TextPlugin): """Text parser plugin for Atlassian Bitbucket access log files.""" NAME = "bitbucket_access" DATA_FORMAT = "Atlassian Bitbucket access log (atlassian-bitbucket-access.log) file" ENCODING = "utf-8" _INTEGER = pyparsing.Word(pyparsing.nums).set_parse_action( lambda tokens: int(tokens[0], 10) ) _TWO_DIGITS = pyparsing.Word(pyparsing.nums, exact=2).set_parse_action( lambda tokens: int(tokens[0], 10) ) _THREE_DIGITS = pyparsing.Word(pyparsing.nums, exact=3).set_parse_action( lambda tokens: int(tokens[0], 10) ) _FOUR_DIGITS = pyparsing.Word(pyparsing.nums, exact=4).set_parse_action( lambda tokens: int(tokens[0], 10) ) # Pipe as field separator. _SEPARATOR = pyparsing.Suppress(pyparsing.Literal("|")) # Integer or dash (used for optional numeric fields). _INT_OR_DASH = pyparsing.Literal("-") | _INTEGER # Remote address: IPv4, IPv6, or comma-separated list of addresses. _REMOTE_ADDRESS = pyparsing.Combine( pyparsing.Word(pyparsing.alphanums + ".:,") ).set_results_name("remote_address") # Request identifier: alphanumeric token with @/* prefix and x separators, # or '-'. _REQUEST_IDENTIFIER = pyparsing.Word( pyparsing.alphanums + "@*-_x" ) | pyparsing.Literal("-") # User name: alphanumeric with common separators, or '-'. _USER_NAME = ( pyparsing.Word(pyparsing.alphanums + "-_./") | pyparsing.Literal("-") ).set_results_name("user_name") # Session identifier: short alphanumeric token or '-'. _SESSION_IDENTIFIER = pyparsing.Word( pyparsing.alphanums + "-_" ) | pyparsing.Literal("-") # Date and time format: 2020-09-08 07:53:45,084 _DATE_TIME = ( _FOUR_DIGITS + pyparsing.Suppress("-") + _TWO_DIGITS + pyparsing.Suppress("-") + _TWO_DIGITS + _TWO_DIGITS + pyparsing.Suppress(":") + _TWO_DIGITS + pyparsing.Suppress(":") + _TWO_DIGITS + pyparsing.Suppress(",") + _THREE_DIGITS ).set_results_name("date_time") # HTTP request methods. _HTTP_METHODS = [ "CONNECT", "DELETE", "GET", "HEAD", "OPTIONS", "PATCH", "POST", "PUT", "TRACE", ] # Request URI characters. _REQUEST_URI = pyparsing.Word(pyparsing.alphanums + "/-_.?=%&:+<>#~[]@!,()") # HTTP request: "METHOD /uri HTTP/1.1" _HTTP_REQUEST = pyparsing.Group( pyparsing.Suppress('"') + pyparsing.one_of(_HTTP_METHODS).set_results_name("http_method") + _REQUEST_URI.set_results_name("request_url") + pyparsing.Word(pyparsing.alphanums + "/.").set_results_name("http_version") + pyparsing.Suppress('"') ).set_results_name("http_request") # SSH request: "SSH - git-upload-pack '/repo.git'" _SSH_REQUEST = pyparsing.Group( pyparsing.Suppress('"') + pyparsing.Literal("SSH").set_results_name("http_method") + pyparsing.Suppress(pyparsing.Literal("-")) + pyparsing.Word(pyparsing.alphanums + "-_").set_results_name("request_url") + pyparsing.QuotedString("'").set_results_name("ssh_repo") + pyparsing.Suppress('"') ).set_results_name("http_request") # gRPC action: "ServiceName/MethodName" _GRPC_REQUEST = pyparsing.Group( pyparsing.Suppress('"') + pyparsing.SkipTo('"').set_results_name("request_url") + pyparsing.Suppress('"') ).set_results_name("http_request") # Referer and user agent fields: "" "user-agent-string" _REFERER = ( pyparsing.Suppress('"') + pyparsing.Optional( pyparsing.Word(pyparsing.alphanums + "/-_.?=%&:+<>#~[]@!,()") ).set_results_name("referer") + pyparsing.Suppress('"') ) _USER_AGENT = ( pyparsing.Suppress('"') + pyparsing.SkipTo('"').set_results_name("user_agent") + pyparsing.Suppress('"') ) # Labels field: arbitrary text up to the next pipe separator. _LABELS = pyparsing.SkipTo(pyparsing.Literal("|")).set_results_name("labels") # HTTP/SSH access log line: # ip_address | protocol | request_identifier | user | timestamp | "request" | # "referer" "user-agent" | status | bytes_read | bytes_written | labels | # request_time | session_identifier | _HTTP_ACCESS_LOG_LINE = ( _REMOTE_ADDRESS + _SEPARATOR + pyparsing.Word(pyparsing.alphanums).set_results_name("protocol") + _SEPARATOR + _REQUEST_IDENTIFIER.set_results_name("request_identifier") + _SEPARATOR + _USER_NAME + _SEPARATOR + _DATE_TIME + _SEPARATOR + (_HTTP_REQUEST | _SSH_REQUEST | _GRPC_REQUEST) + _SEPARATOR + _REFERER + _USER_AGENT + _SEPARATOR + _INT_OR_DASH.set_results_name("status_code") + _SEPARATOR + _INT_OR_DASH.set_results_name("bytes_read") + _SEPARATOR + _INT_OR_DASH.set_results_name("bytes_written") + _SEPARATOR + _LABELS + _SEPARATOR + _INT_OR_DASH.set_results_name("request_time") + _SEPARATOR + _SESSION_IDENTIFIER.set_results_name("session_identifier") + _SEPARATOR + pyparsing.Suppress(pyparsing.LineEnd()) ) # gRPC/Mesh access log line: # ip_address | grpc | request_identifier | mesh_execution_identifier | user | # timestamp | "action" | - | status | bytes_read | bytes_written | mesh_in | # mesh_out | duration_ns | session_identifier | _GRPC_ACCESS_LOG_LINE = ( _REMOTE_ADDRESS + _SEPARATOR + pyparsing.Literal("grpc").set_results_name("protocol") + _SEPARATOR + _REQUEST_IDENTIFIER.set_results_name("request_identifier") + _SEPARATOR + ( pyparsing.Word(pyparsing.alphanums + "@*-_x") | pyparsing.Literal("-") ).set_results_name("mesh_execution_identifier") + _SEPARATOR + _USER_NAME + _SEPARATOR + _DATE_TIME + _SEPARATOR + _GRPC_REQUEST + _SEPARATOR + pyparsing.Suppress(pyparsing.Literal("-")) + _SEPARATOR + _INT_OR_DASH.set_results_name("status_code") + _SEPARATOR + _INT_OR_DASH.set_results_name("bytes_read") + _SEPARATOR + _INT_OR_DASH.set_results_name("bytes_written") + _SEPARATOR + _INT_OR_DASH.set_results_name("mesh_in") + _SEPARATOR + _INT_OR_DASH.set_results_name("mesh_out") + _SEPARATOR + _INT_OR_DASH.set_results_name("duration_ns") + _SEPARATOR + _SESSION_IDENTIFIER.set_results_name("session_identifier") + _SEPARATOR + pyparsing.Suppress(pyparsing.LineEnd()) ) _LINE_STRUCTURES = [ ("grpc_access_log", _GRPC_ACCESS_LOG_LINE), ("http_access_log", _HTTP_ACCESS_LOG_LINE), ] VERIFICATION_GRAMMAR = _GRPC_ACCESS_LOG_LINE | _HTTP_ACCESS_LOG_LINE VERIFICATION_LITERALS = [" | grpc | ", " | http | ", " | https | ", " | ssh | "] def _GetStrippedValue(self, structure, name, default_value=None): """Retrieves a token value from a Pyparsing structure and strips '' or '-'. Args: structure (pyparsing.ParseResults): tokens from a parsed log line. name (str): name of the token. default_value (Optional[object]): default value. Returns: object: value in the token or default value if the token is not available in the structure. """ value = self._GetValueFromStructure(structure, name) if value in (None, "", "-"): return default_value return value def _ParseRecord(self, parser_mediator, key, structure): """Parses a pyparsing structure. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfVFS. key (str): name of the parsed structure. structure (pyparsing.ParseResults): tokens from a parsed log line. Raises: ParseError: if the structure cannot be parsed. """ if key not in ("grpc_access_log", "http_access_log"): raise errors.ParseError(f"Unsupported structure: {key:s}") time_elements_structure = self._GetValueFromStructure(structure, "date_time") http_request = self._GetValueFromStructure(structure, "http_request") labels = self._GetValueFromStructure( structure, "labels", default_value="" ).strip() event_data = BitbucketAccessEventData() event_data.http_response_bytes_read = self._GetStrippedValue( structure, "bytes_read" ) event_data.http_response_bytes_written = self._GetStrippedValue( structure, "bytes_written" ) event_data.http_response_code = self._GetStrippedValue(structure, "status_code") event_data.http_request_user_agent = ( self._GetStringValueFromStructure(structure, "user_agent") or None ) event_data.labels = None if not labels or labels == "-" else labels event_data.mesh_execution_identifier = self._GetStrippedValue( structure, "mesh_execution_identifier" ) event_data.protocol = self._GetValueFromStructure(structure, "protocol") event_data.recorded_time = self._ParseTimeElements(time_elements_structure) event_data.remote_address = self._GetValueFromStructure( structure, "remote_address" ) event_data.request_identifier = self._GetStrippedValue( structure, "request_identifier" ) event_data.request_time = self._GetStrippedValue(structure, "request_time") event_data.session_identifier = self._GetStrippedValue( structure, "session_identifier" ) event_data.user_name = self._GetStrippedValue(structure, "user_name") if http_request: event_data.http_request_method = self._GetValueFromStructure( http_request, "http_method" ) event_data.http_request_uri = self._GetValueFromStructure( http_request, "request_url" ) event_data.http_version = ( self._GetValueFromStructure(http_request, "http_version") or None ) event_data.ssh_repository_path = ( self._GetValueFromStructure(http_request, "ssh_repo") or None ) parser_mediator.ProduceEventData(event_data) def _ParseTimeElements(self, time_elements_structure): """Parses date and time elements of a log line. Args: time_elements_structure (pyparsing.ParseResults): date and time elements. Returns: dfdatetime.TimeElements: date and time value. Raises: ParseError: if a valid date and time value cannot be derived. """ try: date_time = dfdatetime_time_elements.TimeElementsInMilliseconds( time_elements_tuple=time_elements_structure ) date_time.is_local_time = True return date_time except (IndexError, TypeError, ValueError) as exception: raise errors.ParseError( f"Unable to parse time elements with error: {exception!s}" )
[docs] def CheckRequiredFormat(self, parser_mediator, text_reader): """Check if the log record has the minimal structure required by the plugin. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfVFS. text_reader (EncodedTextReader): text reader. Returns: bool: True if this is the correct plugin, False otherwise. """ try: structure = self._VerifyString(text_reader.lines) except errors.ParseError: return False time_elements_structure = self._GetValueFromStructure(structure, "date_time") try: self._ParseTimeElements(time_elements_structure) except errors.ParseError: return False return True
text_parser.TextLogParser.RegisterPlugin(BitbucketAccessTextPlugin)