"""Text parser plugin for Atlassian Bitbucket access log files.
This is for the atlassian-bitbucket-access.log file, one of multiple log
files produced by a Bitbucket DC/Server installation.
The standard HTTP/SSH access log format is pipe-delimited:
ip_address | protocol | request_identifier | user | timestamp | "request" |
"referer" "user-agent" | status | bytes_read | bytes_written | labels |
request_time | session_identifier |
The Mesh/gRPC access log format has two additional fields:
ip_address | grpc | request_identifier | mesh_execution_identifier | user |
timestamp | "action" | - | status | bytes_read | bytes_written | mesh_in |
mesh_out | duration_ns | labels | session_identifier |
Also see:
https://support.atlassian.com/bitbucket-data-center/kb/how-to-read-the-bitbucket-data-center-log-formats/
"""
import pyparsing
from dfdatetime import time_elements as dfdatetime_time_elements
from plaso.containers import events
from plaso.lib import errors
from plaso.parsers import text_parser
from plaso.parsers.text_plugins import interface
[docs]
class BitbucketAccessEventData(events.EventData):
"""Bitbucket access log event data.
Attributes:
http_request_method (str): HTTP request method (GET, POST, etc.), or
'SSH' for SSH requests, or None for gRPC/Mesh action lines.
http_request_uri (str): HTTP request URI, SSH command, or gRPC action.
http_request_user_agent (str): HTTP request user agent, or None for
SSH/gRPC requests.
http_response_bytes_read (int): number of bytes read from the client.
http_response_bytes_written (int): number of bytes written to the client.
http_response_code (int): HTTP response status code.
http_version (str): HTTP request version, or None for SSH/gRPC requests.
labels (str): request classification labels, such as push, fetch, clone,
access-token:id:..., async, protocol:2 or refs.
mesh_execution_identifier (str): Mesh execution identifier, only present for
gRPC/Mesh log lines.
protocol (str): protocol used, such as http, https, ssh or grpc.
recorded_time (dfdatetime.DateTimeValues): date and time the log entry
was recorded.
remote_address (str): remote IP address(es), including X-Forwarded-For
proxies. Multiple addresses are comma-separated.
request_identifier (str): unique request identifier, correlatable with audit
log.
request_time (int): time taken to process the request in milliseconds
(HTTP/SSH) or nanoseconds (gRPC/Mesh), or None if not available.
session_identifier (str): session identifier.
ssh_repository_path (str): SSH repository path for SSH requests, or None.
user_name (str): the name of the authenticated user.
"""
DATA_TYPE = 'atlassian:bitbucket:access'
[docs]
def __init__(self):
"""Initializes event data."""
super().__init__(data_type=self.DATA_TYPE)
self.http_request_method = None
self.http_request_uri = None
self.http_request_user_agent = None
self.http_response_bytes_read = None
self.http_response_bytes_written = None
self.http_response_code = None
self.http_version = None
self.labels = None
self.mesh_execution_identifier = None
self.protocol = None
self.recorded_time = None
self.remote_address = None
self.request_identifier = None
self.request_time = None
self.session_identifier = None
self.ssh_repository_path = None
self.user_name = None
[docs]
class BitbucketAccessTextPlugin(interface.TextPlugin):
"""Text parser plugin for Atlassian Bitbucket access log files."""
NAME = 'bitbucket_access'
DATA_FORMAT = (
'Atlassian Bitbucket access log (atlassian-bitbucket-access.log) file')
ENCODING = 'utf-8'
_INTEGER = pyparsing.Word(pyparsing.nums).set_parse_action(
lambda tokens: int(tokens[0], 10))
_TWO_DIGITS = pyparsing.Word(pyparsing.nums, exact=2).set_parse_action(
lambda tokens: int(tokens[0], 10))
_THREE_DIGITS = pyparsing.Word(pyparsing.nums, exact=3).set_parse_action(
lambda tokens: int(tokens[0], 10))
_FOUR_DIGITS = pyparsing.Word(pyparsing.nums, exact=4).set_parse_action(
lambda tokens: int(tokens[0], 10))
# Pipe as field separator.
_SEPARATOR = pyparsing.Suppress(pyparsing.Literal('|'))
# Integer or dash (used for optional numeric fields).
_INT_OR_DASH = pyparsing.Literal('-') | _INTEGER
# Remote address: IPv4, IPv6, or comma-separated list of addresses.
_REMOTE_ADDRESS = pyparsing.Combine(
pyparsing.Word(pyparsing.alphanums + '.:,')).set_results_name(
'remote_address')
# Request identifier: alphanumeric token with @/* prefix and x separators,
# or '-'.
_REQUEST_IDENTIFIER = (
pyparsing.Word(pyparsing.alphanums + '@*-_x') |
pyparsing.Literal('-'))
# User name: alphanumeric with common separators, or '-'.
_USER_NAME = (
pyparsing.Word(pyparsing.alphanums + '-_./') |
pyparsing.Literal('-')).set_results_name('user_name')
# Session identifier: short alphanumeric token or '-'.
_SESSION_IDENTIFIER = (
pyparsing.Word(pyparsing.alphanums + '-_') |
pyparsing.Literal('-'))
# Date and time format: 2020-09-08 07:53:45,084
_DATE_TIME = (
_FOUR_DIGITS + pyparsing.Suppress('-') +
_TWO_DIGITS + pyparsing.Suppress('-') +
_TWO_DIGITS + _TWO_DIGITS +
pyparsing.Suppress(':') + _TWO_DIGITS +
pyparsing.Suppress(':') + _TWO_DIGITS +
pyparsing.Suppress(',') + _THREE_DIGITS).set_results_name('date_time')
# HTTP request methods.
_HTTP_METHODS = [
'CONNECT', 'DELETE', 'GET', 'HEAD', 'OPTIONS', 'PATCH', 'POST', 'PUT',
'TRACE']
# Request URI characters.
_REQUEST_URI = pyparsing.Word(pyparsing.alphanums + '/-_.?=%&:+<>#~[]@!,()')
# HTTP request: "METHOD /uri HTTP/1.1"
_HTTP_REQUEST = pyparsing.Group(
pyparsing.Suppress('"') +
pyparsing.one_of(_HTTP_METHODS).set_results_name('http_method') +
_REQUEST_URI.set_results_name('request_url') +
pyparsing.Word(pyparsing.alphanums + '/.').set_results_name(
'http_version') +
pyparsing.Suppress('"')).set_results_name('http_request')
# SSH request: "SSH - git-upload-pack '/repo.git'"
_SSH_REQUEST = pyparsing.Group(
pyparsing.Suppress('"') +
pyparsing.Literal('SSH').set_results_name('http_method') +
pyparsing.Suppress(pyparsing.Literal('-')) +
pyparsing.Word(pyparsing.alphanums + '-_').set_results_name(
'request_url') +
pyparsing.QuotedString("'").set_results_name('ssh_repo') +
pyparsing.Suppress('"')).set_results_name('http_request')
# gRPC action: "ServiceName/MethodName"
_GRPC_REQUEST = pyparsing.Group(
pyparsing.Suppress('"') +
pyparsing.SkipTo('"').set_results_name('request_url') +
pyparsing.Suppress('"')).set_results_name('http_request')
# Referer and user agent fields: "" "user-agent-string"
_REFERER = (
pyparsing.Suppress('"') +
pyparsing.Optional(pyparsing.Word(
pyparsing.alphanums + '/-_.?=%&:+<>#~[]@!,()')).set_results_name(
'referer') +
pyparsing.Suppress('"'))
_USER_AGENT = (
pyparsing.Suppress('"') +
pyparsing.SkipTo('"').set_results_name('user_agent') +
pyparsing.Suppress('"'))
# Labels field: arbitrary text up to the next pipe separator.
_LABELS = pyparsing.SkipTo(pyparsing.Literal('|')).set_results_name('labels')
# HTTP/SSH access log line:
# ip_address | protocol | request_identifier | user | timestamp | "request" |
# "referer" "user-agent" | status | bytes_read | bytes_written | labels |
# request_time | session_identifier |
_HTTP_ACCESS_LOG_LINE = (
_REMOTE_ADDRESS + _SEPARATOR +
pyparsing.Word(pyparsing.alphanums).set_results_name('protocol') +
_SEPARATOR +
_REQUEST_IDENTIFIER.set_results_name('request_identifier') + _SEPARATOR +
_USER_NAME + _SEPARATOR +
_DATE_TIME + _SEPARATOR +
(_HTTP_REQUEST | _SSH_REQUEST | _GRPC_REQUEST) +
_SEPARATOR +
_REFERER + _USER_AGENT + _SEPARATOR +
_INT_OR_DASH.set_results_name('status_code') + _SEPARATOR +
_INT_OR_DASH.set_results_name('bytes_read') + _SEPARATOR +
_INT_OR_DASH.set_results_name('bytes_written') + _SEPARATOR +
_LABELS + _SEPARATOR +
_INT_OR_DASH.set_results_name('request_time') + _SEPARATOR +
_SESSION_IDENTIFIER.set_results_name('session_identifier') + _SEPARATOR +
pyparsing.Suppress(pyparsing.LineEnd()))
# gRPC/Mesh access log line:
# ip_address | grpc | request_identifier | mesh_execution_identifier | user |
# timestamp | "action" | - | status | bytes_read | bytes_written | mesh_in |
# mesh_out | duration_ns | session_identifier |
_GRPC_ACCESS_LOG_LINE = (
_REMOTE_ADDRESS + _SEPARATOR +
pyparsing.Literal('grpc').set_results_name('protocol') + _SEPARATOR +
_REQUEST_IDENTIFIER.set_results_name('request_identifier') + _SEPARATOR +
(pyparsing.Word(pyparsing.alphanums + '@*-_x') |
pyparsing.Literal('-')).set_results_name('mesh_execution_identifier') +
_SEPARATOR +
_USER_NAME + _SEPARATOR +
_DATE_TIME + _SEPARATOR +
_GRPC_REQUEST + _SEPARATOR +
pyparsing.Suppress(pyparsing.Literal('-')) + _SEPARATOR +
_INT_OR_DASH.set_results_name('status_code') + _SEPARATOR +
_INT_OR_DASH.set_results_name('bytes_read') + _SEPARATOR +
_INT_OR_DASH.set_results_name('bytes_written') + _SEPARATOR +
_INT_OR_DASH.set_results_name('mesh_in') + _SEPARATOR +
_INT_OR_DASH.set_results_name('mesh_out') + _SEPARATOR +
_INT_OR_DASH.set_results_name('duration_ns') + _SEPARATOR +
_SESSION_IDENTIFIER.set_results_name('session_identifier') + _SEPARATOR +
pyparsing.Suppress(pyparsing.LineEnd()))
_LINE_STRUCTURES = [
('grpc_access_log', _GRPC_ACCESS_LOG_LINE),
('http_access_log', _HTTP_ACCESS_LOG_LINE)]
VERIFICATION_GRAMMAR = _GRPC_ACCESS_LOG_LINE | _HTTP_ACCESS_LOG_LINE
VERIFICATION_LITERALS = [
' | grpc | ', ' | http | ', ' | https | ', ' | ssh | ']
def _GetStrippedValue(self, structure, name, default_value=None):
"""Retrieves a token value from a Pyparsing structure and strips '' or '-'.
Args:
structure (pyparsing.ParseResults): tokens from a parsed log line.
name (str): name of the token.
default_value (Optional[object]): default value.
Returns:
object: value in the token or default value if the token is not available
in the structure.
"""
value = self._GetValueFromStructure(structure, name)
if value in (None, '', '-'):
return default_value
return value
def _ParseRecord(self, parser_mediator, key, structure):
"""Parses a pyparsing structure.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfVFS.
key (str): name of the parsed structure.
structure (pyparsing.ParseResults): tokens from a parsed log line.
Raises:
ParseError: if the structure cannot be parsed.
"""
if key not in ('grpc_access_log', 'http_access_log'):
raise errors.ParseError(f'Unsupported structure: {key:s}')
time_elements_structure = self._GetValueFromStructure(
structure, 'date_time')
http_request = self._GetValueFromStructure(structure, 'http_request')
labels = self._GetValueFromStructure(
structure, 'labels', default_value='').strip()
event_data = BitbucketAccessEventData()
event_data.http_response_bytes_read = self._GetStrippedValue(
structure, 'bytes_read')
event_data.http_response_bytes_written = self._GetStrippedValue(
structure, 'bytes_written')
event_data.http_response_code = self._GetStrippedValue(
structure, 'status_code')
event_data.http_request_user_agent = self._GetStringValueFromStructure(
structure, 'user_agent') or None
event_data.labels = None if not labels or labels == '-' else labels
event_data.mesh_execution_identifier = self._GetStrippedValue(
structure, 'mesh_execution_identifier')
event_data.protocol = self._GetValueFromStructure(structure, 'protocol')
event_data.recorded_time = self._ParseTimeElements(time_elements_structure)
event_data.remote_address = self._GetValueFromStructure(
structure, 'remote_address')
event_data.request_identifier = self._GetStrippedValue(
structure, 'request_identifier')
event_data.request_time = self._GetStrippedValue(structure, 'request_time')
event_data.session_identifier = self._GetStrippedValue(
structure, 'session_identifier')
event_data.user_name = self._GetStrippedValue(structure, 'user_name')
if http_request:
event_data.http_request_method = self._GetValueFromStructure(
http_request, 'http_method')
event_data.http_request_uri = self._GetValueFromStructure(
http_request, 'request_url')
event_data.http_version = self._GetValueFromStructure(
http_request, 'http_version') or None
event_data.ssh_repository_path = self._GetValueFromStructure(
http_request, 'ssh_repo') or None
parser_mediator.ProduceEventData(event_data)
def _ParseTimeElements(self, time_elements_structure):
"""Parses date and time elements of a log line.
Args:
time_elements_structure (pyparsing.ParseResults): date and time elements.
Returns:
dfdatetime.TimeElements: date and time value.
Raises:
ParseError: if a valid date and time value cannot be derived.
"""
try:
date_time = dfdatetime_time_elements.TimeElementsInMilliseconds(
time_elements_tuple=time_elements_structure)
date_time.is_local_time = True
return date_time
except (IndexError, TypeError, ValueError) as exception:
raise errors.ParseError(
f'Unable to parse time elements with error: {exception!s}')
[docs]
def CheckRequiredFormat(self, parser_mediator, text_reader):
"""Check if the log record has the minimal structure required by the plugin.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfVFS.
text_reader (EncodedTextReader): text reader.
Returns:
bool: True if this is the correct plugin, False otherwise.
"""
try:
structure = self._VerifyString(text_reader.lines)
except errors.ParseError:
return False
time_elements_structure = self._GetValueFromStructure(
structure, 'date_time')
try:
self._ParseTimeElements(time_elements_structure)
except errors.ParseError:
return False
return True
text_parser.TextLogParser.RegisterPlugin(BitbucketAccessTextPlugin)