"""Text parser plugin for Atlassian Bitbucket access log files.
This is for the atlassian-bitbucket-access.log file, one of multiple log
files produced by a Bitbucket DC/Server installation.
The standard HTTP/SSH access log format is pipe-delimited:
ip_address | protocol | request_identifier | user | timestamp | "request" |
"referer" "user-agent" | status | bytes_read | bytes_written | labels |
request_time | session_identifier |
The Mesh/gRPC access log format has two additional fields:
ip_address | grpc | request_identifier | mesh_execution_identifier | user |
timestamp | "action" | - | status | bytes_read | bytes_written | mesh_in |
mesh_out | duration_ns | labels | session_identifier |
Also see:
https://support.atlassian.com/bitbucket-data-center/kb/how-to-read-the-bitbucket-data-center-log-formats/
"""
import pyparsing
from dfdatetime import time_elements as dfdatetime_time_elements
from plaso.containers import events
from plaso.lib import errors
from plaso.parsers import text_parser
from plaso.parsers.text_plugins import interface
[docs]
class BitbucketAccessEventData(events.EventData):
"""Bitbucket access log event data.
Attributes:
http_request_method (str): HTTP request method (GET, POST, etc.), or
'SSH' for SSH requests, or None for gRPC/Mesh action lines.
http_request_uri (str): HTTP request URI, SSH command, or gRPC action.
http_request_user_agent (str): HTTP request user agent, or None for
SSH/gRPC requests.
http_response_bytes_read (int): number of bytes read from the client.
http_response_bytes_written (int): number of bytes written to the client.
http_response_code (int): HTTP response status code.
http_version (str): HTTP request version, or None for SSH/gRPC requests.
labels (str): request classification labels, such as push, fetch, clone,
access-token:id:..., async, protocol:2 or refs.
mesh_execution_identifier (str): Mesh execution identifier, only present for
gRPC/Mesh log lines.
protocol (str): protocol used, such as http, https, ssh or grpc.
recorded_time (dfdatetime.DateTimeValues): date and time the log entry
was recorded.
remote_address (str): remote IP address(es), including X-Forwarded-For
proxies. Multiple addresses are comma-separated.
request_identifier (str): unique request identifier, correlatable with audit
log.
request_time (int): time taken to process the request in milliseconds
(HTTP/SSH) or nanoseconds (gRPC/Mesh), or None if not available.
session_identifier (str): session identifier.
ssh_repository_path (str): SSH repository path for SSH requests, or None.
user_name (str): the name of the authenticated user.
"""
DATA_TYPE = "atlassian:bitbucket:access"
[docs]
def __init__(self):
"""Initializes event data."""
super().__init__(data_type=self.DATA_TYPE)
self.http_request_method = None
self.http_request_uri = None
self.http_request_user_agent = None
self.http_response_bytes_read = None
self.http_response_bytes_written = None
self.http_response_code = None
self.http_version = None
self.labels = None
self.mesh_execution_identifier = None
self.protocol = None
self.recorded_time = None
self.remote_address = None
self.request_identifier = None
self.request_time = None
self.session_identifier = None
self.ssh_repository_path = None
self.user_name = None
[docs]
class BitbucketAccessTextPlugin(interface.TextPlugin):
"""Text parser plugin for Atlassian Bitbucket access log files."""
NAME = "bitbucket_access"
DATA_FORMAT = "Atlassian Bitbucket access log (atlassian-bitbucket-access.log) file"
ENCODING = "utf-8"
_INTEGER = pyparsing.Word(pyparsing.nums).set_parse_action(
lambda tokens: int(tokens[0], 10)
)
_TWO_DIGITS = pyparsing.Word(pyparsing.nums, exact=2).set_parse_action(
lambda tokens: int(tokens[0], 10)
)
_THREE_DIGITS = pyparsing.Word(pyparsing.nums, exact=3).set_parse_action(
lambda tokens: int(tokens[0], 10)
)
_FOUR_DIGITS = pyparsing.Word(pyparsing.nums, exact=4).set_parse_action(
lambda tokens: int(tokens[0], 10)
)
# Pipe as field separator.
_SEPARATOR = pyparsing.Suppress(pyparsing.Literal("|"))
# Integer or dash (used for optional numeric fields).
_INT_OR_DASH = pyparsing.Literal("-") | _INTEGER
# Remote address: IPv4, IPv6, or comma-separated list of addresses.
_REMOTE_ADDRESS = pyparsing.Combine(
pyparsing.Word(pyparsing.alphanums + ".:,")
).set_results_name("remote_address")
# Request identifier: alphanumeric token with @/* prefix and x separators,
# or '-'.
_REQUEST_IDENTIFIER = pyparsing.Word(
pyparsing.alphanums + "@*-_x"
) | pyparsing.Literal("-")
# User name: alphanumeric with common separators, or '-'.
_USER_NAME = (
pyparsing.Word(pyparsing.alphanums + "-_./") | pyparsing.Literal("-")
).set_results_name("user_name")
# Session identifier: short alphanumeric token or '-'.
_SESSION_IDENTIFIER = pyparsing.Word(
pyparsing.alphanums + "-_"
) | pyparsing.Literal("-")
# Date and time format: 2020-09-08 07:53:45,084
_DATE_TIME = (
_FOUR_DIGITS
+ pyparsing.Suppress("-")
+ _TWO_DIGITS
+ pyparsing.Suppress("-")
+ _TWO_DIGITS
+ _TWO_DIGITS
+ pyparsing.Suppress(":")
+ _TWO_DIGITS
+ pyparsing.Suppress(":")
+ _TWO_DIGITS
+ pyparsing.Suppress(",")
+ _THREE_DIGITS
).set_results_name("date_time")
# HTTP request methods.
_HTTP_METHODS = [
"CONNECT",
"DELETE",
"GET",
"HEAD",
"OPTIONS",
"PATCH",
"POST",
"PUT",
"TRACE",
]
# Request URI characters.
_REQUEST_URI = pyparsing.Word(pyparsing.alphanums + "/-_.?=%&:+<>#~[]@!,()")
# HTTP request: "METHOD /uri HTTP/1.1"
_HTTP_REQUEST = pyparsing.Group(
pyparsing.Suppress('"')
+ pyparsing.one_of(_HTTP_METHODS).set_results_name("http_method")
+ _REQUEST_URI.set_results_name("request_url")
+ pyparsing.Word(pyparsing.alphanums + "/.").set_results_name("http_version")
+ pyparsing.Suppress('"')
).set_results_name("http_request")
# SSH request: "SSH - git-upload-pack '/repo.git'"
_SSH_REQUEST = pyparsing.Group(
pyparsing.Suppress('"')
+ pyparsing.Literal("SSH").set_results_name("http_method")
+ pyparsing.Suppress(pyparsing.Literal("-"))
+ pyparsing.Word(pyparsing.alphanums + "-_").set_results_name("request_url")
+ pyparsing.QuotedString("'").set_results_name("ssh_repo")
+ pyparsing.Suppress('"')
).set_results_name("http_request")
# gRPC action: "ServiceName/MethodName"
_GRPC_REQUEST = pyparsing.Group(
pyparsing.Suppress('"')
+ pyparsing.SkipTo('"').set_results_name("request_url")
+ pyparsing.Suppress('"')
).set_results_name("http_request")
# Referer and user agent fields: "" "user-agent-string"
_REFERER = (
pyparsing.Suppress('"')
+ pyparsing.Optional(
pyparsing.Word(pyparsing.alphanums + "/-_.?=%&:+<>#~[]@!,()")
).set_results_name("referer")
+ pyparsing.Suppress('"')
)
_USER_AGENT = (
pyparsing.Suppress('"')
+ pyparsing.SkipTo('"').set_results_name("user_agent")
+ pyparsing.Suppress('"')
)
# Labels field: arbitrary text up to the next pipe separator.
_LABELS = pyparsing.SkipTo(pyparsing.Literal("|")).set_results_name("labels")
# HTTP/SSH access log line:
# ip_address | protocol | request_identifier | user | timestamp | "request" |
# "referer" "user-agent" | status | bytes_read | bytes_written | labels |
# request_time | session_identifier |
_HTTP_ACCESS_LOG_LINE = (
_REMOTE_ADDRESS
+ _SEPARATOR
+ pyparsing.Word(pyparsing.alphanums).set_results_name("protocol")
+ _SEPARATOR
+ _REQUEST_IDENTIFIER.set_results_name("request_identifier")
+ _SEPARATOR
+ _USER_NAME
+ _SEPARATOR
+ _DATE_TIME
+ _SEPARATOR
+ (_HTTP_REQUEST | _SSH_REQUEST | _GRPC_REQUEST)
+ _SEPARATOR
+ _REFERER
+ _USER_AGENT
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("status_code")
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("bytes_read")
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("bytes_written")
+ _SEPARATOR
+ _LABELS
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("request_time")
+ _SEPARATOR
+ _SESSION_IDENTIFIER.set_results_name("session_identifier")
+ _SEPARATOR
+ pyparsing.Suppress(pyparsing.LineEnd())
)
# gRPC/Mesh access log line:
# ip_address | grpc | request_identifier | mesh_execution_identifier | user |
# timestamp | "action" | - | status | bytes_read | bytes_written | mesh_in |
# mesh_out | duration_ns | session_identifier |
_GRPC_ACCESS_LOG_LINE = (
_REMOTE_ADDRESS
+ _SEPARATOR
+ pyparsing.Literal("grpc").set_results_name("protocol")
+ _SEPARATOR
+ _REQUEST_IDENTIFIER.set_results_name("request_identifier")
+ _SEPARATOR
+ (
pyparsing.Word(pyparsing.alphanums + "@*-_x") | pyparsing.Literal("-")
).set_results_name("mesh_execution_identifier")
+ _SEPARATOR
+ _USER_NAME
+ _SEPARATOR
+ _DATE_TIME
+ _SEPARATOR
+ _GRPC_REQUEST
+ _SEPARATOR
+ pyparsing.Suppress(pyparsing.Literal("-"))
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("status_code")
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("bytes_read")
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("bytes_written")
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("mesh_in")
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("mesh_out")
+ _SEPARATOR
+ _INT_OR_DASH.set_results_name("duration_ns")
+ _SEPARATOR
+ _SESSION_IDENTIFIER.set_results_name("session_identifier")
+ _SEPARATOR
+ pyparsing.Suppress(pyparsing.LineEnd())
)
_LINE_STRUCTURES = [
("grpc_access_log", _GRPC_ACCESS_LOG_LINE),
("http_access_log", _HTTP_ACCESS_LOG_LINE),
]
VERIFICATION_GRAMMAR = _GRPC_ACCESS_LOG_LINE | _HTTP_ACCESS_LOG_LINE
VERIFICATION_LITERALS = [" | grpc | ", " | http | ", " | https | ", " | ssh | "]
def _GetStrippedValue(self, structure, name, default_value=None):
"""Retrieves a token value from a Pyparsing structure and strips '' or '-'.
Args:
structure (pyparsing.ParseResults): tokens from a parsed log line.
name (str): name of the token.
default_value (Optional[object]): default value.
Returns:
object: value in the token or default value if the token is not available
in the structure.
"""
value = self._GetValueFromStructure(structure, name)
if value in (None, "", "-"):
return default_value
return value
def _ParseRecord(self, parser_mediator, key, structure):
"""Parses a pyparsing structure.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfVFS.
key (str): name of the parsed structure.
structure (pyparsing.ParseResults): tokens from a parsed log line.
Raises:
ParseError: if the structure cannot be parsed.
"""
if key not in ("grpc_access_log", "http_access_log"):
raise errors.ParseError(f"Unsupported structure: {key:s}")
time_elements_structure = self._GetValueFromStructure(structure, "date_time")
http_request = self._GetValueFromStructure(structure, "http_request")
labels = self._GetValueFromStructure(
structure, "labels", default_value=""
).strip()
event_data = BitbucketAccessEventData()
event_data.http_response_bytes_read = self._GetStrippedValue(
structure, "bytes_read"
)
event_data.http_response_bytes_written = self._GetStrippedValue(
structure, "bytes_written"
)
event_data.http_response_code = self._GetStrippedValue(structure, "status_code")
event_data.http_request_user_agent = (
self._GetStringValueFromStructure(structure, "user_agent") or None
)
event_data.labels = None if not labels or labels == "-" else labels
event_data.mesh_execution_identifier = self._GetStrippedValue(
structure, "mesh_execution_identifier"
)
event_data.protocol = self._GetValueFromStructure(structure, "protocol")
event_data.recorded_time = self._ParseTimeElements(time_elements_structure)
event_data.remote_address = self._GetValueFromStructure(
structure, "remote_address"
)
event_data.request_identifier = self._GetStrippedValue(
structure, "request_identifier"
)
event_data.request_time = self._GetStrippedValue(structure, "request_time")
event_data.session_identifier = self._GetStrippedValue(
structure, "session_identifier"
)
event_data.user_name = self._GetStrippedValue(structure, "user_name")
if http_request:
event_data.http_request_method = self._GetValueFromStructure(
http_request, "http_method"
)
event_data.http_request_uri = self._GetValueFromStructure(
http_request, "request_url"
)
event_data.http_version = (
self._GetValueFromStructure(http_request, "http_version") or None
)
event_data.ssh_repository_path = (
self._GetValueFromStructure(http_request, "ssh_repo") or None
)
parser_mediator.ProduceEventData(event_data)
def _ParseTimeElements(self, time_elements_structure):
"""Parses date and time elements of a log line.
Args:
time_elements_structure (pyparsing.ParseResults): date and time elements.
Returns:
dfdatetime.TimeElements: date and time value.
Raises:
ParseError: if a valid date and time value cannot be derived.
"""
try:
date_time = dfdatetime_time_elements.TimeElementsInMilliseconds(
time_elements_tuple=time_elements_structure
)
date_time.is_local_time = True
return date_time
except (IndexError, TypeError, ValueError) as exception:
raise errors.ParseError(
f"Unable to parse time elements with error: {exception!s}"
)
[docs]
def CheckRequiredFormat(self, parser_mediator, text_reader):
"""Check if the log record has the minimal structure required by the plugin.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfVFS.
text_reader (EncodedTextReader): text reader.
Returns:
bool: True if this is the correct plugin, False otherwise.
"""
try:
structure = self._VerifyString(text_reader.lines)
except errors.ParseError:
return False
time_elements_structure = self._GetValueFromStructure(structure, "date_time")
try:
self._ParseTimeElements(time_elements_structure)
except errors.ParseError:
return False
return True
text_parser.TextLogParser.RegisterPlugin(BitbucketAccessTextPlugin)