Source code for plaso.lib.lexer

# -*- coding: utf-8 -*-
"""An LL(1) lexer. This lexer is very tolerant of errors and can resync.

This lexer is originally copied from the GRR project:
https://code.google.com/p/grr
"""

from __future__ import unicode_literals

import logging
import re

from plaso.lib import errors

# TODO: Update docstrings in this file.
# pylint: disable=missing-type-doc,missing-return-type-doc,missing-return-doc
# pylint: disable=missing-param-doc


[docs]class Token(object): """A token action.""" def __init__(self, state_regex, regex, actions, next_state, flags=re.I): """Initializes the token object. Args: state_regex: If this regular expression matches the current state this rule is considered. regex: A regular expression to try and match from the current point. actions: A command separated list of method names in the Lexer to call. next_state: The next state we transition to if this Token matches. flags: re flags. """ self.state_regex = re.compile( state_regex, re.DOTALL | re.M | re.S | re.U | flags) self.regex = re.compile(regex, re.DOTALL | re.M | re.S | re.U | flags) self.re_str = regex self.actions = [] if actions: self.actions = actions.split(',')
self.next_state = next_state
[docs]class Lexer(object): """A generic feed lexer.""" _CONTINUE_STATE = 'CONTINUE' _INITIAL_STATE = 'INITIAL' _ERROR_TOKEN = 'Error' # A list of Token() instances. tokens = [] def __init__(self, data=''): """Initializes the lexer object. Args: data: optional initial data to be processed by the lexer. """ super(Lexer, self).__init__() self.buffer = data self.error = 0 self.flags = 0 self.processed = 0 self.processed_buffer = '' self.state = self._INITIAL_STATE self.state_stack = [] self.verbose = 0
[docs] def NextToken(self): """Fetch the next token by trying to match any of the regexes in order.""" current_state = self.state for token in self.tokens: # Does the rule apply to us? if not token.state_regex.match(current_state): continue # Try to match the rule m = token.regex.match(self.buffer) if not m: continue # The match consumes the data off the buffer (the handler can put it back # if it likes) # TODO: using joins might be more efficient here. self.processed_buffer += self.buffer[:m.end()] self.buffer = self.buffer[m.end():] self.processed += m.end() next_state = token.next_state for action in token.actions: # Is there a callback to handle this action? callback = getattr(self, action, self.Default) # Allow a callback to skip other callbacks. try: possible_next_state = callback(string=m.group(0), match=m) if possible_next_state == self._CONTINUE_STATE: continue # Override the state from the Token elif possible_next_state: next_state = possible_next_state except errors.ParseError as exception: self.Error(exception) # Update the next state if next_state: self.state = next_state return token # Check that we are making progress - if we are too full, we assume we are # stuck. self.Error('Expected {0:s}'.format(self.state)) self.processed_buffer += self.buffer[:1] self.buffer = self.buffer[1:]
return self._ERROR_TOKEN
[docs] def Close(self): """A convenience function to force us to parse all the data.""" while self.NextToken(): if not self.buffer:
return
[docs] def Default(self, **kwarg): """The default callback handler."""
logging.debug('Default handler: {0:s}'.format(kwarg))
[docs] def Empty(self): """Returns a boolean indicating if the buffer is empty."""
return not self.buffer
[docs] def Error(self, message=None, weight=1): """Log an error down. Args: message: optional error message. weight: optional error weight. """ logging.debug('Error({0:d}): {1:s}'.format(weight, message)) # Keep a count of errors
self.error += weight
[docs] def PushState(self, **unused_kwargs): """Push the current state on the state stack.""" logging.debug('Storing state {0:s}'.format(repr(self.state)))
self.state_stack.append(self.state)
[docs] def PopState(self, **unused_kwargs): """Pop the previous state from the stack.""" try: self.state = self.state_stack.pop() logging.debug('Returned state to {0:s}'.format(self.state)) return self.state except IndexError: self.Error(
'Tried to pop the state but failed - possible recursion error')
[docs] def Feed(self, data): """Feed the buffer with data. Args: data: data to be processed by the lexer. """
self.buffer = ''.join([self.buffer, data])
[docs] def PushBack(self, string='', **unused_kwargs): """Push the match back on the stream. Args: string: optional data. """ self.buffer = string + self.buffer
self.processed_buffer = self.processed_buffer[:-len(string)]
[docs]class SelfFeederMixIn(Lexer): """This mixin is used to make a lexer which feeds itself. Note that self.file_object must be the file object we read from. """ # TODO: fix this, file object either needs to be set or not passed here. def __init__(self, file_object=None): """Initializes the lexer feeder min object. Args: file_object: Optional file-like object. """ super(SelfFeederMixIn, self).__init__() self.file_object = file_object # pylint: disable=arguments-differ
[docs] def Feed(self, size=512): """Feed data into the buffer. Args: size: optional data size to read form the file-like object. """ data = self.file_object.read(size) Lexer.Feed(self, data)
return len(data)
[docs] def NextToken(self): """Retrieves the next token. Returns: The next token (instance of Token) or None. """ # If we don't have enough data - feed ourselves: We assume # that we must have at least one sector in our buffer. if len(self.buffer) < 512: if self.Feed() == 0 and not self.buffer: return None
return Lexer.NextToken(self)
[docs]class Expression(object): """A class representing an expression.""" attribute = None args = None operator = None # The expected number of args number_of_args = 1 def __init__(self): """Initializes the expression object.""" self.args = []
[docs] def __str__(self): """Return a string representation of the expression.""" return 'Expression: ({0:s}) ({1:s}) {2:s}'.format(
self.attribute, self.operator, self.args)
[docs] def AddArg(self, argument): """Adds a new argument to this expression. Args: argument (str): argument to add. Returns: True if the argument is the last argument, False otherwise. Raises: ParseError: If there are too many arguments. """ self.args.append(argument) if len(self.args) > self.number_of_args: raise errors.ParseError('Too many arguments for this expression.') elif len(self.args) == self.number_of_args: return True
return False
[docs] def Compile(self, unused_filter_implementation): """Given a filter implementation, compile this expression.""" raise NotImplementedError(
'{0:s} does not implement Compile.'.format(self.__class__.__name__)) # TODO: rename this function to GetTreeAsString or equivalent.
[docs] def PrintTree(self, depth=''): """Print the tree."""
return '{0:s} {1:s}'.format(depth, self)
[docs] def SetAttribute(self, attribute): """Set the attribute."""
self.attribute = attribute
[docs] def SetOperator(self, operator): """Set the operator."""
self.operator = operator
[docs]class BinaryExpression(Expression): """An expression which takes two other expressions.""" def __init__(self, operator='', part=None): """Initializes the expression object.""" self.operator = operator self.args = [] if part: self.args.append(part) super(BinaryExpression, self).__init__()
[docs] def __str__(self): """Return a string representation of the binary expression.""" return 'Binary Expression: {0:s} {1:s}'.format(
self.operator, [str(x) for x in self.args])
[docs] def AddOperands(self, lhs, rhs): """Add an operand.""" if isinstance(lhs, Expression) and isinstance(rhs, Expression): self.args = [lhs, rhs] else: raise errors.ParseError( 'Expected expression, got {0:s} {1:s} {2:s}'.format(
lhs, self.operator, rhs)) # TODO: rename this function to GetTreeAsString or equivalent.
[docs] def PrintTree(self, depth=''): """Print the tree.""" result = '{0:s}{1:s}\n'.format(depth, self.operator) for part in self.args: result += '{0:s}-{1:s}\n'.format(depth, part.PrintTree(depth + ' '))
return result
[docs] def Compile(self, filter_implementation): """Compile the binary expression into a filter object.""" operator = self.operator.lower() if operator in ('and', '&&'): method = 'AndFilter' elif operator in ('or', '||'): method = 'OrFilter' else: raise errors.ParseError( 'Invalid binary operator {0:s}'.format(operator)) args = [x.Compile(filter_implementation) for x in self.args]
return getattr(filter_implementation, method)(*args)
[docs]class IdentityExpression(Expression): """An Expression which always evaluates to True."""
[docs] def Compile(self, filter_implementation): """Compile the expression."""
return filter_implementation.IdentityFilter()
[docs]class SearchParser(Lexer): """This parser can parse the mini query language and build an AST. Examples of valid syntax: filename contains "foo" and (size > 100k or date before "2011-10") date between 2011 and 2010 files older than 1 year """ expression_cls = Expression binary_expression_cls = BinaryExpression tokens = [ # Double quoted string Token('STRING', '"', 'PopState,StringFinish', None), Token('STRING', r'\\(.)', 'StringEscape', None), Token('STRING', r'[^\\"]+', 'StringInsert', None), # Single quoted string Token('SQ_STRING', '\'', 'PopState,StringFinish', None), Token('SQ_STRING', r'\\(.)', 'StringEscape', None), Token('SQ_STRING', r'[^\\\']+', 'StringInsert', None), # TODO: Implement a unary not operator. # The first thing we see in the initial state takes up to the ATTRIBUTE Token('INITIAL', r'(and|or|\&\&|\|\|)', 'BinaryOperator', None), Token('INITIAL', r'[^\s\(\)]', 'PushState,PushBack', 'ATTRIBUTE'), Token('INITIAL', r'\(', 'BracketOpen', None), Token('INITIAL', r'\)', 'BracketClose', None), Token('ATTRIBUTE', r'[\w._0-9]+', 'StoreAttribute', 'OPERATOR'), Token('OPERATOR', r'[a-z0-9<>=\-\+\!\^\&%]+', 'StoreOperator', 'ARG_LIST'), Token('OPERATOR', r'(!=|[<>=])', 'StoreSpecialOperator', 'ARG_LIST'), Token('ARG_LIST', r'[^\s\'"]+', 'InsertArg', None), # Start a string. Token('.', '"', 'PushState,StringStart', 'STRING'), Token('.', '\'', 'PushState,StringStart', 'SQ_STRING'), # Skip whitespace. Token('.', r'\s+', None, None), ] def __init__(self, data): """Initializes the search parser object.""" # Holds expression self.current_expression = self.expression_cls() self.filter_string = data # The token stack self.stack = [] self.string = None Lexer.__init__(self, data)
[docs] def BinaryOperator(self, string=None, **unused_kwargs): """Set the binary operator."""
self.stack.append(self.binary_expression_cls(string))
[docs] def BracketOpen(self, **unused_kwargs): """Define an open bracket."""
self.stack.append('(')
[docs] def BracketClose(self, **unused_kwargs): """Close the bracket."""
self.stack.append(')')
[docs] def StringStart(self, **unused_kwargs): """Initialize the string."""
self.string = ''
[docs] def StringEscape(self, string, match, **unused_kwargs): """Escape backslashes found inside a string quote. Backslashes followed by anything other than ['"rnbt] will just be included in the string. Args: string: The string that matched. match: the match object (instance of re.MatchObject). Where match.group(1) contains the escaped code. """ if match.group(1) in '\'"rnbt': self.string += string.decode('unicode_escape') else:
self.string += string
[docs] def StringInsert(self, string='', **unused_kwargs): """Add to the string."""
self.string += string
[docs] def StringFinish(self, **unused_kwargs): """Finish the string operation.""" if self.state == 'ATTRIBUTE': return self.StoreAttribute(string=self.string) if self.state == 'ARG_LIST': return self.InsertArg(string=self.string)
return None
[docs] def StoreAttribute(self, string='', **unused_kwargs): """Store the attribute.""" logging.debug('Storing attribute {0:s}'.format(repr(string))) # TODO: Update the expected number_of_args try: self.current_expression.SetAttribute(string) except AttributeError: raise errors.ParseError('Invalid attribute \'{0:s}\''.format(string))
return 'OPERATOR'
[docs] def StoreOperator(self, string='', **unused_kwargs): """Store the operator.""" logging.debug('Storing operator {0:s}'.format(repr(string)))
self.current_expression.SetOperator(string)
[docs] def InsertArg(self, string='', **unused_kwargs): """Insert an argument to the current expression.""" logging.debug('Storing Argument {0:s}'.format(string)) # This expression is complete if self.current_expression.AddArg(string): self.stack.append(self.current_expression) self.current_expression = self.expression_cls() return self.PopState()
return None def _CombineBinaryExpressions(self, operator): """Combine binary expressions.""" for i in range(1, len(self.stack)-1): item = self.stack[i] if (isinstance(item, BinaryExpression) and item.operator == operator and isinstance(self.stack[i-1], Expression) and isinstance(self.stack[i+1], Expression)): lhs = self.stack[i-1] rhs = self.stack[i+1] self.stack[i].AddOperands(lhs, rhs) self.stack[i-1] = None self.stack[i+1] = None self.stack = list(filter(None, self.stack)) def _CombineParenthesis(self): """Combine parenthesis.""" for i in range(len(self.stack)-2): if (self.stack[i] == '(' and self.stack[i+2] == ')' and isinstance(self.stack[i+1], Expression)): self.stack[i] = None self.stack[i+2] = None self.stack = list(filter(None, self.stack))
[docs] def Reduce(self): """Reduce the token stack into an AST.""" # Check for sanity if self.state != 'INITIAL': self.Error('Premature end of expression') length = len(self.stack) while length > 1: # Precedence order self._CombineParenthesis() self._CombineBinaryExpressions('and') self._CombineBinaryExpressions('or') # No change if len(self.stack) == length: break length = len(self.stack) if length != 1: self.Error('Illegal query expression')
return self.stack[0]
[docs] def Error(self, message=None, unused_weight=1): """Raise an error message.""" raise errors.ParseError( '{0:s} in position {1:s}: {2:s} <----> {3:s} )'.format( message, len(self.processed_buffer), self.processed_buffer,
self.buffer))
[docs] def Parse(self): """Parse.""" if not self.filter_string: return IdentityExpression() self.Close()
return self.Reduce()