This commit is contained in:
lz_db
2025-11-16 12:31:03 +08:00
commit 0fab423a18
1451 changed files with 743213 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
from .exceptions import (
GrammarError,
LarkError,
LexError,
ParseError,
UnexpectedCharacters,
UnexpectedEOF,
UnexpectedInput,
UnexpectedToken,
)
from .lark import Lark
from .lexer import Token
from .tree import ParseTree, Tree
from .utils import logger
from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
__version__: str = "1.2.0"
__all__ = (
"GrammarError",
"LarkError",
"LexError",
"ParseError",
"UnexpectedCharacters",
"UnexpectedEOF",
"UnexpectedInput",
"UnexpectedToken",
"Lark",
"Token",
"ParseTree",
"Tree",
"logger",
"Discard",
"Transformer",
"Transformer_NonRecursive",
"Visitor",
"v_args",
)

View File

@@ -0,0 +1,6 @@
# For usage of lark with PyInstaller. See https://pyinstaller-sample-hook.readthedocs.io/en/latest/index.html
import os
def get_hook_dirs():
return [os.path.dirname(__file__)]

View File

@@ -0,0 +1,14 @@
#-----------------------------------------------------------------------------
# Copyright (c) 2017-2020, PyInstaller Development Team.
#
# Distributed under the terms of the GNU General Public License (version 2
# or later) with exception for distributing the bootloader.
#
# The full license is in the file COPYING.txt, distributed with this software.
#
# SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception)
#-----------------------------------------------------------------------------
from PyInstaller.utils.hooks import collect_data_files
datas = collect_data_files('lark')

View File

@@ -0,0 +1,59 @@
"""
Module of utilities for transforming a lark.Tree into a custom Abstract Syntax Tree (AST defined in classes)
"""
import inspect, re
import types
from typing import Optional, Callable
from lark import Transformer, v_args
class Ast:
"""Abstract class
Subclasses will be collected by `create_transformer()`
"""
pass
class AsList:
"""Abstract class
Subclasses will be instantiated with the parse results as a single list, instead of as arguments.
"""
class WithMeta:
"""Abstract class
Subclasses will be instantiated with the Meta instance of the tree. (see ``v_args`` for more detail)
"""
pass
def camel_to_snake(name):
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
def create_transformer(ast_module: types.ModuleType,
transformer: Optional[Transformer]=None,
decorator_factory: Callable=v_args) -> Transformer:
"""Collects `Ast` subclasses from the given module, and creates a Lark transformer that builds the AST.
For each class, we create a corresponding rule in the transformer, with a matching name.
CamelCase names will be converted into snake_case. Example: "CodeBlock" -> "code_block".
Classes starting with an underscore (`_`) will be skipped.
Parameters:
ast_module: A Python module containing all the subclasses of ``ast_utils.Ast``
transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten.
decorator_factory (Callable): An optional callable accepting two booleans, inline, and meta,
and returning a decorator for the methods of ``transformer``. (default: ``v_args``).
"""
t = transformer or Transformer()
for name, obj in inspect.getmembers(ast_module):
if not name.startswith('_') and inspect.isclass(obj):
if issubclass(obj, Ast):
wrapper = decorator_factory(inline=not issubclass(obj, AsList), meta=issubclass(obj, WithMeta))
obj = wrapper(obj).__get__(t)
setattr(t, camel_to_snake(name), obj)
return t

View File

@@ -0,0 +1,86 @@
from copy import deepcopy
import sys
from types import ModuleType
from typing import Callable, Collection, Dict, Optional, TYPE_CHECKING, List
if TYPE_CHECKING:
from .lark import PostLex
from .lexer import Lexer
from .grammar import Rule
from typing import Union, Type
from typing import Literal
if sys.version_info >= (3, 10):
from typing import TypeAlias
else:
from typing_extensions import TypeAlias
from .utils import Serialize
from .lexer import TerminalDef, Token
###{standalone
_ParserArgType: 'TypeAlias' = 'Literal["earley", "lalr", "cyk", "auto"]'
_LexerArgType: 'TypeAlias' = 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]'
_LexerCallback = Callable[[Token], Token]
ParserCallbacks = Dict[str, Callable]
class LexerConf(Serialize):
__serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
__serialize_namespace__ = TerminalDef,
terminals: Collection[TerminalDef]
re_module: ModuleType
ignore: Collection[str]
postlex: 'Optional[PostLex]'
callbacks: Dict[str, _LexerCallback]
g_regex_flags: int
skip_validation: bool
use_bytes: bool
lexer_type: Optional[_LexerArgType]
strict: bool
def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None,
callbacks: Optional[Dict[str, _LexerCallback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False, strict: bool=False):
self.terminals = terminals
self.terminals_by_name = {t.name: t for t in self.terminals}
assert len(self.terminals) == len(self.terminals_by_name)
self.ignore = ignore
self.postlex = postlex
self.callbacks = callbacks or {}
self.g_regex_flags = g_regex_flags
self.re_module = re_module
self.skip_validation = skip_validation
self.use_bytes = use_bytes
self.strict = strict
self.lexer_type = None
def _deserialize(self):
self.terminals_by_name = {t.name: t for t in self.terminals}
def __deepcopy__(self, memo=None):
return type(self)(
deepcopy(self.terminals, memo),
self.re_module,
deepcopy(self.ignore, memo),
deepcopy(self.postlex, memo),
deepcopy(self.callbacks, memo),
deepcopy(self.g_regex_flags, memo),
deepcopy(self.skip_validation, memo),
deepcopy(self.use_bytes, memo),
)
class ParserConf(Serialize):
__serialize_fields__ = 'rules', 'start', 'parser_type'
rules: List['Rule']
callbacks: ParserCallbacks
start: List[str]
parser_type: _ParserArgType
def __init__(self, rules: List['Rule'], callbacks: ParserCallbacks, start: List[str]):
assert isinstance(start, list)
self.rules = rules
self.callbacks = callbacks
self.start = start
###}

View File

@@ -0,0 +1,292 @@
from .utils import logger, NO_VALUE
from typing import Mapping, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, Optional, Collection, TYPE_CHECKING
if TYPE_CHECKING:
from .lexer import Token
from .parsers.lalr_interactive_parser import InteractiveParser
from .tree import Tree
###{standalone
class LarkError(Exception):
pass
class ConfigurationError(LarkError, ValueError):
pass
def assert_config(value, options: Collection, msg='Got %r, expected one of %s'):
if value not in options:
raise ConfigurationError(msg % (value, options))
class GrammarError(LarkError):
pass
class ParseError(LarkError):
pass
class LexError(LarkError):
pass
T = TypeVar('T')
class UnexpectedInput(LarkError):
"""UnexpectedInput Error.
Used as a base class for the following exceptions:
- ``UnexpectedCharacters``: The lexer encountered an unexpected string
- ``UnexpectedToken``: The parser received an unexpected token
- ``UnexpectedEOF``: The parser expected a token, but the input ended
After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
"""
line: int
column: int
pos_in_stream = None
state: Any
_terminals_by_name = None
interactive_parser: 'InteractiveParser'
def get_context(self, text: str, span: int=40) -> str:
"""Returns a pretty string pinpointing the error in the text,
with span amount of context characters around it.
Note:
The parser doesn't hold a copy of the text it has to parse,
so you have to provide it again
"""
assert self.pos_in_stream is not None, self
pos = self.pos_in_stream
start = max(pos - span, 0)
end = pos + span
if not isinstance(text, bytes):
before = text[start:pos].rsplit('\n', 1)[-1]
after = text[pos:end].split('\n', 1)[0]
return before + after + '\n' + ' ' * len(before.expandtabs()) + '^\n'
else:
before = text[start:pos].rsplit(b'\n', 1)[-1]
after = text[pos:end].split(b'\n', 1)[0]
return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace")
def match_examples(self, parse_fn: 'Callable[[str], Tree]',
examples: Union[Mapping[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]],
token_type_match_fallback: bool=False,
use_accepts: bool=True
) -> Optional[T]:
"""Allows you to detect what's wrong in the input text by matching
against example errors.
Given a parser instance and a dictionary mapping some label with
some malformed syntax examples, it'll return the label for the
example that bests matches the current error. The function will
iterate the dictionary until it finds a matching error, and
return the corresponding value.
For an example usage, see `examples/error_reporting_lalr.py`
Parameters:
parse_fn: parse function (usually ``lark_instance.parse``)
examples: dictionary of ``{'example_string': value}``.
use_accepts: Recommended to keep this as ``use_accepts=True``.
"""
assert self.state is not None, "Not supported for this exception"
if isinstance(examples, Mapping):
examples = examples.items()
candidate = (None, False)
for i, (label, example) in enumerate(examples):
assert not isinstance(example, str), "Expecting a list"
for j, malformed in enumerate(example):
try:
parse_fn(malformed)
except UnexpectedInput as ut:
if ut.state == self.state:
if (
use_accepts
and isinstance(self, UnexpectedToken)
and isinstance(ut, UnexpectedToken)
and ut.accepts != self.accepts
):
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
(self.state, self.accepts, ut.accepts, i, j))
continue
if (
isinstance(self, (UnexpectedToken, UnexpectedEOF))
and isinstance(ut, (UnexpectedToken, UnexpectedEOF))
):
if ut.token == self.token: # Try exact match first
logger.debug("Exact Match at example [%s][%s]" % (i, j))
return label
if token_type_match_fallback:
# Fallback to token types match
if (ut.token.type == self.token.type) and not candidate[-1]:
logger.debug("Token Type Fallback at example [%s][%s]" % (i, j))
candidate = label, True
if candidate[0] is None:
logger.debug("Same State match at example [%s][%s]" % (i, j))
candidate = label, False
return candidate[0]
def _format_expected(self, expected):
if self._terminals_by_name:
d = self._terminals_by_name
expected = [d[t_name].user_repr() if t_name in d else t_name for t_name in expected]
return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected)
class UnexpectedEOF(ParseError, UnexpectedInput):
"""An exception that is raised by the parser, when the input ends while it still expects a token.
"""
expected: 'List[Token]'
def __init__(self, expected, state=None, terminals_by_name=None):
super(UnexpectedEOF, self).__init__()
self.expected = expected
self.state = state
from .lexer import Token
self.token = Token("<EOF>", "") # , line=-1, column=-1, pos_in_stream=-1)
self.pos_in_stream = -1
self.line = -1
self.column = -1
self._terminals_by_name = terminals_by_name
def __str__(self):
message = "Unexpected end-of-input. "
message += self._format_expected(self.expected)
return message
class UnexpectedCharacters(LexError, UnexpectedInput):
"""An exception that is raised by the lexer, when it cannot match the next
string of characters to any of its terminals.
"""
allowed: Set[str]
considered_tokens: Set[Any]
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
terminals_by_name=None, considered_rules=None):
super(UnexpectedCharacters, self).__init__()
# TODO considered_tokens and allowed can be figured out using state
self.line = line
self.column = column
self.pos_in_stream = lex_pos
self.state = state
self._terminals_by_name = terminals_by_name
self.allowed = allowed
self.considered_tokens = considered_tokens
self.considered_rules = considered_rules
self.token_history = token_history
if isinstance(seq, bytes):
self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace")
else:
self.char = seq[lex_pos]
self._context = self.get_context(seq)
def __str__(self):
message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column)
message += '\n\n' + self._context
if self.allowed:
message += self._format_expected(self.allowed)
if self.token_history:
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history)
return message
class UnexpectedToken(ParseError, UnexpectedInput):
"""An exception that is raised by the parser, when the token it received
doesn't match any valid step forward.
Parameters:
token: The mismatched token
expected: The set of expected tokens
considered_rules: Which rules were considered, to deduce the expected tokens
state: A value representing the parser state. Do not rely on its value or type.
interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failure,
and can be used for debugging and error handling.
Note: These parameters are available as attributes of the instance.
"""
expected: Set[str]
considered_rules: Set[str]
def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
super(UnexpectedToken, self).__init__()
# TODO considered_rules and expected can be figured out using state
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
self.pos_in_stream = getattr(token, 'start_pos', None)
self.state = state
self.token = token
self.expected = expected # XXX deprecate? `accepts` is better
self._accepts = NO_VALUE
self.considered_rules = considered_rules
self.interactive_parser = interactive_parser
self._terminals_by_name = terminals_by_name
self.token_history = token_history
@property
def accepts(self) -> Set[str]:
if self._accepts is NO_VALUE:
self._accepts = self.interactive_parser and self.interactive_parser.accepts()
return self._accepts
def __str__(self):
message = ("Unexpected token %r at line %s, column %s.\n%s"
% (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected)))
if self.token_history:
message += "Previous tokens: %r\n" % self.token_history
return message
class VisitError(LarkError):
"""VisitError is raised when visitors are interrupted by an exception
It provides the following attributes for inspection:
Parameters:
rule: the name of the visit rule that failed
obj: the tree-node or token that was being processed
orig_exc: the exception that cause it to fail
Note: These parameters are available as attributes
"""
obj: 'Union[Tree, Token]'
orig_exc: Exception
def __init__(self, rule, obj, orig_exc):
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message)
self.rule = rule
self.obj = obj
self.orig_exc = orig_exc
class MissingVariableError(LarkError):
pass
###}

View File

@@ -0,0 +1,130 @@
from typing import Optional, Tuple, ClassVar, Sequence
from .utils import Serialize
###{standalone
TOKEN_DEFAULT_PRIORITY = 0
class Symbol(Serialize):
__slots__ = ('name',)
name: str
is_term: ClassVar[bool] = NotImplemented
def __init__(self, name: str) -> None:
self.name = name
def __eq__(self, other):
assert isinstance(other, Symbol), other
return self.is_term == other.is_term and self.name == other.name
def __ne__(self, other):
return not (self == other)
def __hash__(self):
return hash(self.name)
def __repr__(self):
return '%s(%r)' % (type(self).__name__, self.name)
fullrepr = property(__repr__)
def renamed(self, f):
return type(self)(f(self.name))
class Terminal(Symbol):
__serialize_fields__ = 'name', 'filter_out'
is_term: ClassVar[bool] = True
def __init__(self, name, filter_out=False):
self.name = name
self.filter_out = filter_out
@property
def fullrepr(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out)
def renamed(self, f):
return type(self)(f(self.name), self.filter_out)
class NonTerminal(Symbol):
__serialize_fields__ = 'name',
is_term: ClassVar[bool] = False
class RuleOptions(Serialize):
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices'
keep_all_tokens: bool
expand1: bool
priority: Optional[int]
template_source: Optional[str]
empty_indices: Tuple[bool, ...]
def __init__(self, keep_all_tokens: bool=False, expand1: bool=False, priority: Optional[int]=None, template_source: Optional[str]=None, empty_indices: Tuple[bool, ...]=()) -> None:
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.priority = priority
self.template_source = template_source
self.empty_indices = empty_indices
def __repr__(self):
return 'RuleOptions(%r, %r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.priority,
self.template_source
)
class Rule(Serialize):
"""
origin : a symbol
expansion : a list of symbols
order : index of this expansion amongst all rules of the same name
"""
__slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')
__serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
__serialize_namespace__ = Terminal, NonTerminal, RuleOptions
origin: NonTerminal
expansion: Sequence[Symbol]
order: int
alias: Optional[str]
options: RuleOptions
_hash: int
def __init__(self, origin: NonTerminal, expansion: Sequence[Symbol],
order: int=0, alias: Optional[str]=None, options: Optional[RuleOptions]=None):
self.origin = origin
self.expansion = expansion
self.alias = alias
self.order = order
self.options = options or RuleOptions()
self._hash = hash((self.origin, tuple(self.expansion)))
def _deserialize(self):
self._hash = hash((self.origin, tuple(self.expansion)))
def __str__(self):
return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion))
def __repr__(self):
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
def __hash__(self):
return self._hash
def __eq__(self, other):
if not isinstance(other, Rule):
return False
return self.origin == other.origin and self.expansion == other.expansion
###}

View File

@@ -0,0 +1,59 @@
// Basic terminals for common use
//
// Numbers
//
DIGIT: "0".."9"
HEXDIGIT: "a".."f"|"A".."F"|DIGIT
INT: DIGIT+
SIGNED_INT: ["+"|"-"] INT
DECIMAL: INT "." INT? | "." INT
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
_EXP: ("e"|"E") SIGNED_INT
FLOAT: INT _EXP | DECIMAL _EXP?
SIGNED_FLOAT: ["+"|"-"] FLOAT
NUMBER: FLOAT | INT
SIGNED_NUMBER: ["+"|"-"] NUMBER
//
// Strings
//
_STRING_INNER: /.*?/
_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/
ESCAPED_STRING : "\"" _STRING_ESC_INNER "\""
//
// Names (Variables)
//
LCASE_LETTER: "a".."z"
UCASE_LETTER: "A".."Z"
LETTER: UCASE_LETTER | LCASE_LETTER
WORD: LETTER+
CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
//
// Whitespace
//
WS_INLINE: (" "|/\t/)+
WS: /[ \t\f\r\n]/+
CR : /\r/
LF : /\n/
NEWLINE: (CR? LF)+
// Comments
SH_COMMENT: /#[^\n]*/
CPP_COMMENT: /\/\/[^\n]*/
C_COMMENT: "/*" /(.|\n)*?/ "*/"
SQL_COMMENT: /--[^\n]*/

View File

@@ -0,0 +1,62 @@
# Lark grammar of Lark's syntax
# Note: Lark is not bootstrapped, its parser is implemented in load_grammar.py
start: (_item? _NL)* _item?
_item: rule
| token
| statement
rule: RULE rule_params priority? ":" expansions
token: TOKEN token_params priority? ":" expansions
rule_params: ["{" RULE ("," RULE)* "}"]
token_params: ["{" TOKEN ("," TOKEN)* "}"]
priority: "." NUMBER
statement: "%ignore" expansions -> ignore
| "%import" import_path ["->" name] -> import
| "%import" import_path name_list -> multi_import
| "%override" rule -> override_rule
| "%declare" name+ -> declare
!import_path: "."? name ("." name)*
name_list: "(" name ("," name)* ")"
?expansions: alias (_VBAR alias)*
?alias: expansion ["->" RULE]
?expansion: expr*
?expr: atom [OP | "~" NUMBER [".." NUMBER]]
?atom: "(" expansions ")"
| "[" expansions "]" -> maybe
| value
?value: STRING ".." STRING -> literal_range
| name
| (REGEXP | STRING) -> literal
| name "{" value ("," value)* "}" -> template_usage
name: RULE
| TOKEN
_VBAR: _NL? "|"
OP: /[+*]|[?](?![a-z])/
RULE: /!?[_?]?[a-z][_a-z0-9]*/
TOKEN: /_?[A-Z][_A-Z0-9]*/
STRING: _STRING "i"?
REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/
_NL: /(\r?\n)+\s*/
%import common.ESCAPED_STRING -> _STRING
%import common.SIGNED_INT -> NUMBER
%import common.WS_INLINE
COMMENT: /\s*/ "//" /[^\n]/* | /\s*/ "#" /[^\n]/*
%ignore WS_INLINE
%ignore COMMENT

View File

@@ -0,0 +1,302 @@
// Python 3 grammar for Lark
// This grammar should parse all python 3.x code successfully.
// Adapted from: https://docs.python.org/3/reference/grammar.html
// Start symbols for the grammar:
// single_input is a single interactive statement;
// file_input is a module or sequence of commands read from an input file;
// eval_input is the input for the eval() functions.
// NB: compound_stmt in single_input is followed by extra NEWLINE!
//
single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE
file_input: (_NEWLINE | stmt)*
eval_input: testlist _NEWLINE*
decorator: "@" dotted_name [ "(" [arguments] ")" ] _NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
async_funcdef: "async" funcdef
funcdef: "def" name "(" [parameters] ")" ["->" test] ":" suite
parameters: paramvalue ("," paramvalue)* ["," SLASH ("," paramvalue)*] ["," [starparams | kwparams]]
| starparams
| kwparams
SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result
starparams: (starparam | starguard) poststarparams
starparam: "*" typedparam
starguard: "*"
poststarparams: ("," paramvalue)* ["," kwparams]
kwparams: "**" typedparam ","?
?paramvalue: typedparam ("=" test)?
?typedparam: name (":" test)?
lambdef: "lambda" [lambda_params] ":" test
lambdef_nocond: "lambda" [lambda_params] ":" test_nocond
lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]]
| lambda_starparams
| lambda_kwparams
?lambda_paramvalue: name ("=" test)?
lambda_starparams: "*" [name] ("," lambda_paramvalue)* ["," [lambda_kwparams]]
lambda_kwparams: "**" name ","?
?stmt: simple_stmt | compound_stmt
?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE
?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr
assign_stmt: annassign | augassign | assign
annassign: testlist_star_expr ":" test ["=" test]
assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+
augassign: testlist_star_expr augassign_op (yield_expr|testlist)
!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//="
?testlist_star_expr: test_or_star_expr
| test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple
| test_or_star_expr "," -> tuple
// For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: "del" exprlist
pass_stmt: "pass"
?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: "break"
continue_stmt: "continue"
return_stmt: "return" [testlist]
yield_stmt: yield_expr
raise_stmt: "raise" [test ["from" test]]
import_stmt: import_name | import_from
import_name: "import" dotted_as_names
// note below: the ("." | "...") is necessary because "..." is tokenized as ELLIPSIS
import_from: "from" (dots? dotted_name | dots) "import" ("*" | "(" import_as_names ")" | import_as_names)
!dots: "."+
import_as_name: name ["as" name]
dotted_as_name: dotted_name ["as" name]
import_as_names: import_as_name ("," import_as_name)* [","]
dotted_as_names: dotted_as_name ("," dotted_as_name)*
dotted_name: name ("." name)*
global_stmt: "global" name ("," name)*
nonlocal_stmt: "nonlocal" name ("," name)*
assert_stmt: "assert" test ["," test]
?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | match_stmt
| with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: "async" (funcdef | with_stmt | for_stmt)
if_stmt: "if" test ":" suite elifs ["else" ":" suite]
elifs: elif_*
elif_: "elif" test ":" suite
while_stmt: "while" test ":" suite ["else" ":" suite]
for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally]
| "try" ":" suite finally -> try_finally
finally: "finally" ":" suite
except_clauses: except_clause+
except_clause: "except" [test ["as" name]] ":" suite
// NB compile.c makes sure that the default except clause is last
with_stmt: "with" with_items ":" suite
with_items: with_item ("," with_item)*
with_item: test ["as" name]
match_stmt: "match" test ":" _NEWLINE _INDENT case+ _DEDENT
case: "case" pattern ["if" test] ":" suite
?pattern: sequence_item_pattern "," _sequence_pattern -> sequence_pattern
| as_pattern
?as_pattern: or_pattern ("as" NAME)?
?or_pattern: closed_pattern ("|" closed_pattern)*
?closed_pattern: literal_pattern
| NAME -> capture_pattern
| "_" -> any_pattern
| attr_pattern
| "(" as_pattern ")"
| "[" _sequence_pattern "]" -> sequence_pattern
| "(" (sequence_item_pattern "," _sequence_pattern)? ")" -> sequence_pattern
| "{" (mapping_item_pattern ("," mapping_item_pattern)* ","?)?"}" -> mapping_pattern
| "{" (mapping_item_pattern ("," mapping_item_pattern)* ",")? "**" NAME ","? "}" -> mapping_star_pattern
| class_pattern
literal_pattern: inner_literal_pattern
?inner_literal_pattern: "None" -> const_none
| "True" -> const_true
| "False" -> const_false
| STRING -> string
| number
attr_pattern: NAME ("." NAME)+ -> value
name_or_attr_pattern: NAME ("." NAME)* -> value
mapping_item_pattern: (literal_pattern|attr_pattern) ":" as_pattern
_sequence_pattern: (sequence_item_pattern ("," sequence_item_pattern)* ","?)?
?sequence_item_pattern: as_pattern
| "*" NAME -> star_pattern
class_pattern: name_or_attr_pattern "(" [arguments_pattern ","?] ")"
arguments_pattern: pos_arg_pattern ["," keyws_arg_pattern]
| keyws_arg_pattern -> no_pos_arguments
pos_arg_pattern: as_pattern ("," as_pattern)*
keyws_arg_pattern: keyw_arg_pattern ("," keyw_arg_pattern)*
keyw_arg_pattern: NAME "=" as_pattern
suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT
?test: or_test ("if" or_test "else" test)?
| lambdef
| assign_expr
assign_expr: name ":=" test
?test_nocond: or_test | lambdef_nocond
?or_test: and_test ("or" and_test)*
?and_test: not_test_ ("and" not_test_)*
?not_test_: "not" not_test_ -> not_test
| comparison
?comparison: expr (comp_op expr)*
star_expr: "*" expr
?expr: or_expr
?or_expr: xor_expr ("|" xor_expr)*
?xor_expr: and_expr ("^" and_expr)*
?and_expr: shift_expr ("&" shift_expr)*
?shift_expr: arith_expr (_shift_op arith_expr)*
?arith_expr: term (_add_op term)*
?term: factor (_mul_op factor)*
?factor: _unary_op factor | power
!_unary_op: "+"|"-"|"~"
!_add_op: "+"|"-"
!_shift_op: "<<"|">>"
!_mul_op: "*"|"@"|"/"|"%"|"//"
// <> isn't actually a valid comparison operator in Python. It's here for the
// sake of a __future__ import described in PEP 401 (which really works :-)
!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
?power: await_expr ("**" factor)?
?await_expr: AWAIT? atom_expr
AWAIT: "await"
?atom_expr: atom_expr "(" [arguments] ")" -> funccall
| atom_expr "[" subscriptlist "]" -> getitem
| atom_expr "." name -> getattr
| atom
?atom: "(" yield_expr ")"
| "(" _tuple_inner? ")" -> tuple
| "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension
| "[" _exprlist? "]" -> list
| "[" comprehension{test_or_star_expr} "]" -> list_comprehension
| "{" _dict_exprlist? "}" -> dict
| "{" comprehension{key_value} "}" -> dict_comprehension
| "{" _exprlist "}" -> set
| "{" comprehension{test} "}" -> set_comprehension
| name -> var
| number
| string_concat
| "(" test ")"
| "..." -> ellipsis
| "None" -> const_none
| "True" -> const_true
| "False" -> const_false
?string_concat: string+
_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",")
?test_or_star_expr: test
| star_expr
?subscriptlist: subscript
| subscript (("," subscript)+ [","] | ",") -> subscript_tuple
?subscript: test | ([test] ":" [test] [sliceop]) -> slice
sliceop: ":" [test]
?exprlist: (expr|star_expr)
| (expr|star_expr) (("," (expr|star_expr))+ [","]|",")
?testlist: test | testlist_tuple
testlist_tuple: test (("," test)+ [","] | ",")
_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","]
key_value: test ":" test
_exprlist: test_or_star_expr ("," test_or_star_expr)* [","]
classdef: "class" name ["(" [arguments] ")"] ":" suite
arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])?
| starargs
| kwargs
| comprehension{test}
starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs]
stararg: "*" test
kwargs: "**" test ("," argvalue)*
?argvalue: test ("=" test)?
comprehension{comp_result}: comp_result comp_fors [comp_if]
comp_fors: comp_for+
comp_for: [ASYNC] "for" exprlist "in" or_test
ASYNC: "async"
?comp_if: "if" test_nocond
// not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: name
yield_expr: "yield" [testlist]
| "yield" "from" test -> yield_from
number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
string: STRING | LONG_STRING
// Other terminals
_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+
%ignore /[\t \f]+/ // WS
%ignore /\\[\t \f]*\r?\n/ // LINE_CONT
%ignore COMMENT
%declare _INDENT _DEDENT
// Python terminals
!name: NAME | "match" | "case"
NAME: /[^\W\d]\w*/
COMMENT: /#[^\n]*/
STRING: /([ubf]?r?|r[ubf])("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
LONG_STRING: /([ubf]?r?|r[ubf])(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is
_SPECIAL_DEC: "0".."9" ("_"? "0".."9" )*
DEC_NUMBER: "1".."9" ("_"? "0".."9" )*
| "0" ("_"? "0" )* /(?![1-9])/
HEX_NUMBER.2: "0" ("x" | "X") ("_"? ("0".."9" | "a".."f" | "A".."F"))+
OCT_NUMBER.2: "0" ("o" | "O") ("_"? "0".."7" )+
BIN_NUMBER.2: "0" ("b" | "B") ("_"? "0".."1" )+
_EXP: ("e"|"E") ["+" | "-"] _SPECIAL_DEC
DECIMAL: "." _SPECIAL_DEC | _SPECIAL_DEC "." _SPECIAL_DEC?
FLOAT_NUMBER.2: _SPECIAL_DEC _EXP | DECIMAL _EXP?
IMAG_NUMBER.2: (_SPECIAL_DEC | FLOAT_NUMBER) ("J" | "j")
// Comma-separated list (with an optional trailing comma)
cs_list{item}: item ("," item)* ","?
_cs_list{item}: item ("," item)* ","?

View File

@@ -0,0 +1,7 @@
// TODO: LETTER, WORD, etc.
//
// Whitespace
//
WS_INLINE: /[ \t\xa0]/+
WS: /[ \t\xa0\f\r\n]/+

View File

@@ -0,0 +1,143 @@
"Provides a post-lexer for implementing Python-style indentation."
from abc import ABC, abstractmethod
from typing import List, Iterator
from .exceptions import LarkError
from .lark import PostLex
from .lexer import Token
###{standalone
class DedentError(LarkError):
pass
class Indenter(PostLex, ABC):
"""This is a postlexer that "injects" indent/dedent tokens based on indentation.
It keeps track of the current indentation, as well as the current level of parentheses.
Inside parentheses, the indentation is ignored, and no indent/dedent tokens get generated.
Note: This is an abstract class. To use it, inherit and implement all its abstract methods:
- tab_len
- NL_type
- OPEN_PAREN_types, CLOSE_PAREN_types
- INDENT_type, DEDENT_type
See also: the ``postlex`` option in `Lark`.
"""
paren_level: int
indent_level: List[int]
def __init__(self) -> None:
self.paren_level = 0
self.indent_level = [0]
assert self.tab_len > 0
def handle_NL(self, token: Token) -> Iterator[Token]:
if self.paren_level > 0:
return
yield token
indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
if indent > self.indent_level[-1]:
self.indent_level.append(indent)
yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
else:
while indent < self.indent_level[-1]:
self.indent_level.pop()
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
if indent != self.indent_level[-1]:
raise DedentError('Unexpected dedent to column %s. Expected dedent to %s' % (indent, self.indent_level[-1]))
def _process(self, stream):
for token in stream:
if token.type == self.NL_type:
yield from self.handle_NL(token)
else:
yield token
if token.type in self.OPEN_PAREN_types:
self.paren_level += 1
elif token.type in self.CLOSE_PAREN_types:
self.paren_level -= 1
assert self.paren_level >= 0
while len(self.indent_level) > 1:
self.indent_level.pop()
yield Token(self.DEDENT_type, '')
assert self.indent_level == [0], self.indent_level
def process(self, stream):
self.paren_level = 0
self.indent_level = [0]
return self._process(stream)
# XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
@property
def always_accept(self):
return (self.NL_type,)
@property
@abstractmethod
def NL_type(self) -> str:
"The name of the newline token"
raise NotImplementedError()
@property
@abstractmethod
def OPEN_PAREN_types(self) -> List[str]:
"The names of the tokens that open a parenthesis"
raise NotImplementedError()
@property
@abstractmethod
def CLOSE_PAREN_types(self) -> List[str]:
"""The names of the tokens that close a parenthesis
"""
raise NotImplementedError()
@property
@abstractmethod
def INDENT_type(self) -> str:
"""The name of the token that starts an indentation in the grammar.
See also: %declare
"""
raise NotImplementedError()
@property
@abstractmethod
def DEDENT_type(self) -> str:
"""The name of the token that end an indentation in the grammar.
See also: %declare
"""
raise NotImplementedError()
@property
@abstractmethod
def tab_len(self) -> int:
"""How many spaces does a tab equal"""
raise NotImplementedError()
class PythonIndenter(Indenter):
"""A postlexer that "injects" _INDENT/_DEDENT tokens based on indentation, according to the Python syntax.
See also: the ``postlex`` option in `Lark`.
"""
NL_type = '_NEWLINE'
OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 8
###}

View File

@@ -0,0 +1,658 @@
from abc import ABC, abstractmethod
import getpass
import sys, os, pickle
import tempfile
import types
import re
from typing import (
TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, Sequence,
Tuple, Iterable, IO, Any, TYPE_CHECKING, Collection
)
if TYPE_CHECKING:
from .parsers.lalr_interactive_parser import InteractiveParser
from .tree import ParseTree
from .visitors import Transformer
from typing import Literal
from .parser_frontends import ParsingFrontend
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
from .utils import Serialize, SerializeMemoizer, FS, logger
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
from .tree import Tree
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend
from .grammar import Rule
try:
import regex
_has_regex = True
except ImportError:
_has_regex = False
###{standalone
class PostLex(ABC):
@abstractmethod
def process(self, stream: Iterator[Token]) -> Iterator[Token]:
return stream
always_accept: Iterable[str] = ()
class LarkOptions(Serialize):
"""Specifies the options for Lark
"""
start: List[str]
debug: bool
strict: bool
transformer: 'Optional[Transformer]'
propagate_positions: Union[bool, str]
maybe_placeholders: bool
cache: Union[bool, str]
regex: bool
g_regex_flags: int
keep_all_tokens: bool
tree_class: Optional[Callable[[str, List], Any]]
parser: _ParserArgType
lexer: _LexerArgType
ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]'
postlex: Optional[PostLex]
priority: 'Optional[Literal["auto", "normal", "invert"]]'
lexer_callbacks: Dict[str, Callable[[Token], Token]]
use_bytes: bool
ordered_sets: bool
edit_terminals: Optional[Callable[[TerminalDef], TerminalDef]]
import_paths: 'List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]'
source_path: Optional[str]
OPTIONS_DOC = r"""
**=== General Options ===**
start
The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start")
debug
Display debug information and extra warnings. Use only when debugging (Default: ``False``)
When used with Earley, it generates a forest graph as "sppf.png", if 'dot' is installed.
strict
Throw an exception on any potential ambiguity, including shift/reduce conflicts, and regex collisions.
transformer
Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
propagate_positions
Propagates positional attributes into the 'meta' attribute of all tree branches.
Sets attributes: (line, column, end_line, end_column, start_pos, end_pos,
container_line, container_column, container_end_line, container_end_column)
Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating.
maybe_placeholders
When ``True``, the ``[]`` operator returns ``None`` when not matched.
When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all.
(default= ``True``)
cache
Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now.
- When ``False``, does nothing (default)
- When ``True``, caches to a temporary file in the local directory
- When given a string, caches to the path pointed by the string
regex
When True, uses the ``regex`` module instead of the stdlib ``re``.
g_regex_flags
Flags that are applied to all terminals (both regex and strings)
keep_all_tokens
Prevent the tree builder from automagically removing "punctuation" tokens (Default: ``False``)
tree_class
Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``.
**=== Algorithm Options ===**
parser
Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley").
(there is also a "cyk" option for legacy)
lexer
Decides whether or not to use a lexer stage
- "auto" (default): Choose for me based on the parser
- "basic": Use a basic lexer
- "contextual": Stronger lexer (only works with parser="lalr")
- "dynamic": Flexible and powerful (only with parser="earley")
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible.
ambiguity
Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
- "resolve": The parser will automatically choose the simplest derivation
(it chooses consistently: greedy for tokens, non-greedy for rules)
- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).
- "forest": The parser will return the root of the shared packed parse forest.
**=== Misc. / Domain Specific Options ===**
postlex
Lexer post-processing (Default: ``None``) Only works with the basic and contextual lexers.
priority
How priorities should be evaluated - "auto", ``None``, "normal", "invert" (Default: "auto")
lexer_callbacks
Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
use_bytes
Accept an input of type ``bytes`` instead of ``str``.
ordered_sets
Should Earley use ordered-sets to achieve stable output (~10% slower than regular sets. Default: True)
edit_terminals
A callback for editing the terminals before parse.
import_paths
A List of either paths or loader functions to specify from where grammars are imported
source_path
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading
**=== End of Options ===**
"""
if __doc__:
__doc__ += OPTIONS_DOC
# Adding a new option needs to be done in multiple places:
# - In the dictionary below. This is the primary truth of which options `Lark.__init__` accepts
# - In the docstring above. It is used both for the docstring of `LarkOptions` and `Lark`, and in readthedocs
# - As an attribute of `LarkOptions` above
# - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded
# - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument
_defaults: Dict[str, Any] = {
'debug': False,
'strict': False,
'keep_all_tokens': False,
'tree_class': None,
'cache': False,
'postlex': None,
'parser': 'earley',
'lexer': 'auto',
'transformer': None,
'start': 'start',
'priority': 'auto',
'ambiguity': 'auto',
'regex': False,
'propagate_positions': False,
'lexer_callbacks': {},
'maybe_placeholders': True,
'edit_terminals': None,
'g_regex_flags': 0,
'use_bytes': False,
'ordered_sets': True,
'import_paths': [],
'source_path': None,
'_plugins': {},
}
def __init__(self, options_dict: Dict[str, Any]) -> None:
o = dict(options_dict)
options = {}
for name, default in self._defaults.items():
if name in o:
value = o.pop(name)
if isinstance(default, bool) and name not in ('cache', 'use_bytes', 'propagate_positions'):
value = bool(value)
else:
value = default
options[name] = value
if isinstance(options['start'], str):
options['start'] = [options['start']]
self.__dict__['options'] = options
assert_config(self.parser, ('earley', 'lalr', 'cyk', None))
if self.parser == 'earley' and self.transformer:
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
if o:
raise ConfigurationError("Unknown options: %s" % o.keys())
def __getattr__(self, name: str) -> Any:
try:
return self.__dict__['options'][name]
except KeyError as e:
raise AttributeError(e)
def __setattr__(self, name: str, value: str) -> None:
assert_config(name, self.options.keys(), "%r isn't a valid option. Expected one of: %s")
self.options[name] = value
def serialize(self, memo = None) -> Dict[str, Any]:
return self.options
@classmethod
def deserialize(cls, data: Dict[str, Any], memo: Dict[int, Union[TerminalDef, Rule]]) -> "LarkOptions":
return cls(data)
# Options that can be passed to the Lark parser, even when it was loaded from cache/standalone.
# These options are only used outside of `load_grammar`.
_LOAD_ALLOWED_OPTIONS = {'postlex', 'transformer', 'lexer_callbacks', 'use_bytes', 'debug', 'g_regex_flags', 'regex', 'propagate_positions', 'tree_class', '_plugins'}
_VALID_PRIORITY_OPTIONS = ('auto', 'normal', 'invert', None)
_VALID_AMBIGUITY_OPTIONS = ('auto', 'resolve', 'explicit', 'forest')
_T = TypeVar('_T', bound="Lark")
class Lark(Serialize):
"""Main interface for the library.
It's mostly a thin wrapper for the many different parsers, and for the tree constructor.
Parameters:
grammar: a string or file-object containing the grammar spec (using Lark's ebnf syntax)
options: a dictionary controlling various aspects of Lark.
Example:
>>> Lark(r'''start: "foo" ''')
Lark(...)
"""
source_path: str
source_grammar: str
grammar: 'Grammar'
options: LarkOptions
lexer: Lexer
parser: 'ParsingFrontend'
terminals: Collection[TerminalDef]
def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
self.options = LarkOptions(options)
re_module: types.ModuleType
# Set regex or re module
use_regex = self.options.regex
if use_regex:
if _has_regex:
re_module = regex
else:
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
else:
re_module = re
# Some, but not all file-like objects have a 'name' attribute
if self.options.source_path is None:
try:
self.source_path = grammar.name # type: ignore[union-attr]
except AttributeError:
self.source_path = '<string>'
else:
self.source_path = self.options.source_path
# Drain file-like objects to get their contents
try:
read = grammar.read # type: ignore[union-attr]
except AttributeError:
pass
else:
grammar = read()
cache_fn = None
cache_sha256 = None
if isinstance(grammar, str):
self.source_grammar = grammar
if self.options.use_bytes:
if not grammar.isascii():
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals', '_plugins')
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
from . import __version__
s = grammar + options_str + __version__ + str(sys.version_info[:2])
cache_sha256 = sha256_digest(s)
if isinstance(self.options.cache, str):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
try:
username = getpass.getuser()
except Exception:
# The exception raised may be ImportError or OSError in
# the future. For the cache, we don't care about the
# specific reason - we just want a username.
username = "unknown"
cache_fn = tempfile.gettempdir() + "/.lark_cache_%s_%s_%s_%s.tmp" % (username, cache_sha256, *sys.version_info[:2])
old_options = self.options
try:
with FS.open(cache_fn, 'rb') as f:
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
file_sha256 = f.readline().rstrip(b'\n')
cached_used_files = pickle.load(f)
if file_sha256 == cache_sha256.encode('utf8') and verify_used_files(cached_used_files):
cached_parser_data = pickle.load(f)
self._load(cached_parser_data, **options)
return
except FileNotFoundError:
# The cache file doesn't exist; parse and compose the grammar as normal
pass
except Exception: # We should probably narrow done which errors we catch here.
logger.exception("Failed to load Lark from cache: %r. We will try to carry on.", cache_fn)
# In theory, the Lark instance might have been messed up by the call to `_load`.
# In practice the only relevant thing that might have been overwritten should be `options`
self.options = old_options
# Parse the grammar file and compose the grammars
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else:
assert isinstance(grammar, Grammar)
self.grammar = grammar
if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
self.options.lexer = 'contextual'
elif self.options.parser == 'earley':
if self.options.postlex is not None:
logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. "
"Consider using lalr with contextual instead of earley")
self.options.lexer = 'basic'
else:
self.options.lexer = 'dynamic'
elif self.options.parser == 'cyk':
self.options.lexer = 'basic'
else:
assert False, self.options.parser
lexer = self.options.lexer
if isinstance(lexer, type):
assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance
else:
assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete'))
if self.options.postlex is not None and 'dynamic' in lexer:
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead")
if self.options.ambiguity == 'auto':
if self.options.parser == 'earley':
self.options.ambiguity = 'resolve'
else:
assert_config(self.options.parser, ('earley', 'cyk'), "%r doesn't support disambiguation. Use one of these parsers instead: %s")
if self.options.priority == 'auto':
self.options.priority = 'normal'
if self.options.priority not in _VALID_PRIORITY_OPTIONS:
raise ConfigurationError("invalid priority option: %r. Must be one of %r" % (self.options.priority, _VALID_PRIORITY_OPTIONS))
if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))
if self.options.parser is None:
terminals_to_keep = '*'
elif self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept)
else:
terminals_to_keep = set()
# Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start, terminals_to_keep)
if self.options.edit_terminals:
for t in self.terminals:
self.options.edit_terminals(t)
self._terminals_dict = {t.name: t for t in self.terminals}
# If the user asked to invert the priorities, negate them all here.
if self.options.priority == 'invert':
for rule in self.rules:
if rule.options.priority is not None:
rule.options.priority = -rule.options.priority
for term in self.terminals:
term.priority = -term.priority
# Else, if the user asked to disable priorities, strip them from the
# rules and terminals. This allows the Earley parsers to skip an extra forest walk
# for improved performance, if you don't need them (or didn't specify any).
elif self.options.priority is None:
for rule in self.rules:
if rule.options.priority is not None:
rule.options.priority = None
for term in self.terminals:
term.priority = 0
# TODO Deprecate lexer_callbacks?
self.lexer_conf = LexerConf(
self.terminals, re_module, self.ignore_tokens, self.options.postlex,
self.options.lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes, strict=self.options.strict
)
if self.options.parser:
self.parser = self._build_parser()
elif lexer:
self.lexer = self._build_lexer()
if cache_fn:
logger.debug('Saving grammar to cache: %s', cache_fn)
try:
with FS.open(cache_fn, 'wb') as f:
assert cache_sha256 is not None
f.write(cache_sha256.encode('utf8') + b'\n')
pickle.dump(used_files, f)
self.save(f, _LOAD_ALLOWED_OPTIONS)
except IOError as e:
logger.exception("Failed to save Lark to cache: %r.", cache_fn, e)
if __doc__:
__doc__ += "\n\n" + LarkOptions.OPTIONS_DOC
__serialize_fields__ = 'parser', 'rules', 'options'
def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
lexer_conf = self.lexer_conf
if dont_ignore:
from copy import copy
lexer_conf = copy(lexer_conf)
lexer_conf.ignore = ()
return BasicLexer(lexer_conf)
def _prepare_callbacks(self) -> None:
self._callbacks = {}
# we don't need these callbacks if we aren't building a tree
if self.options.ambiguity != 'forest':
self._parse_tree_builder = ParseTreeBuilder(
self.rules,
self.options.tree_class or Tree,
self.options.propagate_positions,
self.options.parser != 'lalr' and self.options.ambiguity == 'explicit',
self.options.maybe_placeholders
)
self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer)
self._callbacks.update(_get_lexer_callbacks(self.options.transformer, self.terminals))
def _build_parser(self) -> "ParsingFrontend":
self._prepare_callbacks()
_validate_frontend_args(self.options.parser, self.options.lexer)
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return _construct_parsing_frontend(
self.options.parser,
self.options.lexer,
self.lexer_conf,
parser_conf,
options=self.options
)
def save(self, f, exclude_options: Collection[str] = ()) -> None:
"""Saves the instance into the given file object
Useful for caching and multiprocessing.
"""
if self.options.parser != 'lalr':
raise NotImplementedError("Lark.save() is only implemented for the LALR(1) parser.")
data, m = self.memo_serialize([TerminalDef, Rule])
if exclude_options:
data["options"] = {n: v for n, v in data["options"].items() if n not in exclude_options}
pickle.dump({'data': data, 'memo': m}, f, protocol=pickle.HIGHEST_PROTOCOL)
@classmethod
def load(cls: Type[_T], f) -> _T:
"""Loads an instance from the given file object
Useful for caching and multiprocessing.
"""
inst = cls.__new__(cls)
return inst._load(f)
def _deserialize_lexer_conf(self, data: Dict[str, Any], memo: Dict[int, Union[TerminalDef, Rule]], options: LarkOptions) -> LexerConf:
lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo)
lexer_conf.callbacks = options.lexer_callbacks or {}
lexer_conf.re_module = regex if options.regex else re
lexer_conf.use_bytes = options.use_bytes
lexer_conf.g_regex_flags = options.g_regex_flags
lexer_conf.skip_validation = True
lexer_conf.postlex = options.postlex
return lexer_conf
def _load(self: _T, f: Any, **kwargs) -> _T:
if isinstance(f, dict):
d = f
else:
d = pickle.load(f)
memo_json = d['memo']
data = d['data']
assert memo_json
memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
options = dict(data['options'])
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
.format(set(kwargs) - _LOAD_ALLOWED_OPTIONS))
options.update(kwargs)
self.options = LarkOptions.deserialize(options, memo)
self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source_path = '<deserialized>'
_validate_frontend_args(self.options.parser, self.options.lexer)
self.lexer_conf = self._deserialize_lexer_conf(data['parser'], memo, self.options)
self.terminals = self.lexer_conf.terminals
self._prepare_callbacks()
self._terminals_dict = {t.name: t for t in self.terminals}
self.parser = _deserialize_parsing_frontend(
data['parser'],
memo,
self.lexer_conf,
self._callbacks,
self.options, # Not all, but multiple attributes are used
)
return self
@classmethod
def _load_from_dict(cls, data, memo, **kwargs):
inst = cls.__new__(cls)
return inst._load({'data': data, 'memo': memo}, **kwargs)
@classmethod
def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str]=None, **options) -> _T:
"""Create an instance of Lark with the grammar given by its filename
If ``rel_to`` is provided, the function will find the grammar filename in relation to it.
Example:
>>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
Lark(...)
"""
if rel_to:
basepath = os.path.dirname(rel_to)
grammar_filename = os.path.join(basepath, grammar_filename)
with open(grammar_filename, encoding='utf8') as f:
return cls(f, **options)
@classmethod
def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: 'Sequence[str]'=[""], **options) -> _T:
"""Create an instance of Lark with the grammar loaded from within the package `package`.
This allows grammar loading from zipapps.
Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader`
Example:
Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
"""
package_loader = FromPackageLoader(package, search_paths)
full_path, text = package_loader(None, grammar_path)
options.setdefault('source_path', full_path)
options.setdefault('import_paths', [])
options['import_paths'].append(package_loader)
return cls(text, **options)
def __repr__(self):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
:raises UnexpectedCharacters: In case the lexer cannot find a suitable match.
"""
lexer: Lexer
if not hasattr(self, 'lexer') or dont_ignore:
lexer = self._build_lexer(dont_ignore)
else:
lexer = self.lexer
lexer_thread = LexerThread.from_text(lexer, text)
stream = lexer_thread.lex(None)
if self.options.postlex:
return self.options.postlex.process(stream)
return stream
def get_terminal(self, name: str) -> TerminalDef:
"""Get information about a terminal"""
return self._terminals_dict[name]
def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
"""Start an interactive parsing session.
Parameters:
text (str, optional): Text to be parsed. Required for ``resume_parse()``.
start (str, optional): Start symbol
Returns:
A new InteractiveParser instance.
See Also: ``Lark.parse()``
"""
return self.parser.parse_interactive(text, start=start)
def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
"""Parse the given text, according to the options provided.
Parameters:
text (str): Text to be parsed.
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing.
LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.
Returns:
If a transformer is supplied to ``__init__``, returns whatever is the
result of the transformation. Otherwise, returns a Tree instance.
:raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise:
``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``.
For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.
"""
return self.parser.parse(text, start=start, on_error=on_error)
###}

View File

@@ -0,0 +1,678 @@
# Lexer Implementation
from abc import abstractmethod, ABC
import re
from contextlib import suppress
from typing import (
TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
ClassVar, TYPE_CHECKING, overload
)
from types import ModuleType
import warnings
try:
import interegular
except ImportError:
pass
if TYPE_CHECKING:
from .common import LexerConf
from .parsers.lalr_parser_state import ParserState
from .utils import classify, get_regexp_width, Serialize, logger
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
from .grammar import TOKEN_DEFAULT_PRIORITY
###{standalone
from copy import copy
try: # For the standalone parser, we need to make sure that has_interegular is False to avoid NameErrors later on
has_interegular = bool(interegular)
except NameError:
has_interegular = False
class Pattern(Serialize, ABC):
"An abstraction over regular expressions."
value: str
flags: Collection[str]
raw: Optional[str]
type: ClassVar[str]
def __init__(self, value: str, flags: Collection[str] = (), raw: Optional[str] = None) -> None:
self.value = value
self.flags = frozenset(flags)
self.raw = raw
def __repr__(self):
return repr(self.to_regexp())
# Pattern Hashing assumes all subclasses have a different priority!
def __hash__(self):
return hash((type(self), self.value, self.flags))
def __eq__(self, other):
return type(self) == type(other) and self.value == other.value and self.flags == other.flags
@abstractmethod
def to_regexp(self) -> str:
raise NotImplementedError()
@property
@abstractmethod
def min_width(self) -> int:
raise NotImplementedError()
@property
@abstractmethod
def max_width(self) -> int:
raise NotImplementedError()
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s:%s)' % (f, value))
return value
class PatternStr(Pattern):
__serialize_fields__ = 'value', 'flags', 'raw'
type: ClassVar[str] = "str"
def to_regexp(self) -> str:
return self._get_flags(re.escape(self.value))
@property
def min_width(self) -> int:
return len(self.value)
@property
def max_width(self) -> int:
return len(self.value)
class PatternRE(Pattern):
__serialize_fields__ = 'value', 'flags', 'raw', '_width'
type: ClassVar[str] = "re"
def to_regexp(self) -> str:
return self._get_flags(self.value)
_width = None
def _get_width(self):
if self._width is None:
self._width = get_regexp_width(self.to_regexp())
return self._width
@property
def min_width(self) -> int:
return self._get_width()[0]
@property
def max_width(self) -> int:
return self._get_width()[1]
class TerminalDef(Serialize):
"A definition of a terminal"
__serialize_fields__ = 'name', 'pattern', 'priority'
__serialize_namespace__ = PatternStr, PatternRE
name: str
pattern: Pattern
priority: int
def __init__(self, name: str, pattern: Pattern, priority: int = TOKEN_DEFAULT_PRIORITY) -> None:
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority
def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
def user_repr(self) -> str:
if self.name.startswith('__'): # We represent a generated terminal
return self.pattern.raw or self.name
else:
return self.name
_T = TypeVar('_T', bound="Token")
class Token(str):
"""A string with meta-information, that is produced by the lexer.
When parsing text, the resulting chunks of the input that haven't been discarded,
will end up in the tree as Token instances. The Token class inherits from Python's ``str``,
so normal string comparisons and operations will work as expected.
Attributes:
type: Name of the token (as specified in grammar)
value: Value of the token (redundant, as ``token.value == token`` will always be true)
start_pos: The index of the token in the text
line: The line of the token in the text (starting with 1)
column: The column of the token in the text (starting with 1)
end_line: The line where the token ends
end_column: The next column after the end of the token. For example,
if the token is a single character with a column value of 4,
end_column will be 5.
end_pos: the index where the token ends (basically ``start_pos + len(token)``)
"""
__slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
__match_args__ = ('type', 'value')
type: str
start_pos: Optional[int]
value: Any
line: Optional[int]
column: Optional[int]
end_line: Optional[int]
end_column: Optional[int]
end_pos: Optional[int]
@overload
def __new__(
cls,
type: str,
value: Any,
start_pos: Optional[int] = None,
line: Optional[int] = None,
column: Optional[int] = None,
end_line: Optional[int] = None,
end_column: Optional[int] = None,
end_pos: Optional[int] = None
) -> 'Token':
...
@overload
def __new__(
cls,
type_: str,
value: Any,
start_pos: Optional[int] = None,
line: Optional[int] = None,
column: Optional[int] = None,
end_line: Optional[int] = None,
end_column: Optional[int] = None,
end_pos: Optional[int] = None
) -> 'Token': ...
def __new__(cls, *args, **kwargs):
if "type_" in kwargs:
warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning)
if "type" in kwargs:
raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.")
kwargs["type"] = kwargs.pop("type_")
return cls._future_new(*args, **kwargs)
@classmethod
def _future_new(cls, type, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
inst = super(Token, cls).__new__(cls, value)
inst.type = type
inst.start_pos = start_pos
inst.value = value
inst.line = line
inst.column = column
inst.end_line = end_line
inst.end_column = end_column
inst.end_pos = end_pos
return inst
@overload
def update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
...
@overload
def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
...
def update(self, *args, **kwargs):
if "type_" in kwargs:
warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning)
if "type" in kwargs:
raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.")
kwargs["type"] = kwargs.pop("type_")
return self._future_update(*args, **kwargs)
def _future_update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
return Token.new_borrow_pos(
type if type is not None else self.type,
value if value is not None else self.value,
self
)
@classmethod
def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> _T:
return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)
def __reduce__(self):
return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column))
def __repr__(self):
return 'Token(%r, %r)' % (self.type, self.value)
def __deepcopy__(self, memo):
return Token(self.type, self.value, self.start_pos, self.line, self.column)
def __eq__(self, other):
if isinstance(other, Token) and self.type != other.type:
return False
return str.__eq__(self, other)
__hash__ = str.__hash__
class LineCounter:
"A utility class for keeping track of line & column information"
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char'
def __init__(self, newline_char):
self.newline_char = newline_char
self.char_pos = 0
self.line = 1
self.column = 1
self.line_start_pos = 0
def __eq__(self, other):
if not isinstance(other, LineCounter):
return NotImplemented
return self.char_pos == other.char_pos and self.newline_char == other.newline_char
def feed(self, token: Token, test_newline=True):
"""Consume a token and calculate the new line & column.
As an optional optimization, set test_newline=False if token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos + 1
class UnlessCallback:
def __init__(self, scanner):
self.scanner = scanner
def __call__(self, t):
res = self.scanner.match(t.value, 0)
if res:
_value, t.type = res
return t
class CallChain:
def __init__(self, callback1, callback2, cond):
self.callback1 = callback1
self.callback2 = callback2
self.cond = cond
def __call__(self, t):
t2 = self.callback1(t)
return self.callback2(t) if self.cond(t2) else t2
def _get_match(re_, regexp, s, flags):
m = re_.match(regexp, s, flags)
if m:
return m.group(0)
def _create_unless(terminals, g_regex_flags, re_, use_bytes):
tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set()
callback = {}
for retok in tokens_by_type.get(PatternRE, []):
unless = []
for strtok in tokens_by_type.get(PatternStr, []):
if strtok.priority != retok.priority:
continue
s = strtok.pattern.value
if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
new_terminals = [t for t in terminals if t not in embedded_strs]
return new_terminals, callback
class Scanner:
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
self.terminals = terminals
self.g_regex_flags = g_regex_flags
self.re_ = re_
self.use_bytes = use_bytes
self.match_whole = match_whole
self.allowed_types = {t.name for t in self.terminals}
self._mres = self._build_mres(terminals, len(terminals))
def _build_mres(self, terminals, max_size):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if self.match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if self.use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = self.re_.compile(pattern, self.g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(terminals, max_size // 2)
mres.append(mre)
terminals = terminals[max_size:]
return mres
def match(self, text, pos):
for mre in self._mres:
m = mre.match(text, pos)
if m:
return m.group(0), m.lastgroup
def _regexp_has_newline(r: str):
r"""Expressions that may indicate newlines in a regexp:
- newlines (\n)
- escaped newline (\\n)
- anything but ([^...])
- any-char (.) when the flag (?s) exists
- spaces (\s)
"""
return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
class LexerState:
"""Represents the current state of the lexer as it scans the text
(Lexer objects are only instantiated per grammar, not per text)
"""
__slots__ = 'text', 'line_ctr', 'last_token'
text: str
line_ctr: LineCounter
last_token: Optional[Token]
def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None):
self.text = text
self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n')
self.last_token = last_token
def __eq__(self, other):
if not isinstance(other, LexerState):
return NotImplemented
return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
def __copy__(self):
return type(self)(self.text, copy(self.line_ctr), self.last_token)
class LexerThread:
"""A thread that ties a lexer instance and a lexer state, to be used by the parser
"""
def __init__(self, lexer: 'Lexer', lexer_state: LexerState):
self.lexer = lexer
self.state = lexer_state
@classmethod
def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread':
return cls(lexer, LexerState(text))
def lex(self, parser_state):
return self.lexer.lex(self.state, parser_state)
def __copy__(self):
return type(self)(self.lexer, copy(self.state))
_Token = Token
_Callback = Callable[[Token], Token]
class Lexer(ABC):
"""Lexer interface
Method Signatures:
lex(self, lexer_state, parser_state) -> Iterator[Token]
"""
@abstractmethod
def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
return NotImplemented
def make_lexer_state(self, text):
"Deprecated"
return LexerState(text)
def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
if not comparator:
comparator = interegular.Comparator.from_regexes(terminal_to_regexp)
# When in strict mode, we only ever try to provide one example, so taking
# a long time for that should be fine
max_time = 2 if strict_mode else 0.2
# We don't want to show too many collisions.
if comparator.count_marked_pairs() >= max_collisions_to_show:
return
for group in classify(terminal_to_regexp, lambda t: t.priority).values():
for a, b in comparator.check(group, skip_marked=True):
assert a.priority == b.priority
# Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
comparator.mark(a, b)
# Notify the user
message = f"Collision between Terminals {a.name} and {b.name}. "
try:
example = comparator.get_example_overlap(a, b, max_time).format_multiline()
except ValueError:
# Couldn't find an example within max_time steps.
example = "No example could be found fast enough. However, the collision does still exists"
if strict_mode:
raise LexError(f"{message}\n{example}")
logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example)
if comparator.count_marked_pairs() >= max_collisions_to_show:
logger.warning("Found 8 regex collisions, will not check for more.")
return
class AbstractBasicLexer(Lexer):
terminals_by_name: Dict[str, TerminalDef]
@abstractmethod
def __init__(self, conf: 'LexerConf', comparator=None) -> None:
...
@abstractmethod
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
...
def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
with suppress(EOFError):
while True:
yield self.next_token(state, parser_state)
class BasicLexer(AbstractBasicLexer):
terminals: Collection[TerminalDef]
ignore_types: FrozenSet[str]
newline_types: FrozenSet[str]
user_callbacks: Dict[str, _Callback]
callback: Dict[str, _Callback]
re: ModuleType
def __init__(self, conf: 'LexerConf', comparator=None) -> None:
terminals = list(conf.terminals)
assert all(isinstance(t, TerminalDef) for t in terminals), terminals
self.re = conf.re_module
if not conf.skip_validation:
# Sanitization
terminal_to_regexp = {}
for t in terminals:
regexp = t.pattern.to_regexp()
try:
self.re.compile(regexp, conf.g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
if t.pattern.type == "re":
terminal_to_regexp[t] = regexp
if not (set(conf.ignore) <= {t.name for t in terminals}):
raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals}))
if has_interegular:
_check_regex_collisions(terminal_to_regexp, comparator, conf.strict)
elif conf.strict:
raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.")
# Init
self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
self.ignore_types = frozenset(conf.ignore)
terminals.sort(key=lambda x: (-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
self.terminals = terminals
self.user_callbacks = conf.callbacks
self.g_regex_flags = conf.g_regex_flags
self.use_bytes = conf.use_bytes
self.terminals_by_name = conf.terminals_by_name
self._scanner = None
def _build_scanner(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
assert all(self.callback.values())
for type_, f in self.user_callbacks.items():
if type_ in self.callback:
# Already a callback there, probably UnlessCallback
self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
else:
self.callback[type_] = f
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
@property
def scanner(self):
if self._scanner is None:
self._build_scanner()
return self._scanner
def match(self, text, pos):
return self.scanner.match(text, pos)
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
if not res:
allowed = self.scanner.allowed_types - self.ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
state=parser_state, terminals_by_name=self.terminals_by_name)
value, type_ = res
ignored = type_ in self.ignore_types
t = None
if not ignored or type_ in self.callback:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in self.newline_types)
if t is not None:
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in self.callback:
t = self.callback[t.type](t)
if not ignored:
if not isinstance(t, Token):
raise LexError("Callbacks must return a token (returned %r)" % t)
lex_state.last_token = t
return t
# EOF
raise EOFError(self)
class ContextualLexer(Lexer):
lexers: Dict[int, AbstractBasicLexer]
root_lexer: AbstractBasicLexer
BasicLexer: Type[AbstractBasicLexer] = BasicLexer
def __init__(self, conf: 'LexerConf', states: Dict[int, Collection[str]], always_accept: Collection[str]=()) -> None:
terminals = list(conf.terminals)
terminals_by_name = conf.terminals_by_name
trad_conf = copy(conf)
trad_conf.terminals = terminals
if has_interegular and not conf.skip_validation:
comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals})
else:
comparator = None
lexer_by_tokens: Dict[FrozenSet[str], AbstractBasicLexer] = {}
self.lexers = {}
for state, accepts in states.items():
key = frozenset(accepts)
try:
lexer = lexer_by_tokens[key]
except KeyError:
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
lexer_conf = copy(trad_conf)
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
lexer = self.BasicLexer(lexer_conf, comparator)
lexer_by_tokens[key] = lexer
self.lexers[state] = lexer
assert trad_conf.terminals is terminals
trad_conf.skip_validation = True # We don't need to verify all terminals again
self.root_lexer = self.BasicLexer(trad_conf, comparator)
def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[Token]:
try:
while True:
lexer = self.lexers[parser_state.position]
yield lexer.next_token(lexer_state, parser_state)
except EOFError:
pass
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
# This tests the input against the global context, to provide a nicer error.
try:
last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token
token = self.root_lexer.next_token(lexer_state, parser_state)
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name)
except UnexpectedCharacters:
raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.
###}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,391 @@
"""Provides functions for the automatic building and shaping of the parse-tree."""
from typing import List
from .exceptions import GrammarError, ConfigurationError
from .lexer import Token
from .tree import Tree
from .visitors import Transformer_InPlace
from .visitors import _vargs_meta, _vargs_meta_inline
###{standalone
from functools import partial, wraps
from itertools import product
class ExpandSingleChild:
def __init__(self, node_builder):
self.node_builder = node_builder
def __call__(self, children):
if len(children) == 1:
return children[0]
else:
return self.node_builder(children)
class PropagatePositions:
def __init__(self, node_builder, node_filter=None):
self.node_builder = node_builder
self.node_filter = node_filter
def __call__(self, children):
res = self.node_builder(children)
if isinstance(res, Tree):
# Calculate positions while the tree is streaming, according to the rule:
# - nodes start at the start of their first child's container,
# and end at the end of their last child's container.
# Containers are nodes that take up space in text, but have been inlined in the tree.
res_meta = res.meta
first_meta = self._pp_get_meta(children)
if first_meta is not None:
if not hasattr(res_meta, 'line'):
# meta was already set, probably because the rule has been inlined (e.g. `?rule`)
res_meta.line = getattr(first_meta, 'container_line', first_meta.line)
res_meta.column = getattr(first_meta, 'container_column', first_meta.column)
res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)
res_meta.empty = False
res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line)
res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column)
res_meta.container_start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)
last_meta = self._pp_get_meta(reversed(children))
if last_meta is not None:
if not hasattr(res_meta, 'end_line'):
res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos)
res_meta.empty = False
res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
res_meta.container_end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos)
return res
def _pp_get_meta(self, children):
for c in children:
if self.node_filter is not None and not self.node_filter(c):
continue
if isinstance(c, Tree):
if not c.meta.empty:
return c.meta
elif isinstance(c, Token):
return c
elif hasattr(c, '__lark_meta__'):
return c.__lark_meta__()
def make_propagate_positions(option):
if callable(option):
return partial(PropagatePositions, node_filter=option)
elif option is True:
return PropagatePositions
elif option is False:
return None
raise ConfigurationError('Invalid option for propagate_positions: %r' % option)
class ChildFilter:
def __init__(self, to_include, append_none, node_builder):
self.node_builder = node_builder
self.to_include = to_include
self.append_none = append_none
def __call__(self, children):
filtered = []
for i, to_expand, add_none in self.to_include:
if add_none:
filtered += [None] * add_none
if to_expand:
filtered += children[i].children
else:
filtered.append(children[i])
if self.append_none:
filtered += [None] * self.append_none
return self.node_builder(filtered)
class ChildFilterLALR(ChildFilter):
"""Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"""
def __call__(self, children):
filtered = []
for i, to_expand, add_none in self.to_include:
if add_none:
filtered += [None] * add_none
if to_expand:
if filtered:
filtered += children[i].children
else: # Optimize for left-recursion
filtered = children[i].children
else:
filtered.append(children[i])
if self.append_none:
filtered += [None] * self.append_none
return self.node_builder(filtered)
class ChildFilterLALR_NoPlaceholders(ChildFilter):
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
def __init__(self, to_include, node_builder):
self.node_builder = node_builder
self.to_include = to_include
def __call__(self, children):
filtered = []
for i, to_expand in self.to_include:
if to_expand:
if filtered:
filtered += children[i].children
else: # Optimize for left-recursion
filtered = children[i].children
else:
filtered.append(children[i])
return self.node_builder(filtered)
def _should_expand(sym):
return not sym.is_term and sym.name.startswith('_')
def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices: List[bool]):
# Prepare empty_indices as: How many Nones to insert at each index?
if _empty_indices:
assert _empty_indices.count(False) == len(expansion)
s = ''.join(str(int(b)) for b in _empty_indices)
empty_indices = [len(ones) for ones in s.split('0')]
assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion))
else:
empty_indices = [0] * (len(expansion)+1)
to_include = []
nones_to_add = 0
for i, sym in enumerate(expansion):
nones_to_add += empty_indices[i]
if keep_all_tokens or not (sym.is_term and sym.filter_out):
to_include.append((i, _should_expand(sym), nones_to_add))
nones_to_add = 0
nones_to_add += empty_indices[len(expansion)]
if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include):
if _empty_indices or ambiguous:
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add)
else:
# LALR without placeholders
return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include])
class AmbiguousExpander:
"""Deal with the case where we're expanding children ('_rule') into a parent but the children
are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself
ambiguous with as many copies as there are ambiguous children, and then copy the ambiguous children
into the right parents in the right places, essentially shifting the ambiguity up the tree."""
def __init__(self, to_expand, tree_class, node_builder):
self.node_builder = node_builder
self.tree_class = tree_class
self.to_expand = to_expand
def __call__(self, children):
def _is_ambig_tree(t):
return hasattr(t, 'data') and t.data == '_ambig'
# -- When we're repeatedly expanding ambiguities we can end up with nested ambiguities.
# All children of an _ambig node should be a derivation of that ambig node, hence
# it is safe to assume that if we see an _ambig node nested within an ambig node
# it is safe to simply expand it into the parent _ambig node as an alternative derivation.
ambiguous = []
for i, child in enumerate(children):
if _is_ambig_tree(child):
if i in self.to_expand:
ambiguous.append(i)
child.expand_kids_by_data('_ambig')
if not ambiguous:
return self.node_builder(children)
expand = [child.children if i in ambiguous else (child,) for i, child in enumerate(children)]
return self.tree_class('_ambig', [self.node_builder(list(f)) for f in product(*expand)])
def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens):
to_expand = [i for i, sym in enumerate(expansion)
if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))]
if to_expand:
return partial(AmbiguousExpander, to_expand, tree_class)
class AmbiguousIntermediateExpander:
"""
Propagate ambiguous intermediate nodes and their derivations up to the
current rule.
In general, converts
rule
_iambig
_inter
someChildren1
...
_inter
someChildren2
...
someChildren3
...
to
_ambig
rule
someChildren1
...
someChildren3
...
rule
someChildren2
...
someChildren3
...
rule
childrenFromNestedIambigs
...
someChildren3
...
...
propagating up any nested '_iambig' nodes along the way.
"""
def __init__(self, tree_class, node_builder):
self.node_builder = node_builder
self.tree_class = tree_class
def __call__(self, children):
def _is_iambig_tree(child):
return hasattr(child, 'data') and child.data == '_iambig'
def _collapse_iambig(children):
"""
Recursively flatten the derivations of the parent of an '_iambig'
node. Returns a list of '_inter' nodes guaranteed not
to contain any nested '_iambig' nodes, or None if children does
not contain an '_iambig' node.
"""
# Due to the structure of the SPPF,
# an '_iambig' node can only appear as the first child
if children and _is_iambig_tree(children[0]):
iambig_node = children[0]
result = []
for grandchild in iambig_node.children:
collapsed = _collapse_iambig(grandchild.children)
if collapsed:
for child in collapsed:
child.children += children[1:]
result += collapsed
else:
new_tree = self.tree_class('_inter', grandchild.children + children[1:])
result.append(new_tree)
return result
collapsed = _collapse_iambig(children)
if collapsed:
processed_nodes = [self.node_builder(c.children) for c in collapsed]
return self.tree_class('_ambig', processed_nodes)
return self.node_builder(children)
def inplace_transformer(func):
@wraps(func)
def f(children):
# function name in a Transformer is a rule name.
tree = Tree(func.__name__, children)
return func(tree)
return f
def apply_visit_wrapper(func, name, wrapper):
if wrapper is _vargs_meta or wrapper is _vargs_meta_inline:
raise NotImplementedError("Meta args not supported for internal transformer")
@wraps(func)
def f(children):
return wrapper(func, name, children, None)
return f
class ParseTreeBuilder:
def __init__(self, rules, tree_class, propagate_positions=False, ambiguous=False, maybe_placeholders=False):
self.tree_class = tree_class
self.propagate_positions = propagate_positions
self.ambiguous = ambiguous
self.maybe_placeholders = maybe_placeholders
self.rule_builders = list(self._init_builders(rules))
def _init_builders(self, rules):
propagate_positions = make_propagate_positions(self.propagate_positions)
for rule in rules:
options = rule.options
keep_all_tokens = options.keep_all_tokens
expand_single_child = options.expand1
wrapper_chain = list(filter(None, [
(expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None),
propagate_positions,
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
self.ambiguous and partial(AmbiguousIntermediateExpander, self.tree_class)
]))
yield rule, wrapper_chain
def create_callback(self, transformer=None):
callbacks = {}
default_handler = getattr(transformer, '__default__', None)
if default_handler:
def default_callback(data, children):
return default_handler(data, children, None)
else:
default_callback = self.tree_class
for rule, wrapper_chain in self.rule_builders:
user_callback_name = rule.alias or rule.options.template_source or rule.origin.name
try:
f = getattr(transformer, user_callback_name)
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
f = apply_visit_wrapper(f, user_callback_name, wrapper)
elif isinstance(transformer, Transformer_InPlace):
f = inplace_transformer(f)
except AttributeError:
f = partial(default_callback, user_callback_name)
for w in wrapper_chain:
f = w(f)
if rule in callbacks:
raise GrammarError("Rule '%s' already exists" % (rule,))
callbacks[rule] = f
return callbacks
###}

View File

@@ -0,0 +1,257 @@
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
from .exceptions import ConfigurationError, GrammarError, assert_config
from .utils import get_regexp_width, Serialize
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .tree import Tree
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
if TYPE_CHECKING:
from .parsers.lalr_analysis import ParseTableBase
###{standalone
def _wrap_lexer(lexer_class):
future_interface = getattr(lexer_class, '__future_interface__', False)
if future_interface:
return lexer_class
else:
class CustomLexerWrapper(Lexer):
def __init__(self, lexer_conf):
self.lexer = lexer_class(lexer_conf)
def lex(self, lexer_state, parser_state):
return self.lexer.lex(lexer_state.text)
return CustomLexerWrapper
def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options):
parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
cls = (options and options._plugins.get('LALR_Parser')) or LALR_Parser
parser = cls.deserialize(data['parser'], memo, callbacks, options.debug)
parser_conf.callbacks = callbacks
return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser)
_parser_creators: 'Dict[str, Callable[[LexerConf, Any, Any], Any]]' = {}
class ParsingFrontend(Serialize):
__serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser'
lexer_conf: LexerConf
parser_conf: ParserConf
options: Any
def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, parser=None):
self.parser_conf = parser_conf
self.lexer_conf = lexer_conf
self.options = options
# Set-up parser
if parser: # From cache
self.parser = parser
else:
create_parser = _parser_creators.get(parser_conf.parser_type)
assert create_parser is not None, "{} is not supported in standalone mode".format(
parser_conf.parser_type
)
self.parser = create_parser(lexer_conf, parser_conf, options)
# Set-up lexer
lexer_type = lexer_conf.lexer_type
self.skip_lexer = False
if lexer_type in ('dynamic', 'dynamic_complete'):
assert lexer_conf.postlex is None
self.skip_lexer = True
return
if isinstance(lexer_type, type):
assert issubclass(lexer_type, Lexer)
self.lexer = _wrap_lexer(lexer_type)(lexer_conf)
elif isinstance(lexer_type, str):
create_lexer = {
'basic': create_basic_lexer,
'contextual': create_contextual_lexer,
}[lexer_type]
self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex, options)
else:
raise TypeError("Bad value for lexer_type: {lexer_type}")
if lexer_conf.postlex:
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)
def _verify_start(self, start=None):
if start is None:
start_decls = self.parser_conf.start
if len(start_decls) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls)
start ,= start_decls
elif start not in self.parser_conf.start:
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
return start
def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]:
cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
return text if self.skip_lexer else cls.from_text(self.lexer, text)
def parse(self, text: str, start=None, on_error=None):
chosen_start = self._verify_start(start)
kw = {} if on_error is None else {'on_error': on_error}
stream = self._make_lexer_thread(text)
return self.parser.parse(stream, chosen_start, **kw)
def parse_interactive(self, text: Optional[str]=None, start=None):
# TODO BREAK - Change text from Optional[str] to text: str = ''.
# Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return []
chosen_start = self._verify_start(start)
if self.parser_conf.parser_type != 'lalr':
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
stream = self._make_lexer_thread(text) # type: ignore[arg-type]
return self.parser.parse_interactive(stream, chosen_start)
def _validate_frontend_args(parser, lexer) -> None:
assert_config(parser, ('lalr', 'earley', 'cyk'))
if not isinstance(lexer, type): # not custom lexer?
expected = {
'lalr': ('basic', 'contextual'),
'earley': ('basic', 'dynamic', 'dynamic_complete'),
'cyk': ('basic', ),
}[parser]
assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)
def _get_lexer_callbacks(transformer, terminals):
result = {}
for terminal in terminals:
callback = getattr(transformer, terminal.name, None)
if callback is not None:
result[terminal.name] = callback
return result
class PostLexConnector:
def __init__(self, lexer, postlexer):
self.lexer = lexer
self.postlexer = postlexer
def lex(self, lexer_state, parser_state):
i = self.lexer.lex(lexer_state, parser_state)
return self.postlexer.process(i)
def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer:
cls = (options and options._plugins.get('BasicLexer')) or BasicLexer
return cls(lexer_conf)
def create_contextual_lexer(lexer_conf: LexerConf, parser, postlex, options) -> ContextualLexer:
cls = (options and options._plugins.get('ContextualLexer')) or ContextualLexer
parse_table: ParseTableBase[int] = parser._parse_table
states: Dict[int, Collection[str]] = {idx:list(t.keys()) for idx, t in parse_table.states.items()}
always_accept: Collection[str] = postlex.always_accept if postlex else ()
return cls(lexer_conf, states, always_accept=always_accept)
def create_lalr_parser(lexer_conf: LexerConf, parser_conf: ParserConf, options=None) -> LALR_Parser:
debug = options.debug if options else False
strict = options.strict if options else False
cls = (options and options._plugins.get('LALR_Parser')) or LALR_Parser
return cls(parser_conf, debug=debug, strict=strict)
_parser_creators['lalr'] = create_lalr_parser
###}
class EarleyRegexpMatcher:
def __init__(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.terminals:
regexp = t.pattern.to_regexp()
try:
width = get_regexp_width(regexp)[0]
except ValueError:
raise GrammarError("Bad regexp in token %s: %s" % (t.name, regexp))
else:
if width == 0:
raise GrammarError("Dynamic Earley doesn't allow zero-width regexps", t)
if lexer_conf.use_bytes:
regexp = regexp.encode('utf-8')
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)
def match(self, term, text, index=0):
return self.regexps[term.name].match(text, index)
def create_earley_parser__dynamic(lexer_conf: LexerConf, parser_conf: ParserConf, **kw):
if lexer_conf.callbacks:
raise GrammarError("Earley's dynamic lexer doesn't support lexer_callbacks.")
earley_matcher = EarleyRegexpMatcher(lexer_conf)
return xearley.Parser(lexer_conf, parser_conf, earley_matcher.match, **kw)
def _match_earley_basic(term, token):
return term.name == token.type
def create_earley_parser__basic(lexer_conf: LexerConf, parser_conf: ParserConf, **kw):
return earley.Parser(lexer_conf, parser_conf, _match_earley_basic, **kw)
def create_earley_parser(lexer_conf: LexerConf, parser_conf: ParserConf, options) -> earley.Parser:
resolve_ambiguity = options.ambiguity == 'resolve'
debug = options.debug if options else False
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
extra = {}
if lexer_conf.lexer_type == 'dynamic':
f = create_earley_parser__dynamic
elif lexer_conf.lexer_type == 'dynamic_complete':
extra['complete_lex'] = True
f = create_earley_parser__dynamic
else:
f = create_earley_parser__basic
return f(lexer_conf, parser_conf, resolve_ambiguity=resolve_ambiguity,
debug=debug, tree_class=tree_class, ordered_sets=options.ordered_sets, **extra)
class CYK_FrontEnd:
def __init__(self, lexer_conf, parser_conf, options=None):
self.parser = cyk.Parser(parser_conf.rules)
self.callbacks = parser_conf.callbacks
def parse(self, lexer_thread, start):
tokens = list(lexer_thread.lex(None))
tree = self.parser.parse(tokens, start)
return self._transform(tree)
def _transform(self, tree):
subtrees = list(tree.iter_subtrees())
for subtree in subtrees:
subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
return self._apply_callback(tree)
def _apply_callback(self, tree):
return self.callbacks[tree.rule](tree.children)
_parser_creators['earley'] = create_earley_parser
_parser_creators['cyk'] = CYK_FrontEnd
def _construct_parsing_frontend(
parser_type: _ParserArgType,
lexer_type: _LexerArgType,
lexer_conf,
parser_conf,
options
):
assert isinstance(lexer_conf, LexerConf)
assert isinstance(parser_conf, ParserConf)
parser_conf.parser_type = parser_type
lexer_conf.lexer_type = lexer_type
return ParsingFrontend(lexer_conf, parser_conf, options)

View File

@@ -0,0 +1,340 @@
"""This module implements a CYK parser."""
# Author: https://github.com/ehudt (2018)
#
# Adapted by Erez
from collections import defaultdict
import itertools
from ..exceptions import ParseError
from ..lexer import Token
from ..tree import Tree
from ..grammar import Terminal as T, NonTerminal as NT, Symbol
def match(t, s):
assert isinstance(t, T)
return t.name == s.type
class Rule:
"""Context-free grammar rule."""
def __init__(self, lhs, rhs, weight, alias):
super(Rule, self).__init__()
assert isinstance(lhs, NT), lhs
assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
self.lhs = lhs
self.rhs = rhs
self.weight = weight
self.alias = alias
def __str__(self):
return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
def __repr__(self):
return str(self)
def __hash__(self):
return hash((self.lhs, tuple(self.rhs)))
def __eq__(self, other):
return self.lhs == other.lhs and self.rhs == other.rhs
def __ne__(self, other):
return not (self == other)
class Grammar:
"""Context-free grammar."""
def __init__(self, rules):
self.rules = frozenset(rules)
def __eq__(self, other):
return self.rules == other.rules
def __str__(self):
return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'
def __repr__(self):
return str(self)
# Parse tree data structures
class RuleNode:
"""A node in the parse tree, which also contains the full rhs rule."""
def __init__(self, rule, children, weight=0):
self.rule = rule
self.children = children
self.weight = weight
def __repr__(self):
return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children))
class Parser:
"""Parser wrapper."""
def __init__(self, rules):
super(Parser, self).__init__()
self.orig_rules = {rule: rule for rule in rules}
rules = [self._to_rule(rule) for rule in rules]
self.grammar = to_cnf(Grammar(rules))
def _to_rule(self, lark_rule):
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
assert isinstance(lark_rule.origin, NT)
assert all(isinstance(x, Symbol) for x in lark_rule.expansion)
return Rule(
lark_rule.origin, lark_rule.expansion,
weight=lark_rule.options.priority if lark_rule.options.priority else 0,
alias=lark_rule)
def parse(self, tokenized, start): # pylint: disable=invalid-name
"""Parses input, which is a list of tokens."""
assert start
start = NT(start)
table, trees = _parse(tokenized, self.grammar)
# Check if the parse succeeded.
if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]):
raise ParseError('Parsing failed.')
parse = trees[(0, len(tokenized) - 1)][start]
return self._to_tree(revert_cnf(parse))
def _to_tree(self, rule_node):
"""Converts a RuleNode parse tree to a lark Tree."""
orig_rule = self.orig_rules[rule_node.rule.alias]
children = []
for child in rule_node.children:
if isinstance(child, RuleNode):
children.append(self._to_tree(child))
else:
assert isinstance(child.name, Token)
children.append(child.name)
t = Tree(orig_rule.origin, children)
t.rule=orig_rule
return t
def print_parse(node, indent=0):
if isinstance(node, RuleNode):
print(' ' * (indent * 2) + str(node.rule.lhs))
for child in node.children:
print_parse(child, indent + 1)
else:
print(' ' * (indent * 2) + str(node.s))
def _parse(s, g):
"""Parses sentence 's' using CNF grammar 'g'."""
# The CYK table. Indexed with a 2-tuple: (start pos, end pos)
table = defaultdict(set)
# Top-level structure is similar to the CYK table. Each cell is a dict from
# rule name to the best (lightest) tree for that rule.
trees = defaultdict(dict)
# Populate base case with existing terminal production rules
for i, w in enumerate(s):
for terminal, rules in g.terminal_rules.items():
if match(terminal, w):
for rule in rules:
table[(i, i)].add(rule)
if (rule.lhs not in trees[(i, i)] or
rule.weight < trees[(i, i)][rule.lhs].weight):
trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
# Iterate over lengths of sub-sentences
for l in range(2, len(s) + 1):
# Iterate over sub-sentences with the given length
for i in range(len(s) - l + 1):
# Choose partition of the sub-sentence in [1, l)
for p in range(i + 1, i + l):
span1 = (i, p - 1)
span2 = (p, i + l - 1)
for r1, r2 in itertools.product(table[span1], table[span2]):
for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
table[(i, i + l - 1)].add(rule)
r1_tree = trees[span1][r1.lhs]
r2_tree = trees[span2][r2.lhs]
rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
if (rule.lhs not in trees[(i, i + l - 1)]
or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
return table, trees
# This section implements context-free grammar converter to Chomsky normal form.
# It also implements a conversion of parse trees from its CNF to the original
# grammar.
# Overview:
# Applies the following operations in this order:
# * TERM: Eliminates non-solitary terminals from all rules
# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side.
# * UNIT: Eliminates non-terminal unit rules
#
# The following grammar characteristics aren't featured:
# * Start symbol appears on RHS
# * Empty rules (epsilon rules)
class CnfWrapper:
"""CNF wrapper for grammar.
Validates that the input grammar is CNF and provides helper data structures.
"""
def __init__(self, grammar):
super(CnfWrapper, self).__init__()
self.grammar = grammar
self.rules = grammar.rules
self.terminal_rules = defaultdict(list)
self.nonterminal_rules = defaultdict(list)
for r in self.rules:
# Validate that the grammar is CNF and populate auxiliary data structures.
assert isinstance(r.lhs, NT), r
if len(r.rhs) not in [1, 2]:
raise ParseError("CYK doesn't support empty rules")
if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
self.terminal_rules[r.rhs[0]].append(r)
elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
self.nonterminal_rules[tuple(r.rhs)].append(r)
else:
assert False, r
def __eq__(self, other):
return self.grammar == other.grammar
def __repr__(self):
return repr(self.grammar)
class UnitSkipRule(Rule):
"""A rule that records NTs that were skipped during transformation."""
def __init__(self, lhs, rhs, skipped_rules, weight, alias):
super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
self.skipped_rules = skipped_rules
def __eq__(self, other):
return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules
__hash__ = Rule.__hash__
def build_unit_skiprule(unit_rule, target_rule):
skipped_rules = []
if isinstance(unit_rule, UnitSkipRule):
skipped_rules += unit_rule.skipped_rules
skipped_rules.append(target_rule)
if isinstance(target_rule, UnitSkipRule):
skipped_rules += target_rule.skipped_rules
return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
def get_any_nt_unit_rule(g):
"""Returns a non-terminal unit rule from 'g', or None if there is none."""
for rule in g.rules:
if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
return rule
return None
def _remove_unit_rule(g, rule):
"""Removes 'rule' from 'g' without changing the language produced by 'g'."""
new_rules = [x for x in g.rules if x != rule]
refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
new_rules += [build_unit_skiprule(rule, ref) for ref in refs]
return Grammar(new_rules)
def _split(rule):
"""Splits a rule whose len(rhs) > 2 into shorter rules."""
rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
rule_name = '__SP_%s' % (rule_str) + '_%d'
yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)
for i in range(1, len(rule.rhs) - 2):
yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split')
yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split')
def _term(g):
"""Applies the TERM rule on 'g' (see top comment)."""
all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs)
else:
new_rules.append(rule)
return Grammar(new_rules)
def _bin(g):
"""Applies the BIN rule to 'g' (see top comment)."""
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 2:
new_rules += _split(rule)
else:
new_rules.append(rule)
return Grammar(new_rules)
def _unit(g):
"""Applies the UNIT rule to 'g' (see top comment)."""
nt_unit_rule = get_any_nt_unit_rule(g)
while nt_unit_rule:
g = _remove_unit_rule(g, nt_unit_rule)
nt_unit_rule = get_any_nt_unit_rule(g)
return g
def to_cnf(g):
"""Creates a CNF grammar from a general context-free grammar 'g'."""
g = _unit(_bin(_term(g)))
return CnfWrapper(g)
def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias):
if not skipped_rules:
return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
else:
weight = weight - skipped_rules[0].weight
return RuleNode(
Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs,
skipped_rules[1:], children,
skipped_rules[0].weight, skipped_rules[0].alias)
], weight=weight)
def revert_cnf(node):
"""Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
if isinstance(node, T):
return node
# Reverts TERM rule.
if node.rule.lhs.name.startswith('__T_'):
return node.children[0]
else:
children = []
for child in map(revert_cnf, node.children):
# Reverts BIN rule.
if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'):
children += child.children
else:
children.append(child)
# Reverts UNIT rule.
if isinstance(node.rule, UnitSkipRule):
return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs,
node.rule.skipped_rules, children,
node.rule.weight, node.rule.alias)
else:
return RuleNode(node.rule, children)

View File

@@ -0,0 +1,314 @@
"""This module implements an Earley parser.
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
https://www.sciencedirect.com/science/article/pii/S1571066108001497
That is probably the best reference for understanding the algorithm here.
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
"""
from typing import TYPE_CHECKING, Callable, Optional, List, Any
from collections import deque
from ..lexer import Token
from ..tree import Tree
from ..exceptions import UnexpectedEOF, UnexpectedToken
from ..utils import logger, OrderedSet, dedup_list
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Item
from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree
if TYPE_CHECKING:
from ..common import LexerConf, ParserConf
class Parser:
lexer_conf: 'LexerConf'
parser_conf: 'ParserConf'
debug: bool
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
resolve_ambiguity: bool=True, debug: bool=False,
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
analysis = GrammarAnalyzer(parser_conf)
self.lexer_conf = lexer_conf
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.debug = debug
self.Tree = tree_class
self.Set = OrderedSet if ordered_sets else set
self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode
self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
self.callbacks = parser_conf.callbacks
# TODO add typing info
self.predictions = {} # type: ignore[var-annotated]
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
# the slow 'isupper' in is_terminal.
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
self.forest_sum_visitor = None
for rule in parser_conf.rules:
if rule.origin not in self.predictions:
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
# the priorities will be stripped from all rules/terminals before they reach us, allowing us to
# skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
# on any rules/terminals.
if self.forest_sum_visitor is None and rule.options.priority is not None:
self.forest_sum_visitor = ForestSumVisitor
# Check terminals for priorities
# Ignore terminal priorities if the basic lexer is used
if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
for term in self.lexer_conf.terminals:
if term.priority:
self.forest_sum_visitor = ForestSumVisitor
break
self.term_matcher = term_matcher
def predict_and_complete(self, i, to_scan, columns, transitives):
"""The core Earley Predictor and Completer.
At each stage of the input, we handling any completed items (things
that matched on the last cycle) and use those to predict what should
come next in the input stream. The completions and any predicted
non-terminals are recursively processed until we reach a set of,
which can be added to the scan list for the next scanner cycle."""
# Held Completions (H in E.Scotts paper).
node_cache = {}
held_completions = {}
column = columns[i]
# R (items) = Ei (column.items)
items = deque(column)
while items:
item = items.pop() # remove an element, A say, from R
### The Earley completer
if item.is_complete: ### (item.s == string)
if item.node is None:
label = (item.s, item.start, i)
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
item.node.add_family(item.s, item.rule, item.start, None, None)
# create_leo_transitives(item.rule.origin, item.start)
###R Joop Leo right recursion Completer
if item.rule.origin in transitives[item.start]:
transitive = transitives[item.start][item.s]
if transitive.previous in transitives[transitive.column]:
root_transitive = transitives[transitive.column][transitive.previous]
else:
root_transitive = transitive
new_item = Item(transitive.rule, transitive.ptr, transitive.start)
label = (root_transitive.s, root_transitive.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_path(root_transitive, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)
###R Regular Earley completer
else:
# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start == i
if is_empty_item:
held_completions[item.rule.origin] = item.node
originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
label = (new_item.s, originator.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)
### The Earley predictor
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
new_items = []
for rule in self.predictions[item.expect]:
new_item = Item(rule, 0, i)
new_items.append(new_item)
# Process any held completions (H).
if item.expect in held_completions:
new_item = item.advance()
label = (new_item.s, item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)
for new_item in new_items:
if new_item.expect in self.TERMINALS:
to_scan.add(new_item)
elif new_item not in column:
column.add(new_item)
items.append(new_item)
def _parse(self, lexer, columns, to_scan, start_symbol=None):
def is_quasi_complete(item):
if item.is_complete:
return True
quasi = item.advance()
while not quasi.is_complete:
if quasi.expect not in self.NULLABLE:
return False
if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
return False
quasi = quasi.advance()
return True
# def create_leo_transitives(origin, start):
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
def scan(i, token, to_scan):
"""The core Earley Scanner.
This is a custom implementation of the scanner that uses the
Lark lexer to match tokens. The scan list is built by the
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""
next_to_scan = self.Set()
next_set = self.Set()
columns.append(next_set)
transitives.append({})
node_cache = {}
for item in self.Set(to_scan):
if match(item.expect, token):
new_item = item.advance()
label = (new_item.s, new_item.start, i)
# 'terminals' may not contain token.type when using %declare
# Additionally, token is not always a Token
# For example, it can be a Tree when using TreeMatcher
term = terminals.get(token.type) if isinstance(token, Token) else None
# Set the priority of the token node to 0 so that the
# terminal priorities do not affect the Tree chosen by
# ForestSumVisitor after the basic lexer has already
# "used up" the terminal priorities
token_node = TokenNode(token, term, priority=0)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
if new_item.expect in self.TERMINALS:
# add (B ::= Aai+1.B, h, y) to Q'
next_to_scan.add(new_item)
else:
# add (B ::= Aa+1.B, h, y) to Ei+1
next_set.add(new_item)
if not next_set and not next_to_scan:
expect = {i.expect.name for i in to_scan}
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
return next_to_scan
# Define parser functions
match = self.term_matcher
terminals = self.lexer_conf.terminals_by_name
# Cache for nodes & tokens created in a particular parse step.
transitives = [{}]
## The main Earley loop.
# Run the Prediction/Completion cycle for any Items in the current Earley set.
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
expects = {i.expect for i in to_scan}
i = 0
for token in lexer.lex(expects):
self.predict_and_complete(i, to_scan, columns, transitives)
to_scan = scan(i, token, to_scan)
i += 1
expects.clear()
expects |= {i.expect for i in to_scan}
self.predict_and_complete(i, to_scan, columns, transitives)
## Column is now the final column in the parse.
assert i == len(columns)-1
return to_scan
def parse(self, lexer, start):
assert start, start
start_symbol = NonTerminal(start)
columns = [self.Set()]
to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.
## Predict for the start_symbol.
# Add predicted items to the first Earley set (for the predictor) if they
# result in a non-terminal, or the scanner if they result in a terminal.
for rule in self.predictions[start_symbol]:
item = Item(rule, 0, 0)
if item.expect in self.TERMINALS:
to_scan.add(item)
else:
columns[0].add(item)
to_scan = self._parse(lexer, columns, to_scan, start_symbol)
# If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)
if not solutions:
expected_terminals = [t.expect.name for t in to_scan]
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
if self.debug:
from .earley_forest import ForestToPyDotVisitor
try:
debug_walker = ForestToPyDotVisitor()
except ImportError:
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
else:
for i, s in enumerate(solutions):
debug_walker.visit(s, f"sppf{i}.png")
if self.Tree is not None:
# Perform our SPPF -> AST conversion
transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity)
solutions = [transformer.transform(s) for s in solutions]
if len(solutions) > 1:
t: Tree = self.Tree('_ambig', solutions)
t.expand_kids_by_data('_ambig') # solutions may themselves be _ambig nodes
return t
return solutions[0]
# return the root of the SPPF
# TODO return a list of solutions, or join them together somehow
return solutions[0]

View File

@@ -0,0 +1,42 @@
"""This module implements useful building blocks for the Earley parser
"""
class Item:
"An Earley Item, the atom of the algorithm."
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')
def __init__(self, rule, ptr, start):
self.is_complete = len(rule.expansion) == ptr
self.rule = rule # rule
self.ptr = ptr # ptr
self.start = start # j
self.node = None # w
if self.is_complete:
self.s = rule.origin
self.expect = None
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
else:
self.s = (rule, ptr)
self.expect = rule.expansion[ptr]
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
self._hash = hash((self.s, self.start, self.rule))
def advance(self):
return Item(self.rule, self.ptr + 1, self.start)
def __eq__(self, other):
return self is other or (self.s == other.s and self.start == other.start and self.rule == other.rule)
def __hash__(self):
return self._hash
def __repr__(self):
before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after))
return '%s (%d)' % (symbol, self.start)
# class TransitiveItem(Item):
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420

View File

@@ -0,0 +1,801 @@
""""This module implements an SPPF implementation
This is used as the primary output mechanism for the Earley parser
in order to store complex ambiguities.
Full reference and more details is here:
https://web.archive.org/web/20190616123959/http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
"""
from typing import Type, AbstractSet
from random import randint
from collections import deque
from operator import attrgetter
from importlib import import_module
from functools import partial
from ..parse_tree_builder import AmbiguousIntermediateExpander
from ..visitors import Discard
from ..utils import logger, OrderedSet
from ..tree import Tree
class ForestNode:
pass
class SymbolNode(ForestNode):
"""
A Symbol Node represents a symbol (or Intermediate LR0).
Symbol nodes are keyed by the symbol (s). For intermediate nodes
s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol
nodes, s will be a string representing the non-terminal origin (i.e.
the left hand side of the rule).
The children of a Symbol or Intermediate Node will always be Packed Nodes;
with each Packed Node child representing a single derivation of a production.
Hence a Symbol Node with a single child is unambiguous.
Parameters:
s: A Symbol, or a tuple of (rule, ptr) for an intermediate node.
start: For dynamic lexers, the index of the start of the substring matched by this symbol (inclusive).
end: For dynamic lexers, the index of the end of the substring matched by this symbol (exclusive).
Properties:
is_intermediate: True if this node is an intermediate node.
priority: The priority of the node's symbol.
"""
Set: Type[AbstractSet] = set # Overridden by StableSymbolNode
__slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate')
def __init__(self, s, start, end):
self.s = s
self.start = start
self.end = end
self._children = self.Set()
self.paths = self.Set()
self.paths_loaded = False
### We use inf here as it can be safely negated without resorting to conditionals,
# unlike None or float('NaN'), and sorts appropriately.
self.priority = float('-inf')
self.is_intermediate = isinstance(s, tuple)
def add_family(self, lr0, rule, start, left, right):
self._children.add(PackedNode(self, lr0, rule, start, left, right))
def add_path(self, transitive, node):
self.paths.add((transitive, node))
def load_paths(self):
for transitive, node in self.paths:
if transitive.next_titem is not None:
vn = type(self)(transitive.next_titem.s, transitive.next_titem.start, self.end)
vn.add_path(transitive.next_titem, node)
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
else:
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
self.paths_loaded = True
@property
def is_ambiguous(self):
"""Returns True if this node is ambiguous."""
return len(self.children) > 1
@property
def children(self):
"""Returns a list of this node's children sorted from greatest to
least priority."""
if not self.paths_loaded:
self.load_paths()
return sorted(self._children, key=attrgetter('sort_key'))
def __iter__(self):
return iter(self._children)
def __repr__(self):
if self.is_intermediate:
rule = self.s[0]
ptr = self.s[1]
before = ( expansion.name for expansion in rule.expansion[:ptr] )
after = ( expansion.name for expansion in rule.expansion[ptr:] )
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
else:
symbol = self.s.name
return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority)
class StableSymbolNode(SymbolNode):
"A version of SymbolNode that uses OrderedSet for output stability"
Set = OrderedSet
class PackedNode(ForestNode):
"""
A Packed Node represents a single derivation in a symbol node.
Parameters:
rule: The rule associated with this node.
parent: The parent of this node.
left: The left child of this node. ``None`` if one does not exist.
right: The right child of this node. ``None`` if one does not exist.
priority: The priority of this node.
"""
__slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash')
def __init__(self, parent, s, rule, start, left, right):
self.parent = parent
self.s = s
self.start = start
self.rule = rule
self.left = left
self.right = right
self.priority = float('-inf')
self._hash = hash((self.left, self.right))
@property
def is_empty(self):
return self.left is None and self.right is None
@property
def sort_key(self):
"""
Used to sort PackedNode children of SymbolNodes.
A SymbolNode has multiple PackedNodes if it matched
ambiguously. Hence, we use the sort order to identify
the order in which ambiguous children should be considered.
"""
return self.is_empty, -self.priority, self.rule.order
@property
def children(self):
"""Returns a list of this node's children."""
return [x for x in [self.left, self.right] if x is not None]
def __iter__(self):
yield self.left
yield self.right
def __eq__(self, other):
if not isinstance(other, PackedNode):
return False
return self is other or (self.left == other.left and self.right == other.right)
def __hash__(self):
return self._hash
def __repr__(self):
if isinstance(self.s, tuple):
rule = self.s[0]
ptr = self.s[1]
before = ( expansion.name for expansion in rule.expansion[:ptr] )
after = ( expansion.name for expansion in rule.expansion[ptr:] )
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
else:
symbol = self.s.name
return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order)
class TokenNode(ForestNode):
"""
A Token Node represents a matched terminal and is always a leaf node.
Parameters:
token: The Token associated with this node.
term: The TerminalDef matched by the token.
priority: The priority of this node.
"""
__slots__ = ('token', 'term', 'priority', '_hash')
def __init__(self, token, term, priority=None):
self.token = token
self.term = term
if priority is not None:
self.priority = priority
else:
self.priority = term.priority if term is not None else 0
self._hash = hash(token)
def __eq__(self, other):
if not isinstance(other, TokenNode):
return False
return self is other or (self.token == other.token)
def __hash__(self):
return self._hash
def __repr__(self):
return repr(self.token)
class ForestVisitor:
"""
An abstract base class for building forest visitors.
This class performs a controllable depth-first walk of an SPPF.
The visitor will not enter cycles and will backtrack if one is encountered.
Subclasses are notified of cycles through the ``on_cycle`` method.
Behavior for visit events is defined by overriding the
``visit*node*`` functions.
The walk is controlled by the return values of the ``visit*node_in``
methods. Returning a node(s) will schedule them to be visited. The visitor
will begin to backtrack if no nodes are returned.
Parameters:
single_visit: If ``True``, non-Token nodes will only be visited once.
"""
def __init__(self, single_visit=False):
self.single_visit = single_visit
def visit_token_node(self, node):
"""Called when a ``Token`` is visited. ``Token`` nodes are always leaves."""
pass
def visit_symbol_node_in(self, node):
"""Called when a symbol node is visited. Nodes that are returned
will be scheduled to be visited. If ``visit_intermediate_node_in``
is not implemented, this function will be called for intermediate
nodes as well."""
pass
def visit_symbol_node_out(self, node):
"""Called after all nodes returned from a corresponding ``visit_symbol_node_in``
call have been visited. If ``visit_intermediate_node_out``
is not implemented, this function will be called for intermediate
nodes as well."""
pass
def visit_packed_node_in(self, node):
"""Called when a packed node is visited. Nodes that are returned
will be scheduled to be visited. """
pass
def visit_packed_node_out(self, node):
"""Called after all nodes returned from a corresponding ``visit_packed_node_in``
call have been visited."""
pass
def on_cycle(self, node, path):
"""Called when a cycle is encountered.
Parameters:
node: The node that causes a cycle.
path: The list of nodes being visited: nodes that have been
entered but not exited. The first element is the root in a forest
visit, and the last element is the node visited most recently.
``path`` should be treated as read-only.
"""
pass
def get_cycle_in_path(self, node, path):
"""A utility function for use in ``on_cycle`` to obtain a slice of
``path`` that only contains the nodes that make up the cycle."""
index = len(path) - 1
while id(path[index]) != id(node):
index -= 1
return path[index:]
def visit(self, root):
# Visiting is a list of IDs of all symbol/intermediate nodes currently in
# the stack. It serves two purposes: to detect when we 'recurse' in and out
# of a symbol/intermediate so that we can process both up and down. Also,
# since the SPPF can have cycles it allows us to detect if we're trying
# to recurse into a node that's already on the stack (infinite recursion).
visiting = set()
# set of all nodes that have been visited
visited = set()
# a list of nodes that are currently being visited
# used for the `on_cycle` callback
path = []
# We do not use recursion here to walk the Forest due to the limited
# stack size in python. Therefore input_stack is essentially our stack.
input_stack = deque([root])
# It is much faster to cache these as locals since they are called
# many times in large parses.
vpno = getattr(self, 'visit_packed_node_out')
vpni = getattr(self, 'visit_packed_node_in')
vsno = getattr(self, 'visit_symbol_node_out')
vsni = getattr(self, 'visit_symbol_node_in')
vino = getattr(self, 'visit_intermediate_node_out', vsno)
vini = getattr(self, 'visit_intermediate_node_in', vsni)
vtn = getattr(self, 'visit_token_node')
oc = getattr(self, 'on_cycle')
while input_stack:
current = next(reversed(input_stack))
try:
next_node = next(current)
except StopIteration:
input_stack.pop()
continue
except TypeError:
### If the current object is not an iterator, pass through to Token/SymbolNode
pass
else:
if next_node is None:
continue
if id(next_node) in visiting:
oc(next_node, path)
continue
input_stack.append(next_node)
continue
if isinstance(current, TokenNode):
vtn(current.token)
input_stack.pop()
continue
current_id = id(current)
if current_id in visiting:
if isinstance(current, PackedNode):
vpno(current)
elif current.is_intermediate:
vino(current)
else:
vsno(current)
input_stack.pop()
path.pop()
visiting.remove(current_id)
visited.add(current_id)
elif self.single_visit and current_id in visited:
input_stack.pop()
else:
visiting.add(current_id)
path.append(current)
if isinstance(current, PackedNode):
next_node = vpni(current)
elif current.is_intermediate:
next_node = vini(current)
else:
next_node = vsni(current)
if next_node is None:
continue
if not isinstance(next_node, ForestNode):
next_node = iter(next_node)
elif id(next_node) in visiting:
oc(next_node, path)
continue
input_stack.append(next_node)
class ForestTransformer(ForestVisitor):
"""The base class for a bottom-up forest transformation. Most users will
want to use ``TreeForestTransformer`` instead as it has a friendlier
interface and covers most use cases.
Transformations are applied via inheritance and overriding of the
``transform*node`` methods.
``transform_token_node`` receives a ``Token`` as an argument.
All other methods receive the node that is being transformed and
a list of the results of the transformations of that node's children.
The return value of these methods are the resulting transformations.
If ``Discard`` is raised in a node's transformation, no data from that node
will be passed to its parent's transformation.
"""
def __init__(self):
super(ForestTransformer, self).__init__()
# results of transformations
self.data = dict()
# used to track parent nodes
self.node_stack = deque()
def transform(self, root):
"""Perform a transformation on an SPPF."""
self.node_stack.append('result')
self.data['result'] = []
self.visit(root)
assert len(self.data['result']) <= 1
if self.data['result']:
return self.data['result'][0]
def transform_symbol_node(self, node, data):
"""Transform a symbol node."""
return node
def transform_intermediate_node(self, node, data):
"""Transform an intermediate node."""
return node
def transform_packed_node(self, node, data):
"""Transform a packed node."""
return node
def transform_token_node(self, node):
"""Transform a ``Token``."""
return node
def visit_symbol_node_in(self, node):
self.node_stack.append(id(node))
self.data[id(node)] = []
return node.children
def visit_packed_node_in(self, node):
self.node_stack.append(id(node))
self.data[id(node)] = []
return node.children
def visit_token_node(self, node):
transformed = self.transform_token_node(node)
if transformed is not Discard:
self.data[self.node_stack[-1]].append(transformed)
def _visit_node_out_helper(self, node, method):
self.node_stack.pop()
transformed = method(node, self.data[id(node)])
if transformed is not Discard:
self.data[self.node_stack[-1]].append(transformed)
del self.data[id(node)]
def visit_symbol_node_out(self, node):
self._visit_node_out_helper(node, self.transform_symbol_node)
def visit_intermediate_node_out(self, node):
self._visit_node_out_helper(node, self.transform_intermediate_node)
def visit_packed_node_out(self, node):
self._visit_node_out_helper(node, self.transform_packed_node)
class ForestSumVisitor(ForestVisitor):
"""
A visitor for prioritizing ambiguous parts of the Forest.
This visitor is used when support for explicit priorities on
rules is requested (whether normal, or invert). It walks the
forest (or subsets thereof) and cascades properties upwards
from the leaves.
It would be ideal to do this during parsing, however this would
require processing each Earley item multiple times. That's
a big performance drawback; so running a forest walk is the
lesser of two evils: there can be significantly more Earley
items created during parsing than there are SPPF nodes in the
final tree.
"""
def __init__(self):
super(ForestSumVisitor, self).__init__(single_visit=True)
def visit_packed_node_in(self, node):
yield node.left
yield node.right
def visit_symbol_node_in(self, node):
return iter(node.children)
def visit_packed_node_out(self, node):
priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
priority += getattr(node.right, 'priority', 0)
priority += getattr(node.left, 'priority', 0)
node.priority = priority
def visit_symbol_node_out(self, node):
node.priority = max(child.priority for child in node.children)
class PackedData():
"""Used in transformationss of packed nodes to distinguish the data
that comes from the left child and the right child.
"""
class _NoData():
pass
NO_DATA = _NoData()
def __init__(self, node, data):
self.left = self.NO_DATA
self.right = self.NO_DATA
if data:
if node.left is not None:
self.left = data[0]
if len(data) > 1:
self.right = data[1]
else:
self.right = data[0]
class ForestToParseTree(ForestTransformer):
"""Used by the earley parser when ambiguity equals 'resolve' or
'explicit'. Transforms an SPPF into an (ambiguous) parse tree.
Parameters:
tree_class: The tree class to use for construction
callbacks: A dictionary of rules to functions that output a tree
prioritizer: A ``ForestVisitor`` that manipulates the priorities of ForestNodes
resolve_ambiguity: If True, ambiguities will be resolved based on
priorities. Otherwise, `_ambig` nodes will be in the resulting tree.
use_cache: If True, the results of packed node transformations will be cached.
"""
def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=True):
super(ForestToParseTree, self).__init__()
self.tree_class = tree_class
self.callbacks = callbacks
self.prioritizer = prioritizer
self.resolve_ambiguity = resolve_ambiguity
self._use_cache = use_cache
self._cache = {}
self._on_cycle_retreat = False
self._cycle_node = None
self._successful_visits = set()
def visit(self, root):
if self.prioritizer:
self.prioritizer.visit(root)
super(ForestToParseTree, self).visit(root)
self._cache = {}
def on_cycle(self, node, path):
logger.debug("Cycle encountered in the SPPF at node: %s. "
"As infinite ambiguities cannot be represented in a tree, "
"this family of derivations will be discarded.", node)
self._cycle_node = node
self._on_cycle_retreat = True
def _check_cycle(self, node):
if self._on_cycle_retreat:
if id(node) == id(self._cycle_node) or id(node) in self._successful_visits:
self._cycle_node = None
self._on_cycle_retreat = False
else:
return Discard
def _collapse_ambig(self, children):
new_children = []
for child in children:
if hasattr(child, 'data') and child.data == '_ambig':
new_children += child.children
else:
new_children.append(child)
return new_children
def _call_rule_func(self, node, data):
# called when transforming children of symbol nodes
# data is a list of trees or tokens that correspond to the
# symbol's rule expansion
return self.callbacks[node.rule](data)
def _call_ambig_func(self, node, data):
# called when transforming a symbol node
# data is a list of trees where each tree's data is
# equal to the name of the symbol or one of its aliases.
if len(data) > 1:
return self.tree_class('_ambig', data)
elif data:
return data[0]
return Discard
def transform_symbol_node(self, node, data):
if id(node) not in self._successful_visits:
return Discard
r = self._check_cycle(node)
if r is Discard:
return r
self._successful_visits.remove(id(node))
data = self._collapse_ambig(data)
return self._call_ambig_func(node, data)
def transform_intermediate_node(self, node, data):
if id(node) not in self._successful_visits:
return Discard
r = self._check_cycle(node)
if r is Discard:
return r
self._successful_visits.remove(id(node))
if len(data) > 1:
children = [self.tree_class('_inter', c) for c in data]
return self.tree_class('_iambig', children)
return data[0]
def transform_packed_node(self, node, data):
r = self._check_cycle(node)
if r is Discard:
return r
if self.resolve_ambiguity and id(node.parent) in self._successful_visits:
return Discard
if self._use_cache and id(node) in self._cache:
return self._cache[id(node)]
children = []
assert len(data) <= 2
data = PackedData(node, data)
if data.left is not PackedData.NO_DATA:
if node.left.is_intermediate and isinstance(data.left, list):
children += data.left
else:
children.append(data.left)
if data.right is not PackedData.NO_DATA:
children.append(data.right)
if node.parent.is_intermediate:
return self._cache.setdefault(id(node), children)
return self._cache.setdefault(id(node), self._call_rule_func(node, children))
def visit_symbol_node_in(self, node):
super(ForestToParseTree, self).visit_symbol_node_in(node)
if self._on_cycle_retreat:
return
return node.children
def visit_packed_node_in(self, node):
self._on_cycle_retreat = False
to_visit = super(ForestToParseTree, self).visit_packed_node_in(node)
if not self.resolve_ambiguity or id(node.parent) not in self._successful_visits:
if not self._use_cache or id(node) not in self._cache:
return to_visit
def visit_packed_node_out(self, node):
super(ForestToParseTree, self).visit_packed_node_out(node)
if not self._on_cycle_retreat:
self._successful_visits.add(id(node.parent))
def handles_ambiguity(func):
"""Decorator for methods of subclasses of ``TreeForestTransformer``.
Denotes that the method should receive a list of transformed derivations."""
func.handles_ambiguity = True
return func
class TreeForestTransformer(ForestToParseTree):
"""A ``ForestTransformer`` with a tree ``Transformer``-like interface.
By default, it will construct a tree.
Methods provided via inheritance are called based on the rule/symbol
names of nodes in the forest.
Methods that act on rules will receive a list of the results of the
transformations of the rule's children. By default, trees and tokens.
Methods that act on tokens will receive a token.
Alternatively, methods that act on rules may be annotated with
``handles_ambiguity``. In this case, the function will receive a list
of all the transformations of all the derivations of the rule.
By default, a list of trees where each tree.data is equal to the
rule name or one of its aliases.
Non-tree transformations are made possible by override of
``__default__``, ``__default_token__``, and ``__default_ambig__``.
Note:
Tree shaping features such as inlined rules and token filtering are
not built into the transformation. Positions are also not propagated.
Parameters:
tree_class: The tree class to use for construction
prioritizer: A ``ForestVisitor`` that manipulates the priorities of nodes in the SPPF.
resolve_ambiguity: If True, ambiguities will be resolved based on priorities.
use_cache (bool): If True, caches the results of some transformations,
potentially improving performance when ``resolve_ambiguity==False``.
Only use if you know what you are doing: i.e. All transformation
functions are pure and referentially transparent.
"""
def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=False):
super(TreeForestTransformer, self).__init__(tree_class, dict(), prioritizer, resolve_ambiguity, use_cache)
def __default__(self, name, data):
"""Default operation on tree (for override).
Returns a tree with name with data as children.
"""
return self.tree_class(name, data)
def __default_ambig__(self, name, data):
"""Default operation on ambiguous rule (for override).
Wraps data in an '_ambig_' node if it contains more than
one element.
"""
if len(data) > 1:
return self.tree_class('_ambig', data)
elif data:
return data[0]
return Discard
def __default_token__(self, node):
"""Default operation on ``Token`` (for override).
Returns ``node``.
"""
return node
def transform_token_node(self, node):
return getattr(self, node.type, self.__default_token__)(node)
def _call_rule_func(self, node, data):
name = node.rule.alias or node.rule.options.template_source or node.rule.origin.name
user_func = getattr(self, name, self.__default__)
if user_func == self.__default__ or hasattr(user_func, 'handles_ambiguity'):
user_func = partial(self.__default__, name)
if not self.resolve_ambiguity:
wrapper = partial(AmbiguousIntermediateExpander, self.tree_class)
user_func = wrapper(user_func)
return user_func(data)
def _call_ambig_func(self, node, data):
name = node.s.name
user_func = getattr(self, name, self.__default_ambig__)
if user_func == self.__default_ambig__ or not hasattr(user_func, 'handles_ambiguity'):
user_func = partial(self.__default_ambig__, name)
return user_func(data)
class ForestToPyDotVisitor(ForestVisitor):
"""
A Forest visitor which writes the SPPF to a PNG.
The SPPF can get really large, really quickly because
of the amount of meta-data it stores, so this is probably
only useful for trivial trees and learning how the SPPF
is structured.
"""
def __init__(self, rankdir="TB"):
super(ForestToPyDotVisitor, self).__init__(single_visit=True)
self.pydot = import_module('pydot')
self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)
def visit(self, root, filename):
super(ForestToPyDotVisitor, self).visit(root)
try:
self.graph.write_png(filename)
except FileNotFoundError as e:
logger.error("Could not write png: ", e)
def visit_token_node(self, node):
graph_node_id = str(id(node))
graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
graph_node_color = 0x808080
graph_node_style = "\"filled,rounded\""
graph_node_shape = "diamond"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)
def visit_packed_node_in(self, node):
graph_node_id = str(id(node))
graph_node_label = repr(node)
graph_node_color = 0x808080
graph_node_style = "filled"
graph_node_shape = "diamond"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)
yield node.left
yield node.right
def visit_packed_node_out(self, node):
graph_node_id = str(id(node))
graph_node = self.graph.get_node(graph_node_id)[0]
for child in [node.left, node.right]:
if child is not None:
child_graph_node_id = str(id(child.token if isinstance(child, TokenNode) else child))
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
else:
#### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
child_graph_node_style = "invis"
child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
child_edge_style = "invis"
self.graph.add_node(child_graph_node)
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))
def visit_symbol_node_in(self, node):
graph_node_id = str(id(node))
graph_node_label = repr(node)
graph_node_color = 0x808080
graph_node_style = "\"filled\""
if node.is_intermediate:
graph_node_shape = "ellipse"
else:
graph_node_shape = "rectangle"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)
return iter(node.children)
def visit_symbol_node_out(self, node):
graph_node_id = str(id(node))
graph_node = self.graph.get_node(graph_node_id)[0]
for child in node.children:
child_graph_node_id = str(id(child))
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))

View File

@@ -0,0 +1,203 @@
"Provides for superficial grammar analysis."
from collections import Counter, defaultdict
from typing import List, Dict, Iterator, FrozenSet, Set
from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal, Symbol
from ..common import ParserConf
class RulePtr:
__slots__ = ('rule', 'index')
rule: Rule
index: int
def __init__(self, rule: Rule, index: int):
assert isinstance(rule, Rule)
assert index <= len(rule.expansion)
self.rule = rule
self.index = index
def __repr__(self):
before = [x.name for x in self.rule.expansion[:self.index]]
after = [x.name for x in self.rule.expansion[self.index:]]
return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
@property
def next(self) -> Symbol:
return self.rule.expansion[self.index]
def advance(self, sym: Symbol) -> 'RulePtr':
assert self.next == sym
return RulePtr(self.rule, self.index+1)
@property
def is_satisfied(self) -> bool:
return self.index == len(self.rule.expansion)
def __eq__(self, other) -> bool:
if not isinstance(other, RulePtr):
return NotImplemented
return self.rule == other.rule and self.index == other.index
def __hash__(self) -> int:
return hash((self.rule, self.index))
State = FrozenSet[RulePtr]
# state generation ensures no duplicate LR0ItemSets
class LR0ItemSet:
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')
kernel: State
closure: State
transitions: Dict[Symbol, 'LR0ItemSet']
lookaheads: Dict[Symbol, Set[Rule]]
def __init__(self, kernel, closure):
self.kernel = fzset(kernel)
self.closure = fzset(closure)
self.transitions = {}
self.lookaheads = defaultdict(set)
def __repr__(self):
return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
def update_set(set1, set2):
if not set2 or set1 > set2:
return False
copy = set(set1)
set1 |= set2
return set1 != copy
def calculate_sets(rules):
"""Calculate FOLLOW sets.
Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
# foreach grammar rule X ::= Y(1) ... Y(k)
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
# NULLABLE = NULLABLE union {X}
# for i = 1 to k
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
# FIRST(X) = FIRST(X) union FIRST(Y(i))
# for j = i+1 to k
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration
NULLABLE = set()
FIRST = {}
FOLLOW = {}
for sym in symbols:
FIRST[sym]={sym} if sym.is_term else set()
FOLLOW[sym]=set()
# Calculate NULLABLE and FIRST
changed = True
while changed:
changed = False
for rule in rules:
if set(rule.expansion) <= NULLABLE:
if update_set(NULLABLE, {rule.origin}):
changed = True
for i, sym in enumerate(rule.expansion):
if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True
else:
break
# Calculate FOLLOW
changed = True
while changed:
changed = False
for rule in rules:
for i, sym in enumerate(rule.expansion):
if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
changed = True
for j in range(i+1, len(rule.expansion)):
if set(rule.expansion[i+1:j]) <= NULLABLE:
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
changed = True
return FIRST, FOLLOW, NULLABLE
class GrammarAnalyzer:
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
self.debug = debug
self.strict = strict
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
for start in parser_conf.start}
rules = parser_conf.rules + list(root_rules.values())
self.rules_by_origin: Dict[NonTerminal, List[Rule]] = classify(rules, lambda r: r.origin)
if len(rules) != len(set(rules)):
duplicates = [item for item, count in Counter(rules).items() if count > 1]
raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
for r in rules:
for sym in r.expansion:
if not (sym.is_term or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym)
self.start_states = {start: self.expand_rule(root_rule.origin)
for start, root_rule in root_rules.items()}
self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
for start, root_rule in root_rules.items()}
lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
for start in parser_conf.start}
lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
assert(len(lr0_rules) == len(set(lr0_rules)))
self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
# cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
for start, root_rule in lr0_root_rules.items()}
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
def expand_rule(self, source_rule: NonTerminal, rules_by_origin=None) -> State:
"Returns all init_ptrs accessible by rule (recursive)"
if rules_by_origin is None:
rules_by_origin = self.rules_by_origin
init_ptrs = set()
def _expand_rule(rule: NonTerminal) -> Iterator[NonTerminal]:
assert not rule.is_term, rule
for r in rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr)
if r.expansion: # if not empty rule
new_r = init_ptr.next
if not new_r.is_term:
assert isinstance(new_r, NonTerminal)
yield new_r
for _ in bfs([source_rule], _expand_rule):
pass
return fzset(init_ptrs)

View File

@@ -0,0 +1,332 @@
"""This module builds a LALR(1) transition-table for lalr_parser.py
For now, shift/reduce conflicts are automatically resolved as shifts.
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from typing import Dict, Set, Iterator, Tuple, List, TypeVar, Generic
from collections import defaultdict
from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
from ..exceptions import GrammarError
from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet, RulePtr, State
from ..grammar import Rule, Symbol
from ..common import ParserConf
###{standalone
class Action:
def __init__(self, name):
self.name = name
def __str__(self):
return self.name
def __repr__(self):
return str(self)
Shift = Action('Shift')
Reduce = Action('Reduce')
StateT = TypeVar("StateT")
class ParseTableBase(Generic[StateT]):
states: Dict[StateT, Dict[str, Tuple]]
start_states: Dict[str, StateT]
end_states: Dict[str, StateT]
def __init__(self, states, start_states, end_states):
self.states = states
self.start_states = start_states
self.end_states = end_states
def serialize(self, memo):
tokens = Enumerator()
states = {
state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}
for state, actions in self.states.items()
}
return {
'tokens': tokens.reversed(),
'states': states,
'start_states': self.start_states,
'end_states': self.end_states,
}
@classmethod
def deserialize(cls, data, memo):
tokens = data['tokens']
states = {
state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
for token, (action, arg) in actions.items()}
for state, actions in data['states'].items()
}
return cls(states, data['start_states'], data['end_states'])
class ParseTable(ParseTableBase['State']):
"""Parse-table whose key is State, i.e. set[RulePtr]
Slower than IntParseTable, but useful for debugging
"""
pass
class IntParseTable(ParseTableBase[int]):
"""Parse-table whose key is int. Best for performance."""
@classmethod
def from_ParseTable(cls, parse_table: ParseTable):
enum = list(parse_table.states)
state_to_idx: Dict['State', int] = {s:i for i,s in enumerate(enum)}
int_states = {}
for s, la in parse_table.states.items():
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
for k,v in la.items()}
int_states[ state_to_idx[s] ] = la
start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
return cls(int_states, start_states, end_states)
###}
# digraph and traverse, see The Theory and Practice of Compiler Writing
# computes F(x) = G(x) union (union { G(y) | x R y })
# X: nodes
# R: relation (function mapping node -> list of nodes that satisfy the relation)
# G: set valued function
def digraph(X, R, G):
F = {}
S = []
N = dict.fromkeys(X, 0)
for x in X:
# this is always true for the first iteration, but N[x] may be updated in traverse below
if N[x] == 0:
traverse(x, S, N, X, R, G, F)
return F
# x: single node
# S: stack
# N: weights
# X: nodes
# R: relation (see above)
# G: set valued function
# F: set valued function we are computing (map of input -> output)
def traverse(x, S, N, X, R, G, F):
S.append(x)
d = len(S)
N[x] = d
F[x] = G[x]
for y in R[x]:
if N[y] == 0:
traverse(y, S, N, X, R, G, F)
n_x = N[x]
assert(n_x > 0)
n_y = N[y]
assert(n_y != 0)
if (n_y > 0) and (n_y < n_x):
N[x] = n_y
F[x].update(F[y])
if N[x] == d:
f_x = F[x]
while True:
z = S.pop()
N[z] = -1
F[z] = f_x
if z == x:
break
class LALR_Analyzer(GrammarAnalyzer):
lr0_itemsets: Set[LR0ItemSet]
nonterminal_transitions: List[Tuple[LR0ItemSet, Symbol]]
lookback: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Rule]]]
includes: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
directly_reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Symbol]]
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
GrammarAnalyzer.__init__(self, parser_conf, debug, strict)
self.nonterminal_transitions = []
self.directly_reads = defaultdict(set)
self.reads = defaultdict(set)
self.includes = defaultdict(set)
self.lookback = defaultdict(set)
def compute_lr0_states(self) -> None:
self.lr0_itemsets = set()
# map of kernels to LR0ItemSets
cache: Dict['State', LR0ItemSet] = {}
def step(state: LR0ItemSet) -> Iterator[LR0ItemSet]:
_, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)
d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
kernel = fzset({rp.advance(sym) for rp in rps})
new_state = cache.get(kernel, None)
if new_state is None:
closure = set(kernel)
for rp in kernel:
if not rp.is_satisfied and not rp.next.is_term:
closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
new_state = LR0ItemSet(kernel, closure)
cache[kernel] = new_state
state.transitions[sym] = new_state
yield new_state
self.lr0_itemsets.add(state)
for _ in bfs(self.lr0_start_states.values(), step):
pass
def compute_reads_relations(self):
# handle start state
for root in self.lr0_start_states.values():
assert(len(root.kernel) == 1)
for rp in root.kernel:
assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
for state in self.lr0_itemsets:
seen = set()
for rp in state.closure:
if rp.is_satisfied:
continue
s = rp.next
# if s is a not a nonterminal
if s not in self.lr0_rules_by_origin:
continue
if s in seen:
continue
seen.add(s)
nt = (state, s)
self.nonterminal_transitions.append(nt)
dr = self.directly_reads[nt]
r = self.reads[nt]
next_state = state.transitions[s]
for rp2 in next_state.closure:
if rp2.is_satisfied:
continue
s2 = rp2.next
# if s2 is a terminal
if s2 not in self.lr0_rules_by_origin:
dr.add(s2)
if s2 in self.NULLABLE:
r.add((next_state, s2))
def compute_includes_lookback(self):
for nt in self.nonterminal_transitions:
state, nonterminal = nt
includes = []
lookback = self.lookback[nt]
for rp in state.closure:
if rp.rule.origin != nonterminal:
continue
# traverse the states for rp(.rule)
state2 = state
for i in range(rp.index, len(rp.rule.expansion)):
s = rp.rule.expansion[i]
nt2 = (state2, s)
state2 = state2.transitions[s]
if nt2 not in self.reads:
continue
for j in range(i + 1, len(rp.rule.expansion)):
if rp.rule.expansion[j] not in self.NULLABLE:
break
else:
includes.append(nt2)
# state2 is at the final state for rp.rule
if rp.index == 0:
for rp2 in state2.closure:
if (rp2.rule == rp.rule) and rp2.is_satisfied:
lookback.add((state2, rp2.rule))
for nt2 in includes:
self.includes[nt2].add(nt)
def compute_lookaheads(self):
read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads)
follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets)
for nt, lookbacks in self.lookback.items():
for state, rule in lookbacks:
for s in follow_sets[nt]:
state.lookaheads[s].add(rule)
def compute_lalr1_states(self) -> None:
m: Dict[LR0ItemSet, Dict[str, Tuple]] = {}
reduce_reduce = []
for itemset in self.lr0_itemsets:
actions: Dict[Symbol, Tuple] = {la: (Shift, next_state.closure)
for la, next_state in itemset.transitions.items()}
for la, rules in itemset.lookaheads.items():
if len(rules) > 1:
# Try to resolve conflict based on priority
p = [(r.options.priority or 0, r) for r in rules]
p.sort(key=lambda r: r[0], reverse=True)
best, second_best = p[:2]
if best[0] > second_best[0]:
rules = {best[1]}
else:
reduce_reduce.append((itemset, la, rules))
continue
rule ,= rules
if la in actions:
if self.strict:
raise GrammarError(f"Shift/Reduce conflict for terminal {la.name}. [strict-mode]\n ")
elif self.debug:
logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logger.warning(' * %s', rule)
else:
logger.debug('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logger.debug(' * %s', rule)
else:
actions[la] = (Reduce, rule)
m[itemset] = { k.name: v for k, v in actions.items() }
if reduce_reduce:
msgs = []
for itemset, la, rules in reduce_reduce:
msg = 'Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t- ' + str(r) for r in rules ]))
if self.debug:
msg += '\n collision occurred in state: {%s\n }' % ''.join(['\n\t' + str(x) for x in itemset.closure])
msgs.append(msg)
raise GrammarError('\n\n'.join(msgs))
states = { k.closure: v for k, v in m.items() }
# compute end states
end_states: Dict[str, 'State'] = {}
for state in states:
for rp in state:
for start in self.lr0_start_states:
if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
assert start not in end_states
end_states[start] = state
start_states = { start: state.closure for start, state in self.lr0_start_states.items() }
_parse_table = ParseTable(states, start_states, end_states)
if self.debug:
self.parse_table = _parse_table
else:
self.parse_table = IntParseTable.from_ParseTable(_parse_table)
def compute_lalr(self):
self.compute_lr0_states()
self.compute_reads_relations()
self.compute_includes_lookback()
self.compute_lookaheads()
self.compute_lalr1_states()

View File

@@ -0,0 +1,158 @@
# This module provides a LALR interactive parser, which is used for debugging and error handling
from typing import Iterator, List
from copy import copy
import warnings
from ..exceptions import UnexpectedToken
from ..lexer import Token, LexerThread
from .lalr_parser_state import ParserState
###{standalone
class InteractiveParser:
"""InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR.
For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``.
"""
def __init__(self, parser, parser_state: ParserState, lexer_thread: LexerThread):
self.parser = parser
self.parser_state = parser_state
self.lexer_thread = lexer_thread
self.result = None
@property
def lexer_state(self) -> LexerThread:
warnings.warn("lexer_state will be removed in subsequent releases. Use lexer_thread instead.", DeprecationWarning)
return self.lexer_thread
def feed_token(self, token: Token):
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer.
Note that ``token`` has to be an instance of ``Token``.
"""
return self.parser_state.feed_token(token, token.type == '$END')
def iter_parse(self) -> Iterator[Token]:
"""Step through the different stages of the parse, by reading tokens from the lexer
and feeding them to the parser, one per iteration.
Returns an iterator of the tokens it encounters.
When the parse is over, the resulting tree can be found in ``InteractiveParser.result``.
"""
for token in self.lexer_thread.lex(self.parser_state):
yield token
self.result = self.feed_token(token)
def exhaust_lexer(self) -> List[Token]:
"""Try to feed the rest of the lexer state into the interactive parser.
Note that this modifies the instance in place and does not feed an '$END' Token
"""
return list(self.iter_parse())
def feed_eof(self, last_token=None):
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else self.lexer_thread._Token('$END', '', 0, 1, 1)
return self.feed_token(eof)
def __copy__(self):
"""Create a new interactive parser with a separate state.
Calls to feed_token() won't affect the old instance, and vice-versa.
"""
return self.copy()
def copy(self, deepcopy_values=True):
return type(self)(
self.parser,
self.parser_state.copy(deepcopy_values=deepcopy_values),
copy(self.lexer_thread),
)
def __eq__(self, other):
if not isinstance(other, InteractiveParser):
return False
return self.parser_state == other.parser_state and self.lexer_thread == other.lexer_thread
def as_immutable(self):
"""Convert to an ``ImmutableInteractiveParser``."""
p = copy(self)
return ImmutableInteractiveParser(p.parser, p.parser_state, p.lexer_thread)
def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read."""
out = ["Parser choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %r' % (k, v))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out)
def choices(self):
"""Returns a dictionary of token types, matched to their action in the parser.
Only returns token types that are accepted by the current state.
Updated by ``feed_token()``.
"""
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position]
def accepts(self):
"""Returns the set of possible tokens that will advance the parser into a new valid state."""
accepts = set()
conf_no_callbacks = copy(self.parser_state.parse_conf)
# We don't want to call callbacks here since those might have arbitrary side effects
# and are unnecessarily slow.
conf_no_callbacks.callbacks = {}
for t in self.choices():
if t.isupper(): # is terminal?
new_cursor = self.copy(deepcopy_values=False)
new_cursor.parser_state.parse_conf = conf_no_callbacks
try:
new_cursor.feed_token(self.lexer_thread._Token(t, ''))
except UnexpectedToken:
pass
else:
accepts.add(t)
return accepts
def resume_parse(self):
"""Resume automated parsing from the current state.
"""
return self.parser.parse_from_state(self.parser_state, last_token=self.lexer_thread.state.last_token)
class ImmutableInteractiveParser(InteractiveParser):
"""Same as ``InteractiveParser``, but operations create a new instance instead
of changing it in-place.
"""
result = None
def __hash__(self):
return hash((self.parser_state, self.lexer_thread))
def feed_token(self, token):
c = copy(self)
c.result = InteractiveParser.feed_token(c, token)
return c
def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the parser.
Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
cursor = self.as_mutable()
cursor.exhaust_lexer()
return cursor.as_immutable()
def as_mutable(self):
"""Convert to an ``InteractiveParser``."""
p = copy(self)
return InteractiveParser(p.parser, p.parser_state, p.lexer_thread)
###}

View File

@@ -0,0 +1,122 @@
"""This module implements a LALR(1) Parser
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from typing import Dict, Any, Optional
from ..lexer import Token, LexerThread
from ..utils import Serialize
from ..common import ParserConf, ParserCallbacks
from .lalr_analysis import LALR_Analyzer, IntParseTable, ParseTableBase
from .lalr_interactive_parser import InteractiveParser
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from .lalr_parser_state import ParserState, ParseConf
###{standalone
class LALR_Parser(Serialize):
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
analysis = LALR_Analyzer(parser_conf, debug=debug, strict=strict)
analysis.compute_lalr()
callbacks = parser_conf.callbacks
self._parse_table = analysis.parse_table
self.parser_conf = parser_conf
self.parser = _Parser(analysis.parse_table, callbacks, debug)
@classmethod
def deserialize(cls, data, memo, callbacks, debug=False):
inst = cls.__new__(cls)
inst._parse_table = IntParseTable.deserialize(data, memo)
inst.parser = _Parser(inst._parse_table, callbacks, debug)
return inst
def serialize(self, memo: Any = None) -> Dict[str, Any]:
return self._parse_table.serialize(memo)
def parse_interactive(self, lexer: LexerThread, start: str):
return self.parser.parse(lexer, start, start_interactive=True)
def parse(self, lexer, start, on_error=None):
try:
return self.parser.parse(lexer, start)
except UnexpectedInput as e:
if on_error is None:
raise
while True:
if isinstance(e, UnexpectedCharacters):
s = e.interactive_parser.lexer_thread.state
p = s.line_ctr.char_pos
if not on_error(e):
raise e
if isinstance(e, UnexpectedCharacters):
# If user didn't change the character position, then we should
if p == s.line_ctr.char_pos:
s.line_ctr.feed(s.text[p:p+1])
try:
return e.interactive_parser.resume_parse()
except UnexpectedToken as e2:
if (isinstance(e, UnexpectedToken)
and e.token.type == e2.token.type == '$END'
and e.interactive_parser == e2.interactive_parser):
# Prevent infinite loop
raise e2
e = e2
except UnexpectedCharacters as e2:
e = e2
class _Parser:
parse_table: ParseTableBase
callbacks: ParserCallbacks
debug: bool
def __init__(self, parse_table: ParseTableBase, callbacks: ParserCallbacks, debug: bool=False):
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug
def parse(self, lexer: LexerThread, start: str, value_stack=None, state_stack=None, start_interactive=False):
parse_conf = ParseConf(self.parse_table, self.callbacks, start)
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack)
if start_interactive:
return InteractiveParser(self, parser_state, parser_state.lexer)
return self.parse_from_state(parser_state)
def parse_from_state(self, state: ParserState, last_token: Optional[Token]=None):
"""Run the main LALR parser loop
Parameters:
state - the initial state. Changed in-place.
last_token - Used only for line information in case of an empty lexer.
"""
try:
token = last_token
for token in state.lexer.lex(state):
assert token is not None
state.feed_token(token)
end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(end_token, True)
except UnexpectedInput as e:
try:
e.interactive_parser = InteractiveParser(self, state, state.lexer)
except NameError:
pass
raise e
except Exception as e:
if self.debug:
print("")
print("STATE STACK DUMP")
print("----------------")
for i, s in enumerate(state.state_stack):
print('%d)' % i , s)
print("")
raise
###}

View File

@@ -0,0 +1,110 @@
from copy import deepcopy, copy
from typing import Dict, Any, Generic, List
from ..lexer import Token, LexerThread
from ..common import ParserCallbacks
from .lalr_analysis import Shift, ParseTableBase, StateT
from ..exceptions import UnexpectedToken
###{standalone
class ParseConf(Generic[StateT]):
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'
parse_table: ParseTableBase[StateT]
callbacks: ParserCallbacks
start: str
start_state: StateT
end_state: StateT
states: Dict[StateT, Dict[str, tuple]]
def __init__(self, parse_table: ParseTableBase[StateT], callbacks: ParserCallbacks, start: str):
self.parse_table = parse_table
self.start_state = self.parse_table.start_states[start]
self.end_state = self.parse_table.end_states[start]
self.states = self.parse_table.states
self.callbacks = callbacks
self.start = start
class ParserState(Generic[StateT]):
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'
parse_conf: ParseConf[StateT]
lexer: LexerThread
state_stack: List[StateT]
value_stack: list
def __init__(self, parse_conf: ParseConf[StateT], lexer: LexerThread, state_stack=None, value_stack=None):
self.parse_conf = parse_conf
self.lexer = lexer
self.state_stack = state_stack or [self.parse_conf.start_state]
self.value_stack = value_stack or []
@property
def position(self) -> StateT:
return self.state_stack[-1]
# Necessary for match_examples() to work
def __eq__(self, other) -> bool:
if not isinstance(other, ParserState):
return NotImplemented
return len(self.state_stack) == len(other.state_stack) and self.position == other.position
def __copy__(self):
return self.copy()
def copy(self, deepcopy_values=True) -> 'ParserState[StateT]':
return type(self)(
self.parse_conf,
self.lexer, # XXX copy
copy(self.state_stack),
deepcopy(self.value_stack) if deepcopy_values else copy(self.value_stack),
)
def feed_token(self, token: Token, is_end=False) -> Any:
state_stack = self.state_stack
value_stack = self.value_stack
states = self.parse_conf.states
end_state = self.parse_conf.end_state
callbacks = self.parse_conf.callbacks
while True:
state = state_stack[-1]
try:
action, arg = states[state][token.type]
except KeyError:
expected = {s for s in states[state].keys() if s.isupper()}
raise UnexpectedToken(token, expected, state=self, interactive_parser=None)
assert arg != end_state
if action is Shift:
# shift once and return
assert not is_end
state_stack.append(arg)
value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
return
else:
# reduce+shift as many times as necessary
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []
value = callbacks[rule](s) if callbacks else s
_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)
if is_end and state_stack[-1] == end_state:
return value_stack[-1]
###}

View File

@@ -0,0 +1,165 @@
"""This module implements an Earley parser with a dynamic lexer
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
https://www.sciencedirect.com/science/article/pii/S1571066108001497
That is probably the best reference for understanding the algorithm here.
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
is better documented here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
uses regular expressions by necessity, achieving high-performance while maintaining all of
Earley's power in parsing any CFG.
"""
from typing import TYPE_CHECKING, Callable, Optional, List, Any
from collections import defaultdict
from ..tree import Tree
from ..exceptions import UnexpectedCharacters
from ..lexer import Token
from ..grammar import Terminal
from .earley import Parser as BaseParser
from .earley_forest import TokenNode
if TYPE_CHECKING:
from ..common import LexerConf, ParserConf
class Parser(BaseParser):
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
resolve_ambiguity: bool=True, complete_lex: bool=False, debug: bool=False,
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
BaseParser.__init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity,
debug, tree_class, ordered_sets)
self.ignore = [Terminal(t) for t in lexer_conf.ignore]
self.complete_lex = complete_lex
def _parse(self, stream, columns, to_scan, start_symbol=None):
def scan(i, to_scan):
"""The core Earley Scanner.
This is a custom implementation of the scanner that uses the
Lark lexer to match tokens. The scan list is built by the
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""
node_cache = {}
# 1) Loop the expectations and ask the lexer to match.
# Since regexp is forward looking on the input stream, and we only
# want to process tokens when we hit the point in the stream at which
# they complete, we push all tokens into a buffer (delayed_matches), to
# be held possibly for a later parse step when we reach the point in the
# input stream at which they complete.
for item in self.Set(to_scan):
m = match(item.expect, stream, i)
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append( (item, i, t) )
if self.complete_lex:
s = m.group(0)
for j in range(1, len(s)):
m = match(item.expect, s[:-j])
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append( (item, i, t) )
# XXX The following 3 lines were commented out for causing a bug. See issue #768
# # Remove any items that successfully matched in this pass from the to_scan buffer.
# # This ensures we don't carry over tokens that already matched, if we're ignoring below.
# to_scan.remove(item)
# 3) Process any ignores. This is typically used for e.g. whitespace.
# We carry over any unmatched items from the to_scan buffer to be matched again after
# the ignore. This should allow us to use ignored symbols in non-terminals to implement
# e.g. mandatory spacing.
for x in self.ignore:
m = match(x, stream, i)
if m:
# Carry over any items still in the scan buffer, to past the end of the ignored items.
delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ])
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])
next_to_scan = self.Set()
next_set = self.Set()
columns.append(next_set)
transitives.append({})
## 4) Process Tokens from delayed_matches.
# This is the core of the Earley scanner. Create an SPPF node for each Token,
# and create the symbol node in the SPPF tree. Advance the item that completed,
# and add the resulting new item to either the Earley set (for processing by the
# completer/predictor) or the to_scan buffer for the next parse step.
for item, start, token in delayed_matches[i+1]:
if token is not None:
token.end_line = text_line
token.end_column = text_column + 1
token.end_pos = i + 1
new_item = item.advance()
label = (new_item.s, new_item.start, i + 1)
token_node = TokenNode(token, terminals[token.type])
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
else:
new_item = item
if new_item.expect in self.TERMINALS:
# add (B ::= Aai+1.B, h, y) to Q'
next_to_scan.add(new_item)
else:
# add (B ::= Aa+1.B, h, y) to Ei+1
next_set.add(new_item)
del delayed_matches[i+1] # No longer needed, so unburden memory
if not next_set and not delayed_matches and not next_to_scan:
considered_rules = list(sorted(to_scan, key=lambda key: key.rule.origin.name))
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
set(to_scan), state=frozenset(i.s for i in to_scan),
considered_rules=considered_rules
)
return next_to_scan
delayed_matches = defaultdict(list)
match = self.term_matcher
terminals = self.lexer_conf.terminals_by_name
# Cache for nodes & tokens created in a particular parse step.
transitives = [{}]
text_line = 1
text_column = 1
## The main Earley loop.
# Run the Prediction/Completion cycle for any Items in the current Earley set.
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
i = 0
for token in stream:
self.predict_and_complete(i, to_scan, columns, transitives)
to_scan = scan(i, to_scan)
if token == '\n':
text_line += 1
text_column = 1
else:
text_column += 1
i += 1
self.predict_and_complete(i, to_scan, columns, transitives)
## Column is now the final column in the parse.
assert i == len(columns)-1
return to_scan

View File

View File

@@ -0,0 +1,107 @@
"""This is an experimental tool for reconstructing text from a shaped tree, based on a Lark grammar.
"""
from typing import Dict, Callable, Iterable, Optional
from .lark import Lark
from .tree import Tree, ParseTree
from .visitors import Transformer_InPlace
from .lexer import Token, PatternStr, TerminalDef
from .grammar import Terminal, NonTerminal, Symbol
from .tree_matcher import TreeMatcher, is_discarded_terminal
from .utils import is_id_continue
def is_iter_empty(i):
try:
_ = next(i)
return False
except StopIteration:
return True
class WriteTokensTransformer(Transformer_InPlace):
"Inserts discarded tokens into their correct place, according to the rules of grammar"
tokens: Dict[str, TerminalDef]
term_subs: Dict[str, Callable[[Symbol], str]]
def __init__(self, tokens: Dict[str, TerminalDef], term_subs: Dict[str, Callable[[Symbol], str]]) -> None:
self.tokens = tokens
self.term_subs = term_subs
def __default__(self, data, children, meta):
if not getattr(meta, 'match_tree', False):
return Tree(data, children)
iter_args = iter(children)
to_write = []
for sym in meta.orig_expansion:
if is_discarded_terminal(sym):
try:
v = self.term_subs[sym.name](sym)
except KeyError:
t = self.tokens[sym.name]
if not isinstance(t.pattern, PatternStr):
raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)
v = t.pattern.value
to_write.append(v)
else:
x = next(iter_args)
if isinstance(x, list):
to_write += x
else:
if isinstance(x, Token):
assert Terminal(x.type) == sym, x
else:
assert NonTerminal(x.data) == sym, (sym, x)
to_write.append(x)
assert is_iter_empty(iter_args)
return to_write
class Reconstructor(TreeMatcher):
"""
A Reconstructor that will, given a full parse Tree, generate source code.
Note:
The reconstructor cannot generate values from regexps. If you need to produce discarded
regexes, such as newlines, use `term_subs` and provide default values for them.
Parameters:
parser: a Lark instance
term_subs: a dictionary of [Terminal name as str] to [output text as str]
"""
write_tokens: WriteTokensTransformer
def __init__(self, parser: Lark, term_subs: Optional[Dict[str, Callable[[Symbol], str]]]=None) -> None:
TreeMatcher.__init__(self, parser)
self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {})
def _reconstruct(self, tree):
unreduced_tree = self.match_tree(tree, tree.data)
res = self.write_tokens.transform(unreduced_tree)
for item in res:
if isinstance(item, Tree):
# TODO use orig_expansion.rulename to support templates
yield from self._reconstruct(item)
else:
yield item
def reconstruct(self, tree: ParseTree, postproc: Optional[Callable[[Iterable[str]], Iterable[str]]]=None, insert_spaces: bool=True) -> str:
x = self._reconstruct(tree)
if postproc:
x = postproc(x)
y = []
prev_item = ''
for item in x:
if insert_spaces and prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
y.append(' ')
y.append(item)
prev_item = item
return ''.join(y)

View File

@@ -0,0 +1,70 @@
import sys
from argparse import ArgumentParser, FileType
from textwrap import indent
from logging import DEBUG, INFO, WARN, ERROR
from typing import Optional
import warnings
from lark import Lark, logger
try:
from interegular import logger as interegular_logger
has_interegular = True
except ImportError:
has_interegular = False
lalr_argparser = ArgumentParser(add_help=False, epilog='Look at the Lark documentation for more info on the options')
flags = [
('d', 'debug'),
'keep_all_tokens',
'regex',
'propagate_positions',
'maybe_placeholders',
'use_bytes'
]
options = ['start', 'lexer']
lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times")
lalr_argparser.add_argument('-s', '--start', action='append', default=[])
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual'))
lalr_argparser.add_argument('-o', '--out', type=FileType('w', encoding='utf-8'), default=sys.stdout, help='the output file (default=stdout)')
lalr_argparser.add_argument('grammar_file', type=FileType('r', encoding='utf-8'), help='A valid .lark file')
for flag in flags:
if isinstance(flag, tuple):
options.append(flag[1])
lalr_argparser.add_argument('-' + flag[0], '--' + flag[1], action='store_true')
elif isinstance(flag, str):
options.append(flag)
lalr_argparser.add_argument('--' + flag, action='store_true')
else:
raise NotImplementedError("flags must only contain strings or tuples of strings")
def build_lalr(namespace):
logger.setLevel((ERROR, WARN, INFO, DEBUG)[min(namespace.verbose, 3)])
if has_interegular:
interegular_logger.setLevel(logger.getEffectiveLevel())
if len(namespace.start) == 0:
namespace.start.append('start')
kwargs = {n: getattr(namespace, n) for n in options}
return Lark(namespace.grammar_file, parser='lalr', **kwargs), namespace.out
def showwarning_as_comment(message, category, filename, lineno, file=None, line=None):
# Based on warnings._showwarnmsg_impl
text = warnings.formatwarning(message, category, filename, lineno, line)
text = indent(text, '# ')
if file is None:
file = sys.stderr
if file is None:
return
try:
file.write(text)
except OSError:
pass
def make_warnings_comments():
warnings.showwarning = showwarning_as_comment

View File

@@ -0,0 +1,202 @@
"Converts Nearley grammars to Lark"
import os.path
import sys
import codecs
import argparse
from lark import Lark, Transformer, v_args
nearley_grammar = r"""
start: (ruledef|directive)+
directive: "@" NAME (STRING|NAME)
| "@" JS -> js_code
ruledef: NAME "->" expansions
| NAME REGEXP "->" expansions -> macro
expansions: expansion ("|" expansion)*
expansion: expr+ js
?expr: item (":" /[+*?]/)?
?item: rule|string|regexp|null
| "(" expansions ")"
rule: NAME
string: STRING
regexp: REGEXP
null: "null"
JS: /{%.*?%}/s
js: JS?
NAME: /[a-zA-Z_$]\w*/
COMMENT: /#[^\n]*/
REGEXP: /\[.*?\]/
STRING: _STRING "i"?
%import common.ESCAPED_STRING -> _STRING
%import common.WS
%ignore WS
%ignore COMMENT
"""
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic')
def _get_rulename(name):
name = {'_': '_ws_maybe', '__': '_ws'}.get(name, name)
return 'n_' + name.replace('$', '__DOLLAR__').lower()
@v_args(inline=True)
class NearleyToLark(Transformer):
def __init__(self):
self._count = 0
self.extra_rules = {}
self.extra_rules_rev = {}
self.alias_js_code = {}
def _new_function(self, code):
name = 'alias_%d' % self._count
self._count += 1
self.alias_js_code[name] = code
return name
def _extra_rule(self, rule):
if rule in self.extra_rules_rev:
return self.extra_rules_rev[rule]
name = 'xrule_%d' % len(self.extra_rules)
assert name not in self.extra_rules
self.extra_rules[name] = rule
self.extra_rules_rev[rule] = name
return name
def rule(self, name):
return _get_rulename(name)
def ruledef(self, name, exps):
return '!%s: %s' % (_get_rulename(name), exps)
def expr(self, item, op):
rule = '(%s)%s' % (item, op)
return self._extra_rule(rule)
def regexp(self, r):
return '/%s/' % r
def null(self):
return ''
def string(self, s):
return self._extra_rule(s)
def expansion(self, *x):
x, js = x[:-1], x[-1]
if js.children:
js_code ,= js.children
js_code = js_code[2:-2]
alias = '-> ' + self._new_function(js_code)
else:
alias = ''
return ' '.join(x) + alias
def expansions(self, *x):
return '%s' % ('\n |'.join(x))
def start(self, *rules):
return '\n'.join(filter(None, rules))
def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
rule_defs = []
tree = nearley_grammar_parser.parse(g)
for statement in tree.children:
if statement.data == 'directive':
directive, arg = statement.children
if directive in ('builtin', 'include'):
folder = builtin_path if directive == 'builtin' else folder_path
path = os.path.join(folder, arg[1:-1])
if path not in includes:
includes.add(path)
with codecs.open(path, encoding='utf8') as f:
text = f.read()
rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes)
else:
assert False, directive
elif statement.data == 'js_code':
code ,= statement.children
code = code[2:-2]
js_code.append(code)
elif statement.data == 'macro':
pass # TODO Add support for macros!
elif statement.data == 'ruledef':
rule_defs.append(n2l.transform(statement))
else:
raise Exception("Unknown statement: %s" % statement)
return rule_defs
def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False):
import js2py
emit_code = []
def emit(x=None):
if x:
emit_code.append(x)
emit_code.append('\n')
js_code = ['function id(x) {return x[0];}']
n2l = NearleyToLark()
rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set())
lark_g = '\n'.join(rule_defs)
lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items())
emit('from lark import Lark, Transformer')
emit()
emit('grammar = ' + repr(lark_g))
emit()
for alias, code in n2l.alias_js_code.items():
js_code.append('%s = (%s);' % (alias, code))
if es6:
emit(js2py.translate_js6('\n'.join(js_code)))
else:
emit(js2py.translate_js('\n'.join(js_code)))
emit('class TransformNearley(Transformer):')
for alias in n2l.alias_js_code:
emit(" %s = var.get('%s').to_python()" % (alias, alias))
emit(" __default__ = lambda self, n, c, m: c if c else None")
emit()
emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start)
emit('def parse(text):')
emit(' return TransformNearley().transform(parser.parse(text))')
return ''.join(emit_code)
def main(fn, start, nearley_lib, es6=False):
with codecs.open(fn, encoding='utf8') as f:
grammar = f.read()
return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6)
def get_arg_parser():
parser = argparse.ArgumentParser(description='Reads a Nearley grammar (with js functions), and outputs an equivalent lark parser.')
parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar')
parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule')
parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)')
parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true')
return parser
if __name__ == '__main__':
parser = get_arg_parser()
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6))

View File

@@ -0,0 +1,32 @@
import sys
import json
from lark.grammar import Rule
from lark.lexer import TerminalDef
from lark.tools import lalr_argparser, build_lalr
import argparse
argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize', parents=[lalr_argparser],
description="Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file",
epilog='Look at the Lark documentation for more info on the options')
def serialize(lark_inst, outfile):
data, memo = lark_inst.memo_serialize([TerminalDef, Rule])
outfile.write('{\n')
outfile.write(' "data": %s,\n' % json.dumps(data))
outfile.write(' "memo": %s\n' % json.dumps(memo))
outfile.write('}\n')
def main():
if len(sys.argv)==1:
argparser.print_help(sys.stderr)
sys.exit(1)
ns = argparser.parse_args()
serialize(*build_lalr(ns))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,196 @@
###{standalone
#
#
# Lark Stand-alone Generator Tool
# ----------------------------------
# Generates a stand-alone LALR(1) parser
#
# Git: https://github.com/erezsh/lark
# Author: Erez Shinan (erezshin@gmail.com)
#
#
# >>> LICENSE
#
# This tool and its generated code use a separate license from Lark,
# and are subject to the terms of the Mozilla Public License, v. 2.0.
# If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
#
# If you wish to purchase a commercial license for this tool and its
# generated code, you may contact me via email or otherwise.
#
# If MPL2 is incompatible with your free or open-source project,
# contact me and we'll work it out.
#
#
from copy import deepcopy
from abc import ABC, abstractmethod
from types import ModuleType
from typing import (
TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
Union, Iterable, IO, TYPE_CHECKING, overload, Sequence,
Pattern as REPattern, ClassVar, Set, Mapping
)
###}
import sys
import token, tokenize
import os
from os import path
from collections import defaultdict
from functools import partial
from argparse import ArgumentParser
import lark
from lark.tools import lalr_argparser, build_lalr, make_warnings_comments
from lark.grammar import Rule
from lark.lexer import TerminalDef
_dir = path.dirname(__file__)
_larkdir = path.join(_dir, path.pardir)
EXTRACT_STANDALONE_FILES = [
'tools/standalone.py',
'exceptions.py',
'utils.py',
'tree.py',
'visitors.py',
'grammar.py',
'lexer.py',
'common.py',
'parse_tree_builder.py',
'parsers/lalr_analysis.py',
'parsers/lalr_parser_state.py',
'parsers/lalr_parser.py',
'parsers/lalr_interactive_parser.py',
'parser_frontends.py',
'lark.py',
'indenter.py',
]
def extract_sections(lines):
section = None
text = []
sections = defaultdict(list)
for line in lines:
if line.startswith('###'):
if line[3] == '{':
section = line[4:].strip()
elif line[3] == '}':
sections[section] += text
section = None
text = []
else:
raise ValueError(line)
elif section:
text.append(line)
return {name: ''.join(text) for name, text in sections.items()}
def strip_docstrings(line_gen):
""" Strip comments and docstrings from a file.
Based on code from: https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings
"""
res = []
prev_toktype = token.INDENT
last_lineno = -1
last_col = 0
tokgen = tokenize.generate_tokens(line_gen)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if slineno > last_lineno:
last_col = 0
if scol > last_col:
res.append(" " * (scol - last_col))
if toktype == token.STRING and prev_toktype == token.INDENT:
# Docstring
res.append("#--")
elif toktype == tokenize.COMMENT:
# Comment
res.append("##\n")
else:
res.append(ttext)
prev_toktype = toktype
last_col = ecol
last_lineno = elineno
return ''.join(res)
def gen_standalone(lark_inst, output=None, out=sys.stdout, compress=False):
if output is None:
output = partial(print, file=out)
import pickle, zlib, base64
def compressed_output(obj):
s = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
c = zlib.compress(s)
output(repr(base64.b64encode(c)))
def output_decompress(name):
output('%(name)s = pickle.loads(zlib.decompress(base64.b64decode(%(name)s)))' % locals())
output('# The file was automatically generated by Lark v%s' % lark.__version__)
output('__version__ = "%s"' % lark.__version__)
output()
for i, pyfile in enumerate(EXTRACT_STANDALONE_FILES):
with open(os.path.join(_larkdir, pyfile)) as f:
code = extract_sections(f)['standalone']
if i: # if not this file
code = strip_docstrings(partial(next, iter(code.splitlines(True))))
output(code)
data, m = lark_inst.memo_serialize([TerminalDef, Rule])
output('import pickle, zlib, base64')
if compress:
output('DATA = (')
compressed_output(data)
output(')')
output_decompress('DATA')
output('MEMO = (')
compressed_output(m)
output(')')
output_decompress('MEMO')
else:
output('DATA = (')
output(data)
output(')')
output('MEMO = (')
output(m)
output(')')
output('Shift = 0')
output('Reduce = 1')
output("def Lark_StandAlone(**kwargs):")
output(" return Lark._load_from_dict(DATA, MEMO, **kwargs)")
def main():
make_warnings_comments()
parser = ArgumentParser(prog="prog='python -m lark.tools.standalone'", description="Lark Stand-alone Generator Tool",
parents=[lalr_argparser], epilog='Look at the Lark documentation for more info on the options')
parser.add_argument('-c', '--compress', action='store_true', default=0, help="Enable compression")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
ns = parser.parse_args()
lark_inst, out = build_lalr(ns)
gen_standalone(lark_inst, out=out, compress=ns.compress)
ns.out.close()
ns.grammar_file.close()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,267 @@
import sys
from copy import deepcopy
from typing import List, Callable, Iterator, Union, Optional, Generic, TypeVar, TYPE_CHECKING
if TYPE_CHECKING:
from .lexer import TerminalDef, Token
try:
import rich
except ImportError:
pass
from typing import Literal
###{standalone
class Meta:
empty: bool
line: int
column: int
start_pos: int
end_line: int
end_column: int
end_pos: int
orig_expansion: 'List[TerminalDef]'
match_tree: bool
def __init__(self):
self.empty = True
_Leaf_T = TypeVar("_Leaf_T")
Branch = Union[_Leaf_T, 'Tree[_Leaf_T]']
class Tree(Generic[_Leaf_T]):
"""The main tree class.
Creates a new tree, and stores "data" and "children" in attributes of the same name.
Trees can be hashed and compared.
Parameters:
data: The name of the rule or alias
children: List of matched sub-rules and terminals
meta: Line & Column numbers (if ``propagate_positions`` is enabled).
meta attributes: (line, column, end_line, end_column, start_pos, end_pos,
container_line, container_column, container_end_line, container_end_column)
container_* attributes consider all symbols, including those that have been inlined in the tree.
For example, in the rule 'a: _A B _C', the regular attributes will mark the start and end of B,
but the container_* attributes will also include _A and _C in the range. However, rules that
contain 'a' will consider it in full, including _A and _C for all attributes.
"""
data: str
children: 'List[Branch[_Leaf_T]]'
def __init__(self, data: str, children: 'List[Branch[_Leaf_T]]', meta: Optional[Meta]=None) -> None:
self.data = data
self.children = children
self._meta = meta
@property
def meta(self) -> Meta:
if self._meta is None:
self._meta = Meta()
return self._meta
def __repr__(self):
return 'Tree(%r, %r)' % (self.data, self.children)
def _pretty_label(self):
return self.data
def _pretty(self, level, indent_str):
yield f'{indent_str*level}{self._pretty_label()}'
if len(self.children) == 1 and not isinstance(self.children[0], Tree):
yield f'\t{self.children[0]}\n'
else:
yield '\n'
for n in self.children:
if isinstance(n, Tree):
yield from n._pretty(level+1, indent_str)
else:
yield f'{indent_str*(level+1)}{n}\n'
def pretty(self, indent_str: str=' ') -> str:
"""Returns an indented string representation of the tree.
Great for debugging.
"""
return ''.join(self._pretty(0, indent_str))
def __rich__(self, parent:Optional['rich.tree.Tree']=None) -> 'rich.tree.Tree':
"""Returns a tree widget for the 'rich' library.
Example:
::
from rich import print
from lark import Tree
tree = Tree('root', ['node1', 'node2'])
print(tree)
"""
return self._rich(parent)
def _rich(self, parent):
if parent:
tree = parent.add(f'[bold]{self.data}[/bold]')
else:
import rich.tree
tree = rich.tree.Tree(self.data)
for c in self.children:
if isinstance(c, Tree):
c._rich(tree)
else:
tree.add(f'[green]{c}[/green]')
return tree
def __eq__(self, other):
try:
return self.data == other.data and self.children == other.children
except AttributeError:
return False
def __ne__(self, other):
return not (self == other)
def __hash__(self) -> int:
return hash((self.data, tuple(self.children)))
def iter_subtrees(self) -> 'Iterator[Tree[_Leaf_T]]':
"""Depth-first iteration.
Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG).
"""
queue = [self]
subtrees = dict()
for subtree in queue:
subtrees[id(subtree)] = subtree
queue += [c for c in reversed(subtree.children)
if isinstance(c, Tree) and id(c) not in subtrees]
del queue
return reversed(list(subtrees.values()))
def iter_subtrees_topdown(self):
"""Breadth-first iteration.
Iterates over all the subtrees, return nodes in order like pretty() does.
"""
stack = [self]
stack_append = stack.append
stack_pop = stack.pop
while stack:
node = stack_pop()
if not isinstance(node, Tree):
continue
yield node
for child in reversed(node.children):
stack_append(child)
def find_pred(self, pred: 'Callable[[Tree[_Leaf_T]], bool]') -> 'Iterator[Tree[_Leaf_T]]':
"""Returns all nodes of the tree that evaluate pred(node) as true."""
return filter(pred, self.iter_subtrees())
def find_data(self, data: str) -> 'Iterator[Tree[_Leaf_T]]':
"""Returns all nodes of the tree whose data equals the given data."""
return self.find_pred(lambda t: t.data == data)
###}
def expand_kids_by_data(self, *data_values):
"""Expand (inline) children with any of the given data values. Returns True if anything changed"""
changed = False
for i in range(len(self.children)-1, -1, -1):
child = self.children[i]
if isinstance(child, Tree) and child.data in data_values:
self.children[i:i+1] = child.children
changed = True
return changed
def scan_values(self, pred: 'Callable[[Branch[_Leaf_T]], bool]') -> Iterator[_Leaf_T]:
"""Return all values in the tree that evaluate pred(value) as true.
This can be used to find all the tokens in the tree.
Example:
>>> all_tokens = tree.scan_values(lambda v: isinstance(v, Token))
"""
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c
def __deepcopy__(self, memo):
return type(self)(self.data, deepcopy(self.children, memo), meta=self._meta)
def copy(self) -> 'Tree[_Leaf_T]':
return type(self)(self.data, self.children)
def set(self, data: str, children: 'List[Branch[_Leaf_T]]') -> None:
self.data = data
self.children = children
ParseTree = Tree['Token']
class SlottedTree(Tree):
__slots__ = 'data', 'children', 'rule', '_meta'
def pydot__tree_to_png(tree: Tree, filename: str, rankdir: 'Literal["TB", "LR", "BT", "RL"]'="LR", **kwargs) -> None:
graph = pydot__tree_to_graph(tree, rankdir, **kwargs)
graph.write_png(filename)
def pydot__tree_to_dot(tree: Tree, filename, rankdir="LR", **kwargs):
graph = pydot__tree_to_graph(tree, rankdir, **kwargs)
graph.write(filename)
def pydot__tree_to_graph(tree: Tree, rankdir="LR", **kwargs):
"""Creates a colorful image that represents the tree (data+children, without meta)
Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to
directed graphs drawn from top to bottom, from left to right, from bottom to
top, and from right to left, respectively.
`kwargs` can be any graph attribute (e. g. `dpi=200`). For a list of
possible attributes, see https://www.graphviz.org/doc/info/attrs.html.
"""
import pydot # type: ignore[import-not-found]
graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs)
i = [0]
def new_leaf(leaf):
node = pydot.Node(i[0], label=repr(leaf))
i[0] += 1
graph.add_node(node)
return node
def _to_pydot(subtree):
color = hash(subtree.data) & 0xffffff
color |= 0x808080
subnodes = [_to_pydot(child) if isinstance(child, Tree) else new_leaf(child)
for child in subtree.children]
node = pydot.Node(i[0], style="filled", fillcolor="#%x" % color, label=subtree.data)
i[0] += 1
graph.add_node(node)
for subnode in subnodes:
graph.add_edge(pydot.Edge(node, subnode))
return node
_to_pydot(tree)
return graph

View File

@@ -0,0 +1,186 @@
"""Tree matcher based on Lark grammar"""
import re
from collections import defaultdict
from . import Tree, Token
from .common import ParserConf
from .parsers import earley
from .grammar import Rule, Terminal, NonTerminal
def is_discarded_terminal(t):
return t.is_term and t.filter_out
class _MakeTreeMatch:
def __init__(self, name, expansion):
self.name = name
self.expansion = expansion
def __call__(self, args):
t = Tree(self.name, args)
t.meta.match_tree = True
t.meta.orig_expansion = self.expansion
return t
def _best_from_group(seq, group_key, cmp_key):
d = {}
for item in seq:
key = group_key(item)
if key in d:
v1 = cmp_key(item)
v2 = cmp_key(d[key])
if v2 > v1:
d[key] = item
else:
d[key] = item
return list(d.values())
def _best_rules_from_group(rules):
rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion))
rules.sort(key=lambda r: len(r.expansion))
return rules
def _match(term, token):
if isinstance(token, Tree):
name, _args = parse_rulename(term.name)
return token.data == name
elif isinstance(token, Token):
return term == Terminal(token.type)
assert False, (term, token)
def make_recons_rule(origin, expansion, old_expansion):
return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion))
def make_recons_rule_to_term(origin, term):
return make_recons_rule(origin, [Terminal(term.name)], [term])
def parse_rulename(s):
"Parse rule names that may contain a template syntax (like rule{a, b, ...})"
name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups()
args = args_str and [a.strip() for a in args_str.split(',')]
return name, args
class ChildrenLexer:
def __init__(self, children):
self.children = children
def lex(self, parser_state):
return self.children
class TreeMatcher:
"""Match the elements of a tree node, based on an ontology
provided by a Lark grammar.
Supports templates and inlined rules (`rule{a, b,..}` and `_rule`)
Initialize with an instance of Lark.
"""
def __init__(self, parser):
# XXX TODO calling compile twice returns different results!
assert not parser.options.maybe_placeholders
# XXX TODO: we just ignore the potential existence of a postlexer
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())
self.rules_for_root = defaultdict(list)
self.rules = list(self._build_recons_rules(rules))
self.rules.reverse()
# Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
self.rules = _best_rules_from_group(self.rules)
self.parser = parser
self._parser_cache = {}
def _build_recons_rules(self, rules):
"Convert tree-parsing/construction rules to tree-matching rules"
expand1s = {r.origin for r in rules if r.options.expand1}
aliases = defaultdict(list)
for r in rules:
if r.alias:
aliases[r.origin].append(r.alias)
rule_names = {r.origin for r in rules}
nonterminals = {sym for sym in rule_names
if sym.name.startswith('_') or sym in expand1s or sym in aliases}
seen = set()
for r in rules:
recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
for sym in r.expansion if not is_discarded_terminal(sym)]
# Skip self-recursive constructs
if recons_exp == [r.origin] and r.alias is None:
continue
sym = NonTerminal(r.alias) if r.alias else r.origin
rule = make_recons_rule(sym, recons_exp, r.expansion)
if sym in expand1s and len(recons_exp) != 1:
self.rules_for_root[sym.name].append(rule)
if sym.name not in seen:
yield make_recons_rule_to_term(sym, sym)
seen.add(sym.name)
else:
if sym.name.startswith('_') or sym in expand1s:
yield rule
else:
self.rules_for_root[sym.name].append(rule)
for origin, rule_aliases in aliases.items():
for alias in rule_aliases:
yield make_recons_rule_to_term(origin, NonTerminal(alias))
yield make_recons_rule_to_term(origin, origin)
def match_tree(self, tree, rulename):
"""Match the elements of `tree` to the symbols of rule `rulename`.
Parameters:
tree (Tree): the tree node to match
rulename (str): The expected full rule name (including template args)
Returns:
Tree: an unreduced tree that matches `rulename`
Raises:
UnexpectedToken: If no match was found.
Note:
It's the callers' responsibility match the tree recursively.
"""
if rulename:
# validate
name, _args = parse_rulename(rulename)
assert tree.data == name
else:
rulename = tree.data
# TODO: ambiguity?
try:
parser = self._parser_cache[rulename]
except KeyError:
rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename])
# TODO pass callbacks through dict, instead of alias?
callbacks = {rule: rule.alias for rule in rules}
conf = ParserConf(rules, callbacks, [rulename])
parser = earley.Parser(self.parser.lexer_conf, conf, _match, resolve_ambiguity=True)
self._parser_cache[rulename] = parser
# find a full derivation
unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename)
assert unreduced_tree.data == rulename
return unreduced_tree

View File

@@ -0,0 +1,180 @@
"""This module defines utilities for matching and translation tree templates.
A tree templates is a tree that contains nodes that are template variables.
"""
from typing import Union, Optional, Mapping, Dict, Tuple, Iterator
from lark import Tree, Transformer
from lark.exceptions import MissingVariableError
Branch = Union[Tree[str], str]
TreeOrCode = Union[Tree[str], str]
MatchResult = Dict[str, Tree]
_TEMPLATE_MARKER = '$'
class TemplateConf:
"""Template Configuration
Allows customization for different uses of Template
parse() must return a Tree instance.
"""
def __init__(self, parse=None):
self._parse = parse
def test_var(self, var: Union[Tree[str], str]) -> Optional[str]:
"""Given a tree node, if it is a template variable return its name. Otherwise, return None.
This method may be overridden for customization
Parameters:
var: Tree | str - The tree node to test
"""
if isinstance(var, str):
return _get_template_name(var)
if (
isinstance(var, Tree)
and var.data == "var"
and len(var.children) > 0
and isinstance(var.children[0], str)
):
return _get_template_name(var.children[0])
return None
def _get_tree(self, template: TreeOrCode) -> Tree[str]:
if isinstance(template, str):
assert self._parse
template = self._parse(template)
if not isinstance(template, Tree):
raise TypeError("template parser must return a Tree instance")
return template
def __call__(self, template: Tree[str]) -> 'Template':
return Template(template, conf=self)
def _match_tree_template(self, template: TreeOrCode, tree: Branch) -> Optional[MatchResult]:
"""Returns dict of {var: match} if found a match, else None
"""
template_var = self.test_var(template)
if template_var:
if not isinstance(tree, Tree):
raise TypeError(f"Template variables can only match Tree instances. Not {tree!r}")
return {template_var: tree}
if isinstance(template, str):
if template == tree:
return {}
return None
assert isinstance(template, Tree) and isinstance(tree, Tree), f"template={template} tree={tree}"
if template.data == tree.data and len(template.children) == len(tree.children):
res = {}
for t1, t2 in zip(template.children, tree.children):
matches = self._match_tree_template(t1, t2)
if matches is None:
return None
res.update(matches)
return res
return None
class _ReplaceVars(Transformer[str, Tree[str]]):
def __init__(self, conf: TemplateConf, vars: Mapping[str, Tree[str]]) -> None:
super().__init__()
self._conf = conf
self._vars = vars
def __default__(self, data, children, meta) -> Tree[str]:
tree = super().__default__(data, children, meta)
var = self._conf.test_var(tree)
if var:
try:
return self._vars[var]
except KeyError:
raise MissingVariableError(f"No mapping for template variable ({var})")
return tree
class Template:
"""Represents a tree template, tied to a specific configuration
A tree template is a tree that contains nodes that are template variables.
Those variables will match any tree.
(future versions may support annotations on the variables, to allow more complex templates)
"""
def __init__(self, tree: Tree[str], conf: TemplateConf = TemplateConf()):
self.conf = conf
self.tree = conf._get_tree(tree)
def match(self, tree: TreeOrCode) -> Optional[MatchResult]:
"""Match a tree template to a tree.
A tree template without variables will only match ``tree`` if it is equal to the template.
Parameters:
tree (Tree): The tree to match to the template
Returns:
Optional[Dict[str, Tree]]: If match is found, returns a dictionary mapping
template variable names to their matching tree nodes.
If no match was found, returns None.
"""
tree = self.conf._get_tree(tree)
return self.conf._match_tree_template(self.tree, tree)
def search(self, tree: TreeOrCode) -> Iterator[Tuple[Tree[str], MatchResult]]:
"""Search for all occurrences of the tree template inside ``tree``.
"""
tree = self.conf._get_tree(tree)
for subtree in tree.iter_subtrees():
res = self.match(subtree)
if res:
yield subtree, res
def apply_vars(self, vars: Mapping[str, Tree[str]]) -> Tree[str]:
"""Apply vars to the template tree
"""
return _ReplaceVars(self.conf, vars).transform(self.tree)
def translate(t1: Template, t2: Template, tree: TreeOrCode):
"""Search tree and translate each occurrence of t1 into t2.
"""
tree = t1.conf._get_tree(tree) # ensure it's a tree, parse if necessary and possible
for subtree, vars in t1.search(tree):
res = t2.apply_vars(vars)
subtree.set(res.data, res.children)
return tree
class TemplateTranslator:
"""Utility class for translating a collection of patterns
"""
def __init__(self, translations: Mapping[Template, Template]):
assert all(isinstance(k, Template) and isinstance(v, Template) for k, v in translations.items())
self.translations = translations
def translate(self, tree: Tree[str]):
for k, v in self.translations.items():
tree = translate(k, v, tree)
return tree
def _get_template_name(value: str) -> Optional[str]:
return value.lstrip(_TEMPLATE_MARKER) if value.startswith(_TEMPLATE_MARKER) else None

View File

@@ -0,0 +1,343 @@
import unicodedata
import os
from itertools import product
from collections import deque
from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable, AbstractSet
###{standalone
import sys, re
import logging
logger: logging.Logger = logging.getLogger("lark")
logger.addHandler(logging.StreamHandler())
# Set to highest level, since we have some warnings amongst the code
# By default, we should not output any log messages
logger.setLevel(logging.CRITICAL)
NO_VALUE = object()
T = TypeVar("T")
def classify(seq: Iterable, key: Optional[Callable] = None, value: Optional[Callable] = None) -> Dict:
d: Dict[Any, Any] = {}
for item in seq:
k = key(item) if (key is not None) else item
v = value(item) if (value is not None) else item
try:
d[k].append(v)
except KeyError:
d[k] = [v]
return d
def _deserialize(data: Any, namespace: Dict[str, Any], memo: Dict) -> Any:
if isinstance(data, dict):
if '__type__' in data: # Object
class_ = namespace[data['__type__']]
return class_.deserialize(data, memo)
elif '@' in data:
return memo[data['@']]
return {key:_deserialize(value, namespace, memo) for key, value in data.items()}
elif isinstance(data, list):
return [_deserialize(value, namespace, memo) for value in data]
return data
_T = TypeVar("_T", bound="Serialize")
class Serialize:
"""Safe-ish serialization interface that doesn't rely on Pickle
Attributes:
__serialize_fields__ (List[str]): Fields (aka attributes) to serialize.
__serialize_namespace__ (list): List of classes that deserialization is allowed to instantiate.
Should include all field types that aren't builtin types.
"""
def memo_serialize(self, types_to_memoize: List) -> Any:
memo = SerializeMemoizer(types_to_memoize)
return self.serialize(memo), memo.serialize()
def serialize(self, memo = None) -> Dict[str, Any]:
if memo and memo.in_types(self):
return {'@': memo.memoized.get(self)}
fields = getattr(self, '__serialize_fields__')
res = {f: _serialize(getattr(self, f), memo) for f in fields}
res['__type__'] = type(self).__name__
if hasattr(self, '_serialize'):
self._serialize(res, memo)
return res
@classmethod
def deserialize(cls: Type[_T], data: Dict[str, Any], memo: Dict[int, Any]) -> _T:
namespace = getattr(cls, '__serialize_namespace__', [])
namespace = {c.__name__:c for c in namespace}
fields = getattr(cls, '__serialize_fields__')
if '@' in data:
return memo[data['@']]
inst = cls.__new__(cls)
for f in fields:
try:
setattr(inst, f, _deserialize(data[f], namespace, memo))
except KeyError as e:
raise KeyError("Cannot find key for class", cls, e)
if hasattr(inst, '_deserialize'):
inst._deserialize()
return inst
class SerializeMemoizer(Serialize):
"A version of serialize that memoizes objects to reduce space"
__serialize_fields__ = 'memoized',
def __init__(self, types_to_memoize: List) -> None:
self.types_to_memoize = tuple(types_to_memoize)
self.memoized = Enumerator()
def in_types(self, value: Serialize) -> bool:
return isinstance(value, self.types_to_memoize)
def serialize(self) -> Dict[int, Any]: # type: ignore[override]
return _serialize(self.memoized.reversed(), None)
@classmethod
def deserialize(cls, data: Dict[int, Any], namespace: Dict[str, Any], memo: Dict[Any, Any]) -> Dict[int, Any]: # type: ignore[override]
return _deserialize(data, namespace, memo)
try:
import regex
_has_regex = True
except ImportError:
_has_regex = False
if sys.version_info >= (3, 11):
import re._parser as sre_parse
import re._constants as sre_constants
else:
import sre_parse
import sre_constants
categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
def get_regexp_width(expr: str) -> Union[Tuple[int, int], List[int]]:
if _has_regex:
# Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
# a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
# match here below.
regexp_final = re.sub(categ_pattern, 'A', expr)
else:
if re.search(categ_pattern, expr):
raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr)
regexp_final = expr
try:
# Fixed in next version (past 0.960) of typeshed
return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
except sre_constants.error:
if not _has_regex:
raise ValueError(expr)
else:
# sre_parse does not support the new features in regex. To not completely fail in that case,
# we manually test for the most important info (whether the empty string is matched)
c = regex.compile(regexp_final)
# Python 3.11.7 introducded sre_parse.MAXWIDTH that is used instead of MAXREPEAT
# See lark-parser/lark#1376 and python/cpython#109859
MAXWIDTH = getattr(sre_parse, "MAXWIDTH", sre_constants.MAXREPEAT)
if c.match('') is None:
# MAXREPEAT is a none pickable subclass of int, therefore needs to be converted to enable caching
return 1, int(MAXWIDTH)
else:
return 0, int(MAXWIDTH)
###}
_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
_ID_CONTINUE = _ID_START + ('Nd', 'Nl',)
def _test_unicode_category(s: str, categories: Sequence[str]) -> bool:
if len(s) != 1:
return all(_test_unicode_category(char, categories) for char in s)
return s == '_' or unicodedata.category(s) in categories
def is_id_continue(s: str) -> bool:
"""
Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
"""
return _test_unicode_category(s, _ID_CONTINUE)
def is_id_start(s: str) -> bool:
"""
Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
"""
return _test_unicode_category(s, _ID_START)
def dedup_list(l: Sequence[T]) -> List[T]:
"""Given a list (l) will removing duplicates from the list,
preserving the original order of the list. Assumes that
the list entries are hashable."""
return list(dict.fromkeys(l))
class Enumerator(Serialize):
def __init__(self) -> None:
self.enums: Dict[Any, int] = {}
def get(self, item) -> int:
if item not in self.enums:
self.enums[item] = len(self.enums)
return self.enums[item]
def __len__(self):
return len(self.enums)
def reversed(self) -> Dict[int, Any]:
r = {v: k for k, v in self.enums.items()}
assert len(r) == len(self.enums)
return r
def combine_alternatives(lists):
"""
Accepts a list of alternatives, and enumerates all their possible concatenations.
Examples:
>>> combine_alternatives([range(2), [4,5]])
[[0, 4], [0, 5], [1, 4], [1, 5]]
>>> combine_alternatives(["abc", "xy", '$'])
[['a', 'x', '$'], ['a', 'y', '$'], ['b', 'x', '$'], ['b', 'y', '$'], ['c', 'x', '$'], ['c', 'y', '$']]
>>> combine_alternatives([])
[[]]
"""
if not lists:
return [[]]
assert all(l for l in lists), lists
return list(product(*lists))
try:
import atomicwrites
_has_atomicwrites = True
except ImportError:
_has_atomicwrites = False
class FS:
exists = staticmethod(os.path.exists)
@staticmethod
def open(name, mode="r", **kwargs):
if _has_atomicwrites and "w" in mode:
return atomicwrites.atomic_write(name, mode=mode, overwrite=True, **kwargs)
else:
return open(name, mode, **kwargs)
class fzset(frozenset):
def __repr__(self):
return '{%s}' % ', '.join(map(repr, self))
def classify_bool(seq: Iterable, pred: Callable) -> Any:
false_elems = []
true_elems = [elem for elem in seq if pred(elem) or false_elems.append(elem)] # type: ignore[func-returns-value]
return true_elems, false_elems
def bfs(initial: Iterable, expand: Callable) -> Iterator:
open_q = deque(list(initial))
visited = set(open_q)
while open_q:
node = open_q.popleft()
yield node
for next_node in expand(node):
if next_node not in visited:
visited.add(next_node)
open_q.append(next_node)
def bfs_all_unique(initial, expand):
"bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions"
open_q = deque(list(initial))
while open_q:
node = open_q.popleft()
yield node
open_q += expand(node)
def _serialize(value: Any, memo: Optional[SerializeMemoizer]) -> Any:
if isinstance(value, Serialize):
return value.serialize(memo)
elif isinstance(value, list):
return [_serialize(elem, memo) for elem in value]
elif isinstance(value, frozenset):
return list(value) # TODO reversible?
elif isinstance(value, dict):
return {key:_serialize(elem, memo) for key, elem in value.items()}
# assert value is None or isinstance(value, (int, float, str, tuple)), value
return value
def small_factors(n: int, max_factor: int) -> List[Tuple[int, int]]:
"""
Splits n up into smaller factors and summands <= max_factor.
Returns a list of [(a, b), ...]
so that the following code returns n:
n = 1
for a, b in values:
n = n * a + b
Currently, we also keep a + b <= max_factor, but that might change
"""
assert n >= 0
assert max_factor > 2
if n <= max_factor:
return [(n, 0)]
for a in range(max_factor, 1, -1):
r, b = divmod(n, a)
if a + b <= max_factor:
return small_factors(r, max_factor) + [(a, b)]
assert False, "Failed to factorize %s" % n
class OrderedSet(AbstractSet[T]):
"""A minimal OrderedSet implementation, using a dictionary.
(relies on the dictionary being ordered)
"""
def __init__(self, items: Iterable[T] =()):
self.d = dict.fromkeys(items)
def __contains__(self, item: Any) -> bool:
return item in self.d
def add(self, item: T):
self.d[item] = None
def __iter__(self) -> Iterator[T]:
return iter(self.d)
def remove(self, item: T):
del self.d[item]
def __bool__(self):
return bool(self.d)
def __len__(self) -> int:
return len(self.d)

View File

@@ -0,0 +1,596 @@
from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union, Optional, Any, cast
from abc import ABC
from .utils import combine_alternatives
from .tree import Tree, Branch
from .exceptions import VisitError, GrammarError
from .lexer import Token
###{standalone
from functools import wraps, update_wrapper
from inspect import getmembers, getmro
_Return_T = TypeVar('_Return_T')
_Return_V = TypeVar('_Return_V')
_Leaf_T = TypeVar('_Leaf_T')
_Leaf_U = TypeVar('_Leaf_U')
_R = TypeVar('_R')
_FUNC = Callable[..., _Return_T]
_DECORATED = Union[_FUNC, type]
class _DiscardType:
"""When the Discard value is returned from a transformer callback,
that node is discarded and won't appear in the parent.
Note:
This feature is disabled when the transformer is provided to Lark
using the ``transformer`` keyword (aka Tree-less LALR mode).
Example:
::
class T(Transformer):
def ignore_tree(self, children):
return Discard
def IGNORE_TOKEN(self, token):
return Discard
"""
def __repr__(self):
return "lark.visitors.Discard"
Discard = _DiscardType()
# Transformers
class _Decoratable:
"Provides support for decorating methods with @v_args"
@classmethod
def _apply_v_args(cls, visit_wrapper):
mro = getmro(cls)
assert mro[0] is cls
libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
for name, value in getmembers(cls):
# Make sure the function isn't inherited (unless it's overwritten)
if name.startswith('_') or (name in libmembers and name not in cls.__dict__):
continue
if not callable(value):
continue
# Skip if v_args already applied (at the function level)
if isinstance(cls.__dict__[name], _VArgsWrapper):
continue
setattr(cls, name, _VArgsWrapper(cls.__dict__[name], visit_wrapper))
return cls
def __class_getitem__(cls, _):
return cls
class Transformer(_Decoratable, ABC, Generic[_Leaf_T, _Return_T]):
"""Transformers work bottom-up (or depth-first), starting with visiting the leaves and working
their way up until ending at the root of the tree.
For each node visited, the transformer will call the appropriate method (callbacks), according to the
node's ``data``, and use the returned value to replace the node, thereby creating a new tree structure.
Transformers can be used to implement map & reduce patterns. Because nodes are reduced from leaf to root,
at any point the callbacks may assume the children have already been transformed (if applicable).
If the transformer cannot find a method with the right name, it will instead call ``__default__``, which by
default creates a copy of the node.
To discard a node, return Discard (``lark.visitors.Discard``).
``Transformer`` can do anything ``Visitor`` can do, but because it reconstructs the tree,
it is slightly less efficient.
A transformer without methods essentially performs a non-memoized partial deepcopy.
All these classes implement the transformer interface:
- ``Transformer`` - Recursively transforms the tree. This is the one you probably want.
- ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place instead of returning new instances
- ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place instead of returning new instances
Parameters:
visit_tokens (bool, optional): Should the transformer visit tokens in addition to rules.
Setting this to ``False`` is slightly faster. Defaults to ``True``.
(For processing ignored tokens, use the ``lexer_callbacks`` options)
"""
__visit_tokens__ = True # For backwards compatibility
def __init__(self, visit_tokens: bool=True) -> None:
self.__visit_tokens__ = visit_tokens
def _call_userfunc(self, tree, new_children=None):
# Assumes tree is already transformed
children = new_children if new_children is not None else tree.children
try:
f = getattr(self, tree.data)
except AttributeError:
return self.__default__(tree.data, children, tree.meta)
else:
try:
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
return f.visit_wrapper(f, tree.data, children, tree.meta)
else:
return f(children)
except GrammarError:
raise
except Exception as e:
raise VisitError(tree.data, tree, e)
def _call_userfunc_token(self, token):
try:
f = getattr(self, token.type)
except AttributeError:
return self.__default_token__(token)
else:
try:
return f(token)
except GrammarError:
raise
except Exception as e:
raise VisitError(token.type, token, e)
def _transform_children(self, children):
for c in children:
if isinstance(c, Tree):
res = self._transform_tree(c)
elif self.__visit_tokens__ and isinstance(c, Token):
res = self._call_userfunc_token(c)
else:
res = c
if res is not Discard:
yield res
def _transform_tree(self, tree):
children = list(self._transform_children(tree.children))
return self._call_userfunc(tree, children)
def transform(self, tree: Tree[_Leaf_T]) -> _Return_T:
"Transform the given tree, and return the final result"
res = list(self._transform_children([tree]))
if not res:
return None # type: ignore[return-value]
assert len(res) == 1
return res[0]
def __mul__(
self: 'Transformer[_Leaf_T, Tree[_Leaf_U]]',
other: 'Union[Transformer[_Leaf_U, _Return_V], TransformerChain[_Leaf_U, _Return_V,]]'
) -> 'TransformerChain[_Leaf_T, _Return_V]':
"""Chain two transformers together, returning a new transformer.
"""
return TransformerChain(self, other)
def __default__(self, data, children, meta):
"""Default function that is called if there is no attribute matching ``data``
Can be overridden. Defaults to creating a new copy of the tree node (i.e. ``return Tree(data, children, meta)``)
"""
return Tree(data, children, meta)
def __default_token__(self, token):
"""Default function that is called if there is no attribute matching ``token.type``
Can be overridden. Defaults to returning the token as-is.
"""
return token
def merge_transformers(base_transformer=None, **transformers_to_merge):
"""Merge a collection of transformers into the base_transformer, each into its own 'namespace'.
When called, it will collect the methods from each transformer, and assign them to base_transformer,
with their name prefixed with the given keyword, as ``prefix__methodname``.
This function is especially useful for processing grammars that import other grammars,
thereby creating some of their rules in a 'namespace'. (i.e with a consistent name prefix).
In this case, the key for the transformer should match the name of the imported grammar.
Parameters:
base_transformer (Transformer, optional): The transformer that all other transformers will be added to.
**transformers_to_merge: Keyword arguments, in the form of ``name_prefix = transformer``.
Raises:
AttributeError: In case of a name collision in the merged methods
Example:
::
class TBase(Transformer):
def start(self, children):
return children[0] + 'bar'
class TImportedGrammar(Transformer):
def foo(self, children):
return "foo"
composed_transformer = merge_transformers(TBase(), imported=TImportedGrammar())
t = Tree('start', [ Tree('imported__foo', []) ])
assert composed_transformer.transform(t) == 'foobar'
"""
if base_transformer is None:
base_transformer = Transformer()
for prefix, transformer in transformers_to_merge.items():
for method_name in dir(transformer):
method = getattr(transformer, method_name)
if not callable(method):
continue
if method_name.startswith("_") or method_name == "transform":
continue
prefixed_method = prefix + "__" + method_name
if hasattr(base_transformer, prefixed_method):
raise AttributeError("Cannot merge: method '%s' appears more than once" % prefixed_method)
setattr(base_transformer, prefixed_method, method)
return base_transformer
class InlineTransformer(Transformer): # XXX Deprecated
def _call_userfunc(self, tree, new_children=None):
# Assumes tree is already transformed
children = new_children if new_children is not None else tree.children
try:
f = getattr(self, tree.data)
except AttributeError:
return self.__default__(tree.data, children, tree.meta)
else:
return f(*children)
class TransformerChain(Generic[_Leaf_T, _Return_T]):
transformers: 'Tuple[Union[Transformer, TransformerChain], ...]'
def __init__(self, *transformers: 'Union[Transformer, TransformerChain]') -> None:
self.transformers = transformers
def transform(self, tree: Tree[_Leaf_T]) -> _Return_T:
for t in self.transformers:
tree = t.transform(tree)
return cast(_Return_T, tree)
def __mul__(
self: 'TransformerChain[_Leaf_T, Tree[_Leaf_U]]',
other: 'Union[Transformer[_Leaf_U, _Return_V], TransformerChain[_Leaf_U, _Return_V]]'
) -> 'TransformerChain[_Leaf_T, _Return_V]':
return TransformerChain(*self.transformers + (other,))
class Transformer_InPlace(Transformer[_Leaf_T, _Return_T]):
"""Same as Transformer, but non-recursive, and changes the tree in-place instead of returning new instances
Useful for huge trees. Conservative in memory.
"""
def _transform_tree(self, tree): # Cancel recursion
return self._call_userfunc(tree)
def transform(self, tree: Tree[_Leaf_T]) -> _Return_T:
for subtree in tree.iter_subtrees():
subtree.children = list(self._transform_children(subtree.children))
return self._transform_tree(tree)
class Transformer_NonRecursive(Transformer[_Leaf_T, _Return_T]):
"""Same as Transformer but non-recursive.
Like Transformer, it doesn't change the original tree.
Useful for huge trees.
"""
def transform(self, tree: Tree[_Leaf_T]) -> _Return_T:
# Tree to postfix
rev_postfix = []
q: List[Branch[_Leaf_T]] = [tree]
while q:
t = q.pop()
rev_postfix.append(t)
if isinstance(t, Tree):
q += t.children
# Postfix to tree
stack: List = []
for x in reversed(rev_postfix):
if isinstance(x, Tree):
size = len(x.children)
if size:
args = stack[-size:]
del stack[-size:]
else:
args = []
res = self._call_userfunc(x, args)
if res is not Discard:
stack.append(res)
elif self.__visit_tokens__ and isinstance(x, Token):
res = self._call_userfunc_token(x)
if res is not Discard:
stack.append(res)
else:
stack.append(x)
result, = stack # We should have only one tree remaining
# There are no guarantees on the type of the value produced by calling a user func for a
# child will produce. This means type system can't statically know that the final result is
# _Return_T. As a result a cast is required.
return cast(_Return_T, result)
class Transformer_InPlaceRecursive(Transformer):
"Same as Transformer, recursive, but changes the tree in-place instead of returning new instances"
def _transform_tree(self, tree):
tree.children = list(self._transform_children(tree.children))
return self._call_userfunc(tree)
# Visitors
class VisitorBase:
def _call_userfunc(self, tree):
return getattr(self, tree.data, self.__default__)(tree)
def __default__(self, tree):
"""Default function that is called if there is no attribute matching ``tree.data``
Can be overridden. Defaults to doing nothing.
"""
return tree
def __class_getitem__(cls, _):
return cls
class Visitor(VisitorBase, ABC, Generic[_Leaf_T]):
"""Tree visitor, non-recursive (can handle huge trees).
Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data``
"""
def visit(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]:
"Visits the tree, starting with the leaves and finally the root (bottom-up)"
for subtree in tree.iter_subtrees():
self._call_userfunc(subtree)
return tree
def visit_topdown(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]:
"Visit the tree, starting at the root, and ending at the leaves (top-down)"
for subtree in tree.iter_subtrees_topdown():
self._call_userfunc(subtree)
return tree
class Visitor_Recursive(VisitorBase, Generic[_Leaf_T]):
"""Bottom-up visitor, recursive.
Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data``
Slightly faster than the non-recursive version.
"""
def visit(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]:
"Visits the tree, starting with the leaves and finally the root (bottom-up)"
for child in tree.children:
if isinstance(child, Tree):
self.visit(child)
self._call_userfunc(tree)
return tree
def visit_topdown(self,tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]:
"Visit the tree, starting at the root, and ending at the leaves (top-down)"
self._call_userfunc(tree)
for child in tree.children:
if isinstance(child, Tree):
self.visit_topdown(child)
return tree
class Interpreter(_Decoratable, ABC, Generic[_Leaf_T, _Return_T]):
"""Interpreter walks the tree starting at the root.
Visits the tree, starting with the root and finally the leaves (top-down)
For each tree node, it calls its methods (provided by user via inheritance) according to ``tree.data``.
Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't automatically visit its sub-branches.
The user has to explicitly call ``visit``, ``visit_children``, or use the ``@visit_children_decor``.
This allows the user to implement branching and loops.
"""
def visit(self, tree: Tree[_Leaf_T]) -> _Return_T:
# There are no guarantees on the type of the value produced by calling a user func for a
# child will produce. So only annotate the public method and use an internal method when
# visiting child trees.
return self._visit_tree(tree)
def _visit_tree(self, tree: Tree[_Leaf_T]):
f = getattr(self, tree.data)
wrapper = getattr(f, 'visit_wrapper', None)
if wrapper is not None:
return f.visit_wrapper(f, tree.data, tree.children, tree.meta)
else:
return f(tree)
def visit_children(self, tree: Tree[_Leaf_T]) -> List:
return [self._visit_tree(child) if isinstance(child, Tree) else child
for child in tree.children]
def __getattr__(self, name):
return self.__default__
def __default__(self, tree):
return self.visit_children(tree)
_InterMethod = Callable[[Type[Interpreter], _Return_T], _R]
def visit_children_decor(func: _InterMethod) -> _InterMethod:
"See Interpreter"
@wraps(func)
def inner(cls, tree):
values = cls.visit_children(tree)
return func(cls, values)
return inner
# Decorators
def _apply_v_args(obj, visit_wrapper):
try:
_apply = obj._apply_v_args
except AttributeError:
return _VArgsWrapper(obj, visit_wrapper)
else:
return _apply(visit_wrapper)
class _VArgsWrapper:
"""
A wrapper around a Callable. It delegates `__call__` to the Callable.
If the Callable has a `__get__`, that is also delegate and the resulting function is wrapped.
Otherwise, we use the original function mirroring the behaviour without a __get__.
We also have the visit_wrapper attribute to be used by Transformers.
"""
base_func: Callable
def __init__(self, func: Callable, visit_wrapper: Callable[[Callable, str, list, Any], Any]):
if isinstance(func, _VArgsWrapper):
func = func.base_func
self.base_func = func
self.visit_wrapper = visit_wrapper
update_wrapper(self, func)
def __call__(self, *args, **kwargs):
return self.base_func(*args, **kwargs)
def __get__(self, instance, owner=None):
try:
# Use the __get__ attribute of the type instead of the instance
# to fully mirror the behavior of getattr
g = type(self.base_func).__get__
except AttributeError:
return self
else:
return _VArgsWrapper(g(self.base_func, instance, owner), self.visit_wrapper)
def __set_name__(self, owner, name):
try:
f = type(self.base_func).__set_name__
except AttributeError:
return
else:
f(self.base_func, owner, name)
def _vargs_inline(f, _data, children, _meta):
return f(*children)
def _vargs_meta_inline(f, _data, children, meta):
return f(meta, *children)
def _vargs_meta(f, _data, children, meta):
return f(meta, children)
def _vargs_tree(f, data, children, meta):
return f(Tree(data, children, meta))
def v_args(inline: bool = False, meta: bool = False, tree: bool = False, wrapper: Optional[Callable] = None) -> Callable[[_DECORATED], _DECORATED]:
"""A convenience decorator factory for modifying the behavior of user-supplied visitor methods.
By default, callback methods of transformers/visitors accept one argument - a list of the node's children.
``v_args`` can modify this behavior. When used on a transformer/visitor class definition,
it applies to all the callback methods inside it.
``v_args`` can be applied to a single method, or to an entire class. When applied to both,
the options given to the method take precedence.
Parameters:
inline (bool, optional): Children are provided as ``*args`` instead of a list argument (not recommended for very long lists).
meta (bool, optional): Provides two arguments: ``meta`` and ``children`` (instead of just the latter)
tree (bool, optional): Provides the entire tree as the argument, instead of the children.
wrapper (function, optional): Provide a function to decorate all methods.
Example:
::
@v_args(inline=True)
class SolveArith(Transformer):
def add(self, left, right):
return left + right
@v_args(meta=True)
def mul(self, meta, children):
logger.info(f'mul at line {meta.line}')
left, right = children
return left * right
class ReverseNotation(Transformer_InPlace):
@v_args(tree=True)
def tree_node(self, tree):
tree.children = tree.children[::-1]
"""
if tree and (meta or inline):
raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.")
func = None
if meta:
if inline:
func = _vargs_meta_inline
else:
func = _vargs_meta
elif inline:
func = _vargs_inline
elif tree:
func = _vargs_tree
if wrapper is not None:
if func is not None:
raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.")
func = wrapper
def _visitor_args_dec(obj):
return _apply_v_args(obj, func)
return _visitor_args_dec
###}
# --- Visitor Utilities ---
class CollapseAmbiguities(Transformer):
"""
Transforms a tree that contains any number of _ambig nodes into a list of trees,
each one containing an unambiguous tree.
The length of the resulting list is the product of the length of all _ambig nodes.
Warning: This may quickly explode for highly ambiguous trees.
"""
def _ambig(self, options):
return sum(options, [])
def __default__(self, data, children_lists, meta):
return [Tree(data, children, meta) for children in combine_alternatives(children_lists)]
def __default_token__(self, t):
return [t]