This commit is contained in:
lz_db
2025-11-16 12:31:03 +08:00
commit 0fab423a18
1451 changed files with 743213 additions and 0 deletions

View File

@@ -0,0 +1,340 @@
"""This module implements a CYK parser."""
# Author: https://github.com/ehudt (2018)
#
# Adapted by Erez
from collections import defaultdict
import itertools
from ..exceptions import ParseError
from ..lexer import Token
from ..tree import Tree
from ..grammar import Terminal as T, NonTerminal as NT, Symbol
def match(t, s):
assert isinstance(t, T)
return t.name == s.type
class Rule:
"""Context-free grammar rule."""
def __init__(self, lhs, rhs, weight, alias):
super(Rule, self).__init__()
assert isinstance(lhs, NT), lhs
assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
self.lhs = lhs
self.rhs = rhs
self.weight = weight
self.alias = alias
def __str__(self):
return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
def __repr__(self):
return str(self)
def __hash__(self):
return hash((self.lhs, tuple(self.rhs)))
def __eq__(self, other):
return self.lhs == other.lhs and self.rhs == other.rhs
def __ne__(self, other):
return not (self == other)
class Grammar:
"""Context-free grammar."""
def __init__(self, rules):
self.rules = frozenset(rules)
def __eq__(self, other):
return self.rules == other.rules
def __str__(self):
return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'
def __repr__(self):
return str(self)
# Parse tree data structures
class RuleNode:
"""A node in the parse tree, which also contains the full rhs rule."""
def __init__(self, rule, children, weight=0):
self.rule = rule
self.children = children
self.weight = weight
def __repr__(self):
return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children))
class Parser:
"""Parser wrapper."""
def __init__(self, rules):
super(Parser, self).__init__()
self.orig_rules = {rule: rule for rule in rules}
rules = [self._to_rule(rule) for rule in rules]
self.grammar = to_cnf(Grammar(rules))
def _to_rule(self, lark_rule):
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
assert isinstance(lark_rule.origin, NT)
assert all(isinstance(x, Symbol) for x in lark_rule.expansion)
return Rule(
lark_rule.origin, lark_rule.expansion,
weight=lark_rule.options.priority if lark_rule.options.priority else 0,
alias=lark_rule)
def parse(self, tokenized, start): # pylint: disable=invalid-name
"""Parses input, which is a list of tokens."""
assert start
start = NT(start)
table, trees = _parse(tokenized, self.grammar)
# Check if the parse succeeded.
if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]):
raise ParseError('Parsing failed.')
parse = trees[(0, len(tokenized) - 1)][start]
return self._to_tree(revert_cnf(parse))
def _to_tree(self, rule_node):
"""Converts a RuleNode parse tree to a lark Tree."""
orig_rule = self.orig_rules[rule_node.rule.alias]
children = []
for child in rule_node.children:
if isinstance(child, RuleNode):
children.append(self._to_tree(child))
else:
assert isinstance(child.name, Token)
children.append(child.name)
t = Tree(orig_rule.origin, children)
t.rule=orig_rule
return t
def print_parse(node, indent=0):
if isinstance(node, RuleNode):
print(' ' * (indent * 2) + str(node.rule.lhs))
for child in node.children:
print_parse(child, indent + 1)
else:
print(' ' * (indent * 2) + str(node.s))
def _parse(s, g):
"""Parses sentence 's' using CNF grammar 'g'."""
# The CYK table. Indexed with a 2-tuple: (start pos, end pos)
table = defaultdict(set)
# Top-level structure is similar to the CYK table. Each cell is a dict from
# rule name to the best (lightest) tree for that rule.
trees = defaultdict(dict)
# Populate base case with existing terminal production rules
for i, w in enumerate(s):
for terminal, rules in g.terminal_rules.items():
if match(terminal, w):
for rule in rules:
table[(i, i)].add(rule)
if (rule.lhs not in trees[(i, i)] or
rule.weight < trees[(i, i)][rule.lhs].weight):
trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
# Iterate over lengths of sub-sentences
for l in range(2, len(s) + 1):
# Iterate over sub-sentences with the given length
for i in range(len(s) - l + 1):
# Choose partition of the sub-sentence in [1, l)
for p in range(i + 1, i + l):
span1 = (i, p - 1)
span2 = (p, i + l - 1)
for r1, r2 in itertools.product(table[span1], table[span2]):
for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
table[(i, i + l - 1)].add(rule)
r1_tree = trees[span1][r1.lhs]
r2_tree = trees[span2][r2.lhs]
rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
if (rule.lhs not in trees[(i, i + l - 1)]
or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
return table, trees
# This section implements context-free grammar converter to Chomsky normal form.
# It also implements a conversion of parse trees from its CNF to the original
# grammar.
# Overview:
# Applies the following operations in this order:
# * TERM: Eliminates non-solitary terminals from all rules
# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side.
# * UNIT: Eliminates non-terminal unit rules
#
# The following grammar characteristics aren't featured:
# * Start symbol appears on RHS
# * Empty rules (epsilon rules)
class CnfWrapper:
"""CNF wrapper for grammar.
Validates that the input grammar is CNF and provides helper data structures.
"""
def __init__(self, grammar):
super(CnfWrapper, self).__init__()
self.grammar = grammar
self.rules = grammar.rules
self.terminal_rules = defaultdict(list)
self.nonterminal_rules = defaultdict(list)
for r in self.rules:
# Validate that the grammar is CNF and populate auxiliary data structures.
assert isinstance(r.lhs, NT), r
if len(r.rhs) not in [1, 2]:
raise ParseError("CYK doesn't support empty rules")
if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
self.terminal_rules[r.rhs[0]].append(r)
elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
self.nonterminal_rules[tuple(r.rhs)].append(r)
else:
assert False, r
def __eq__(self, other):
return self.grammar == other.grammar
def __repr__(self):
return repr(self.grammar)
class UnitSkipRule(Rule):
"""A rule that records NTs that were skipped during transformation."""
def __init__(self, lhs, rhs, skipped_rules, weight, alias):
super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
self.skipped_rules = skipped_rules
def __eq__(self, other):
return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules
__hash__ = Rule.__hash__
def build_unit_skiprule(unit_rule, target_rule):
skipped_rules = []
if isinstance(unit_rule, UnitSkipRule):
skipped_rules += unit_rule.skipped_rules
skipped_rules.append(target_rule)
if isinstance(target_rule, UnitSkipRule):
skipped_rules += target_rule.skipped_rules
return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
def get_any_nt_unit_rule(g):
"""Returns a non-terminal unit rule from 'g', or None if there is none."""
for rule in g.rules:
if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
return rule
return None
def _remove_unit_rule(g, rule):
"""Removes 'rule' from 'g' without changing the language produced by 'g'."""
new_rules = [x for x in g.rules if x != rule]
refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
new_rules += [build_unit_skiprule(rule, ref) for ref in refs]
return Grammar(new_rules)
def _split(rule):
"""Splits a rule whose len(rhs) > 2 into shorter rules."""
rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
rule_name = '__SP_%s' % (rule_str) + '_%d'
yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)
for i in range(1, len(rule.rhs) - 2):
yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split')
yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split')
def _term(g):
"""Applies the TERM rule on 'g' (see top comment)."""
all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs)
else:
new_rules.append(rule)
return Grammar(new_rules)
def _bin(g):
"""Applies the BIN rule to 'g' (see top comment)."""
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 2:
new_rules += _split(rule)
else:
new_rules.append(rule)
return Grammar(new_rules)
def _unit(g):
"""Applies the UNIT rule to 'g' (see top comment)."""
nt_unit_rule = get_any_nt_unit_rule(g)
while nt_unit_rule:
g = _remove_unit_rule(g, nt_unit_rule)
nt_unit_rule = get_any_nt_unit_rule(g)
return g
def to_cnf(g):
"""Creates a CNF grammar from a general context-free grammar 'g'."""
g = _unit(_bin(_term(g)))
return CnfWrapper(g)
def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias):
if not skipped_rules:
return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
else:
weight = weight - skipped_rules[0].weight
return RuleNode(
Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs,
skipped_rules[1:], children,
skipped_rules[0].weight, skipped_rules[0].alias)
], weight=weight)
def revert_cnf(node):
"""Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
if isinstance(node, T):
return node
# Reverts TERM rule.
if node.rule.lhs.name.startswith('__T_'):
return node.children[0]
else:
children = []
for child in map(revert_cnf, node.children):
# Reverts BIN rule.
if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'):
children += child.children
else:
children.append(child)
# Reverts UNIT rule.
if isinstance(node.rule, UnitSkipRule):
return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs,
node.rule.skipped_rules, children,
node.rule.weight, node.rule.alias)
else:
return RuleNode(node.rule, children)

View File

@@ -0,0 +1,314 @@
"""This module implements an Earley parser.
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
https://www.sciencedirect.com/science/article/pii/S1571066108001497
That is probably the best reference for understanding the algorithm here.
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
"""
from typing import TYPE_CHECKING, Callable, Optional, List, Any
from collections import deque
from ..lexer import Token
from ..tree import Tree
from ..exceptions import UnexpectedEOF, UnexpectedToken
from ..utils import logger, OrderedSet, dedup_list
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Item
from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree
if TYPE_CHECKING:
from ..common import LexerConf, ParserConf
class Parser:
lexer_conf: 'LexerConf'
parser_conf: 'ParserConf'
debug: bool
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
resolve_ambiguity: bool=True, debug: bool=False,
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
analysis = GrammarAnalyzer(parser_conf)
self.lexer_conf = lexer_conf
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.debug = debug
self.Tree = tree_class
self.Set = OrderedSet if ordered_sets else set
self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode
self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
self.callbacks = parser_conf.callbacks
# TODO add typing info
self.predictions = {} # type: ignore[var-annotated]
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
# the slow 'isupper' in is_terminal.
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
self.forest_sum_visitor = None
for rule in parser_conf.rules:
if rule.origin not in self.predictions:
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
# the priorities will be stripped from all rules/terminals before they reach us, allowing us to
# skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
# on any rules/terminals.
if self.forest_sum_visitor is None and rule.options.priority is not None:
self.forest_sum_visitor = ForestSumVisitor
# Check terminals for priorities
# Ignore terminal priorities if the basic lexer is used
if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
for term in self.lexer_conf.terminals:
if term.priority:
self.forest_sum_visitor = ForestSumVisitor
break
self.term_matcher = term_matcher
def predict_and_complete(self, i, to_scan, columns, transitives):
"""The core Earley Predictor and Completer.
At each stage of the input, we handling any completed items (things
that matched on the last cycle) and use those to predict what should
come next in the input stream. The completions and any predicted
non-terminals are recursively processed until we reach a set of,
which can be added to the scan list for the next scanner cycle."""
# Held Completions (H in E.Scotts paper).
node_cache = {}
held_completions = {}
column = columns[i]
# R (items) = Ei (column.items)
items = deque(column)
while items:
item = items.pop() # remove an element, A say, from R
### The Earley completer
if item.is_complete: ### (item.s == string)
if item.node is None:
label = (item.s, item.start, i)
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
item.node.add_family(item.s, item.rule, item.start, None, None)
# create_leo_transitives(item.rule.origin, item.start)
###R Joop Leo right recursion Completer
if item.rule.origin in transitives[item.start]:
transitive = transitives[item.start][item.s]
if transitive.previous in transitives[transitive.column]:
root_transitive = transitives[transitive.column][transitive.previous]
else:
root_transitive = transitive
new_item = Item(transitive.rule, transitive.ptr, transitive.start)
label = (root_transitive.s, root_transitive.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_path(root_transitive, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)
###R Regular Earley completer
else:
# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start == i
if is_empty_item:
held_completions[item.rule.origin] = item.node
originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
label = (new_item.s, originator.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)
### The Earley predictor
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
new_items = []
for rule in self.predictions[item.expect]:
new_item = Item(rule, 0, i)
new_items.append(new_item)
# Process any held completions (H).
if item.expect in held_completions:
new_item = item.advance()
label = (new_item.s, item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)
for new_item in new_items:
if new_item.expect in self.TERMINALS:
to_scan.add(new_item)
elif new_item not in column:
column.add(new_item)
items.append(new_item)
def _parse(self, lexer, columns, to_scan, start_symbol=None):
def is_quasi_complete(item):
if item.is_complete:
return True
quasi = item.advance()
while not quasi.is_complete:
if quasi.expect not in self.NULLABLE:
return False
if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
return False
quasi = quasi.advance()
return True
# def create_leo_transitives(origin, start):
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
def scan(i, token, to_scan):
"""The core Earley Scanner.
This is a custom implementation of the scanner that uses the
Lark lexer to match tokens. The scan list is built by the
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""
next_to_scan = self.Set()
next_set = self.Set()
columns.append(next_set)
transitives.append({})
node_cache = {}
for item in self.Set(to_scan):
if match(item.expect, token):
new_item = item.advance()
label = (new_item.s, new_item.start, i)
# 'terminals' may not contain token.type when using %declare
# Additionally, token is not always a Token
# For example, it can be a Tree when using TreeMatcher
term = terminals.get(token.type) if isinstance(token, Token) else None
# Set the priority of the token node to 0 so that the
# terminal priorities do not affect the Tree chosen by
# ForestSumVisitor after the basic lexer has already
# "used up" the terminal priorities
token_node = TokenNode(token, term, priority=0)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
if new_item.expect in self.TERMINALS:
# add (B ::= Aai+1.B, h, y) to Q'
next_to_scan.add(new_item)
else:
# add (B ::= Aa+1.B, h, y) to Ei+1
next_set.add(new_item)
if not next_set and not next_to_scan:
expect = {i.expect.name for i in to_scan}
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
return next_to_scan
# Define parser functions
match = self.term_matcher
terminals = self.lexer_conf.terminals_by_name
# Cache for nodes & tokens created in a particular parse step.
transitives = [{}]
## The main Earley loop.
# Run the Prediction/Completion cycle for any Items in the current Earley set.
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
expects = {i.expect for i in to_scan}
i = 0
for token in lexer.lex(expects):
self.predict_and_complete(i, to_scan, columns, transitives)
to_scan = scan(i, token, to_scan)
i += 1
expects.clear()
expects |= {i.expect for i in to_scan}
self.predict_and_complete(i, to_scan, columns, transitives)
## Column is now the final column in the parse.
assert i == len(columns)-1
return to_scan
def parse(self, lexer, start):
assert start, start
start_symbol = NonTerminal(start)
columns = [self.Set()]
to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.
## Predict for the start_symbol.
# Add predicted items to the first Earley set (for the predictor) if they
# result in a non-terminal, or the scanner if they result in a terminal.
for rule in self.predictions[start_symbol]:
item = Item(rule, 0, 0)
if item.expect in self.TERMINALS:
to_scan.add(item)
else:
columns[0].add(item)
to_scan = self._parse(lexer, columns, to_scan, start_symbol)
# If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)
if not solutions:
expected_terminals = [t.expect.name for t in to_scan]
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
if self.debug:
from .earley_forest import ForestToPyDotVisitor
try:
debug_walker = ForestToPyDotVisitor()
except ImportError:
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
else:
for i, s in enumerate(solutions):
debug_walker.visit(s, f"sppf{i}.png")
if self.Tree is not None:
# Perform our SPPF -> AST conversion
transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity)
solutions = [transformer.transform(s) for s in solutions]
if len(solutions) > 1:
t: Tree = self.Tree('_ambig', solutions)
t.expand_kids_by_data('_ambig') # solutions may themselves be _ambig nodes
return t
return solutions[0]
# return the root of the SPPF
# TODO return a list of solutions, or join them together somehow
return solutions[0]

View File

@@ -0,0 +1,42 @@
"""This module implements useful building blocks for the Earley parser
"""
class Item:
"An Earley Item, the atom of the algorithm."
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')
def __init__(self, rule, ptr, start):
self.is_complete = len(rule.expansion) == ptr
self.rule = rule # rule
self.ptr = ptr # ptr
self.start = start # j
self.node = None # w
if self.is_complete:
self.s = rule.origin
self.expect = None
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
else:
self.s = (rule, ptr)
self.expect = rule.expansion[ptr]
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
self._hash = hash((self.s, self.start, self.rule))
def advance(self):
return Item(self.rule, self.ptr + 1, self.start)
def __eq__(self, other):
return self is other or (self.s == other.s and self.start == other.start and self.rule == other.rule)
def __hash__(self):
return self._hash
def __repr__(self):
before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after))
return '%s (%d)' % (symbol, self.start)
# class TransitiveItem(Item):
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420

View File

@@ -0,0 +1,801 @@
""""This module implements an SPPF implementation
This is used as the primary output mechanism for the Earley parser
in order to store complex ambiguities.
Full reference and more details is here:
https://web.archive.org/web/20190616123959/http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
"""
from typing import Type, AbstractSet
from random import randint
from collections import deque
from operator import attrgetter
from importlib import import_module
from functools import partial
from ..parse_tree_builder import AmbiguousIntermediateExpander
from ..visitors import Discard
from ..utils import logger, OrderedSet
from ..tree import Tree
class ForestNode:
pass
class SymbolNode(ForestNode):
"""
A Symbol Node represents a symbol (or Intermediate LR0).
Symbol nodes are keyed by the symbol (s). For intermediate nodes
s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol
nodes, s will be a string representing the non-terminal origin (i.e.
the left hand side of the rule).
The children of a Symbol or Intermediate Node will always be Packed Nodes;
with each Packed Node child representing a single derivation of a production.
Hence a Symbol Node with a single child is unambiguous.
Parameters:
s: A Symbol, or a tuple of (rule, ptr) for an intermediate node.
start: For dynamic lexers, the index of the start of the substring matched by this symbol (inclusive).
end: For dynamic lexers, the index of the end of the substring matched by this symbol (exclusive).
Properties:
is_intermediate: True if this node is an intermediate node.
priority: The priority of the node's symbol.
"""
Set: Type[AbstractSet] = set # Overridden by StableSymbolNode
__slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate')
def __init__(self, s, start, end):
self.s = s
self.start = start
self.end = end
self._children = self.Set()
self.paths = self.Set()
self.paths_loaded = False
### We use inf here as it can be safely negated without resorting to conditionals,
# unlike None or float('NaN'), and sorts appropriately.
self.priority = float('-inf')
self.is_intermediate = isinstance(s, tuple)
def add_family(self, lr0, rule, start, left, right):
self._children.add(PackedNode(self, lr0, rule, start, left, right))
def add_path(self, transitive, node):
self.paths.add((transitive, node))
def load_paths(self):
for transitive, node in self.paths:
if transitive.next_titem is not None:
vn = type(self)(transitive.next_titem.s, transitive.next_titem.start, self.end)
vn.add_path(transitive.next_titem, node)
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
else:
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
self.paths_loaded = True
@property
def is_ambiguous(self):
"""Returns True if this node is ambiguous."""
return len(self.children) > 1
@property
def children(self):
"""Returns a list of this node's children sorted from greatest to
least priority."""
if not self.paths_loaded:
self.load_paths()
return sorted(self._children, key=attrgetter('sort_key'))
def __iter__(self):
return iter(self._children)
def __repr__(self):
if self.is_intermediate:
rule = self.s[0]
ptr = self.s[1]
before = ( expansion.name for expansion in rule.expansion[:ptr] )
after = ( expansion.name for expansion in rule.expansion[ptr:] )
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
else:
symbol = self.s.name
return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority)
class StableSymbolNode(SymbolNode):
"A version of SymbolNode that uses OrderedSet for output stability"
Set = OrderedSet
class PackedNode(ForestNode):
"""
A Packed Node represents a single derivation in a symbol node.
Parameters:
rule: The rule associated with this node.
parent: The parent of this node.
left: The left child of this node. ``None`` if one does not exist.
right: The right child of this node. ``None`` if one does not exist.
priority: The priority of this node.
"""
__slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash')
def __init__(self, parent, s, rule, start, left, right):
self.parent = parent
self.s = s
self.start = start
self.rule = rule
self.left = left
self.right = right
self.priority = float('-inf')
self._hash = hash((self.left, self.right))
@property
def is_empty(self):
return self.left is None and self.right is None
@property
def sort_key(self):
"""
Used to sort PackedNode children of SymbolNodes.
A SymbolNode has multiple PackedNodes if it matched
ambiguously. Hence, we use the sort order to identify
the order in which ambiguous children should be considered.
"""
return self.is_empty, -self.priority, self.rule.order
@property
def children(self):
"""Returns a list of this node's children."""
return [x for x in [self.left, self.right] if x is not None]
def __iter__(self):
yield self.left
yield self.right
def __eq__(self, other):
if not isinstance(other, PackedNode):
return False
return self is other or (self.left == other.left and self.right == other.right)
def __hash__(self):
return self._hash
def __repr__(self):
if isinstance(self.s, tuple):
rule = self.s[0]
ptr = self.s[1]
before = ( expansion.name for expansion in rule.expansion[:ptr] )
after = ( expansion.name for expansion in rule.expansion[ptr:] )
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
else:
symbol = self.s.name
return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order)
class TokenNode(ForestNode):
"""
A Token Node represents a matched terminal and is always a leaf node.
Parameters:
token: The Token associated with this node.
term: The TerminalDef matched by the token.
priority: The priority of this node.
"""
__slots__ = ('token', 'term', 'priority', '_hash')
def __init__(self, token, term, priority=None):
self.token = token
self.term = term
if priority is not None:
self.priority = priority
else:
self.priority = term.priority if term is not None else 0
self._hash = hash(token)
def __eq__(self, other):
if not isinstance(other, TokenNode):
return False
return self is other or (self.token == other.token)
def __hash__(self):
return self._hash
def __repr__(self):
return repr(self.token)
class ForestVisitor:
"""
An abstract base class for building forest visitors.
This class performs a controllable depth-first walk of an SPPF.
The visitor will not enter cycles and will backtrack if one is encountered.
Subclasses are notified of cycles through the ``on_cycle`` method.
Behavior for visit events is defined by overriding the
``visit*node*`` functions.
The walk is controlled by the return values of the ``visit*node_in``
methods. Returning a node(s) will schedule them to be visited. The visitor
will begin to backtrack if no nodes are returned.
Parameters:
single_visit: If ``True``, non-Token nodes will only be visited once.
"""
def __init__(self, single_visit=False):
self.single_visit = single_visit
def visit_token_node(self, node):
"""Called when a ``Token`` is visited. ``Token`` nodes are always leaves."""
pass
def visit_symbol_node_in(self, node):
"""Called when a symbol node is visited. Nodes that are returned
will be scheduled to be visited. If ``visit_intermediate_node_in``
is not implemented, this function will be called for intermediate
nodes as well."""
pass
def visit_symbol_node_out(self, node):
"""Called after all nodes returned from a corresponding ``visit_symbol_node_in``
call have been visited. If ``visit_intermediate_node_out``
is not implemented, this function will be called for intermediate
nodes as well."""
pass
def visit_packed_node_in(self, node):
"""Called when a packed node is visited. Nodes that are returned
will be scheduled to be visited. """
pass
def visit_packed_node_out(self, node):
"""Called after all nodes returned from a corresponding ``visit_packed_node_in``
call have been visited."""
pass
def on_cycle(self, node, path):
"""Called when a cycle is encountered.
Parameters:
node: The node that causes a cycle.
path: The list of nodes being visited: nodes that have been
entered but not exited. The first element is the root in a forest
visit, and the last element is the node visited most recently.
``path`` should be treated as read-only.
"""
pass
def get_cycle_in_path(self, node, path):
"""A utility function for use in ``on_cycle`` to obtain a slice of
``path`` that only contains the nodes that make up the cycle."""
index = len(path) - 1
while id(path[index]) != id(node):
index -= 1
return path[index:]
def visit(self, root):
# Visiting is a list of IDs of all symbol/intermediate nodes currently in
# the stack. It serves two purposes: to detect when we 'recurse' in and out
# of a symbol/intermediate so that we can process both up and down. Also,
# since the SPPF can have cycles it allows us to detect if we're trying
# to recurse into a node that's already on the stack (infinite recursion).
visiting = set()
# set of all nodes that have been visited
visited = set()
# a list of nodes that are currently being visited
# used for the `on_cycle` callback
path = []
# We do not use recursion here to walk the Forest due to the limited
# stack size in python. Therefore input_stack is essentially our stack.
input_stack = deque([root])
# It is much faster to cache these as locals since they are called
# many times in large parses.
vpno = getattr(self, 'visit_packed_node_out')
vpni = getattr(self, 'visit_packed_node_in')
vsno = getattr(self, 'visit_symbol_node_out')
vsni = getattr(self, 'visit_symbol_node_in')
vino = getattr(self, 'visit_intermediate_node_out', vsno)
vini = getattr(self, 'visit_intermediate_node_in', vsni)
vtn = getattr(self, 'visit_token_node')
oc = getattr(self, 'on_cycle')
while input_stack:
current = next(reversed(input_stack))
try:
next_node = next(current)
except StopIteration:
input_stack.pop()
continue
except TypeError:
### If the current object is not an iterator, pass through to Token/SymbolNode
pass
else:
if next_node is None:
continue
if id(next_node) in visiting:
oc(next_node, path)
continue
input_stack.append(next_node)
continue
if isinstance(current, TokenNode):
vtn(current.token)
input_stack.pop()
continue
current_id = id(current)
if current_id in visiting:
if isinstance(current, PackedNode):
vpno(current)
elif current.is_intermediate:
vino(current)
else:
vsno(current)
input_stack.pop()
path.pop()
visiting.remove(current_id)
visited.add(current_id)
elif self.single_visit and current_id in visited:
input_stack.pop()
else:
visiting.add(current_id)
path.append(current)
if isinstance(current, PackedNode):
next_node = vpni(current)
elif current.is_intermediate:
next_node = vini(current)
else:
next_node = vsni(current)
if next_node is None:
continue
if not isinstance(next_node, ForestNode):
next_node = iter(next_node)
elif id(next_node) in visiting:
oc(next_node, path)
continue
input_stack.append(next_node)
class ForestTransformer(ForestVisitor):
"""The base class for a bottom-up forest transformation. Most users will
want to use ``TreeForestTransformer`` instead as it has a friendlier
interface and covers most use cases.
Transformations are applied via inheritance and overriding of the
``transform*node`` methods.
``transform_token_node`` receives a ``Token`` as an argument.
All other methods receive the node that is being transformed and
a list of the results of the transformations of that node's children.
The return value of these methods are the resulting transformations.
If ``Discard`` is raised in a node's transformation, no data from that node
will be passed to its parent's transformation.
"""
def __init__(self):
super(ForestTransformer, self).__init__()
# results of transformations
self.data = dict()
# used to track parent nodes
self.node_stack = deque()
def transform(self, root):
"""Perform a transformation on an SPPF."""
self.node_stack.append('result')
self.data['result'] = []
self.visit(root)
assert len(self.data['result']) <= 1
if self.data['result']:
return self.data['result'][0]
def transform_symbol_node(self, node, data):
"""Transform a symbol node."""
return node
def transform_intermediate_node(self, node, data):
"""Transform an intermediate node."""
return node
def transform_packed_node(self, node, data):
"""Transform a packed node."""
return node
def transform_token_node(self, node):
"""Transform a ``Token``."""
return node
def visit_symbol_node_in(self, node):
self.node_stack.append(id(node))
self.data[id(node)] = []
return node.children
def visit_packed_node_in(self, node):
self.node_stack.append(id(node))
self.data[id(node)] = []
return node.children
def visit_token_node(self, node):
transformed = self.transform_token_node(node)
if transformed is not Discard:
self.data[self.node_stack[-1]].append(transformed)
def _visit_node_out_helper(self, node, method):
self.node_stack.pop()
transformed = method(node, self.data[id(node)])
if transformed is not Discard:
self.data[self.node_stack[-1]].append(transformed)
del self.data[id(node)]
def visit_symbol_node_out(self, node):
self._visit_node_out_helper(node, self.transform_symbol_node)
def visit_intermediate_node_out(self, node):
self._visit_node_out_helper(node, self.transform_intermediate_node)
def visit_packed_node_out(self, node):
self._visit_node_out_helper(node, self.transform_packed_node)
class ForestSumVisitor(ForestVisitor):
"""
A visitor for prioritizing ambiguous parts of the Forest.
This visitor is used when support for explicit priorities on
rules is requested (whether normal, or invert). It walks the
forest (or subsets thereof) and cascades properties upwards
from the leaves.
It would be ideal to do this during parsing, however this would
require processing each Earley item multiple times. That's
a big performance drawback; so running a forest walk is the
lesser of two evils: there can be significantly more Earley
items created during parsing than there are SPPF nodes in the
final tree.
"""
def __init__(self):
super(ForestSumVisitor, self).__init__(single_visit=True)
def visit_packed_node_in(self, node):
yield node.left
yield node.right
def visit_symbol_node_in(self, node):
return iter(node.children)
def visit_packed_node_out(self, node):
priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
priority += getattr(node.right, 'priority', 0)
priority += getattr(node.left, 'priority', 0)
node.priority = priority
def visit_symbol_node_out(self, node):
node.priority = max(child.priority for child in node.children)
class PackedData():
"""Used in transformationss of packed nodes to distinguish the data
that comes from the left child and the right child.
"""
class _NoData():
pass
NO_DATA = _NoData()
def __init__(self, node, data):
self.left = self.NO_DATA
self.right = self.NO_DATA
if data:
if node.left is not None:
self.left = data[0]
if len(data) > 1:
self.right = data[1]
else:
self.right = data[0]
class ForestToParseTree(ForestTransformer):
"""Used by the earley parser when ambiguity equals 'resolve' or
'explicit'. Transforms an SPPF into an (ambiguous) parse tree.
Parameters:
tree_class: The tree class to use for construction
callbacks: A dictionary of rules to functions that output a tree
prioritizer: A ``ForestVisitor`` that manipulates the priorities of ForestNodes
resolve_ambiguity: If True, ambiguities will be resolved based on
priorities. Otherwise, `_ambig` nodes will be in the resulting tree.
use_cache: If True, the results of packed node transformations will be cached.
"""
def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=True):
super(ForestToParseTree, self).__init__()
self.tree_class = tree_class
self.callbacks = callbacks
self.prioritizer = prioritizer
self.resolve_ambiguity = resolve_ambiguity
self._use_cache = use_cache
self._cache = {}
self._on_cycle_retreat = False
self._cycle_node = None
self._successful_visits = set()
def visit(self, root):
if self.prioritizer:
self.prioritizer.visit(root)
super(ForestToParseTree, self).visit(root)
self._cache = {}
def on_cycle(self, node, path):
logger.debug("Cycle encountered in the SPPF at node: %s. "
"As infinite ambiguities cannot be represented in a tree, "
"this family of derivations will be discarded.", node)
self._cycle_node = node
self._on_cycle_retreat = True
def _check_cycle(self, node):
if self._on_cycle_retreat:
if id(node) == id(self._cycle_node) or id(node) in self._successful_visits:
self._cycle_node = None
self._on_cycle_retreat = False
else:
return Discard
def _collapse_ambig(self, children):
new_children = []
for child in children:
if hasattr(child, 'data') and child.data == '_ambig':
new_children += child.children
else:
new_children.append(child)
return new_children
def _call_rule_func(self, node, data):
# called when transforming children of symbol nodes
# data is a list of trees or tokens that correspond to the
# symbol's rule expansion
return self.callbacks[node.rule](data)
def _call_ambig_func(self, node, data):
# called when transforming a symbol node
# data is a list of trees where each tree's data is
# equal to the name of the symbol or one of its aliases.
if len(data) > 1:
return self.tree_class('_ambig', data)
elif data:
return data[0]
return Discard
def transform_symbol_node(self, node, data):
if id(node) not in self._successful_visits:
return Discard
r = self._check_cycle(node)
if r is Discard:
return r
self._successful_visits.remove(id(node))
data = self._collapse_ambig(data)
return self._call_ambig_func(node, data)
def transform_intermediate_node(self, node, data):
if id(node) not in self._successful_visits:
return Discard
r = self._check_cycle(node)
if r is Discard:
return r
self._successful_visits.remove(id(node))
if len(data) > 1:
children = [self.tree_class('_inter', c) for c in data]
return self.tree_class('_iambig', children)
return data[0]
def transform_packed_node(self, node, data):
r = self._check_cycle(node)
if r is Discard:
return r
if self.resolve_ambiguity and id(node.parent) in self._successful_visits:
return Discard
if self._use_cache and id(node) in self._cache:
return self._cache[id(node)]
children = []
assert len(data) <= 2
data = PackedData(node, data)
if data.left is not PackedData.NO_DATA:
if node.left.is_intermediate and isinstance(data.left, list):
children += data.left
else:
children.append(data.left)
if data.right is not PackedData.NO_DATA:
children.append(data.right)
if node.parent.is_intermediate:
return self._cache.setdefault(id(node), children)
return self._cache.setdefault(id(node), self._call_rule_func(node, children))
def visit_symbol_node_in(self, node):
super(ForestToParseTree, self).visit_symbol_node_in(node)
if self._on_cycle_retreat:
return
return node.children
def visit_packed_node_in(self, node):
self._on_cycle_retreat = False
to_visit = super(ForestToParseTree, self).visit_packed_node_in(node)
if not self.resolve_ambiguity or id(node.parent) not in self._successful_visits:
if not self._use_cache or id(node) not in self._cache:
return to_visit
def visit_packed_node_out(self, node):
super(ForestToParseTree, self).visit_packed_node_out(node)
if not self._on_cycle_retreat:
self._successful_visits.add(id(node.parent))
def handles_ambiguity(func):
"""Decorator for methods of subclasses of ``TreeForestTransformer``.
Denotes that the method should receive a list of transformed derivations."""
func.handles_ambiguity = True
return func
class TreeForestTransformer(ForestToParseTree):
"""A ``ForestTransformer`` with a tree ``Transformer``-like interface.
By default, it will construct a tree.
Methods provided via inheritance are called based on the rule/symbol
names of nodes in the forest.
Methods that act on rules will receive a list of the results of the
transformations of the rule's children. By default, trees and tokens.
Methods that act on tokens will receive a token.
Alternatively, methods that act on rules may be annotated with
``handles_ambiguity``. In this case, the function will receive a list
of all the transformations of all the derivations of the rule.
By default, a list of trees where each tree.data is equal to the
rule name or one of its aliases.
Non-tree transformations are made possible by override of
``__default__``, ``__default_token__``, and ``__default_ambig__``.
Note:
Tree shaping features such as inlined rules and token filtering are
not built into the transformation. Positions are also not propagated.
Parameters:
tree_class: The tree class to use for construction
prioritizer: A ``ForestVisitor`` that manipulates the priorities of nodes in the SPPF.
resolve_ambiguity: If True, ambiguities will be resolved based on priorities.
use_cache (bool): If True, caches the results of some transformations,
potentially improving performance when ``resolve_ambiguity==False``.
Only use if you know what you are doing: i.e. All transformation
functions are pure and referentially transparent.
"""
def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=False):
super(TreeForestTransformer, self).__init__(tree_class, dict(), prioritizer, resolve_ambiguity, use_cache)
def __default__(self, name, data):
"""Default operation on tree (for override).
Returns a tree with name with data as children.
"""
return self.tree_class(name, data)
def __default_ambig__(self, name, data):
"""Default operation on ambiguous rule (for override).
Wraps data in an '_ambig_' node if it contains more than
one element.
"""
if len(data) > 1:
return self.tree_class('_ambig', data)
elif data:
return data[0]
return Discard
def __default_token__(self, node):
"""Default operation on ``Token`` (for override).
Returns ``node``.
"""
return node
def transform_token_node(self, node):
return getattr(self, node.type, self.__default_token__)(node)
def _call_rule_func(self, node, data):
name = node.rule.alias or node.rule.options.template_source or node.rule.origin.name
user_func = getattr(self, name, self.__default__)
if user_func == self.__default__ or hasattr(user_func, 'handles_ambiguity'):
user_func = partial(self.__default__, name)
if not self.resolve_ambiguity:
wrapper = partial(AmbiguousIntermediateExpander, self.tree_class)
user_func = wrapper(user_func)
return user_func(data)
def _call_ambig_func(self, node, data):
name = node.s.name
user_func = getattr(self, name, self.__default_ambig__)
if user_func == self.__default_ambig__ or not hasattr(user_func, 'handles_ambiguity'):
user_func = partial(self.__default_ambig__, name)
return user_func(data)
class ForestToPyDotVisitor(ForestVisitor):
"""
A Forest visitor which writes the SPPF to a PNG.
The SPPF can get really large, really quickly because
of the amount of meta-data it stores, so this is probably
only useful for trivial trees and learning how the SPPF
is structured.
"""
def __init__(self, rankdir="TB"):
super(ForestToPyDotVisitor, self).__init__(single_visit=True)
self.pydot = import_module('pydot')
self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)
def visit(self, root, filename):
super(ForestToPyDotVisitor, self).visit(root)
try:
self.graph.write_png(filename)
except FileNotFoundError as e:
logger.error("Could not write png: ", e)
def visit_token_node(self, node):
graph_node_id = str(id(node))
graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
graph_node_color = 0x808080
graph_node_style = "\"filled,rounded\""
graph_node_shape = "diamond"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)
def visit_packed_node_in(self, node):
graph_node_id = str(id(node))
graph_node_label = repr(node)
graph_node_color = 0x808080
graph_node_style = "filled"
graph_node_shape = "diamond"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)
yield node.left
yield node.right
def visit_packed_node_out(self, node):
graph_node_id = str(id(node))
graph_node = self.graph.get_node(graph_node_id)[0]
for child in [node.left, node.right]:
if child is not None:
child_graph_node_id = str(id(child.token if isinstance(child, TokenNode) else child))
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
else:
#### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
child_graph_node_style = "invis"
child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
child_edge_style = "invis"
self.graph.add_node(child_graph_node)
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))
def visit_symbol_node_in(self, node):
graph_node_id = str(id(node))
graph_node_label = repr(node)
graph_node_color = 0x808080
graph_node_style = "\"filled\""
if node.is_intermediate:
graph_node_shape = "ellipse"
else:
graph_node_shape = "rectangle"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)
return iter(node.children)
def visit_symbol_node_out(self, node):
graph_node_id = str(id(node))
graph_node = self.graph.get_node(graph_node_id)[0]
for child in node.children:
child_graph_node_id = str(id(child))
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))

View File

@@ -0,0 +1,203 @@
"Provides for superficial grammar analysis."
from collections import Counter, defaultdict
from typing import List, Dict, Iterator, FrozenSet, Set
from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal, Symbol
from ..common import ParserConf
class RulePtr:
__slots__ = ('rule', 'index')
rule: Rule
index: int
def __init__(self, rule: Rule, index: int):
assert isinstance(rule, Rule)
assert index <= len(rule.expansion)
self.rule = rule
self.index = index
def __repr__(self):
before = [x.name for x in self.rule.expansion[:self.index]]
after = [x.name for x in self.rule.expansion[self.index:]]
return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
@property
def next(self) -> Symbol:
return self.rule.expansion[self.index]
def advance(self, sym: Symbol) -> 'RulePtr':
assert self.next == sym
return RulePtr(self.rule, self.index+1)
@property
def is_satisfied(self) -> bool:
return self.index == len(self.rule.expansion)
def __eq__(self, other) -> bool:
if not isinstance(other, RulePtr):
return NotImplemented
return self.rule == other.rule and self.index == other.index
def __hash__(self) -> int:
return hash((self.rule, self.index))
State = FrozenSet[RulePtr]
# state generation ensures no duplicate LR0ItemSets
class LR0ItemSet:
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')
kernel: State
closure: State
transitions: Dict[Symbol, 'LR0ItemSet']
lookaheads: Dict[Symbol, Set[Rule]]
def __init__(self, kernel, closure):
self.kernel = fzset(kernel)
self.closure = fzset(closure)
self.transitions = {}
self.lookaheads = defaultdict(set)
def __repr__(self):
return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
def update_set(set1, set2):
if not set2 or set1 > set2:
return False
copy = set(set1)
set1 |= set2
return set1 != copy
def calculate_sets(rules):
"""Calculate FOLLOW sets.
Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
# foreach grammar rule X ::= Y(1) ... Y(k)
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
# NULLABLE = NULLABLE union {X}
# for i = 1 to k
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
# FIRST(X) = FIRST(X) union FIRST(Y(i))
# for j = i+1 to k
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration
NULLABLE = set()
FIRST = {}
FOLLOW = {}
for sym in symbols:
FIRST[sym]={sym} if sym.is_term else set()
FOLLOW[sym]=set()
# Calculate NULLABLE and FIRST
changed = True
while changed:
changed = False
for rule in rules:
if set(rule.expansion) <= NULLABLE:
if update_set(NULLABLE, {rule.origin}):
changed = True
for i, sym in enumerate(rule.expansion):
if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True
else:
break
# Calculate FOLLOW
changed = True
while changed:
changed = False
for rule in rules:
for i, sym in enumerate(rule.expansion):
if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
changed = True
for j in range(i+1, len(rule.expansion)):
if set(rule.expansion[i+1:j]) <= NULLABLE:
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
changed = True
return FIRST, FOLLOW, NULLABLE
class GrammarAnalyzer:
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
self.debug = debug
self.strict = strict
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
for start in parser_conf.start}
rules = parser_conf.rules + list(root_rules.values())
self.rules_by_origin: Dict[NonTerminal, List[Rule]] = classify(rules, lambda r: r.origin)
if len(rules) != len(set(rules)):
duplicates = [item for item, count in Counter(rules).items() if count > 1]
raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
for r in rules:
for sym in r.expansion:
if not (sym.is_term or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym)
self.start_states = {start: self.expand_rule(root_rule.origin)
for start, root_rule in root_rules.items()}
self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
for start, root_rule in root_rules.items()}
lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
for start in parser_conf.start}
lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
assert(len(lr0_rules) == len(set(lr0_rules)))
self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
# cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
for start, root_rule in lr0_root_rules.items()}
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
def expand_rule(self, source_rule: NonTerminal, rules_by_origin=None) -> State:
"Returns all init_ptrs accessible by rule (recursive)"
if rules_by_origin is None:
rules_by_origin = self.rules_by_origin
init_ptrs = set()
def _expand_rule(rule: NonTerminal) -> Iterator[NonTerminal]:
assert not rule.is_term, rule
for r in rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr)
if r.expansion: # if not empty rule
new_r = init_ptr.next
if not new_r.is_term:
assert isinstance(new_r, NonTerminal)
yield new_r
for _ in bfs([source_rule], _expand_rule):
pass
return fzset(init_ptrs)

View File

@@ -0,0 +1,332 @@
"""This module builds a LALR(1) transition-table for lalr_parser.py
For now, shift/reduce conflicts are automatically resolved as shifts.
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from typing import Dict, Set, Iterator, Tuple, List, TypeVar, Generic
from collections import defaultdict
from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
from ..exceptions import GrammarError
from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet, RulePtr, State
from ..grammar import Rule, Symbol
from ..common import ParserConf
###{standalone
class Action:
def __init__(self, name):
self.name = name
def __str__(self):
return self.name
def __repr__(self):
return str(self)
Shift = Action('Shift')
Reduce = Action('Reduce')
StateT = TypeVar("StateT")
class ParseTableBase(Generic[StateT]):
states: Dict[StateT, Dict[str, Tuple]]
start_states: Dict[str, StateT]
end_states: Dict[str, StateT]
def __init__(self, states, start_states, end_states):
self.states = states
self.start_states = start_states
self.end_states = end_states
def serialize(self, memo):
tokens = Enumerator()
states = {
state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
for token, (action, arg) in actions.items()}
for state, actions in self.states.items()
}
return {
'tokens': tokens.reversed(),
'states': states,
'start_states': self.start_states,
'end_states': self.end_states,
}
@classmethod
def deserialize(cls, data, memo):
tokens = data['tokens']
states = {
state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
for token, (action, arg) in actions.items()}
for state, actions in data['states'].items()
}
return cls(states, data['start_states'], data['end_states'])
class ParseTable(ParseTableBase['State']):
"""Parse-table whose key is State, i.e. set[RulePtr]
Slower than IntParseTable, but useful for debugging
"""
pass
class IntParseTable(ParseTableBase[int]):
"""Parse-table whose key is int. Best for performance."""
@classmethod
def from_ParseTable(cls, parse_table: ParseTable):
enum = list(parse_table.states)
state_to_idx: Dict['State', int] = {s:i for i,s in enumerate(enum)}
int_states = {}
for s, la in parse_table.states.items():
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
for k,v in la.items()}
int_states[ state_to_idx[s] ] = la
start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
return cls(int_states, start_states, end_states)
###}
# digraph and traverse, see The Theory and Practice of Compiler Writing
# computes F(x) = G(x) union (union { G(y) | x R y })
# X: nodes
# R: relation (function mapping node -> list of nodes that satisfy the relation)
# G: set valued function
def digraph(X, R, G):
F = {}
S = []
N = dict.fromkeys(X, 0)
for x in X:
# this is always true for the first iteration, but N[x] may be updated in traverse below
if N[x] == 0:
traverse(x, S, N, X, R, G, F)
return F
# x: single node
# S: stack
# N: weights
# X: nodes
# R: relation (see above)
# G: set valued function
# F: set valued function we are computing (map of input -> output)
def traverse(x, S, N, X, R, G, F):
S.append(x)
d = len(S)
N[x] = d
F[x] = G[x]
for y in R[x]:
if N[y] == 0:
traverse(y, S, N, X, R, G, F)
n_x = N[x]
assert(n_x > 0)
n_y = N[y]
assert(n_y != 0)
if (n_y > 0) and (n_y < n_x):
N[x] = n_y
F[x].update(F[y])
if N[x] == d:
f_x = F[x]
while True:
z = S.pop()
N[z] = -1
F[z] = f_x
if z == x:
break
class LALR_Analyzer(GrammarAnalyzer):
lr0_itemsets: Set[LR0ItemSet]
nonterminal_transitions: List[Tuple[LR0ItemSet, Symbol]]
lookback: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Rule]]]
includes: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
directly_reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Symbol]]
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
GrammarAnalyzer.__init__(self, parser_conf, debug, strict)
self.nonterminal_transitions = []
self.directly_reads = defaultdict(set)
self.reads = defaultdict(set)
self.includes = defaultdict(set)
self.lookback = defaultdict(set)
def compute_lr0_states(self) -> None:
self.lr0_itemsets = set()
# map of kernels to LR0ItemSets
cache: Dict['State', LR0ItemSet] = {}
def step(state: LR0ItemSet) -> Iterator[LR0ItemSet]:
_, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)
d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
kernel = fzset({rp.advance(sym) for rp in rps})
new_state = cache.get(kernel, None)
if new_state is None:
closure = set(kernel)
for rp in kernel:
if not rp.is_satisfied and not rp.next.is_term:
closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
new_state = LR0ItemSet(kernel, closure)
cache[kernel] = new_state
state.transitions[sym] = new_state
yield new_state
self.lr0_itemsets.add(state)
for _ in bfs(self.lr0_start_states.values(), step):
pass
def compute_reads_relations(self):
# handle start state
for root in self.lr0_start_states.values():
assert(len(root.kernel) == 1)
for rp in root.kernel:
assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
for state in self.lr0_itemsets:
seen = set()
for rp in state.closure:
if rp.is_satisfied:
continue
s = rp.next
# if s is a not a nonterminal
if s not in self.lr0_rules_by_origin:
continue
if s in seen:
continue
seen.add(s)
nt = (state, s)
self.nonterminal_transitions.append(nt)
dr = self.directly_reads[nt]
r = self.reads[nt]
next_state = state.transitions[s]
for rp2 in next_state.closure:
if rp2.is_satisfied:
continue
s2 = rp2.next
# if s2 is a terminal
if s2 not in self.lr0_rules_by_origin:
dr.add(s2)
if s2 in self.NULLABLE:
r.add((next_state, s2))
def compute_includes_lookback(self):
for nt in self.nonterminal_transitions:
state, nonterminal = nt
includes = []
lookback = self.lookback[nt]
for rp in state.closure:
if rp.rule.origin != nonterminal:
continue
# traverse the states for rp(.rule)
state2 = state
for i in range(rp.index, len(rp.rule.expansion)):
s = rp.rule.expansion[i]
nt2 = (state2, s)
state2 = state2.transitions[s]
if nt2 not in self.reads:
continue
for j in range(i + 1, len(rp.rule.expansion)):
if rp.rule.expansion[j] not in self.NULLABLE:
break
else:
includes.append(nt2)
# state2 is at the final state for rp.rule
if rp.index == 0:
for rp2 in state2.closure:
if (rp2.rule == rp.rule) and rp2.is_satisfied:
lookback.add((state2, rp2.rule))
for nt2 in includes:
self.includes[nt2].add(nt)
def compute_lookaheads(self):
read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads)
follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets)
for nt, lookbacks in self.lookback.items():
for state, rule in lookbacks:
for s in follow_sets[nt]:
state.lookaheads[s].add(rule)
def compute_lalr1_states(self) -> None:
m: Dict[LR0ItemSet, Dict[str, Tuple]] = {}
reduce_reduce = []
for itemset in self.lr0_itemsets:
actions: Dict[Symbol, Tuple] = {la: (Shift, next_state.closure)
for la, next_state in itemset.transitions.items()}
for la, rules in itemset.lookaheads.items():
if len(rules) > 1:
# Try to resolve conflict based on priority
p = [(r.options.priority or 0, r) for r in rules]
p.sort(key=lambda r: r[0], reverse=True)
best, second_best = p[:2]
if best[0] > second_best[0]:
rules = {best[1]}
else:
reduce_reduce.append((itemset, la, rules))
continue
rule ,= rules
if la in actions:
if self.strict:
raise GrammarError(f"Shift/Reduce conflict for terminal {la.name}. [strict-mode]\n ")
elif self.debug:
logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logger.warning(' * %s', rule)
else:
logger.debug('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logger.debug(' * %s', rule)
else:
actions[la] = (Reduce, rule)
m[itemset] = { k.name: v for k, v in actions.items() }
if reduce_reduce:
msgs = []
for itemset, la, rules in reduce_reduce:
msg = 'Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t- ' + str(r) for r in rules ]))
if self.debug:
msg += '\n collision occurred in state: {%s\n }' % ''.join(['\n\t' + str(x) for x in itemset.closure])
msgs.append(msg)
raise GrammarError('\n\n'.join(msgs))
states = { k.closure: v for k, v in m.items() }
# compute end states
end_states: Dict[str, 'State'] = {}
for state in states:
for rp in state:
for start in self.lr0_start_states:
if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
assert start not in end_states
end_states[start] = state
start_states = { start: state.closure for start, state in self.lr0_start_states.items() }
_parse_table = ParseTable(states, start_states, end_states)
if self.debug:
self.parse_table = _parse_table
else:
self.parse_table = IntParseTable.from_ParseTable(_parse_table)
def compute_lalr(self):
self.compute_lr0_states()
self.compute_reads_relations()
self.compute_includes_lookback()
self.compute_lookaheads()
self.compute_lalr1_states()

View File

@@ -0,0 +1,158 @@
# This module provides a LALR interactive parser, which is used for debugging and error handling
from typing import Iterator, List
from copy import copy
import warnings
from ..exceptions import UnexpectedToken
from ..lexer import Token, LexerThread
from .lalr_parser_state import ParserState
###{standalone
class InteractiveParser:
"""InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR.
For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``.
"""
def __init__(self, parser, parser_state: ParserState, lexer_thread: LexerThread):
self.parser = parser
self.parser_state = parser_state
self.lexer_thread = lexer_thread
self.result = None
@property
def lexer_state(self) -> LexerThread:
warnings.warn("lexer_state will be removed in subsequent releases. Use lexer_thread instead.", DeprecationWarning)
return self.lexer_thread
def feed_token(self, token: Token):
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer.
Note that ``token`` has to be an instance of ``Token``.
"""
return self.parser_state.feed_token(token, token.type == '$END')
def iter_parse(self) -> Iterator[Token]:
"""Step through the different stages of the parse, by reading tokens from the lexer
and feeding them to the parser, one per iteration.
Returns an iterator of the tokens it encounters.
When the parse is over, the resulting tree can be found in ``InteractiveParser.result``.
"""
for token in self.lexer_thread.lex(self.parser_state):
yield token
self.result = self.feed_token(token)
def exhaust_lexer(self) -> List[Token]:
"""Try to feed the rest of the lexer state into the interactive parser.
Note that this modifies the instance in place and does not feed an '$END' Token
"""
return list(self.iter_parse())
def feed_eof(self, last_token=None):
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else self.lexer_thread._Token('$END', '', 0, 1, 1)
return self.feed_token(eof)
def __copy__(self):
"""Create a new interactive parser with a separate state.
Calls to feed_token() won't affect the old instance, and vice-versa.
"""
return self.copy()
def copy(self, deepcopy_values=True):
return type(self)(
self.parser,
self.parser_state.copy(deepcopy_values=deepcopy_values),
copy(self.lexer_thread),
)
def __eq__(self, other):
if not isinstance(other, InteractiveParser):
return False
return self.parser_state == other.parser_state and self.lexer_thread == other.lexer_thread
def as_immutable(self):
"""Convert to an ``ImmutableInteractiveParser``."""
p = copy(self)
return ImmutableInteractiveParser(p.parser, p.parser_state, p.lexer_thread)
def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read."""
out = ["Parser choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %r' % (k, v))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out)
def choices(self):
"""Returns a dictionary of token types, matched to their action in the parser.
Only returns token types that are accepted by the current state.
Updated by ``feed_token()``.
"""
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position]
def accepts(self):
"""Returns the set of possible tokens that will advance the parser into a new valid state."""
accepts = set()
conf_no_callbacks = copy(self.parser_state.parse_conf)
# We don't want to call callbacks here since those might have arbitrary side effects
# and are unnecessarily slow.
conf_no_callbacks.callbacks = {}
for t in self.choices():
if t.isupper(): # is terminal?
new_cursor = self.copy(deepcopy_values=False)
new_cursor.parser_state.parse_conf = conf_no_callbacks
try:
new_cursor.feed_token(self.lexer_thread._Token(t, ''))
except UnexpectedToken:
pass
else:
accepts.add(t)
return accepts
def resume_parse(self):
"""Resume automated parsing from the current state.
"""
return self.parser.parse_from_state(self.parser_state, last_token=self.lexer_thread.state.last_token)
class ImmutableInteractiveParser(InteractiveParser):
"""Same as ``InteractiveParser``, but operations create a new instance instead
of changing it in-place.
"""
result = None
def __hash__(self):
return hash((self.parser_state, self.lexer_thread))
def feed_token(self, token):
c = copy(self)
c.result = InteractiveParser.feed_token(c, token)
return c
def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the parser.
Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
cursor = self.as_mutable()
cursor.exhaust_lexer()
return cursor.as_immutable()
def as_mutable(self):
"""Convert to an ``InteractiveParser``."""
p = copy(self)
return InteractiveParser(p.parser, p.parser_state, p.lexer_thread)
###}

View File

@@ -0,0 +1,122 @@
"""This module implements a LALR(1) Parser
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from typing import Dict, Any, Optional
from ..lexer import Token, LexerThread
from ..utils import Serialize
from ..common import ParserConf, ParserCallbacks
from .lalr_analysis import LALR_Analyzer, IntParseTable, ParseTableBase
from .lalr_interactive_parser import InteractiveParser
from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from .lalr_parser_state import ParserState, ParseConf
###{standalone
class LALR_Parser(Serialize):
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
analysis = LALR_Analyzer(parser_conf, debug=debug, strict=strict)
analysis.compute_lalr()
callbacks = parser_conf.callbacks
self._parse_table = analysis.parse_table
self.parser_conf = parser_conf
self.parser = _Parser(analysis.parse_table, callbacks, debug)
@classmethod
def deserialize(cls, data, memo, callbacks, debug=False):
inst = cls.__new__(cls)
inst._parse_table = IntParseTable.deserialize(data, memo)
inst.parser = _Parser(inst._parse_table, callbacks, debug)
return inst
def serialize(self, memo: Any = None) -> Dict[str, Any]:
return self._parse_table.serialize(memo)
def parse_interactive(self, lexer: LexerThread, start: str):
return self.parser.parse(lexer, start, start_interactive=True)
def parse(self, lexer, start, on_error=None):
try:
return self.parser.parse(lexer, start)
except UnexpectedInput as e:
if on_error is None:
raise
while True:
if isinstance(e, UnexpectedCharacters):
s = e.interactive_parser.lexer_thread.state
p = s.line_ctr.char_pos
if not on_error(e):
raise e
if isinstance(e, UnexpectedCharacters):
# If user didn't change the character position, then we should
if p == s.line_ctr.char_pos:
s.line_ctr.feed(s.text[p:p+1])
try:
return e.interactive_parser.resume_parse()
except UnexpectedToken as e2:
if (isinstance(e, UnexpectedToken)
and e.token.type == e2.token.type == '$END'
and e.interactive_parser == e2.interactive_parser):
# Prevent infinite loop
raise e2
e = e2
except UnexpectedCharacters as e2:
e = e2
class _Parser:
parse_table: ParseTableBase
callbacks: ParserCallbacks
debug: bool
def __init__(self, parse_table: ParseTableBase, callbacks: ParserCallbacks, debug: bool=False):
self.parse_table = parse_table
self.callbacks = callbacks
self.debug = debug
def parse(self, lexer: LexerThread, start: str, value_stack=None, state_stack=None, start_interactive=False):
parse_conf = ParseConf(self.parse_table, self.callbacks, start)
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack)
if start_interactive:
return InteractiveParser(self, parser_state, parser_state.lexer)
return self.parse_from_state(parser_state)
def parse_from_state(self, state: ParserState, last_token: Optional[Token]=None):
"""Run the main LALR parser loop
Parameters:
state - the initial state. Changed in-place.
last_token - Used only for line information in case of an empty lexer.
"""
try:
token = last_token
for token in state.lexer.lex(state):
assert token is not None
state.feed_token(token)
end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(end_token, True)
except UnexpectedInput as e:
try:
e.interactive_parser = InteractiveParser(self, state, state.lexer)
except NameError:
pass
raise e
except Exception as e:
if self.debug:
print("")
print("STATE STACK DUMP")
print("----------------")
for i, s in enumerate(state.state_stack):
print('%d)' % i , s)
print("")
raise
###}

View File

@@ -0,0 +1,110 @@
from copy import deepcopy, copy
from typing import Dict, Any, Generic, List
from ..lexer import Token, LexerThread
from ..common import ParserCallbacks
from .lalr_analysis import Shift, ParseTableBase, StateT
from ..exceptions import UnexpectedToken
###{standalone
class ParseConf(Generic[StateT]):
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'
parse_table: ParseTableBase[StateT]
callbacks: ParserCallbacks
start: str
start_state: StateT
end_state: StateT
states: Dict[StateT, Dict[str, tuple]]
def __init__(self, parse_table: ParseTableBase[StateT], callbacks: ParserCallbacks, start: str):
self.parse_table = parse_table
self.start_state = self.parse_table.start_states[start]
self.end_state = self.parse_table.end_states[start]
self.states = self.parse_table.states
self.callbacks = callbacks
self.start = start
class ParserState(Generic[StateT]):
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'
parse_conf: ParseConf[StateT]
lexer: LexerThread
state_stack: List[StateT]
value_stack: list
def __init__(self, parse_conf: ParseConf[StateT], lexer: LexerThread, state_stack=None, value_stack=None):
self.parse_conf = parse_conf
self.lexer = lexer
self.state_stack = state_stack or [self.parse_conf.start_state]
self.value_stack = value_stack or []
@property
def position(self) -> StateT:
return self.state_stack[-1]
# Necessary for match_examples() to work
def __eq__(self, other) -> bool:
if not isinstance(other, ParserState):
return NotImplemented
return len(self.state_stack) == len(other.state_stack) and self.position == other.position
def __copy__(self):
return self.copy()
def copy(self, deepcopy_values=True) -> 'ParserState[StateT]':
return type(self)(
self.parse_conf,
self.lexer, # XXX copy
copy(self.state_stack),
deepcopy(self.value_stack) if deepcopy_values else copy(self.value_stack),
)
def feed_token(self, token: Token, is_end=False) -> Any:
state_stack = self.state_stack
value_stack = self.value_stack
states = self.parse_conf.states
end_state = self.parse_conf.end_state
callbacks = self.parse_conf.callbacks
while True:
state = state_stack[-1]
try:
action, arg = states[state][token.type]
except KeyError:
expected = {s for s in states[state].keys() if s.isupper()}
raise UnexpectedToken(token, expected, state=self, interactive_parser=None)
assert arg != end_state
if action is Shift:
# shift once and return
assert not is_end
state_stack.append(arg)
value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
return
else:
# reduce+shift as many times as necessary
rule = arg
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []
value = callbacks[rule](s) if callbacks else s
_action, new_state = states[state_stack[-1]][rule.origin.name]
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)
if is_end and state_stack[-1] == end_state:
return value_stack[-1]
###}

View File

@@ -0,0 +1,165 @@
"""This module implements an Earley parser with a dynamic lexer
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
https://www.sciencedirect.com/science/article/pii/S1571066108001497
That is probably the best reference for understanding the algorithm here.
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
is better documented here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
uses regular expressions by necessity, achieving high-performance while maintaining all of
Earley's power in parsing any CFG.
"""
from typing import TYPE_CHECKING, Callable, Optional, List, Any
from collections import defaultdict
from ..tree import Tree
from ..exceptions import UnexpectedCharacters
from ..lexer import Token
from ..grammar import Terminal
from .earley import Parser as BaseParser
from .earley_forest import TokenNode
if TYPE_CHECKING:
from ..common import LexerConf, ParserConf
class Parser(BaseParser):
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
resolve_ambiguity: bool=True, complete_lex: bool=False, debug: bool=False,
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
BaseParser.__init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity,
debug, tree_class, ordered_sets)
self.ignore = [Terminal(t) for t in lexer_conf.ignore]
self.complete_lex = complete_lex
def _parse(self, stream, columns, to_scan, start_symbol=None):
def scan(i, to_scan):
"""The core Earley Scanner.
This is a custom implementation of the scanner that uses the
Lark lexer to match tokens. The scan list is built by the
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""
node_cache = {}
# 1) Loop the expectations and ask the lexer to match.
# Since regexp is forward looking on the input stream, and we only
# want to process tokens when we hit the point in the stream at which
# they complete, we push all tokens into a buffer (delayed_matches), to
# be held possibly for a later parse step when we reach the point in the
# input stream at which they complete.
for item in self.Set(to_scan):
m = match(item.expect, stream, i)
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append( (item, i, t) )
if self.complete_lex:
s = m.group(0)
for j in range(1, len(s)):
m = match(item.expect, s[:-j])
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append( (item, i, t) )
# XXX The following 3 lines were commented out for causing a bug. See issue #768
# # Remove any items that successfully matched in this pass from the to_scan buffer.
# # This ensures we don't carry over tokens that already matched, if we're ignoring below.
# to_scan.remove(item)
# 3) Process any ignores. This is typically used for e.g. whitespace.
# We carry over any unmatched items from the to_scan buffer to be matched again after
# the ignore. This should allow us to use ignored symbols in non-terminals to implement
# e.g. mandatory spacing.
for x in self.ignore:
m = match(x, stream, i)
if m:
# Carry over any items still in the scan buffer, to past the end of the ignored items.
delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ])
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])
next_to_scan = self.Set()
next_set = self.Set()
columns.append(next_set)
transitives.append({})
## 4) Process Tokens from delayed_matches.
# This is the core of the Earley scanner. Create an SPPF node for each Token,
# and create the symbol node in the SPPF tree. Advance the item that completed,
# and add the resulting new item to either the Earley set (for processing by the
# completer/predictor) or the to_scan buffer for the next parse step.
for item, start, token in delayed_matches[i+1]:
if token is not None:
token.end_line = text_line
token.end_column = text_column + 1
token.end_pos = i + 1
new_item = item.advance()
label = (new_item.s, new_item.start, i + 1)
token_node = TokenNode(token, terminals[token.type])
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
else:
new_item = item
if new_item.expect in self.TERMINALS:
# add (B ::= Aai+1.B, h, y) to Q'
next_to_scan.add(new_item)
else:
# add (B ::= Aa+1.B, h, y) to Ei+1
next_set.add(new_item)
del delayed_matches[i+1] # No longer needed, so unburden memory
if not next_set and not delayed_matches and not next_to_scan:
considered_rules = list(sorted(to_scan, key=lambda key: key.rule.origin.name))
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
set(to_scan), state=frozenset(i.s for i in to_scan),
considered_rules=considered_rules
)
return next_to_scan
delayed_matches = defaultdict(list)
match = self.term_matcher
terminals = self.lexer_conf.terminals_by_name
# Cache for nodes & tokens created in a particular parse step.
transitives = [{}]
text_line = 1
text_column = 1
## The main Earley loop.
# Run the Prediction/Completion cycle for any Items in the current Earley set.
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
i = 0
for token in stream:
self.predict_and_complete(i, to_scan, columns, transitives)
to_scan = scan(i, to_scan)
if token == '\n':
text_line += 1
text_column = 1
else:
text_column += 1
i += 1
self.predict_and_complete(i, to_scan, columns, transitives)
## Column is now the final column in the parse.
assert i == len(columns)-1
return to_scan