add
This commit is contained in:
314
ccxt/static_dependencies/lark/parsers/earley.py
Normal file
314
ccxt/static_dependencies/lark/parsers/earley.py
Normal file
@@ -0,0 +1,314 @@
|
||||
"""This module implements an Earley parser.
|
||||
|
||||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
|
||||
https://www.sciencedirect.com/science/article/pii/S1571066108001497
|
||||
|
||||
That is probably the best reference for understanding the algorithm here.
|
||||
|
||||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
|
||||
is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Optional, List, Any
|
||||
from collections import deque
|
||||
|
||||
from ..lexer import Token
|
||||
from ..tree import Tree
|
||||
from ..exceptions import UnexpectedEOF, UnexpectedToken
|
||||
from ..utils import logger, OrderedSet, dedup_list
|
||||
from .grammar_analysis import GrammarAnalyzer
|
||||
from ..grammar import NonTerminal
|
||||
from .earley_common import Item
|
||||
from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..common import LexerConf, ParserConf
|
||||
|
||||
class Parser:
|
||||
lexer_conf: 'LexerConf'
|
||||
parser_conf: 'ParserConf'
|
||||
debug: bool
|
||||
|
||||
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
|
||||
resolve_ambiguity: bool=True, debug: bool=False,
|
||||
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
|
||||
analysis = GrammarAnalyzer(parser_conf)
|
||||
self.lexer_conf = lexer_conf
|
||||
self.parser_conf = parser_conf
|
||||
self.resolve_ambiguity = resolve_ambiguity
|
||||
self.debug = debug
|
||||
self.Tree = tree_class
|
||||
self.Set = OrderedSet if ordered_sets else set
|
||||
self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode
|
||||
|
||||
self.FIRST = analysis.FIRST
|
||||
self.NULLABLE = analysis.NULLABLE
|
||||
self.callbacks = parser_conf.callbacks
|
||||
# TODO add typing info
|
||||
self.predictions = {} # type: ignore[var-annotated]
|
||||
|
||||
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
|
||||
# the slow 'isupper' in is_terminal.
|
||||
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
|
||||
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
|
||||
|
||||
self.forest_sum_visitor = None
|
||||
for rule in parser_conf.rules:
|
||||
if rule.origin not in self.predictions:
|
||||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
|
||||
|
||||
## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
|
||||
# the priorities will be stripped from all rules/terminals before they reach us, allowing us to
|
||||
# skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
|
||||
# on any rules/terminals.
|
||||
if self.forest_sum_visitor is None and rule.options.priority is not None:
|
||||
self.forest_sum_visitor = ForestSumVisitor
|
||||
|
||||
# Check terminals for priorities
|
||||
# Ignore terminal priorities if the basic lexer is used
|
||||
if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
|
||||
for term in self.lexer_conf.terminals:
|
||||
if term.priority:
|
||||
self.forest_sum_visitor = ForestSumVisitor
|
||||
break
|
||||
|
||||
self.term_matcher = term_matcher
|
||||
|
||||
|
||||
def predict_and_complete(self, i, to_scan, columns, transitives):
|
||||
"""The core Earley Predictor and Completer.
|
||||
|
||||
At each stage of the input, we handling any completed items (things
|
||||
that matched on the last cycle) and use those to predict what should
|
||||
come next in the input stream. The completions and any predicted
|
||||
non-terminals are recursively processed until we reach a set of,
|
||||
which can be added to the scan list for the next scanner cycle."""
|
||||
# Held Completions (H in E.Scotts paper).
|
||||
node_cache = {}
|
||||
held_completions = {}
|
||||
|
||||
column = columns[i]
|
||||
# R (items) = Ei (column.items)
|
||||
items = deque(column)
|
||||
while items:
|
||||
item = items.pop() # remove an element, A say, from R
|
||||
|
||||
### The Earley completer
|
||||
if item.is_complete: ### (item.s == string)
|
||||
if item.node is None:
|
||||
label = (item.s, item.start, i)
|
||||
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
item.node.add_family(item.s, item.rule, item.start, None, None)
|
||||
|
||||
# create_leo_transitives(item.rule.origin, item.start)
|
||||
|
||||
###R Joop Leo right recursion Completer
|
||||
if item.rule.origin in transitives[item.start]:
|
||||
transitive = transitives[item.start][item.s]
|
||||
if transitive.previous in transitives[transitive.column]:
|
||||
root_transitive = transitives[transitive.column][transitive.previous]
|
||||
else:
|
||||
root_transitive = transitive
|
||||
|
||||
new_item = Item(transitive.rule, transitive.ptr, transitive.start)
|
||||
label = (root_transitive.s, root_transitive.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_path(root_transitive, item.node)
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# Add (B :: aC.B, h, y) to Q
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
# Add (B :: aC.B, h, y) to Ei and R
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
###R Regular Earley completer
|
||||
else:
|
||||
# Empty has 0 length. If we complete an empty symbol in a particular
|
||||
# parse step, we need to be able to use that same empty symbol to complete
|
||||
# any predictions that result, that themselves require empty. Avoids
|
||||
# infinite recursion on empty symbols.
|
||||
# held_completions is 'H' in E.Scott's paper.
|
||||
is_empty_item = item.start == i
|
||||
if is_empty_item:
|
||||
held_completions[item.rule.origin] = item.node
|
||||
|
||||
originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
|
||||
for originator in originators:
|
||||
new_item = originator.advance()
|
||||
label = (new_item.s, originator.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# Add (B :: aC.B, h, y) to Q
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
# Add (B :: aC.B, h, y) to Ei and R
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
|
||||
### The Earley predictor
|
||||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
|
||||
new_items = []
|
||||
for rule in self.predictions[item.expect]:
|
||||
new_item = Item(rule, 0, i)
|
||||
new_items.append(new_item)
|
||||
|
||||
# Process any held completions (H).
|
||||
if item.expect in held_completions:
|
||||
new_item = item.advance()
|
||||
label = (new_item.s, item.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
|
||||
new_items.append(new_item)
|
||||
|
||||
for new_item in new_items:
|
||||
if new_item.expect in self.TERMINALS:
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
|
||||
def _parse(self, lexer, columns, to_scan, start_symbol=None):
|
||||
|
||||
def is_quasi_complete(item):
|
||||
if item.is_complete:
|
||||
return True
|
||||
|
||||
quasi = item.advance()
|
||||
while not quasi.is_complete:
|
||||
if quasi.expect not in self.NULLABLE:
|
||||
return False
|
||||
if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
|
||||
return False
|
||||
quasi = quasi.advance()
|
||||
return True
|
||||
|
||||
# def create_leo_transitives(origin, start):
|
||||
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
|
||||
|
||||
def scan(i, token, to_scan):
|
||||
"""The core Earley Scanner.
|
||||
|
||||
This is a custom implementation of the scanner that uses the
|
||||
Lark lexer to match tokens. The scan list is built by the
|
||||
Earley predictor, based on the previously completed tokens.
|
||||
This ensures that at each phase of the parse we have a custom
|
||||
lexer context, allowing for more complex ambiguities."""
|
||||
next_to_scan = self.Set()
|
||||
next_set = self.Set()
|
||||
columns.append(next_set)
|
||||
transitives.append({})
|
||||
node_cache = {}
|
||||
|
||||
for item in self.Set(to_scan):
|
||||
if match(item.expect, token):
|
||||
new_item = item.advance()
|
||||
label = (new_item.s, new_item.start, i)
|
||||
# 'terminals' may not contain token.type when using %declare
|
||||
# Additionally, token is not always a Token
|
||||
# For example, it can be a Tree when using TreeMatcher
|
||||
term = terminals.get(token.type) if isinstance(token, Token) else None
|
||||
# Set the priority of the token node to 0 so that the
|
||||
# terminal priorities do not affect the Tree chosen by
|
||||
# ForestSumVisitor after the basic lexer has already
|
||||
# "used up" the terminal priorities
|
||||
token_node = TokenNode(token, term, priority=0)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
|
||||
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# add (B ::= Aai+1.B, h, y) to Q'
|
||||
next_to_scan.add(new_item)
|
||||
else:
|
||||
# add (B ::= Aa+1.B, h, y) to Ei+1
|
||||
next_set.add(new_item)
|
||||
|
||||
if not next_set and not next_to_scan:
|
||||
expect = {i.expect.name for i in to_scan}
|
||||
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
|
||||
|
||||
return next_to_scan
|
||||
|
||||
|
||||
# Define parser functions
|
||||
match = self.term_matcher
|
||||
|
||||
terminals = self.lexer_conf.terminals_by_name
|
||||
|
||||
# Cache for nodes & tokens created in a particular parse step.
|
||||
transitives = [{}]
|
||||
|
||||
## The main Earley loop.
|
||||
# Run the Prediction/Completion cycle for any Items in the current Earley set.
|
||||
# Completions will be added to the SPPF tree, and predictions will be recursively
|
||||
# processed down to terminals/empty nodes to be added to the scanner for the next
|
||||
# step.
|
||||
expects = {i.expect for i in to_scan}
|
||||
i = 0
|
||||
for token in lexer.lex(expects):
|
||||
self.predict_and_complete(i, to_scan, columns, transitives)
|
||||
|
||||
to_scan = scan(i, token, to_scan)
|
||||
i += 1
|
||||
|
||||
expects.clear()
|
||||
expects |= {i.expect for i in to_scan}
|
||||
|
||||
self.predict_and_complete(i, to_scan, columns, transitives)
|
||||
|
||||
## Column is now the final column in the parse.
|
||||
assert i == len(columns)-1
|
||||
return to_scan
|
||||
|
||||
def parse(self, lexer, start):
|
||||
assert start, start
|
||||
start_symbol = NonTerminal(start)
|
||||
|
||||
columns = [self.Set()]
|
||||
to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.
|
||||
|
||||
## Predict for the start_symbol.
|
||||
# Add predicted items to the first Earley set (for the predictor) if they
|
||||
# result in a non-terminal, or the scanner if they result in a terminal.
|
||||
for rule in self.predictions[start_symbol]:
|
||||
item = Item(rule, 0, 0)
|
||||
if item.expect in self.TERMINALS:
|
||||
to_scan.add(item)
|
||||
else:
|
||||
columns[0].add(item)
|
||||
|
||||
to_scan = self._parse(lexer, columns, to_scan, start_symbol)
|
||||
|
||||
# If the parse was successful, the start
|
||||
# symbol should have been completed in the last step of the Earley cycle, and will be in
|
||||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
|
||||
solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)
|
||||
if not solutions:
|
||||
expected_terminals = [t.expect.name for t in to_scan]
|
||||
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
|
||||
|
||||
if self.debug:
|
||||
from .earley_forest import ForestToPyDotVisitor
|
||||
try:
|
||||
debug_walker = ForestToPyDotVisitor()
|
||||
except ImportError:
|
||||
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
|
||||
else:
|
||||
for i, s in enumerate(solutions):
|
||||
debug_walker.visit(s, f"sppf{i}.png")
|
||||
|
||||
|
||||
if self.Tree is not None:
|
||||
# Perform our SPPF -> AST conversion
|
||||
transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity)
|
||||
solutions = [transformer.transform(s) for s in solutions]
|
||||
|
||||
if len(solutions) > 1:
|
||||
t: Tree = self.Tree('_ambig', solutions)
|
||||
t.expand_kids_by_data('_ambig') # solutions may themselves be _ambig nodes
|
||||
return t
|
||||
return solutions[0]
|
||||
|
||||
# return the root of the SPPF
|
||||
# TODO return a list of solutions, or join them together somehow
|
||||
return solutions[0]
|
||||
Reference in New Issue
Block a user