This commit is contained in:
lz_db
2025-11-16 12:31:03 +08:00
commit 0fab423a18
1451 changed files with 743213 additions and 0 deletions

View File

@@ -0,0 +1,70 @@
import sys
from argparse import ArgumentParser, FileType
from textwrap import indent
from logging import DEBUG, INFO, WARN, ERROR
from typing import Optional
import warnings
from lark import Lark, logger
try:
from interegular import logger as interegular_logger
has_interegular = True
except ImportError:
has_interegular = False
lalr_argparser = ArgumentParser(add_help=False, epilog='Look at the Lark documentation for more info on the options')
flags = [
('d', 'debug'),
'keep_all_tokens',
'regex',
'propagate_positions',
'maybe_placeholders',
'use_bytes'
]
options = ['start', 'lexer']
lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times")
lalr_argparser.add_argument('-s', '--start', action='append', default=[])
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual'))
lalr_argparser.add_argument('-o', '--out', type=FileType('w', encoding='utf-8'), default=sys.stdout, help='the output file (default=stdout)')
lalr_argparser.add_argument('grammar_file', type=FileType('r', encoding='utf-8'), help='A valid .lark file')
for flag in flags:
if isinstance(flag, tuple):
options.append(flag[1])
lalr_argparser.add_argument('-' + flag[0], '--' + flag[1], action='store_true')
elif isinstance(flag, str):
options.append(flag)
lalr_argparser.add_argument('--' + flag, action='store_true')
else:
raise NotImplementedError("flags must only contain strings or tuples of strings")
def build_lalr(namespace):
logger.setLevel((ERROR, WARN, INFO, DEBUG)[min(namespace.verbose, 3)])
if has_interegular:
interegular_logger.setLevel(logger.getEffectiveLevel())
if len(namespace.start) == 0:
namespace.start.append('start')
kwargs = {n: getattr(namespace, n) for n in options}
return Lark(namespace.grammar_file, parser='lalr', **kwargs), namespace.out
def showwarning_as_comment(message, category, filename, lineno, file=None, line=None):
# Based on warnings._showwarnmsg_impl
text = warnings.formatwarning(message, category, filename, lineno, line)
text = indent(text, '# ')
if file is None:
file = sys.stderr
if file is None:
return
try:
file.write(text)
except OSError:
pass
def make_warnings_comments():
warnings.showwarning = showwarning_as_comment

View File

@@ -0,0 +1,202 @@
"Converts Nearley grammars to Lark"
import os.path
import sys
import codecs
import argparse
from lark import Lark, Transformer, v_args
nearley_grammar = r"""
start: (ruledef|directive)+
directive: "@" NAME (STRING|NAME)
| "@" JS -> js_code
ruledef: NAME "->" expansions
| NAME REGEXP "->" expansions -> macro
expansions: expansion ("|" expansion)*
expansion: expr+ js
?expr: item (":" /[+*?]/)?
?item: rule|string|regexp|null
| "(" expansions ")"
rule: NAME
string: STRING
regexp: REGEXP
null: "null"
JS: /{%.*?%}/s
js: JS?
NAME: /[a-zA-Z_$]\w*/
COMMENT: /#[^\n]*/
REGEXP: /\[.*?\]/
STRING: _STRING "i"?
%import common.ESCAPED_STRING -> _STRING
%import common.WS
%ignore WS
%ignore COMMENT
"""
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic')
def _get_rulename(name):
name = {'_': '_ws_maybe', '__': '_ws'}.get(name, name)
return 'n_' + name.replace('$', '__DOLLAR__').lower()
@v_args(inline=True)
class NearleyToLark(Transformer):
def __init__(self):
self._count = 0
self.extra_rules = {}
self.extra_rules_rev = {}
self.alias_js_code = {}
def _new_function(self, code):
name = 'alias_%d' % self._count
self._count += 1
self.alias_js_code[name] = code
return name
def _extra_rule(self, rule):
if rule in self.extra_rules_rev:
return self.extra_rules_rev[rule]
name = 'xrule_%d' % len(self.extra_rules)
assert name not in self.extra_rules
self.extra_rules[name] = rule
self.extra_rules_rev[rule] = name
return name
def rule(self, name):
return _get_rulename(name)
def ruledef(self, name, exps):
return '!%s: %s' % (_get_rulename(name), exps)
def expr(self, item, op):
rule = '(%s)%s' % (item, op)
return self._extra_rule(rule)
def regexp(self, r):
return '/%s/' % r
def null(self):
return ''
def string(self, s):
return self._extra_rule(s)
def expansion(self, *x):
x, js = x[:-1], x[-1]
if js.children:
js_code ,= js.children
js_code = js_code[2:-2]
alias = '-> ' + self._new_function(js_code)
else:
alias = ''
return ' '.join(x) + alias
def expansions(self, *x):
return '%s' % ('\n |'.join(x))
def start(self, *rules):
return '\n'.join(filter(None, rules))
def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
rule_defs = []
tree = nearley_grammar_parser.parse(g)
for statement in tree.children:
if statement.data == 'directive':
directive, arg = statement.children
if directive in ('builtin', 'include'):
folder = builtin_path if directive == 'builtin' else folder_path
path = os.path.join(folder, arg[1:-1])
if path not in includes:
includes.add(path)
with codecs.open(path, encoding='utf8') as f:
text = f.read()
rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes)
else:
assert False, directive
elif statement.data == 'js_code':
code ,= statement.children
code = code[2:-2]
js_code.append(code)
elif statement.data == 'macro':
pass # TODO Add support for macros!
elif statement.data == 'ruledef':
rule_defs.append(n2l.transform(statement))
else:
raise Exception("Unknown statement: %s" % statement)
return rule_defs
def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False):
import js2py
emit_code = []
def emit(x=None):
if x:
emit_code.append(x)
emit_code.append('\n')
js_code = ['function id(x) {return x[0];}']
n2l = NearleyToLark()
rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set())
lark_g = '\n'.join(rule_defs)
lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items())
emit('from lark import Lark, Transformer')
emit()
emit('grammar = ' + repr(lark_g))
emit()
for alias, code in n2l.alias_js_code.items():
js_code.append('%s = (%s);' % (alias, code))
if es6:
emit(js2py.translate_js6('\n'.join(js_code)))
else:
emit(js2py.translate_js('\n'.join(js_code)))
emit('class TransformNearley(Transformer):')
for alias in n2l.alias_js_code:
emit(" %s = var.get('%s').to_python()" % (alias, alias))
emit(" __default__ = lambda self, n, c, m: c if c else None")
emit()
emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start)
emit('def parse(text):')
emit(' return TransformNearley().transform(parser.parse(text))')
return ''.join(emit_code)
def main(fn, start, nearley_lib, es6=False):
with codecs.open(fn, encoding='utf8') as f:
grammar = f.read()
return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6)
def get_arg_parser():
parser = argparse.ArgumentParser(description='Reads a Nearley grammar (with js functions), and outputs an equivalent lark parser.')
parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar')
parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule')
parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)')
parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true')
return parser
if __name__ == '__main__':
parser = get_arg_parser()
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6))

View File

@@ -0,0 +1,32 @@
import sys
import json
from lark.grammar import Rule
from lark.lexer import TerminalDef
from lark.tools import lalr_argparser, build_lalr
import argparse
argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize', parents=[lalr_argparser],
description="Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file",
epilog='Look at the Lark documentation for more info on the options')
def serialize(lark_inst, outfile):
data, memo = lark_inst.memo_serialize([TerminalDef, Rule])
outfile.write('{\n')
outfile.write(' "data": %s,\n' % json.dumps(data))
outfile.write(' "memo": %s\n' % json.dumps(memo))
outfile.write('}\n')
def main():
if len(sys.argv)==1:
argparser.print_help(sys.stderr)
sys.exit(1)
ns = argparser.parse_args()
serialize(*build_lalr(ns))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,196 @@
###{standalone
#
#
# Lark Stand-alone Generator Tool
# ----------------------------------
# Generates a stand-alone LALR(1) parser
#
# Git: https://github.com/erezsh/lark
# Author: Erez Shinan (erezshin@gmail.com)
#
#
# >>> LICENSE
#
# This tool and its generated code use a separate license from Lark,
# and are subject to the terms of the Mozilla Public License, v. 2.0.
# If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
#
# If you wish to purchase a commercial license for this tool and its
# generated code, you may contact me via email or otherwise.
#
# If MPL2 is incompatible with your free or open-source project,
# contact me and we'll work it out.
#
#
from copy import deepcopy
from abc import ABC, abstractmethod
from types import ModuleType
from typing import (
TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
Union, Iterable, IO, TYPE_CHECKING, overload, Sequence,
Pattern as REPattern, ClassVar, Set, Mapping
)
###}
import sys
import token, tokenize
import os
from os import path
from collections import defaultdict
from functools import partial
from argparse import ArgumentParser
import lark
from lark.tools import lalr_argparser, build_lalr, make_warnings_comments
from lark.grammar import Rule
from lark.lexer import TerminalDef
_dir = path.dirname(__file__)
_larkdir = path.join(_dir, path.pardir)
EXTRACT_STANDALONE_FILES = [
'tools/standalone.py',
'exceptions.py',
'utils.py',
'tree.py',
'visitors.py',
'grammar.py',
'lexer.py',
'common.py',
'parse_tree_builder.py',
'parsers/lalr_analysis.py',
'parsers/lalr_parser_state.py',
'parsers/lalr_parser.py',
'parsers/lalr_interactive_parser.py',
'parser_frontends.py',
'lark.py',
'indenter.py',
]
def extract_sections(lines):
section = None
text = []
sections = defaultdict(list)
for line in lines:
if line.startswith('###'):
if line[3] == '{':
section = line[4:].strip()
elif line[3] == '}':
sections[section] += text
section = None
text = []
else:
raise ValueError(line)
elif section:
text.append(line)
return {name: ''.join(text) for name, text in sections.items()}
def strip_docstrings(line_gen):
""" Strip comments and docstrings from a file.
Based on code from: https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings
"""
res = []
prev_toktype = token.INDENT
last_lineno = -1
last_col = 0
tokgen = tokenize.generate_tokens(line_gen)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if slineno > last_lineno:
last_col = 0
if scol > last_col:
res.append(" " * (scol - last_col))
if toktype == token.STRING and prev_toktype == token.INDENT:
# Docstring
res.append("#--")
elif toktype == tokenize.COMMENT:
# Comment
res.append("##\n")
else:
res.append(ttext)
prev_toktype = toktype
last_col = ecol
last_lineno = elineno
return ''.join(res)
def gen_standalone(lark_inst, output=None, out=sys.stdout, compress=False):
if output is None:
output = partial(print, file=out)
import pickle, zlib, base64
def compressed_output(obj):
s = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
c = zlib.compress(s)
output(repr(base64.b64encode(c)))
def output_decompress(name):
output('%(name)s = pickle.loads(zlib.decompress(base64.b64decode(%(name)s)))' % locals())
output('# The file was automatically generated by Lark v%s' % lark.__version__)
output('__version__ = "%s"' % lark.__version__)
output()
for i, pyfile in enumerate(EXTRACT_STANDALONE_FILES):
with open(os.path.join(_larkdir, pyfile)) as f:
code = extract_sections(f)['standalone']
if i: # if not this file
code = strip_docstrings(partial(next, iter(code.splitlines(True))))
output(code)
data, m = lark_inst.memo_serialize([TerminalDef, Rule])
output('import pickle, zlib, base64')
if compress:
output('DATA = (')
compressed_output(data)
output(')')
output_decompress('DATA')
output('MEMO = (')
compressed_output(m)
output(')')
output_decompress('MEMO')
else:
output('DATA = (')
output(data)
output(')')
output('MEMO = (')
output(m)
output(')')
output('Shift = 0')
output('Reduce = 1')
output("def Lark_StandAlone(**kwargs):")
output(" return Lark._load_from_dict(DATA, MEMO, **kwargs)")
def main():
make_warnings_comments()
parser = ArgumentParser(prog="prog='python -m lark.tools.standalone'", description="Lark Stand-alone Generator Tool",
parents=[lalr_argparser], epilog='Look at the Lark documentation for more info on the options')
parser.add_argument('-c', '--compress', action='store_true', default=0, help="Enable compression")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
ns = parser.parse_args()
lark_inst, out = build_lalr(ns)
gen_standalone(lark_inst, out=out, compress=ns.compress)
ns.out.close()
ns.grammar_file.close()
if __name__ == '__main__':
main()