From 3eadd8423e41b70e89854bd63e0e9875e259978c Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sat, 9 Jul 2016 21:38:17 +0300 Subject: [PATCH] Use grako instead of PLY to compile EBNF to Python First try. Most tests work. --- parser/Makefile | 13 ++ parser/build_ast.py | 31 +++++ parser/grammar | 62 --------- parser/grammar.ebnf | 36 +++++ parser/parser.py | 307 +++++++++++++++++++++++++----------------- parser/test_lexer.py | 93 ------------- parser/test_parser.py | 16 ++- 7 files changed, 276 insertions(+), 282 deletions(-) create mode 100644 parser/Makefile create mode 100644 parser/build_ast.py delete mode 100644 parser/grammar create mode 100644 parser/grammar.ebnf mode change 100644 => 100755 parser/parser.py delete mode 100755 parser/test_lexer.py diff --git a/parser/Makefile b/parser/Makefile new file mode 100644 index 0000000..a811f0c --- /dev/null +++ b/parser/Makefile @@ -0,0 +1,13 @@ +# Makefile. +# +# __author__ = "Oleg Broytman " +# __copyright__ = "Copyright (C) 2016 PhiloSoft Design" + +parser.py: grammar.ebnf + grako -o $@ $< + chmod +x $@ + + +.PHONY: test +test: parser.py + ./test_parser.py diff --git a/parser/build_ast.py b/parser/build_ast.py new file mode 100644 index 0000000..de87f84 --- /dev/null +++ b/parser/build_ast.py @@ -0,0 +1,31 @@ + +from parser import TagsSemantics as _TagsSemantics + +class TagsSemantics(_TagsSemantics): + def expression(self, ast): + if isinstance(ast, str): # name + return ast + if isinstance(ast, unicode): # name + return str(ast) + if isinstance(ast, list): + ast = tuple(ast) + elif not isinstance(ast, tuple): + raise TypeError("Expected a list, got %r %s" % (type(ast), ast)) + if len(ast) == 3: + return (ast[1], ast[0], ast[2]) + return ast + + def expression_parens(self, ast): + return ('PARENS', ast[1]) + + def name(self, ast): + return ('NAME', str(ast)) + + def and_op(self, ast): + return 'AND' + + def or_op(self, ast): + return 'OR' + + def not_op(self, ast): + return 'NOT' diff --git a/parser/grammar b/parser/grammar deleted file mode 100644 index 14753c8..0000000 --- a/parser/grammar +++ /dev/null @@ -1,62 +0,0 @@ -# Grammar rules for tag searching; BNF. - -# The grammar defines expressions in the following forms: -# TAG - search blog posts that contain the tag; -# !TAG - search blog posts that don't contain the tag; -# TAG & TAG - search blog posts that contain both tags; -# TAG | TAG - search blog posts that contain any of the tags; -# Parentheses are allowed to group expressions; for example: -# TAG & (TAG | TAG) -# !(TAG | TAG) -# Allowed operators: conjunction - & && AND and -# disjunction - | || OR or -# negation - ! NOT not -# This is a simple version of the grammar and it allows -# rather stupid expressions, like !!TAG or ((TAG)); in the future -# it will be fixed by making the grammar more complex and stricter. - -NAME : '[a-z][a-z0-9_]+' - -AND_OP : '&' - -OR_OP : '|' - -NOT_OP : '!' - -SP1 : '[ \t]+' - -expression : NAME - | expression SP0 AND_OP AND_OP SP0 expression - | expression SP0 AND_OP SP0 expression - | l_expression and_word r_expression - | expression SP0 OR_OP OR_OP SP0 expression - | expression SP0 OR_OP SP0 expression - | l_expression or_word r_expression - | NOT_OP SP0 expression - | not_word r_expression - | expression_parens - -l_expression : expression_parens - | expression_sp - -r_expression : expression_parens - | sp_expression - -expression_parens : '(' SP0 expression SP0 ')' - -sp_expression : SP1 expression - -expression_sp : expression SP1 - -and_word : 'A' 'N' 'D' - | 'a' 'n' 'd' - -or_word : 'O' 'R' - | 'o' 'r' - -not_word : 'N' 'O' 'T' - | 'n' 'o' 't' - -SP0 : SP1 | empty - -empty : diff --git a/parser/grammar.ebnf b/parser/grammar.ebnf new file mode 100644 index 0000000..b657738 --- /dev/null +++ b/parser/grammar.ebnf @@ -0,0 +1,36 @@ +# Grammar rules for tag searching; EBNF. + +# The grammar defines expressions in the following forms: +# TAG - search blog posts that contain the tag; +# !TAG - search blog posts that don't contain the tag; +# TAG & TAG - search blog posts that contain both tags; +# TAG | TAG - search blog posts that contain any of the tags; +# Parentheses are allowed to group expressions; for example: +# TAG & (TAG | TAG) +# !(TAG | TAG) +# Allowed operators: conjunction - & && AND and +# disjunction - | || OR or +# negation - ! NOT not +# This is a simple version of the grammar and it allows +# rather stupid expressions, like !!TAG or ((TAG)) + +@@grammar :: Tags + +start = expression $ ; + +expression = ( + | expression and_op expression + | expression or_op expression + | not_op expression + | expression_parens + | name ) ; + +expression_parens = '(' expression ')' ; + +name = /[a-z][a-z0-9_]+/ ; + +and_op = '&' | '&&' | 'AND' | 'and' ; + +or_op = '|' | '||' | 'OR' | 'or' ; + +not_op = '!' | 'NOT' | 'not' ; diff --git a/parser/parser.py b/parser/parser.py old mode 100644 new mode 100755 index e214698..460a552 --- a/parser/parser.py +++ b/parser/parser.py @@ -1,121 +1,186 @@ -# Parse query - -from ply import lex, yacc - -literals = '()' - -tokens = ('OP_WORD', 'NAME', 'AND_OP', 'OR_OP', 'NOT_OP', 'SP1') - -t_OP_WORD = '(AND|and|OR|or|NOT|not)' - -t_NAME = '([a-z][a-z0-9_]*)' - -t_AND_OP = '&' - -t_OR_OP = r'\|' - -t_NOT_OP = '!' - -t_SP1 = '[ \t]+' - -def t_error(t): - """Avoid warnings on stderr""" - -lexer = lex.lex() - -def p_expression_name(p): - """expression : NAME""" - p[0] = ('NAME', p[1]) - -def p_expression_and_and(p): - """expression : expression SP0 AND_OP AND_OP SP0 expression""" - p[0] = ('AND', p[1], p[6]) - -def p_expression_and(p): - """expression : expression SP0 AND_OP SP0 expression""" - p[0] = ('AND', p[1], p[5]) - -def p_expression_op_word(p): - """expression : l_expression op_word r_expression""" - if p[2] in ('AND', 'and'): - p[0] = ('AND', p[1], p[3]) - elif p[2] in ('OR', 'or'): - p[0] = ('OR', p[1], p[3]) - else: - raise ValueError(p) - -def p_expression_or_or(p): - """expression : expression SP0 OR_OP OR_OP SP0 expression""" - p[0] = ('OR', p[1], p[6]) - -def p_expression_or(p): - """expression : expression SP0 OR_OP SP0 expression""" - p[0] = ('OR', p[1], p[5]) - -def p_expression_not(p): - """expression : NOT_OP SP0 expression""" - p[0] = ('NOT', p[3]) - -def p_expression_not_word(p): - """expression : op_word r_expression""" - if p[1] in ('NOT', 'not'): - p[0] = ('NOT', p[2]) - else: - raise ValueError(p) - -def p_expression_in_parens(p): - """expression : expression_parens""" - p[0] = p[1] - -def p_l_expression(p): - """l_expression : expression_parens - | expression SP1 - """ - if len(p) == 2: - p[0] = p[1] - elif len(p) == 3: - p[0] = p[1] - else: - raise ValueError(p) - -def p_r_expression(p): - """r_expression : expression_parens - | SP1 expression - """ - if len(p) == 2: - p[0] = p[1] - elif len(p) == 3: - p[0] = p[2] - else: - raise ValueError(p) - -def p_expression_parens(p): - """expression_parens : '(' SP0 expression SP0 ')'""" - p[0] = ('PARENS', p[3]) - -def p_op_word(p): - """op_word : OP_WORD""" - if p[1] in ('AND', 'and', 'OR', 'or', 'NOT', 'not'): - p[0] = p[1] - else: - raise SyntaxError - -def p_SP0(p): - """SP0 : SP1 - | empty - """ - -def p_empty(p): - """empty :""" - -def p_error(p): - """Avoid warnings on stderr""" - yacc.restart() - -precedence = ( - ('left', 'OR_OP'), - ('left', 'AND_OP'), - ('right', 'NOT_OP'), -) - -parser = yacc.yacc() +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# CAVEAT UTILITOR +# +# This file was automatically generated by Grako. +# +# https://pypi.python.org/pypi/grako/ +# +# Any changes you make to it will be overwritten the next time +# the file is generated. + + +from __future__ import print_function, division, absolute_import, unicode_literals + +from grako.parsing import graken, Parser +from grako.util import re, RE_FLAGS, generic_main # noqa + + +__version__ = (2016, 7, 9, 18, 10, 4, 5) + +__all__ = [ + 'TagsParser', + 'TagsSemantics', + 'main' +] + +KEYWORDS = set([]) + + +class TagsParser(Parser): + def __init__(self, + whitespace=None, + nameguard=None, + comments_re=None, + eol_comments_re=None, + ignorecase=None, + left_recursion=True, + keywords=KEYWORDS, + namechars='', + **kwargs): + super(TagsParser, self).__init__( + whitespace=whitespace, + nameguard=nameguard, + comments_re=comments_re, + eol_comments_re=eol_comments_re, + ignorecase=ignorecase, + left_recursion=left_recursion, + keywords=keywords, + namechars=namechars, + **kwargs + ) + + @graken() + def _start_(self): + self._expression_() + self._check_eof() + + @graken() + def _expression_(self): + with self._group(): + with self._choice(): + with self._option(): + self._expression_() + self._and_op_() + self._expression_() + with self._option(): + self._expression_() + self._or_op_() + self._expression_() + with self._option(): + self._not_op_() + self._expression_() + with self._option(): + self._expression_parens_() + with self._option(): + self._name_() + self._error('no available options') + + @graken() + def _expression_parens_(self): + self._token('(') + self._expression_() + self._token(')') + + @graken() + def _name_(self): + self._pattern(r'[a-z][a-z0-9_]+') + + @graken() + def _and_op_(self): + with self._choice(): + with self._option(): + self._token('&') + with self._option(): + self._token('&&') + with self._option(): + self._token('AND') + with self._option(): + self._token('and') + self._error('expecting one of: & && AND and') + + @graken() + def _or_op_(self): + with self._choice(): + with self._option(): + self._token('|') + with self._option(): + self._token('||') + with self._option(): + self._token('OR') + with self._option(): + self._token('or') + self._error('expecting one of: OR or | ||') + + @graken() + def _not_op_(self): + with self._choice(): + with self._option(): + self._token('!') + with self._option(): + self._token('NOT') + with self._option(): + self._token('not') + self._error('expecting one of: ! NOT not') + + +class TagsSemantics(object): + def start(self, ast): + return ast + + def expression(self, ast): + return ast + + def expression_parens(self, ast): + return ast + + def name(self, ast): + return ast + + def and_op(self, ast): + return ast + + def or_op(self, ast): + return ast + + def not_op(self, ast): + return ast + + +def main( + filename, + startrule, + trace=False, + whitespace=None, + nameguard=None, + comments_re=None, + eol_comments_re=None, + ignorecase=None, + left_recursion=True, + **kwargs): + + with open(filename) as f: + text = f.read() + whitespace = whitespace or None + parser = TagsParser(parseinfo=False) + ast = parser.parse( + text, + startrule, + filename=filename, + trace=trace, + whitespace=whitespace, + nameguard=nameguard, + ignorecase=ignorecase, + **kwargs) + return ast + +if __name__ == '__main__': + import json + ast = generic_main(main, TagsParser, name='Tags') + print('AST:') + print(ast) + print() + print('JSON:') + print(json.dumps(ast, indent=2)) + print() diff --git a/parser/test_lexer.py b/parser/test_lexer.py deleted file mode 100755 index ba46ee6..0000000 --- a/parser/test_lexer.py +++ /dev/null @@ -1,93 +0,0 @@ -#! /usr/bin/env python - -import unittest - -class TestLexer(unittest.TestCase): - def test_01_import(self): - global lexer - from parser import lexer - - def test_02_tag(self): - global LexToken - from ply.lex import LexToken - lexer.input('xxx') - tokens = list(lexer) - self.assertEqual(len(tokens), 1) - lextoken = tokens[0] - self.assertEqual(lextoken.type, 'NAME') - self.assertEqual(lextoken.value, 'xxx') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 0) - - def test_03_bad_tag(self): - from ply.lex import LexError - lexer.input('XXX') - self.assertRaises(LexError, list, lexer) - - def test_04_expression(self): - lexer.input('!(xxx&yyy)') - tokens = list(lexer) - self.assertEqual(len(tokens), 6) - lextoken = tokens[0] - self.assertEqual(lextoken.type, 'NOT_OP') - self.assertEqual(lextoken.value, '!') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 0) - lextoken = tokens[1] - self.assertEqual(lextoken.type, '(') - self.assertEqual(lextoken.value, '(') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 1) - lextoken = tokens[2] - self.assertEqual(lextoken.type, 'NAME') - self.assertEqual(lextoken.value, 'xxx') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 2) - lextoken = tokens[3] - self.assertEqual(lextoken.type, 'AND_OP') - self.assertEqual(lextoken.value, '&') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 5) - lextoken = tokens[4] - self.assertEqual(lextoken.type, 'NAME') - self.assertEqual(lextoken.value, 'yyy') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 6) - lextoken = tokens[5] - self.assertEqual(lextoken.type, ')') - self.assertEqual(lextoken.value, ')') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 9) - - def test_05_expression_2(self): - lexer.input('xxx and yyy') - tokens = list(lexer) - self.assertEqual(len(tokens), 5) - lextoken = tokens[0] - self.assertEqual(lextoken.type, 'NAME') - self.assertEqual(lextoken.value, 'xxx') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 0) - lextoken = tokens[1] - self.assertEqual(lextoken.type, 'SP1') - self.assertEqual(lextoken.value, ' ') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 3) - lextoken = tokens[2] - self.assertEqual(lextoken.type, 'OP_WORD') - self.assertEqual(lextoken.value, 'and') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 4) - lextoken = tokens[3] - self.assertEqual(lextoken.type, 'SP1') - self.assertEqual(lextoken.value, ' ') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 7) - lextoken = tokens[4] - self.assertEqual(lextoken.type, 'NAME') - self.assertEqual(lextoken.value, 'yyy') - self.assertEqual(lextoken.lineno, 1) - self.assertEqual(lextoken.lexpos, 9) - -if __name__ == "__main__": - unittest.main() diff --git a/parser/test_parser.py b/parser/test_parser.py index 4f2e176..046c31c 100755 --- a/parser/test_parser.py +++ b/parser/test_parser.py @@ -1,21 +1,25 @@ #! /usr/bin/env python + import unittest +from grako.exceptions import FailedParse + class TestParser(unittest.TestCase): def test_01_import(self): - global parser - from parser import parser + global parser, TagsSemantics + from parser import TagsParser + from build_ast import TagsSemantics + parser = TagsParser(parseinfo=False) def _parse(self, input): - return parser.parse(input) + return parser.parse(input, semantics=TagsSemantics()) def test_02_tag(self): self.assertEqual(self._parse('xxx'), ('NAME', 'xxx')) def test_03_bad_tag(self): - from ply.lex import LexError - self.assertRaises(LexError, self._parse, 'XXX') + self.assertRaises(FailedParse, self._parse, 'XXX') def test_04_expression(self): self.assertEqual(self._parse('!(xxx&yyy)'), @@ -50,7 +54,7 @@ class TestParser(unittest.TestCase): ) def test_05_bad_expression(self): - self.assertIs(self._parse('!(xxx&yyy'), None) + self.assertRaises(FailedParse, self._parse, '!(xxx&yyy') if __name__ == "__main__": unittest.main() -- 2.39.2