What kind of parser is that it performs processing like the following Python execution example.
>>> p_syn(p_lex(input()))
((10 + 20)*(30 - 40))
[['10', '+', '20'], '*', ['30', '-', '40']]
>>> p_syn(p_lex(input()))
{ "color": [ "red", "green" ] }
[['color'], ':', [['red'], ',', ['green']]]
I'm in the process of creating a simple S-expression parser in various programming languages with various reasons, but with a little modification, a simple JSON parser Isn't it also possible? While noticing that, I decided to put together a description example that can be easily implemented in a JSON parser, an S-expression parser, or a parenthesis-based original data structure representation parser with a little modification and additional processing. [Lexical analysis] limited to parentheses strings (https://ja.wikipedia.org/wiki/%E5%AD%97%E5%8F%A5%E8%A7%A3%E6%9E%90) + [Abstract] Syntax Tree](https://ja.wikipedia.org/wiki/%E6%8A%BD%E8%B1%A1%E6%A7%8B%E6%96%87%E6%9C%A8) Generation So, it may be possible to divert only the lexical analysis part or only the abstract syntax tree generation part.
p_lex
function (lexical analyzer)From the input character string, a character string array is generated using () [] {}" and
as lexical terms and blank
as a delimiter.
[Example] {(10 + 20) * (30 -40)}
⇒['{', '(', '10', '+', '20', ')', '*', '(', '30', '-', '40', ')', '}']
If you want to use it as an example of a so-called calculator program, you may want to additionally recognize operators such as +
-`` * `` /
as lexical terms.
p_syn
function (abstract syntax tree generator)From the above string array, generate a list of () [] {}"
groups.
[Example] ['{','(', '10','+', '20',')','*','(', '30','-', '40',' )','}']
⇒[['10', '+', '20'], '*', ['30', '-', '40']]
,
:
.
Etc. remain as list elements, so depending on the purpose of diversion,,
will be deleted, and ʻA: B and ʻA. B
will be a list of keys and values. , Etc. may be required. Also, if you want to change the processing depending on the type of parentheses, you will need to make some adjustments.
A good deal is taken from tokenize
and read_from_tokens
in Peter Norvig's "lis.py". The description has been changed so that pop
and ʻappend` are not used for ease of comparison with implementation examples in other languages.
p_lex.py
def p_lex(p):
for s in '()[]{}",':
p = p.replace(s, ' ' + s + ' ')
return p.split()
p_syn.py
def p_syn(p):
t = p[0]
del p[0]
if t in '({["':
r = []
while p[0] != ')}]"'['({["'.find(t)]:
r += [p_syn(p)]
del p[0]
return (r)
else:
return t
>>> p_syn(p_lex(input()))
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
[['a'], ':', ['100', ',', 'f'], ',', ['b'], ':', [['a'], ':', ['t', ',', ['c']], ',', ['b'], ':', ['d']]]
Created using the Python version as a model. The lexical analysis section uses strtok
. In the abstract syntax tree generation part, a list structure is generated by a binary tree according to our hobby (?). For that reason, p_syn
scans the string array from behind.
p_lex.h
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef P_LEX_H_
#define P_LEX_H_
#define SSTR_MAX 256
int p_lex(const char *p, char* pl[]);
#endif
p_lex.c
#include "p_lex.h"
int p_lex(const char *p, char* pl[])
{
char pf[SSTR_MAX * 3];
int i, j = 0;
for (i = 0; i < strlen(p); i++) {
switch (p[i]) {
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '"':
case ',':
pf[j++] = ' '; pf[j++] = p[i]; pf[j++] = ' ';
break;
case '\n': j++; break;
default: pf[j++] = p[i];
}
}
pf[j] = '\0';
char *t;
int len = 0;
for (t = strtok(pf, " "); t != NULL; t = strtok(NULL, " "))
pl[len++] = t;
pl[len] = NULL;
return (len);
}
p_syn.h
#include <stdio.h>
#include <stdlib.h>
#ifndef P_SYN_H_
#define P_SYN_H_
typedef unsigned int value_t;
enum NODE_TAG { NODE_STRG, NODE_BTRE };
typedef struct _node_t_ {
value_t value;
enum NODE_TAG tag;
} _node_t, *node_t;
node_t node(value_t value, enum NODE_TAG tag);
typedef struct _btree_t_ {
node_t x;
node_t y;
} _btree_t, *btree_t;
node_t btree(node_t x, node_t y);
#define str_to_node(p) (node((value_t)(p), NODE_STRG))
#define node_to_str(p) ((char *)(p->value))
#define bt_x(p) (((btree_t)(p->value))->x)
#define bt_y(p) (((btree_t)(p->value))->y)
#define n_str(p) (p->tag == NODE_STRG)
#define n_btr(p) (p->tag == NODE_BTRE)
node_t p_syn(char *p[], int *pos);
#endif
p_syn.c
#include "p_syn.h"
node_t node(value_t value, enum NODE_TAG tag)
{
node_t n = (node_t)malloc(sizeof(_node_t));
n->value = value; n->tag = tag;
return (n);
}
node_t btree(node_t x, node_t y)
{
btree_t c = (btree_t)malloc(sizeof(_btree_t));
c->x = x; c->y = y;
node_t n = node((value_t)c, NODE_BTRE);
return (n);
}
char p_syn_rp(char lp)
{
switch (lp) {
case ')': return '(';
case ']': return '[';
case '}': return '{';
case '"': return '"';
default: return '\0';
}
}
node_t p_syn(char *p[], int *pos)
{
char *t = p[*pos];
*pos = *pos - 1;
switch (t[0]) {
case ')':
case ']':
case '}':
case '"': {
node_t r = NULL;
while (p[*pos][0] != p_syn_rp(t[0])) {
r = btree(p_syn(p, pos), r);
}
*pos = *pos - 1;
return (r);
}
default:
return (str_to_node(t));
}
}
p_sample.c
#include <stdio.h>
#include <stdlib.h>
#include "p_lex.h"
#include "p_syn.h"
void _p_print(node_t p)
{
if (p == NULL) {
printf("]");
} else if (n_str(p)) {
printf("'%s'", node_to_str(p));
} else {
if (n_btr(bt_x(p))) printf("[");
_p_print(bt_x(p));
if (bt_y(p) != NULL) printf(", ");
_p_print(bt_y(p));
}
}
void p_print(node_t p)
{
if (n_btr(p)) { printf("["); _p_print(p); }
else printf("'%s'", node_to_str(p));
}
int main(void)
{
char p[SSTR_MAX], c = getchar();
int i;
for (i = 0; c != '\n'; i++, c = getchar()) p[i] = c;
p[i] = '\0';
char *pl[SSTR_MAX];
int len = p_lex(p, pl) - 1;
node_t r = p_syn(pl, &len);
p_print(r); printf("\n");
return (0);
}
$ ls
p_lex.c p_lex.h p_sample.c p_syn.c p_syn.h
$ cc -Wall -o p_sample *.c
$ ./p_sample
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
[['a'], ':', ['100', ',', 'f'], ',', ['b'], ':', [['a'], ':', ['t', ',', ['c']], ',', ['b'], ':', ['d']]]
Almost the same as the Python version.
p_lex.rb
def p_lex(p)
for s in ['(',')','[',']','{','}','"',','] do
p = p.gsub(s, ' ' + s + ' ')
end
return p.split
end
p_syn.rb
def p_syn_rp(lp)
case lp
when '(' then ')'
when '[' then ']'
when '{' then '}'
when '"' then '"'
end
end
def p_syn(p)
t = p.shift
if ['(','[','{','"'].include?(t) then
r = []
while p[0] != p_syn_rp(t) do
r += [p_syn(p)]
end
p.shift
return r
else
return t
end
end
>> p_syn(p_lex(gets.chomp))
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
=> [["a"], ":", ["100", ",", "f"], ",", ["b"], ":", [["a"], ":", ["t", ",", ["c"]], ",", ["b"], ":", ["d"]]]
Almost the same as the Python version.
p_lex.js
function p_lex(p) {
p = p.replace(/(\(|\)|\[|\]|\{|\}|\"|\,)/g, ' $1 ');
return p.split(/\s+/).filter(x => x != '');
}
p_syn.js
p_syn_rp = {'(':')','{':'}','[':']','"':'"'};
function p_syn(p) {
var t = p.shift();
if (['(','{','[','"'].includes(t)) {
var r = [];
while (p[0] != p_syn_rp[t]) {
r = r.concat([p_syn(p)]);
}
p.shift();
return r;
} else {
return t;
}
}
> p_syn(p_lex('{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}'))
[ [ 'a' ],
':',
[ '100', ',', 'f' ],
',',
[ 'b' ],
':',
[ [ 'a' ], ':', [ 't', ',', [Array] ], ',', [ 'b' ], ':', [ 'd' ] ] ]
s_syn
... "[Shunting-yard algorithm](https://ja.wikipedia.org/wiki/%E6%93%8D%E8%BB%8A%E5%A0%B4%E3%82%A2%E3%83%AB% E3% 82% B4% E3% 83% AA% E3% 82% BA% E3% 83% A0) ”is different.Recommended Posts