What kind of parser is that it performs processing like the following Python execution example.

>>> p_syn(p_lex(input()))
((10 + 20)*(30 - 40))
[['10', '+', '20'], '*', ['30', '-', '40']]

>>> p_syn(p_lex(input()))
{ "color": [ "red", "green" ] }
[['color'], ':', [['red'], ',', ['green']]]

I'm in the process of creating a simple S-expression parser in various programming languages with various reasons, but with a little modification, a simple JSON parser Isn't it also possible? While noticing that, I decided to put together a description example that can be easily implemented in a JSON parser, an S-expression parser, or a parenthesis-based original data structure representation parser with a little modification and additional processing. [Lexical analysis] limited to parentheses strings (https://ja.wikipedia.org/wiki/%E5%AD%97%E5%8F%A5%E8%A7%A3%E6%9E%90) + [Abstract] Syntax Tree](https://ja.wikipedia.org/wiki/%E6%8A%BD%E8%B1%A1%E6%A7%8B%E6%96%87%E6%9C%A8) Generation So, it may be possible to divert only the lexical analysis part or only the abstract syntax tree generation part.

specification

`p_lex` function (lexical analyzer)

From the input character string, a character string array is generated using () [] {}" and as lexical terms and blank as a delimiter.

[Example] {(10 + 20) * (30 -40)} ⇒['{', '(', '10', '+', '20', ')', '*', '(', '30', '-', '40', ')', '}']

If you want to use it as an example of a so-called calculator program, you may want to additionally recognize operators such as + -`` * `` / as lexical terms.

`p_syn` function (abstract syntax tree generator)

From the above string array, generate a list of () [] {}" groups.

[Example] ['{','(', '10','+', '20',')','*','(', '30','-', '40',' )','}'] ⇒[['10', '+', '20'], '*', ['30', '-', '40']]

, : . Etc. remain as list elements, so depending on the purpose of diversion,, will be deleted, and ʻA: B and ʻA. B will be a list of keys and values. , Etc. may be required. Also, if you want to change the processing depending on the type of parentheses, you will need to make some adjustments.

Other

No error checking
No modularization
No garbage collection

Python implementation example

A good deal is taken from tokenize and read_from_tokens in Peter Norvig's "lis.py". The description has been changed so that pop and ʻappend` are not used for ease of comparison with implementation examples in other languages.

`p_lex.py`


def p_lex(p):
    for s in '()[]{}",':
        p = p.replace(s, ' ' + s + ' ')
    return p.split()

`p_syn.py`


def p_syn(p):
    t = p[0]
    del p[0]
    if t in '({["':
        r = []
        while p[0] != ')}]"'['({["'.find(t)]:
            r += [p_syn(p)]
        del p[0]
        return (r)
    else:
        return t

>>> p_syn(p_lex(input()))
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
[['a'], ':', ['100', ',', 'f'], ',', ['b'], ':', [['a'], ':', ['t', ',', ['c']], ',', ['b'], ':', ['d']]]

C language implementation example

Created using the Python version as a model. The lexical analysis section uses strtok. In the abstract syntax tree generation part, a list structure is generated by a binary tree according to our hobby (?). For that reason, p_syn scans the string array from behind.

`p_lex.h`


#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef P_LEX_H_
#define P_LEX_H_

#define SSTR_MAX 256

int p_lex(const char *p, char* pl[]);

#endif

`p_lex.c`


#include "p_lex.h"

int p_lex(const char *p, char* pl[])
{
  char pf[SSTR_MAX * 3];
  int i, j = 0;
  for (i = 0; i < strlen(p); i++) {
    switch (p[i]) {
      case '(':
      case ')':
      case '[':
      case ']':
      case '{':
      case '}':
      case '"':
      case ',':
        pf[j++] = ' '; pf[j++] = p[i]; pf[j++] = ' ';
        break;
      case '\n': j++; break;
      default: pf[j++] = p[i];
    }
  }
  pf[j] = '\0';

  char *t;
  int len = 0;
  for (t = strtok(pf, " "); t != NULL; t = strtok(NULL, " "))
    pl[len++] = t;
  pl[len] = NULL;

  return (len);
}

`p_syn.h`


#include <stdio.h>
#include <stdlib.h>

#ifndef P_SYN_H_
#define P_SYN_H_

typedef unsigned int value_t;
enum NODE_TAG { NODE_STRG, NODE_BTRE };

typedef struct _node_t_ {
  value_t value;
  enum NODE_TAG tag;
} _node_t, *node_t;

node_t node(value_t value, enum NODE_TAG tag);

typedef struct _btree_t_ {
  node_t x;
  node_t y;
} _btree_t, *btree_t;

node_t btree(node_t x, node_t y);

#define str_to_node(p)  (node((value_t)(p), NODE_STRG))
#define node_to_str(p)  ((char *)(p->value))

#define bt_x(p)  (((btree_t)(p->value))->x)
#define bt_y(p)  (((btree_t)(p->value))->y)
#define n_str(p) (p->tag == NODE_STRG)
#define n_btr(p) (p->tag == NODE_BTRE)

node_t p_syn(char *p[], int *pos);

#endif

`p_syn.c`


#include "p_syn.h"

node_t node(value_t value, enum NODE_TAG tag)
{
  node_t n = (node_t)malloc(sizeof(_node_t));
  n->value = value; n->tag = tag;
  return (n);
}

node_t btree(node_t x, node_t y)
{
  btree_t c = (btree_t)malloc(sizeof(_btree_t));
  c->x = x; c->y = y;
  node_t n = node((value_t)c, NODE_BTRE);
  return (n);
}

char p_syn_rp(char lp)
{
  switch (lp) {
    case ')': return '(';
    case ']': return '[';
    case '}': return '{';
    case '"': return '"';
    default: return '\0';
  }
}

node_t p_syn(char *p[], int *pos)
{
  char *t = p[*pos];
  *pos = *pos - 1;

  switch (t[0]) {
    case ')':
    case ']':
    case '}':
    case '"': {
      node_t r = NULL;
      while (p[*pos][0] != p_syn_rp(t[0])) {
        r = btree(p_syn(p, pos), r);
      }
      *pos = *pos - 1;
      return (r);
    }
    default:
      return (str_to_node(t));
  }
}

`p_sample.c`


#include <stdio.h>
#include <stdlib.h>

#include "p_lex.h"
#include "p_syn.h"

void _p_print(node_t p)
{
  if (p == NULL) {
    printf("]");
  } else if (n_str(p)) {
    printf("'%s'", node_to_str(p));
  } else {
    if (n_btr(bt_x(p))) printf("[");
    _p_print(bt_x(p));
    if (bt_y(p) != NULL) printf(", ");
    _p_print(bt_y(p));
  }
}
void p_print(node_t p)
{
  if (n_btr(p)) { printf("["); _p_print(p); }
  else printf("'%s'", node_to_str(p));
}

int main(void)
{
  char p[SSTR_MAX], c = getchar();
  int i;
  for (i = 0; c != '\n'; i++, c = getchar()) p[i] = c;
  p[i] = '\0';

  char *pl[SSTR_MAX];
  int len = p_lex(p, pl) - 1;
  node_t r = p_syn(pl, &len);

  p_print(r); printf("\n");

  return (0);
}

$ ls
p_lex.c  p_lex.h  p_sample.c  p_syn.c  p_syn.h
$ cc -Wall -o p_sample *.c
$ ./p_sample
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
[['a'], ':', ['100', ',', 'f'], ',', ['b'], ':', [['a'], ':', ['t', ',', ['c']], ',', ['b'], ':', ['d']]]

Ruby implementation example

Almost the same as the Python version.

`p_lex.rb`


def p_lex(p)
  for s in ['(',')','[',']','{','}','"',','] do
    p = p.gsub(s, ' ' + s + ' ')
  end
  return p.split
end

`p_syn.rb`


def p_syn_rp(lp)
  case lp
    when '(' then ')'
    when '[' then ']'
    when '{' then '}'
    when '"' then '"'
  end
end

def p_syn(p)
  t = p.shift
  if ['(','[','{','"'].include?(t) then
    r = []
    while p[0] != p_syn_rp(t) do
      r += [p_syn(p)]
    end
    p.shift
    return r
  else
    return t
  end
end

>> p_syn(p_lex(gets.chomp))
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
=> [["a"], ":", ["100", ",", "f"], ",", ["b"], ":", [["a"], ":", ["t", ",", ["c"]], ",", ["b"], ":", ["d"]]]

JavaScript implementation example

Almost the same as the Python version.

`p_lex.js`


function p_lex(p) {
  p = p.replace(/(\(|\)|\[|\]|\{|\}|\"|\,)/g, ' $1 ');
  return p.split(/\s+/).filter(x => x != '');
}

`p_syn.js`


p_syn_rp = {'(':')','{':'}','[':']','"':'"'};
function p_syn(p) {
  var t = p.shift();
  if (['(','{','[','"'].includes(t)) {
    var r = [];
    while (p[0] != p_syn_rp[t]) {
      r = r.concat([p_syn(p)]);
    }
    p.shift();
    return r;
  } else {
    return t;
  }
}

> p_syn(p_lex('{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}'))
[ [ 'a' ],
  ':',
  [ '100', ',', 'f' ],
  ',',
  [ 'b' ],
  ':',
  [ [ 'a' ], ':', [ 't', ',', [Array] ], ',', [ 'b' ], ':', [ 'd' ] ] ]

Remarks

Supplementary information about the article

I remember that there was a term that refers to "bracket strings", but I just can't remember ... Also, I can't remember the term that refers to the processing algorithm of s_syn ... "[Shunting-yard algorithm](https://ja.wikipedia.org/wiki/%E6%93%8D%E8%BB%8A%E5%A0%B4%E3%82%A2%E3%83%AB% E3% 82% B4% E3% 83% AA% E3% 82% BA% E3% 83% A0) ”is different.

Change log

2020-09-03: Added implementation examples of Ruby and JavaScript
2020-09-03: First edition released (Python, C language implementation example)

[JAVA] "Parentheses character string" Simple parser implementation example summary

specification

p_lex function (lexical analyzer)

p_syn function (abstract syntax tree generator)

Other

Python implementation example

p_lex.py

p_syn.py

C language implementation example

p_lex.h

p_lex.c

p_syn.h

p_syn.c

p_sample.c