[JAVA] "Parentheses character string" Simple parser implementation example summary

What kind of parser is that it performs processing like the following Python execution example.

>>> p_syn(p_lex(input()))
((10 + 20)*(30 - 40))
[['10', '+', '20'], '*', ['30', '-', '40']]

>>> p_syn(p_lex(input()))
{ "color": [ "red", "green" ] }
[['color'], ':', [['red'], ',', ['green']]]

I'm in the process of creating a simple S-expression parser in various programming languages with various reasons, but with a little modification, a simple JSON parser Isn't it also possible? While noticing that, I decided to put together a description example that can be easily implemented in a JSON parser, an S-expression parser, or a parenthesis-based original data structure representation parser with a little modification and additional processing. [Lexical analysis] limited to parentheses strings (https://ja.wikipedia.org/wiki/%E5%AD%97%E5%8F%A5%E8%A7%A3%E6%9E%90) + [Abstract] Syntax Tree](https://ja.wikipedia.org/wiki/%E6%8A%BD%E8%B1%A1%E6%A7%8B%E6%96%87%E6%9C%A8) Generation So, it may be possible to divert only the lexical analysis part or only the abstract syntax tree generation part.

specification

p_lex function (lexical analyzer)

From the input character string, a character string array is generated using () [] {}" and as lexical terms and blank as a delimiter.

[Example] {(10 + 20) * (30 -40)}['{', '(', '10', '+', '20', ')', '*', '(', '30', '-', '40', ')', '}']

If you want to use it as an example of a so-called calculator program, you may want to additionally recognize operators such as + -`` * `` / as lexical terms.

p_syn function (abstract syntax tree generator)

From the above string array, generate a list of () [] {}" groups.

[Example] ['{','(', '10','+', '20',')','*','(', '30','-', '40',' )','}'] [['10', '+', '20'], '*', ['30', '-', '40']]

, : . Etc. remain as list elements, so depending on the purpose of diversion,, will be deleted, and ʻA: B and ʻA. B will be a list of keys and values. , Etc. may be required. Also, if you want to change the processing depending on the type of parentheses, you will need to make some adjustments.

Other

Python implementation example

A good deal is taken from tokenize and read_from_tokens in Peter Norvig's "lis.py". The description has been changed so that pop and ʻappend` are not used for ease of comparison with implementation examples in other languages.

p_lex.py


def p_lex(p):
    for s in '()[]{}",':
        p = p.replace(s, ' ' + s + ' ')
    return p.split()

p_syn.py


def p_syn(p):
    t = p[0]
    del p[0]
    if t in '({["':
        r = []
        while p[0] != ')}]"'['({["'.find(t)]:
            r += [p_syn(p)]
        del p[0]
        return (r)
    else:
        return t
>>> p_syn(p_lex(input()))
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
[['a'], ':', ['100', ',', 'f'], ',', ['b'], ':', [['a'], ':', ['t', ',', ['c']], ',', ['b'], ':', ['d']]]

C language implementation example

Created using the Python version as a model. The lexical analysis section uses strtok. In the abstract syntax tree generation part, a list structure is generated by a binary tree according to our hobby (?). For that reason, p_syn scans the string array from behind.

p_lex.h


#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef P_LEX_H_
#define P_LEX_H_

#define SSTR_MAX 256

int p_lex(const char *p, char* pl[]);

#endif

p_lex.c


#include "p_lex.h"

int p_lex(const char *p, char* pl[])
{
  char pf[SSTR_MAX * 3];
  int i, j = 0;
  for (i = 0; i < strlen(p); i++) {
    switch (p[i]) {
      case '(':
      case ')':
      case '[':
      case ']':
      case '{':
      case '}':
      case '"':
      case ',':
        pf[j++] = ' '; pf[j++] = p[i]; pf[j++] = ' ';
        break;
      case '\n': j++; break;
      default: pf[j++] = p[i];
    }
  }
  pf[j] = '\0';

  char *t;
  int len = 0;
  for (t = strtok(pf, " "); t != NULL; t = strtok(NULL, " "))
    pl[len++] = t;
  pl[len] = NULL;

  return (len);
}

p_syn.h


#include <stdio.h>
#include <stdlib.h>

#ifndef P_SYN_H_
#define P_SYN_H_

typedef unsigned int value_t;
enum NODE_TAG { NODE_STRG, NODE_BTRE };

typedef struct _node_t_ {
  value_t value;
  enum NODE_TAG tag;
} _node_t, *node_t;

node_t node(value_t value, enum NODE_TAG tag);

typedef struct _btree_t_ {
  node_t x;
  node_t y;
} _btree_t, *btree_t;

node_t btree(node_t x, node_t y);

#define str_to_node(p)  (node((value_t)(p), NODE_STRG))
#define node_to_str(p)  ((char *)(p->value))

#define bt_x(p)  (((btree_t)(p->value))->x)
#define bt_y(p)  (((btree_t)(p->value))->y)
#define n_str(p) (p->tag == NODE_STRG)
#define n_btr(p) (p->tag == NODE_BTRE)

node_t p_syn(char *p[], int *pos);

#endif

p_syn.c


#include "p_syn.h"

node_t node(value_t value, enum NODE_TAG tag)
{
  node_t n = (node_t)malloc(sizeof(_node_t));
  n->value = value; n->tag = tag;
  return (n);
}

node_t btree(node_t x, node_t y)
{
  btree_t c = (btree_t)malloc(sizeof(_btree_t));
  c->x = x; c->y = y;
  node_t n = node((value_t)c, NODE_BTRE);
  return (n);
}

char p_syn_rp(char lp)
{
  switch (lp) {
    case ')': return '(';
    case ']': return '[';
    case '}': return '{';
    case '"': return '"';
    default: return '\0';
  }
}

node_t p_syn(char *p[], int *pos)
{
  char *t = p[*pos];
  *pos = *pos - 1;

  switch (t[0]) {
    case ')':
    case ']':
    case '}':
    case '"': {
      node_t r = NULL;
      while (p[*pos][0] != p_syn_rp(t[0])) {
        r = btree(p_syn(p, pos), r);
      }
      *pos = *pos - 1;
      return (r);
    }
    default:
      return (str_to_node(t));
  }
}

p_sample.c


#include <stdio.h>
#include <stdlib.h>

#include "p_lex.h"
#include "p_syn.h"

void _p_print(node_t p)
{
  if (p == NULL) {
    printf("]");
  } else if (n_str(p)) {
    printf("'%s'", node_to_str(p));
  } else {
    if (n_btr(bt_x(p))) printf("[");
    _p_print(bt_x(p));
    if (bt_y(p) != NULL) printf(", ");
    _p_print(bt_y(p));
  }
}
void p_print(node_t p)
{
  if (n_btr(p)) { printf("["); _p_print(p); }
  else printf("'%s'", node_to_str(p));
}

int main(void)
{
  char p[SSTR_MAX], c = getchar();
  int i;
  for (i = 0; c != '\n'; i++, c = getchar()) p[i] = c;
  p[i] = '\0';

  char *pl[SSTR_MAX];
  int len = p_lex(p, pl) - 1;
  node_t r = p_syn(pl, &len);

  p_print(r); printf("\n");

  return (0);
}
$ ls
p_lex.c  p_lex.h  p_sample.c  p_syn.c  p_syn.h
$ cc -Wall -o p_sample *.c
$ ./p_sample
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
[['a'], ':', ['100', ',', 'f'], ',', ['b'], ':', [['a'], ':', ['t', ',', ['c']], ',', ['b'], ':', ['d']]]

Ruby implementation example

Almost the same as the Python version.

p_lex.rb


def p_lex(p)
  for s in ['(',')','[',']','{','}','"',','] do
    p = p.gsub(s, ' ' + s + ' ')
  end
  return p.split
end

p_syn.rb


def p_syn_rp(lp)
  case lp
    when '(' then ')'
    when '[' then ']'
    when '{' then '}'
    when '"' then '"'
  end
end

def p_syn(p)
  t = p.shift
  if ['(','[','{','"'].include?(t) then
    r = []
    while p[0] != p_syn_rp(t) do
      r += [p_syn(p)]
    end
    p.shift
    return r
  else
    return t
  end
end
>> p_syn(p_lex(gets.chomp))
{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}
=> [["a"], ":", ["100", ",", "f"], ",", ["b"], ":", [["a"], ":", ["t", ",", ["c"]], ",", ["b"], ":", ["d"]]]

JavaScript implementation example

Almost the same as the Python version.

p_lex.js


function p_lex(p) {
  p = p.replace(/(\(|\)|\[|\]|\{|\}|\"|\,)/g, ' $1 ');
  return p.split(/\s+/).filter(x => x != '');
}

p_syn.js


p_syn_rp = {'(':')','{':'}','[':']','"':'"'};
function p_syn(p) {
  var t = p.shift();
  if (['(','{','[','"'].includes(t)) {
    var r = [];
    while (p[0] != p_syn_rp[t]) {
      r = r.concat([p_syn(p)]);
    }
    p.shift();
    return r;
  } else {
    return t;
  }
}
> p_syn(p_lex('{"a": [100, f], "b": {"a": [t, "c"], "b": "d"}}'))
[ [ 'a' ],
  ':',
  [ '100', ',', 'f' ],
  ',',
  [ 'b' ],
  ':',
  [ [ 'a' ], ':', [ 't', ',', [Array] ], ',', [ 'b' ], ':', [ 'd' ] ] ]

Remarks

Supplementary information about the article

Change log

Recommended Posts

"Parentheses character string" Simple parser implementation example summary
String summary 1
Simple implementation example of one kind of data augmentation
Simple string animation
Character range / character string range
Implementation example of simple LISP processing system (Python version)
Simple character device driver creation memo (Read / Write implementation)