Below, I'm sorry, but I will leave it in a state where I can not stand reading it for a while as an article ...
Create a class called Utf32Iterator and a class called IteratorInput to parse the string. I will also consider surrogate pairs.
StringParser.java
public class StringParser {
//Iterator that treats strings as unicode codepoint. Consider surrogate pairs as well.
public static class Utf32Iterator implements Iterator<Integer> {
private int position, nextCodePoint = -1;
private final String source;
public Utf32Iterator(String source_) {source = source_; position = 0;}
@Override public boolean hasNext() {
nextCodePoint = -1;
return position <= source.length(); // position == source.length()The case is EOF. Return null and make EndOfInputException after that.
}
@Override public Integer next() {
if (position == source.length()) { //EOF if the length of the string is exactly right. Returns null.
position ++;
return null;
}
if (nextCodePoint < 0) { //If nextCodePoint is not set, get the character with codePointAt only once and set the value.
nextCodePoint = source.codePointAt(position);
position = source.offsetByCodePoints(position, 1);
}
return nextCodePoint;
}
}
//Input class for passing iterator to Parser
public static class IteratorInput<T> implements Input<T> {
private final Iterator<T> iterator;
private final int position;
private final T current;
public IteratorInput(Iterator<T> iterator_) {iterator = iterator_; position = 0; current = iterator.hasNext() ? iterator.next(): null;}
public IteratorInput(Iterator<T> iterator_, int position_) {iterator = iterator_; position = position_; current = iterator.hasNext() ? iterator.next() : null;}
@Override public T current() {return current;}
@Override public String positionDescription() {return "" + position;}
private IteratorInput<T> next = null; //Keep the cache because next may be requested multiple times due to or.
@Override public Input<T> next() throws EndOfInputException {
if (next != null) return next;
if (iterator.hasNext()) return (next = new IteratorInput<T>(iterator, position + 1)); throw new EndOfInputException();
}
}
//Stack List of unicode codepoint into a string
public static Parser<Integer, String> concat(Parser<Integer, List<Integer>> parser) {
return apply(reduce(parser, () -> new StringBuilder(), (sb, i) -> sb.appendCodePoint(i)), sb -> sb.toString());
}
public static Parser<Integer, String> concatStr(Parser<Integer, List<String>> parser) {
return apply(reduce(parser, () -> new StringBuilder(), (sb, i) -> sb.append(i)), sb -> sb.toString());
}
//Parser that passes only one character contained in str
public static Parser<Integer, Integer> consistsOf(String str) {return satisfy(i -> str.indexOf(i) >= 0);}
//A parser that consumes the same string as str
public static Parser<Integer, String> word(String str) {
List<Parser<Integer, Integer>> result = new ArrayList<>();
str.chars().forEach(i -> result.add(satisfy(j -> j == i)));
return concat(lst(result));
}
public static String codePointToString(int[] codePoint) {return new String(codePoint, 0, codePoint.length);}
public static String codePointToString(int codePoint) {return codePointToString(new int[] {codePoint});}
//A parser that consumes input until the same string as str appears
public static Parser<Integer, String> until(String str) {
ParserMemoizer<Integer, String> result = new ParserMemoizer<Integer, String>();
result.defun(() -> or(word(str), apply(seq(satisfy(i -> true) /*A parser that consumes any single character*/ , result), tpl2 -> codePointToString(tpl2.car) + tpl2.cdr.car)));
return result;
}
}
Recommended Posts