tokenize.cpp

#include <stdio.h>
#include <iostream>
#include <vector>
#include <map>
#include "util.h"

// These appear as independent tokens even if inside a stream of symbols
const std::string atoms[] = { "#", "//", "(", ")", "[", "]", "{", "}" };
const int numAtoms = 8;

// Is the char alphanumeric, a space, a bracket, a quote, a symbol?
int chartype(char c) {
    if (c >= '0' && c <= '9') return ALPHANUM;
    else if (c >= 'a' && c <= 'z') return ALPHANUM;
    else if (c >= 'A' && c <= 'Z') return ALPHANUM;
	else if (std::string("~_$@").find(c) != std::string::npos) return ALPHANUM;
    else if (c == '\t' || c == ' ' || c == '\n' || c == '\r') return SPACE;
	else if (std::string("()[]{}").find(c) != std::string::npos) return BRACK;
    else if (c == '"') return DQUOTE;
    else if (c == '\'') return SQUOTE;
    else return SYMB;
}

// "y = f(45,124)/3" -> [ "y", "f", "(", "45", ",", "124", ")", "/", "3"]
std::vector<Node> tokenize(std::string inp, Metadata metadata, bool lispMode) {
    int curtype = SPACE;
	unsigned pos = 0;
    int lastNewline = 0;
    metadata.ch = 0;
    std::string cur;
    std::vector<Node> out;

    inp += " ";
    while (pos < inp.length()) {
        int headtype = chartype(inp[pos]);
        if (lispMode) {
            if (inp[pos] == '\'') headtype = ALPHANUM;
        }
        // Are we inside a quote?
        if (curtype == SQUOTE || curtype == DQUOTE) {
            // Close quote
            if (headtype == curtype) {
                cur += inp[pos];
                out.push_back(token(cur, metadata));
                cur = "";
                metadata.ch = pos - lastNewline;
                curtype = SPACE;
                pos += 1;
            }
            // Backslash escape
            else if (inp.length() >= pos + 2 && inp[pos] == '\\') {
                cur += inp[pos];
                cur += inp[pos + 1];
                pos += 2;
            }
            else {
                cur += inp[pos];
                pos += 1;
            }
        }
        else {
            // Handle atoms ( '//', '#',  brackets )
            for (int i = 0; i < numAtoms; i++) {
                int split = cur.length() - atoms[i].length();
                if (split >= 0 && cur.substr(split) == atoms[i]) {
                    if (split > 0) {
                        out.push_back(token(cur.substr(0, split), metadata));
                    }
                    metadata.ch += split;
                    out.push_back(token(cur.substr(split), metadata));
                    metadata.ch = pos - lastNewline;
                    cur = "";
                    curtype = SPACE;
                }
            }
            // Special case the minus sign
            if (cur.length() > 1 && (cur.substr(cur.length() - 1) == "-"
                                  || cur.substr(cur.length() - 1) == "!")) {
                out.push_back(token(cur.substr(0, cur.length() - 1), metadata));
                out.push_back(token(cur.substr(cur.length() - 1), metadata));
                cur = "";
            }
            // Boundary between different char types
            if (headtype != curtype) {
                if (curtype != SPACE && cur != "") {
                    out.push_back(token(cur, metadata));
                }
                metadata.ch = pos - lastNewline;
                cur = "";
            }
            cur += inp[pos];
            curtype = headtype;
            pos += 1;
        }
        if (inp[pos] == '\n') {
            lastNewline = pos;
            metadata.ch = 0;
            metadata.ln += 1;
        }
    }
    return out;
}