This repository has been archived by the owner on Dec 31, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 107
/
Copy pathtokenize.cpp
104 lines (98 loc) · 3.52 KB
/
tokenize.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#include <stdio.h>
#include <iostream>
#include <vector>
#include <map>
#include "util.h"
// These appear as independent tokens even if inside a stream of symbols
const std::string atoms[] = { "#", "//", "(", ")", "[", "]", "{", "}" };
const int numAtoms = 8;
// Is the char alphanumeric, a space, a bracket, a quote, a symbol?
int chartype(char c) {
if (c >= '0' && c <= '9') return ALPHANUM;
else if (c >= 'a' && c <= 'z') return ALPHANUM;
else if (c >= 'A' && c <= 'Z') return ALPHANUM;
else if (std::string("~_$@").find(c) != std::string::npos) return ALPHANUM;
else if (c == '\t' || c == ' ' || c == '\n' || c == '\r') return SPACE;
else if (std::string("()[]{}").find(c) != std::string::npos) return BRACK;
else if (c == '"') return DQUOTE;
else if (c == '\'') return SQUOTE;
else return SYMB;
}
// "y = f(45,124)/3" -> [ "y", "f", "(", "45", ",", "124", ")", "/", "3"]
std::vector<Node> tokenize(std::string inp, Metadata metadata, bool lispMode) {
int curtype = SPACE;
unsigned pos = 0;
int lastNewline = 0;
metadata.ch = 0;
std::string cur;
std::vector<Node> out;
inp += " ";
while (pos < inp.length()) {
int headtype = chartype(inp[pos]);
if (lispMode) {
if (inp[pos] == '\'') headtype = ALPHANUM;
}
// Are we inside a quote?
if (curtype == SQUOTE || curtype == DQUOTE) {
// Close quote
if (headtype == curtype) {
cur += inp[pos];
out.push_back(token(cur, metadata));
cur = "";
metadata.ch = pos - lastNewline;
curtype = SPACE;
pos += 1;
}
// Backslash escape
else if (inp.length() >= pos + 2 && inp[pos] == '\\') {
cur += inp[pos];
cur += inp[pos + 1];
pos += 2;
}
else {
cur += inp[pos];
pos += 1;
}
}
else {
// Handle atoms ( '//', '#', brackets )
for (int i = 0; i < numAtoms; i++) {
int split = cur.length() - atoms[i].length();
if (split >= 0 && cur.substr(split) == atoms[i]) {
if (split > 0) {
out.push_back(token(cur.substr(0, split), metadata));
}
metadata.ch += split;
out.push_back(token(cur.substr(split), metadata));
metadata.ch = pos - lastNewline;
cur = "";
curtype = SPACE;
}
}
// Special case the minus sign
if (cur.length() > 1 && (cur.substr(cur.length() - 1) == "-"
|| cur.substr(cur.length() - 1) == "!")) {
out.push_back(token(cur.substr(0, cur.length() - 1), metadata));
out.push_back(token(cur.substr(cur.length() - 1), metadata));
cur = "";
}
// Boundary between different char types
if (headtype != curtype) {
if (curtype != SPACE && cur != "") {
out.push_back(token(cur, metadata));
}
metadata.ch = pos - lastNewline;
cur = "";
}
cur += inp[pos];
curtype = headtype;
pos += 1;
}
if (inp[pos] == '\n') {
lastNewline = pos;
metadata.ch = 0;
metadata.ln += 1;
}
}
return out;
}