-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexCorp.py
163 lines (140 loc) · 4.73 KB
/
indexCorp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
RAW_CORPUS = 'data/dcard.jsonl'
LITE_CORPUS = 'data/dcard_lite.jsonl'
INDEXED_CORPUS = 'data/dcard.sqlite'
import copy
def revDict(dict_):
return dict((v, k) for k, v in dict_.items())
class IndexedCorp():
def __init__(self, corp):
"""Create an indexed corpus object
Parameters
----------
corp : [type]
[description]
"""
words = set()
tags = set()
for text in corp:
if isinstance(text, dict):
try:
text = text['text']
except:
raise Exception("corpus structure not expected.")
for sent in text:
for word, tag in sent:
words.add(word)
tags.add(tag)
wd = dict((w, i) for i, w in enumerate(words))
td = dict((t, i) for i, t in enumerate(tags))
# Initialize
if isinstance(corp[0], list):
self.corpus = [
[ [ (wd[word], td[tag]) for word, tag in sent ] for sent in text ] for text in corp
]
elif isinstance(corp[0], dict):
indexedCorp = copy.deepcopy(corp)
for i, text in enumerate(indexedCorp):
indexedCorp[i]['text'] = [
[ (wd[w], td[t]) for w, t in sent ] for sent in text['text']
]
self.corpus = indexedCorp
self.corpus_lite = [ text['text'] for text in corp ]
else:
raise Exception("corpus structure not expected.")
self.tokens = words
self.tags = tags
self.wd = wd #revDict(wd)
self.td = td #revDict(td)
self.rev_wd = revDict(wd)
self.rev_td = revDict(td)
if __name__ == "__main__":
import json
import os
# Clean up
if os.path.isfile(INDEXED_CORPUS):
os.unlink(INDEXED_CORPUS)
# Load original corpus
with open(RAW_CORPUS) as f:
corp = [json.loads(line) for line in f]
# Index Corpus
corp = IndexedCorp(corp)
# Save lite corpus (for finding kwic)
with open(LITE_CORPUS, "w") as f:
for text in corp.corpus_lite:
json.dump(text, f, ensure_ascii=False)
f.write('\n')
# Initiate sqlite DB
import sqlite3
conn = sqlite3.connect(INDEXED_CORPUS)
c = conn.cursor()
#conn.close()
# Create Table: token
c.execute("""
CREATE TABLE token(
token_id INTEGER PRIMARY KEY,
token varchar(128) NOT NULL
)
""")
# Create Table: token
c.execute("""
CREATE TABLE pos(
pos_id INTEGER PRIMARY KEY,
pos varchar(32) NOT NULL
)
""")
# Add data to Table: token
rows = []
for key in corp.wd:
token = key
token_id = corp.wd[key]
rows.append( (token_id, token) )
c.executemany("INSERT INTO token (token_id, token) VALUES (?,?)", rows)
# Add data to Table: pos
rows = []
for key in corp.td:
pos = key
pos_id = corp.td[key]
rows.append( (pos_id, pos) )
c.executemany("INSERT INTO pos (pos_id, pos) VALUES (?,?)", rows)
# Index token
c.execute("""
CREATE UNIQUE INDEX idx_token
ON token (token, token_id); """)
# Index pos
c.execute("""
CREATE UNIQUE INDEX idx_pos
ON pos (pos, pos_id); """)
conn.commit()
# rows = c.execute("SELECT * FROM pos")
# Create Table: oneGram
#c.execute("DROP TABLE oneGram;")
c.execute("""
CREATE TABLE oneGram(
text_id INTEGER NOT NULL,
sent_id INTEGER NOT NULL,
position INTEGER NOT NULL,
gender INTEGER NOT NULL,
token_id INTEGER NOT NULL,
pos_id INTEGER NOT NULL,
FOREIGN KEY (token_id) REFERENCES token(token_id),
FOREIGN KEY (pos_id) REFERENCES pos(pos_id)
)""")
# Add data to Table: oneGram
rows = []
for text_id, text in enumerate(corp.corpus):
for sent_id, sent in enumerate(text['text']):
for position, (token_id, pos_id) in enumerate(sent):
rows.append( (text_id, sent_id, position, text['gender'], token_id, pos_id) )
c.executemany('''INSERT INTO oneGram (text_id, sent_id, position, gender, token_id, pos_id)
VALUES (?,?,?,?,?,?)''', rows)
conn.commit()
# Index oneGram
c.execute("""
CREATE INDEX idx_gender_token_pos
ON oneGram (gender, token_id, pos_id, text_id, sent_id, position);
""")
c.execute("""
CREATE INDEX idx_gender_pos_token
ON oneGram (gender, pos_id, token_id, text_id, sent_id, position);
""")
conn.commit()