This repository has been archived by the owner on Mar 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathfeaturesDump.py
87 lines (69 loc) · 1.97 KB
/
featuresDump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
##################################################################
# Use the features.py module to dump out features
# read in a CSV of sentences and bulk-dump to dump.csv of features
##################################################################
#Input CSV fmt: 1st field is sentence ID, 2nd field is text to process, 3rd field is class
import csv
import sys
import hashlib
import features # features.py is bepoke util to extract NLTK POS features from sentences
if len(sys.argv) > 1:
FNAME = sys.argv[1]
else:
FNAME = './analysis/sentences.csv'
print("reading input from ", FNAME)
if len(sys.argv) > 2:
FOUT = sys.argv[2]
else:
FOUT = './analysis/featuresDump.csv'
print("Writing output to ", FOUT)
fin = open(FNAME, 'rt')
fout = open(FOUT, 'wt', newline='')
keys = ["id",
"wordCount",
"stemmedCount",
"stemmedEndNN",
"CD",
"NN",
"NNP",
"NNPS",
"NNS",
"PRP",
"VBG",
"VBZ",
"startTuple0",
"endTuple0",
"endTuple1",
"endTuple2",
"verbBeforeNoun",
"qMark",
"qVerbCombo",
"qTripleScore",
"sTripleScore",
"class"]
reader = csv.reader(fin)
loopCount = 0
next(reader) #Assume we have a header
for line in reader:
sentence = line[0]
c = line[1] #class-label
id = hashlib.md5(str(sentence).encode('utf-8')).hexdigest()[:16] # generate a unique ID
output = ""
header = ""
#get header and string output
#output, header = features.get_string(id,sentence,c)
f = features.features_dict(id,sentence, c)
for key in keys:
value = f[key]
header = header + ", " + key
output = output + ", " + str(value)
if loopCount == 0: # only extract and print header for first dict item
header = header[1:] #strip the first ","" off
print(header)
fout.writelines(header + '\n')
output = output[1:] #strip the first ","" off
loopCount = loopCount + 1
print(output)
fout.writelines(output + '\n')
fin.close()
fout.close()