-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDataset.py
59 lines (44 loc) · 1.61 KB
/
Dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import random
import string
# read positive comments file and store
def store_positive_comments_from_file():
positive_comment_arr = []
file = open("./dataset/rt-polarity.pos", "r")
for comment in file:
positive_comment_arr.append(comment)
return positive_comment_arr
# read negative comments file and store
def store_negative_comments_from_file():
negative_comment_arr = []
file = open("./dataset/rt-polarity.neg", "r")
for comment in file:
negative_comment_arr.append(comment)
return negative_comment_arr
def get_positive_train_test_set():
positive_dataset = store_positive_comments_from_file()
dataset_len = len(positive_dataset)
test_set_len = int(dataset_len * 0.1)
test_set = []
for i in range(test_set_len):
test = positive_dataset.pop(random.randint(0, len(positive_dataset) - 1))
test_set.append(test)
train_set = positive_dataset
return train_set, test_set
def get_negative_train_test_set():
negative_dataset = store_negative_comments_from_file()
dataset_len = len(negative_dataset)
test_set_len = int(dataset_len * 0.1)
test_set = []
for i in range(test_set_len):
test = negative_dataset.pop(random.randint(0, len(negative_dataset) - 1))
test_set.append(test)
train_set = negative_dataset
return train_set, test_set
# remove sign character in sentence
def pre_process(set):
for i in range(len(set)):
for char in set[i]:
if char in string.punctuation:
set[i] = set[i].replace(char, " ")
for i in range(len(set)):
set[i] = set[i].lower()