-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathdata_loader.py
29 lines (24 loc) · 892 Bytes
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json
def _load_data(filename):
with open(filename) as f:
reviews = f.read().strip().split('\n')
reviews = [json.loads(review) for review in reviews]
return reviews
def _get_texts(reviews):
texts = [review['text'] for review in reviews]
return texts
def get_data_and_labels(filepath):
reviews = _load_data(filepath)
texts = _get_texts(reviews)
binstars = [0 if review['stars'] <= 3 else 1 for review in reviews]
balanced_texts = []
balanced_labels = []
limit = 100000 # Change this to grow/shrink the dataset
neg_pos_counts = [0, 0]
for i in range(len(texts)):
polarity = binstars[i]
if neg_pos_counts[polarity] < limit:
balanced_texts.append(texts[i])
balanced_labels.append(binstars[i])
neg_pos_counts[polarity] += 1
return balanced_texts, balanced_labels, None