-
Notifications
You must be signed in to change notification settings - Fork 18
/
reorg_dog_data.py
63 lines (53 loc) · 2.48 KB
/
reorg_dog_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import math
import os
import shutil
from collections import Counter
data_dir = './data'
label_file = 'labels.csv'
train_dir = 'train'
test_dir = 'test'
input_dir = 'train_valid_test'
batch_size = 128
valid_ratio = 0.1
def reorg_dog_data(data_dir, label_file, train_dir, test_dir, input_dir,
valid_ratio):
# 读取训练数据标签。
with open(os.path.join(data_dir, label_file), 'r') as f:
# 跳过文件头行(栏名称)。
lines = f.readlines()[1:]
tokens = [l.rstrip().split(',') for l in lines]
idx_label = dict(((idx, label) for idx, label in tokens))
labels = set(idx_label.values())
num_train = len(os.listdir(os.path.join(data_dir, train_dir)))
# 训练集中数量最少一类的狗的数量。
min_num_train_per_label = (
Counter(idx_label.values()).most_common()[:-2:-1][0][1])
# 验证集中每类狗的数量。
num_valid_per_label = math.floor(min_num_train_per_label * valid_ratio)
label_count = dict()
def mkdir_if_not_exist(path):
if not os.path.exists(os.path.join(*path)):
os.makedirs(os.path.join(*path))
# 整理训练和验证集。
for train_file in os.listdir(os.path.join(data_dir, train_dir)):
idx = train_file.split('.')[0]
label = idx_label[idx]
mkdir_if_not_exist([data_dir, input_dir, 'train_valid', label])
shutil.copy(os.path.join(data_dir, train_dir, train_file),
os.path.join(data_dir, input_dir, 'train_valid', label))
if label not in label_count or label_count[label] < num_valid_per_label:
mkdir_if_not_exist([data_dir, input_dir, 'valid', label])
shutil.copy(os.path.join(data_dir, train_dir, train_file),
os.path.join(data_dir, input_dir, 'valid', label))
label_count[label] = label_count.get(label, 0) + 1
else:
mkdir_if_not_exist([data_dir, input_dir, 'train', label])
shutil.copy(os.path.join(data_dir, train_dir, train_file),
os.path.join(data_dir, input_dir, 'train', label))
# 整理测试集。
mkdir_if_not_exist([data_dir, input_dir, 'test', 'unknown'])
for test_file in os.listdir(os.path.join(data_dir, test_dir)):
shutil.copy(os.path.join(data_dir, test_dir, test_file),
os.path.join(data_dir, input_dir, 'test', 'unknown'))
reorg_dog_data(data_dir, label_file, train_dir, test_dir, input_dir,
valid_ratio)