-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtests.py
168 lines (140 loc) · 6.32 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import pandas as pd
from sklearn.cluster import KMeans
from preprocessing import features as fts
import os
from itertools import groupby
from collections import Counter
#websites:
#https://datalastic.com/blog/ais-navigational-status/
#https://coast.noaa.gov/data/marinecadastre/ais/VesselTypeCodes2018.pdf
def posprocessing(x, min_points=1, verbose=False, labels2=True):
"""
It applies a pos-processing on the clustering labels.
It changes the label of the current observation to the previous one
when the current observation and the next ones of same label have less
than a minimum number
It also converts lines as cluster 0 and curves as cluster 1
:param x: labels sequence (array)
:param min_points: minimum number in the sequence to change the label
:param verbose: if True, it will print something
:param labels2: if True, converts lines as cluster 0 and curves as cluster 1
:return: the new list of labels
"""
lbl = x.tolist()
if labels2:
cl = max(set(lbl), key=lbl.count)
x[x != cl] = -1
x[x == cl] = 0
lbl = list(abs(x))
new_list = []
c=0
for k, g in groupby(lbl):
s = list(g)
if verbose:
print(f'{c} of {len(lbl)}')
if len(s) == min_points:
if len(new_list) == 0:
new_list = new_list + [0]
else:
new_list = new_list + [new_list[-1] for i in range(min_points)]
else:
new_list = new_list + s
c = c+1
return new_list
print('Starting all Experiments...')
from preprocessing.clean_trajectories import Trajectories
from datetime import datetime
n_samples = None
vessel_type = [30, 50, 60, 37]
start_day = datetime(2020, 4, 1)
end_day = datetime(2020, 4, 30)
region_limits = [46,51,-130,-122.5]
vessel_type = [30]
start_day = datetime(2020, 4, 1)
end_day = datetime(2020, 9, 30)
region_limits = [46,51,-130,-122.5]
### Creating dataset
# dataset = Trajectories(n_samples=n_samples, vessel_type=vessel_type, time_period=(start_day, end_day), region=region_limits)
print('Reading Dataset')
# data_file = f'./data/preprocessed/DCAIS_[30_ 1001_ 1002]_None-mmsi_region_[46_ 51_ -130_ -122.5]_01-04_to_30-06_trips.csv'
# data_file = f'./data/preprocessed/DCAIS_[30, 50, 60, 37]_None-mmsi_region_[46, 51, -130, -122.5]_01-04_to_30-04_trips.csv'
data_file = f'./data/preprocessed/DCAIS_[30]_None-mmsi_region_[46, 51, -130, -122.5]_01-04_to_30-06_trips.csv'
dataset = pd.read_csv(data_file, parse_dates=['time'], low_memory=False)
dataset['time'] = dataset['time'].astype('datetime64[ns]')
dataset = dataset.sort_values(by=['trajectory', "time"])
dataset['control'] = dataset['status']
dataset.loc[dataset[dataset['status'] == 7].index, 'control'] = 1
dataset.loc[dataset[dataset['status'] != 7].index, 'control'] = 0
# seconds = 10*60 # window seconds
# features_path = f'./data/features_window_{seconds}.csv'
# if not os.path.exists(features_path):
# # features_all = fts.get_all_features(dataset, n_dirs=n_dirs, win=window, eps=seconds)
# features = fts.get_features_time(dataset, eps=seconds)
# features.to_csv(features_path, index=False)
# else:
# features = pd.read_csv(features_path)
seconds = 10*60
win = 10
features_path = f'./data/features_all_window_{win}_time_{seconds}.csv'
if not os.path.exists(features_path):
# features_all = fts.get_all_features(dataset, n_dirs=n_dirs, win=window, eps=seconds)
features = fts.get_features(dataset, win=win)
features.to_csv(features_path, index=False)
else:
features = pd.read_csv(features_path)
# select features to run the clustering
# features columns:'sog', 'cog', 'rose',
# 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ([str(i) for i in range(n_dirs)])
# 'roc', 'acceleration', 'ma_t_sog', 'ms_t_sog', 'msum_t_sog', 'ma_t_cog', 'ms_t_cog', 'msum_t_cog',
# 'ma_t_acceleration', 'ms_t_acceleration', 'msum_t_acceleration', 'ma_t_roc', 'ms_t_roc', 'msum_t_roc',
# 'ma_sog', 'ms_sog', 'msum_sog', 'ma_cog', 'ms_cog', 'msum_cog',
# 'ma_acceleration', 'ms_acceleration', 'msum_acceleration', 'ma_roc', 'ms_roc', 'msum_roc'
# [str(i) for i in range(n_dirs)]
#%%
data_cl = features[['msum_t_acceleration', 'msum_t_roc']]
print('Clustering')
# Clustering
nc = 10
model = KMeans(nc).fit(data_cl)
labels = model.labels_
print('Pos-processing')
# labels2 = posprocessing(labels)
data_cl['labels'] = labels
dataset['labels'] = labels
print('Computing metrics')
from sklearn.metrics import silhouette_score, normalized_mutual_info_score
comp_labels = normalized_mutual_info_score(dataset['control'], labels)
sl_data = silhouette_score(features[['msum_acceleration', 'msum_roc']], labels)
sl_control = silhouette_score(features[['msum_acceleration', 'msum_roc']], dataset['control'])
# Saving
# index of a few trajectories to quickly evaluate
test = dataset[dataset['trajectory'].isin([213, 117, 145, 26, 11])]
test.to_csv(f'fishing_5_{nc}_{win}.csv', index=False)
dataset.to_csv(f'fishing_{nc}_{win}.csv', index=False)
# from sklearn.metrics import normalized_mutual_info_score, davies_bouldin_score, calinski_harabasz_score
# comp_labels = normalized_mutual_info_score(sample_fishing['control'], sample_fishing['labels_pos'])
# print(f'MI = {comp_labels}')
# sl_data = silhouette_score(test_cl[['ma_sog', 'msum_roc']], test.csv['labels'])
# print(f'S data = {sl_data}')
# sl_control = silhouette_score(test_cl[['ma_sog', 'msum_roc']], test.csv['control'])
# print(f'S control = {sl_control}')
# CH = calinski_harabasz_score(features[['ma_sog', 'msum_roc']], dataset['labels'])
# print(f'CH data = {CH}')
# CH = calinski_harabasz_score(features[['ma_sog', 'msum_roc']], dataset['control'])
# print(f'CH control = {CH}')
# CH = calinski_harabasz_score(features[['ma_sog', 'msum_roc']], dataset['labels_pos'])
# print(f'CH pos data = {CH}')
# prev = pd.read_csv(f'{folder}/fishing_7_{win}.csv')
# for nc in range(3, nc_size):
# file_name = f'{folder}/fishing_{nc}_{win}.csv'
# dataset = pd.read_csv(file_name)
# MI = normalized_mutual_info_score(prev['labels_pos'], dataset['labels_pos'])
# print(f'MI 14 and {nc} = {MI}')
# We store the clusters
# clus0 = data_cl.loc[data_cl['labels'] == 0]
# clus1 = data_cl.loc[data_cl['labels'] == 1]
# cluster_list = [clus0.values, clus1.values]
# print(base.dunn(cluster_list))
# plt.title(f'{nc}')
# plt.scatter(features['ma_sog'], features['msum_roc'], c=dataset['labels'])
# plt.show()