-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainClassifier.m
144 lines (113 loc) · 4.47 KB
/
trainClassifier.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
function trainClassifier(subchallenge,dataset,modality,can_override)
% Run classifier using a precomputed HCTSA MAT-file
%% Configure
subchallenges = {'on_off','dyskinesia','tremor'};
datasets = {'CIS-PD','REAL-PD'};
modalities = {'',...
'-smartphone_accelerometer',...
'-smartwatch_accelerometer',...
'-smartwatch_gyroscope' };
if nargin < 4
can_override = true;
if nargin < 3
modality = '';
if nargin < 2
dataset = 'CIS-PD';
if nargin < 1
subchallenge = 'on_off';
end
end
end
end
assert(any(contains(subchallenges,subchallenge)));
assert(any(contains(datasets,dataset)));
assert(any(contains(modalities,modality)));
%% Setup
classifier_prefix = ['./classifiers/' dataset modality '-' subchallenge '_'];
data_subdir = ['./data/hctsa/' dataset '/'];
input_prefix = [data_subdir dataset modality '_'];
prefix = [data_subdir dataset modality '-' subchallenge '_'];
classifier_filename = [classifier_prefix 'classifier.mat'];
if exist(classifier_filename,'file')
fprintf('File %s already exists',classifier_filename);
if can_override
out = input(' -- override? [y/n] ','s');
if out ~= 'y'
return;
end
else
fprintf('. Parameter set to no override allowed. Exiting.\n');
return;
end
end
success = copyfile([input_prefix 'HCTSA.mat'], [prefix 'HCTSA.mat']);
if ~success
error('Copying %s to %s failed. Check directory structure.\n',...
[input_prefix 'HCTSA.mat'], [prefix 'HCTSA.mat']);
end
use_na = false;
if ~exist('TS_compute','file')
fprintf('Run startup.m from HCTSA directory.\n');
return;
end
% Label all time series by whatever the option set is
matfile = [prefix 'HCTSA.mat'];
x = load(matfile);
groups = cell(6,1);
foundGps = true(5,1);
for i = 0:4
groups{i+1} = sprintf('%s:%i',subchallenge,i);
if ~any(contains(x.TimeSeries.Keywords,groups{i+1}))
foundGps(i+1) = false;
end
end
groups{end} = [subchallenge ':NA'];
if ~use_na
groups = groups(1:end-1);
end
groups = groups(foundGps);
%% For classification
contains(x.TimeSeries.Keywords,groups);
TS_LabelGroups(matfile,groups,true,true);
% Set how to normalize the data:
whatNormalization = 'zscore'; % 'zscore', 'scaledRobustSigmoid'
if exist([matfile(1:end-4) '_filtered.mat'],'file')
matfile = [matfile(1:end-4) '_filtered.mat'];
end
% Normalize the data, filtering out features with any special values:
TS_normalize(whatNormalization,[0.1,1],matfile,true);
if exist([matfile(1:end-4) '_N.mat'],'file')
matfile = [matfile(1:end-4) '_N.mat'];
end
% Load normalized data in a structure:
normalizedData = load(matfile);
%-------------------------------------------------------------------------------
%% How accurately can we classify the states:
whatClassifier = 'svm_linear';
TS_classify(normalizedData,whatClassifier,'numPCs',0,'numNulls',0,...
'classifierFilename',classifier_filename,'numFolds',2);
% %-------------------------------------------------------------------------------
% %% Generate a low-dimensional feature-based representation of the dataset:
%
% numAnnotate = 6; % number of time series to annotate to the plot
% whatAlgorithm = 'pca';
% userSelects = false; % whether the user can click on time series to manually annotate
% timeSeriesLength = 300; % length of time-series segments to annotate
%
% annotateParams = struct('n',numAnnotate,'textAnnotation','none',...
% 'userInput',userSelects,'maxL',timeSeriesLength);
%
% TS_PlotLowDim(normalizedData,whatAlgorithm,true,'',annotateParams);
%-------------------------------------------------------------------------------
%% What individual features best discriminate the phenotypes
% Uses a linear classication accuracy between classes
% Produces 1) a pairwise correlation plot between the top features
% 2) class distributions of the top features, with their stats
% 3) a histogram of the accuracy of all features
numFeatures = 40; % number of features to include in the pairwise correlation plot
numFeaturesDistr = 32; % number of features to show class distributions for
whatStatistic = 'fast_linear'; % classification statistic
TS_TopFeatures(normalizedData,whatStatistic,'numFeatures',numFeatures,...
'numFeaturesDistr',numFeaturesDistr,...
'whatPlots',{'histogram','distributions','cluster'},...
'classifierFilename',classifier_filename);