-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathData_Processing_Pipelines.m
294 lines (247 loc) · 12.9 KB
/
Data_Processing_Pipelines.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
%% Hands-on Tutorial for Data Processing Pipelines
%
% American Society for Neurorehabilitation workshop on Reliability and
% Reproducibility in Neurorehabilitation Research.
%
% by James Finley, Ph.D. 10/16/19
%
% Although this tutorial focuses on data analysis, many problems with rigor
% and reproducibility arise before any data are collected (e.g. study
% design and sampling). There are several helpful resources on design of
% studies in the movement sciences. See _Foundations_ _of_ _Clinical_
% _Research_ by Portney and Watkins for a thorough overview of research
% methods for clinical applications.
%
%% Example #1: Using simulated data to develop and validate an analysis pipeline
%
% * *Take-home message #1*: Generating simulated data can help you build
% intuition about factors that may influence the variance in your
% measurements and reduce your statistical power. Thinking about these
% factors _before_ you collect your data can help you improve the rigor of
% your study design.
%
% *Step 1: Conduct a virtual experiment and save the data*
%
% Here, we will simulate data from a study of split-belt treadmill
% adaptation. Changes in performence during motor adaptation tasks are
% often modeled using a double-exponential function that can capture early,
% rapid changes in performance, as well as more gradual changes that impact
% performance during the latter portions of adaptation.
%
% We will use this knowledge to generate artifial data from a
% double-exponential function that has the following form.
%
% $$SLA = A_f*e^{-B_f*n} + B_f*e^{-B_s*n}$$
%
% We simulate intersubject variability by randomly selecing coefficients
% (A_f and A_s) and rate terms (B_s and B_f) from normal distributions
% centered at values taken from the literature. We will also simulate noisy
% measurements and stride-to-stride variability by adding noise to the
% simulated step length asymmetry values. Lastly, since typical adaptation
% studies use a fixed _duration_ of adaptation, each participant may take a
% different number of total strides (largely because of differences in leg
% length). We simulate this source of variability by having each
% participant take between 800 and 1100 total strides.
% Set the random number generator to the default state so that the results
% are reproducible.
rng('default')
% Define the number of virtual participants
N_Participants = 16;
% Make a new folder in the current directory to store the data
mkdir ASNR_Matlab_Data
Home = cd; % Remember the current directory so that we can return to it
% Allocate space to save the true parameter values.
True_Parameters = zeros(16,4);
% Create a figure to show all of the simulated data
figure
for i = 1:N_Participants
cd('ASNR_Matlab_Data')
% Create subdirectory for each participant. Note that the
% sub-directories have a consistent naming convention with a two-digit
% subject ID. If you were to simulate more than 99 participants, you
% would want to modify the form of the participant ID numbers to
% accomodate the largest number of participants you expect.
if i < 10
mkdir(strcat(['20191016_S0' num2str(i)]))
cd(strcat(['20191016_S0' num2str(i)]))
else
mkdir(strcat(['20191016_S' num2str(i)]))
cd(strcat(['20191016_S' num2str(i)]))
end
% Specify the number of strides for each participant as a random value
% between 800 and 1100.
Total_Strides = randi([800 1100],1,1);
% Create a "time" vector with units of strides
Stride_Num = 1:Total_Strides;
% Specify the fast and slow coefficients. The sum of A_Fast and A_Slow
% are approximately equal to the initial step length asymmetry.
%
% Note that each coefficient has a nominal value of -0.05 and we
% introduce between-subjects variability by taking a random value
% between -0.5 and 0.5 and multiplying that number by 0.1. In other
% words, these coefficients range from -0.1 to 0.
A_Fast = -0.05+0.1*(rand(1,1)-0.5);
A_Slow = -0.05+0.1*(rand(1,1)-0.5);
% B_Fast and B_Slow are the time constants of the fast and slow
% components of adaptation. The fast time constant should be larger
% than the slow time constant. Again, we add some random deviation to
% these parameters to introduce between-subjects variance. The nominal
% values for these parameters were taken from Mawase et al., 2013,
% Journal of Neurophysiology.
B_Fast = 0.025+0.01*(rand(1,1)-0.5);
B_Slow = 0.0011+0.0004*(randn(1,1)-0.5);
% Store the true values of the parameters for each participant. We will
% use this as our ground truth at the end of the demo to determine if
% we can accurately estimate these parameters from the data.
True_Parameters(i,:) = [A_Slow B_Slow A_Fast B_Fast];
% Assume that each participant has normally distributed measurement
% noise, and that the amplitude of this noise varies uniformly across
% participants. The values used for noise amplitude were hand picked to
% generate data sets that look fairly realistic.
Noise_Amplitude = 0.02*rand(1,1)+0.005;
Measurement_Noise = Noise_Amplitude*randn(1,numel(Stride_Num));
% Generate step length asymmetries (SLA) for each participant by adding
% measurement noise to the step length asymmetries that would be
% generated by the double exponential model.
SLA = A_Fast*exp(-B_Fast*Stride_Num) + ...
A_Slow*exp(-B_Slow*Stride_Num) +...
Measurement_Noise;
% Plot the raw step length asymmetry data for all participants
ax(i) = subplot(4,4,i); plot(SLA,'o');
xlabel('Stride Number'), ylabel('Step Len. Asym')
% Save the data in a csv file
if i < 10
csvwrite(strcat(['20191016_S0' num2str(i) '_SLA.csv']),SLA)
else
csvwrite(strcat(['20191016_S' num2str(i) '_SLA.csv']),SLA)
end
cd(Home)
end
linkaxes(ax)
%% Load the data for all participants into a single data structure and perform visual quality control
%
% By storing the data for all participants in a single, well-organized
% data structure, you can easily share your data with other Matlab users in
% a self-explanatory format.
cd('ASNR_Matlab_Data')
Directory = dir;
% Create a new figure window to perform high-level quality control
figure
for i = 1:N_Participants
% Use the csvread function to import the step length asymmetry data
% from the .csv files into Matlab
%
% Note that we are storing the data within a structure. See the Matlab
% help for structures if you are not familiar with this data type.
Data(i).SLA = csvread(strcat([Directory(i+2).name '\' ...
Directory(i+2).name '_SLA.csv']));
% Visualize the data and determine if the data for each participant
% seems reasonable. This manual sanity could be made more quantitative,
% but our purpose is to demostrate how you can automate _some_
% quality control processes.
plot(Data(i).SLA,'o'), xlabel('Stride Number'), ylabel('SLA')
Data(i).Good_Data = questdlg('Does This Data Seem Reasonable?');
% Add some artificial demographic information to illustrate the
% versatility of data structures.
Data(i).Age = randi([20 48],1,1);
Data(i).Weight = randi([50 90],1,1);
end
% Save the data structure
save Data_All Data True_Parameters
%% Compute summary measures and store results in a table
% Load the group data
load Data_All
% Here, we use nonlinear regression to estimate the coefficients and rate
% parameters for each participant. We will then compare these estimates to
% the true values to validate our analysis procedure.
% Define the model's structure. See the help for 'fitnlm' for a more
% descriptive example of how model structure is defined for the 'fitnlm'
% function.
Double_Exp_Model = @(Coeff,x)Coeff(1)*exp(-Coeff(2)*x(:,1))+Coeff(3)*exp(-Coeff(4)*x(:,1));
% Specify initial guess for model parameters. These initial guess are
% purposefully set to be equal to the nominal values defined above.
% However, you should vary these values to see if the fitting procedure is
% sensitive to the initial guess.
Coeff_Init = [-0.05 0.025 -0.05 0.011]';
% We will store the parameter estimates in a table. Tables are relatively
% recent addition to Matlab and they allow you to store different data
% types in a spreadsheet format as you might do in Excel. Here, we are
% simply creating a table with the correct column names.
Coefficients_All = array2table(zeros(1,4),'VariableNames',{'A_Slow' 'B_Slow' 'A_Fast' 'B_Fast'});
% In this block of code, we estimate the dual-rate model parameters using
% the _fitnlm_ function. Note that we include a conditional statement so
% that we only fit data that passed our earlier screening.
for i = 1:numel(Directory)-2
if strcmp(Data(i).Good_Data,'Yes')
% Use the 'fitnlm' function to estimate the model parameters
Data(i).Model = fitnlm((1:numel(Data(i).SLA))',Data(i).SLA',...
Double_Exp_Model,Coeff_Init);
% Make sure that the parameters are stored in the proper order
if Data(i).Model.Coefficients.Estimate(2) > Data(i).Model.Coefficients.Estimate(4)
Coefficients_All(i,:) = array2table(Data(i).Model.Coefficients.Estimate([3 4 1 2])');
else
Coefficients_All(i,:) = array2table(Data(i).Model.Coefficients.Estimate');
end
else
% Store NaN values for data that did not pass our screening
Coefficients_All(i,:) = array2table(NaN(1,4));
end
end
save Group_Data Coefficients_All
%% Visualize summary data and save as a publication-quality figure
% We will create a single figure with four subplots. The left column will
% show the distribution of estimated and actual parameters for the slow
% process and the right column will show the distribution of estimated and
% actual parameters for the fast process.
%
% Note that we explicitly define many of the parameters of the plot
% including the axis limits and labels, line widths, and font sizes. We
% even add panel labels as you would do for a publication.
%
% Finally, we set the figure's size to 15 x 12 cm which is approximately two
% columns in a standard scientific publication.
figure
Not_NaN = ~isnan(Coefficients_All.A_Slow);
A_Axes(1) = subplot(2,2,1); plot([Coefficients_All.A_Slow(Not_NaN)...
True_Parameters(Not_NaN,1)]','-o','LineWidth',1,'Color',...
[0.1 0.1 0.1],'MarkerFaceColor',[0.5 0.5 0.5])
set(gca,'XLim', [0.5 2.5], 'XTick',[1 2],'XTickLabel',...
{'Estimated' 'True'},'FontSize',11,'LineWidth',1)
ylabel('Slow Coefficient')
XLimits = xlim; YLimits = ylim;
text(XLimits(1)-0.3*(XLimits(2)-XLimits(1)),YLimits(2)+0.1*(YLimits(2)-YLimits(1)),'A','FontSize',14)
A_Axes(2) = subplot(2,2,2); plot([Coefficients_All.A_Fast(Not_NaN)...
True_Parameters(Not_NaN,3)]','-o','LineWidth',1,'Color',...
[0.1 0.1 0.1],'MarkerFaceColor',[0.5 0.5 0.5])
set(gca,'XLim', [0.5 2.5], 'XTick',[1 2],'XTickLabel',...
{'Estimated' 'True'},'FontSize',11,'LineWidth',1)
ylabel('Fast Coefficient')
XLimits = xlim; YLimits = ylim;
text(XLimits(1)-0.3*(XLimits(2)-XLimits(1)),YLimits(2)+0.1*(YLimits(2)-YLimits(1)),'B','FontSize',14)
linkaxes(A_Axes)
B_Axes(1) = subplot(2,2,3); plot([Coefficients_All.B_Slow(Not_NaN)...
True_Parameters(Not_NaN,2)]','-o','LineWidth',1,'Color',...
[0.1 0.1 0.1],'MarkerFaceColor',[0.5 0.5 0.5])
set(gca,'XLim', [0.5 2.5], 'XTick',[1 2],'XTickLabel',...
{'Estimated' 'True'},'FontSize',11,'LineWidth',1)
ylabel('Slow Rate Constant')
XLimits = xlim; YLimits = ylim;
text(XLimits(1)-0.3*(XLimits(2)-XLimits(1)),YLimits(2)+0.1*(YLimits(2)-YLimits(1)),'C','FontSize',14)
B_Axes(2) = subplot(2,2,4); plot([Coefficients_All.B_Fast(Not_NaN)...
True_Parameters(Not_NaN,4)]','-o','LineWidth',1,'Color',...
[0.1 0.1 0.1],'MarkerFaceColor',[0.5 0.5 0.5])
set(gca,'XLim', [0.5 2.5], 'XTick',[1 2],'XTickLabel',...
{'Estimated' 'True'},'FontSize',11,'LineWidth',1)
ylabel('Fast Rate Constant')
XLimits = xlim; YLimits = ylim;
text(XLimits(1)-0.3*(XLimits(2)-XLimits(1)),YLimits(2)+0.1*(YLimits(2)-YLimits(1)),'D','FontSize',14)
linkaxes(B_Axes)
% Perform a set of paired t-tests to determine if there is evidence to
% reject the null hypothesis that the mean of the differences between the
% estimated and actual model parameters is different from zero.
[h_A_Slow, p_A_Slow] = ttest(Coefficients_All.A_Slow(Not_NaN), True_Parameters(Not_NaN,1));
[h_A_Fast, p_A_Fast] = ttest(Coefficients_All.A_Fast(Not_NaN), True_Parameters(Not_NaN,3));
[h_B_Slow, p_B_Slow] = ttest(Coefficients_All.B_Slow(Not_NaN), True_Parameters(Not_NaN,2));
[h_B_Fast, p_B_Fast] = ttest(Coefficients_All.B_Fast(Not_NaN), True_Parameters(Not_NaN,4));
set(gcf,'Units','Centimeters','Position',[11 14 15 12])
saveas(gcf,'Coefficient Estimates - Group','pdf')