-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregression_cubic_spline.py
executable file
·126 lines (121 loc) · 6.56 KB
/
regression_cubic_spline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import DeltaT as DT
def regression_cubic_spline(y, DeltaT, a_in, knots):
"""
Fit a regression cubic spline on DeltaT ~ y with cubic coefficients a_in in the first interval.
Input:
y: 1D numpy array for years.
DeltaT: 1D numpy array for DeltaT. y and DeltaT must have the same length.
a_in: 1D numpy array of size 4 for the coefficients of cubic polynomials in the first interval
of y in [knots[0], knots[1]].
knots: 1D numpy array storing the knots for the regression spline. Note that all values of y must be between knots[0] and knots[-1].
Return:
Tuple: spline, residuals, c2
See explanation in the doc string of extend_Morrison_etal_cubic_spline().
"""
def h(t, m):
"""
Function h(t, m) = 0 if t < m and h(t,m) = (t-m)^3 if t >= m
"""
return np.where(t < m, 0, (t-m)**3)
def hd(t,m, n):
"""
Calculate the n-th derivative of h.
"""
if n==1:
return np.where(t < m, 0, 3*(t-m)**2)
elif n==2:
return np.where(t < m, 0, 6*(t-m))
elif n==3:
return np.where(t < m, 0, 6)
else:
return 0
dy = knots[1] - knots[0]
t = (y - knots[0])/dy # the rescaled time variable
# Follow the method in Section 7.4 of *An Introduction to Statistical Learning*
# by James, Witten, Hastie and Tibshirani (https://www.statlearning.com/) to fit
# the data with the equation
# DeltaT = a_in[0] + a_in[1]*t + a_in[2]*t^2 + a_in[3]*t^3 + sum_{m=1}^n c_m*h(t, m)
# h(t, m) = 0 if t < m and (t-m)^3 if t >= m and values of m are determined by
# m = (knots[1:] - knots[0])/dy.
# It's clear that the fitted DeltaT and its first two derivatives will be continuous, but
# its third derivative will be discontinuous at the knots.
# Define z = DeltaT - a_in[0] + a_in[1]*t + a_in[2]*t^2 + a_in[3]*t^3. Then
# z = sum_{m=1}^n c_m*h(t, m)
# Coefficients c_m can be computed by linear regression with 0 intercept.
# Note that any values of c_m will not change the fitted z in the first interval, so we can
# remove data from the first interval.
t = t[y > knots[1]]
z = DeltaT[y > knots[1]] - a_in[0] - a_in[1]*t - a_in[2]*t*t - a_in[3]*t**3
nc = len(knots)-2
# Set up the features
X = np.zeros((len(t), nc))
m = (knots[1:(nc+1)] - knots[0])/dy
for i in range(nc):
X[:,i] = h(t, m[i])
Xmax = np.max(X, axis=0)
reg_coef = np.linalg.lstsq(X/Xmax, z, rcond=None)[0]/Xmax
# Now construct the coefficients of the cubic polynomials in the intervals
# between knots from the regression coefficients.
# If f(x) = c0 + c1*(x-x0) + c2*(x-x0)^2 + c3*(x-x0)^3, then
# c0 = f(x0), c1 = f'(x0), c2 = f''(x0)/2, c3 = f'''(x0)/6, where ' = d/dx.
a = np.zeros((4,nc+1))
a[:,0] = a_in
for i in range(nc):
fac = (knots[i+2] - knots[i+1])/dy # take into account of variable rescaling
e = 1e-12 # a tiny number compared to m[i]
a[0,i+1] = a_in[0] + a_in[1]*m[i] + a_in[2]*m[i]**2 + a_in[3]*m[i]**3 + sum(reg_coef*h(m[i]+e, m))
a[1,i+1] = fac*(a_in[1] + 2*a_in[2]*m[i] + 3*a_in[3]*m[i]**2 + sum(reg_coef*hd(m[i]+e, m, 1)))
a[2,i+1] = fac*fac*(a_in[2] + 3*a_in[3]*m[i] + 0.5*sum(reg_coef*hd(m[i]+e, m, 2)))
a[3,i+1] = fac*fac*fac*(a_in[3] + sum(reg_coef*hd(m[i]+e, m, 3))/6)
y0 = knots[0:(nc+1)]
y1 = knots[1:(nc+2)]
a = np.round(a, 3) # round a to 3 decimal places
# Calculate the residuals of the fit
res = DeltaT - DT.spline(y, y0, y1, a[0,:], a[1,:], a[2,:], a[3,:])
# Calculate the root mean square error of the cubic spline fit in the intervals between knots
eps = np.zeros(nc+1)
for i in range(nc+1):
r = res[(y >= knots[i]) & (y < knots[i+1])]
eps[i] = np.sqrt(np.mean(r*r))
# Delta T at the last knot
DeltaT_last = sum(a[:,-1])
# integration constant c2 for extrapolation using the integrated lod function for y > knots[-1]
c2 = DeltaT_last - DT.integrated_lod(knots[-1], 0)
spline = {'y0':y0, 'y1':y1, 'a0':a[0,:], 'a1':a[1,:], 'a2':a[2,:], 'a3':a[3,:], 'epsilon':eps}
residuals = {'y':y, 'res':res}
return spline, residuals, c2
def extend_Morrison_etal_cubic_spline(knots_in):
"""
Fit a new cubic spline polynomials in the intervals specified by the array knots_in using the IERS data in 'DeltaT_IERS.csv'.
The cubic spline fit before knots_in[0] are not altered, but the cubic cofficients between y0[i] and knots_in[0] may be altered because of possible variable rescaling. Here y0[i] is the largest year in y0 less than knots_in[0] in the Morrison et al's cubic spline knots.
Return:
Tuple: spline, residuals, c2.
spline is a dictionary with keys 'y0', 'y1', 'a0', 'a1', 'a2', 'a3', and 'epsilon'.
Here y0 = knots[0:(nc+1)], y1 = knots[1:(nc+2)], where nc = len(knots)-2;
knots = np.append(y0[i], knots_in);
a0, a1, a2, a3 are the coefficients of cubic spline polynomials so that
fitted DeltaT = a0 + a1*t + a2*t^2 + a3*t^3
with t = (y - y0)/(y1 - y0).
Epsilon is the root mean square error of the fitting formula in the interval [y0, y1].
The fitting coefficients are for the intervals [knots[i], knots[i+1]] with i=0, 1, 2,... nc.
residuals is a dictionary with keys 'y' and 'res'. res is a 1D array storing the residuals of the fit at y.
c2 is the integration constant for extrapolation using the integrated lod function for y > knots[-1].
"""
# Search for i so that y0[i] is the largest year in the Morrison et al's cubic spline knots less than knots_in[0]
i = np.searchsorted(DT.y0, knots_in[0], 'left')-1
# Add y0[i] to knots_in
knots = np.append(DT.y0[i], np.array(knots_in, dtype=np.float64))
# The time variable in the interval [y0[i], knots_in[0]] is t = (y - y0[i])/(knots_in[0]- y0[i]),
# which is in general not the same as the original t = (y - y0[i])/(y1[i] - y0[i]). So the original
# cubic coefficients should be modified by a1[i] -> fac*a1[i], a2[i] -> fac^2*a2[i], a3[i] -> fac^3*a3[i],
# where fac = (knots_in[0]- y0[i])/(y1[i] - y0[i]).
fac = (knots_in[0] - DT.y0[i])/(DT.y1[i] - DT.y0[i])
a_init = np.array([DT.a0[i], DT.a1[i]*fac, DT.a2[i]*fac**2, DT.a3[i]*fac**3])
data = np.loadtxt('DeltaT_IERS.csv', delimiter=',', skiprows=1)
y = (data[:,3] - 51544)/365.2425 + 2000
yrange= (y >= knots[0]) & (y < knots[-1])
# filter y and DeltaT to the range where y is in [knots[0], knots[-1])
y = y[yrange]
DeltaT = data[yrange, 6]
return regression_cubic_spline(y, DeltaT, a_init, knots)