What Is Functional Data Analysis (FDA)?

First, let’s install the package .

Polygonal (piecewise linear) basis functions

library(fda) 
polygonal = create.polygonal.basis(argvals=seq(0,1,0.5))
print(polygonal)
## $call
## basisfd(type = type, rangeval = rangeval, nbasis = nbasis, params = argvals, 
##     dropind = dropind, quadvals = quadvals, values = values)
## 
## $type
## [1] "polyg"
## 
## $rangeval
## [1] 0 1
## 
## $nbasis
## [1] 3
## 
## $params
## [1] 0.0 0.5 1.0
## 
## $dropind
## NULL
## 
## $quadvals
## NULL
## 
## $values
## list()
## 
## $basisvalues
## list()
## 
## $names
## [1] "polygon1" "polygon2" "polygon3"
## 
## attr(,"class")
## [1] "basisfd"
plot(polygonal)

Polynomial/regression splines

-Polynomial splines are functions joined together smoothly at knots or breaks. The two outside knots define the range of the time intervals.

-Between two knots, a polynomial spline is a polynomial of fixed degree \(K_2\). But at an interior knot, two adjacent polynomials are required to match in the values of a fixed number of their derivatives, usually chosen to be \(K_2\)-1.

Berkeley Growth Example

#Load the Berkeley growth data set and extract height data
data(growth)
help('growth')


#The function "search()" allows us to check the packages we have loaded.
search()
##  [1] ".GlobalEnv"        "package:fda"       "package:Matrix"   
##  [4] "package:splines"   "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"
#:: specifies that we want to use the matplot function from the fda package
with(growth, fda::matplot(age, hgtf[, 1:10], type="b"))

#Number of subjects you want to show in plots. 
#Note that the fda library requires that each
#participant's data occupies a separate column.
#So all participants in the data set have the
#same number of measurement occasions (number
#of rows).
n = dim(growth$hgtf)[2]

#Ages
age =  c(seq(1, 2, 0.25), seq(3, 8, 1), seq(8.5, 18, 0.5))

#  Range of observations
rng =  c(1,18)

#Height of female participants
heightmat = growth$hgtf

#Set up polygonal basis functions with a single interior knot at age 14
polygonal <- create.polygonal.basis(argvals=c(0,14,18))

#Create a functional parameter object, i.e., an "object" 
#that carries all the functional parameters
growfdPar =  fdPar(polygonal)

#Creating a smoothed curve for the first participant using the 
#FDA parameters set up in growfdPar.
#In this system of polygonal functions, the first segment has 
#two coefficients to be estimated (intercept and linear slope); 
#in the second segment, the intercept has to match the ending level 
#of the first segment so only 1 more coefficient has to be estimated. 
#So in total, we have df = 3 in this example.
heightfd0 = smooth.basis(age,growth$hgtf[,1],growfdPar)
heightfd0$df # 3
## [1] 3
#You can also smooth all participants' data all at once - 
#in other words, the same smoothing parameter is applied to all 
#participants.
heightfd = smooth.basis(age,heightmat,growfdPar)

plot(heightfd,ylab="Height",xlab="Age")

## [1] "done"
#Plot one by one - not run
#par(cex.lab=1.5,cex.axis=1.2,mgp=c(2.75,.75,0))
#plotfit.fdSmooth(heightmat,age,heightfd,ylab="Height",xlab="Age")

#Use measurement ages as the knots
polygonal <- create.polygonal.basis(age)
growfdPar =  fdPar(polygonal) 
heightfd = smooth.basis(age,heightmat,growfdPar)

#Setting graphical parameters
par(cex.lab=1.5,cex.axis=1.2,mgp=c(2.75,.75,0))
#Plot the first participant's growth curve
plot(age, growth$hgtf[,1], type="p")

#Helpful output to extract
heightfd.fd = heightfd$fd # functional data object
height.df = heightfd$df #degrees of freedom
#generalized cross-validation index - lower indicates better fit
height.gcv = heightfd$gcv 
yhat = eval.fd(age,heightfd.fd,0) #Get smoothed level curves
Dxhat  = eval.fd(age, heightfd.fd, 1) #Get first derivatives
D2xhat = eval.fd(evalarg=age, fdobj=heightfd.fd, 2) #Get second derivatives

Generate Data Using a Dampled Oscillator Model

#Library to obtain numerical solutions for ODEs for data generation
library('deSolve')
source('SelectFunctions.r')
# Simulate a sample of oscillators.
totalPersons <- 30 #Total number of participants
totalSamples <- 60 #Total number of time points
lastT <- 30
deltaT <- lastT / totalSamples
theTimes  <- seq(0, lastT, length=totalSamples)  # the measured time points
parms <- c(-0.5, -0.05) #eta, zeta
dataMatrix <- matrix(NA, nrow=totalSamples, ncol=totalPersons)
trueX <- matrix(NA, nrow=totalSamples, ncol=totalPersons)
truedX <- matrix(NA, nrow=totalSamples, ncol=totalPersons)


#Structuring the data in similar format as the growth data. 
#Thus, each person has one column of data.
for (id in 1:totalPersons) {
    parmsi  <- c(parms[1], parms[2] + rnorm(1,mean=0,sd=.02))     
    xstart <- c(x=runif(1,min=-9,max=9), y=rnorm(1,mean=0,sd=2))
    out1 <- as.data.frame(lsoda(xstart, theTimes, Osc, parmsi))
    dataMatrix[,id] <- out1$x + rnorm(length(out1$x),0,.1)
    trueX[,id] = out1$x
    truedX[,id] = out1$y
}

Read in Simulated Data and Compute Smoothed Derivative Estimates

Up till this point, we have generated data one particular ordinary differential equation (ODE) model - the damped oscillator model with individual differences in the zeta parameter only, with additive measurement noise. Now we will calculate derivative estimates and use them in a linear mixed effects model.

library('fda')
source('SelectFunctions.r')
totalPersons <- 30 #Total number of participants
totalSamples <- 60 #Total number of time points
lastT <- 30
deltaT <- lastT / totalSamples
theTimes  <- seq(0, lastT, length=totalSamples)  # the measured time points
rng =  range(theTimes)
#  We use b-spline basis functions of order 6
# 2 order higher than the highest derivative
# order used (4 in the penalty, see below)

#  Knots are positioned at the observation times.
norder =  6
nt   =  length(theTimes)
nbasis =  nt + norder - 2
oscbasis = create.bspline.basis(rng, nbasis,norder, theTimes)

#Penalizes 4th derivative as we are hoping to obtain smoothed
# second derivative estimates
roughPenaltyMax    =  4          
lambda = .1  # a positive smoothing parameter: larger -> more smoothing
OscfdPar =  fdPar(oscbasis, roughPenaltyMax, lambda)

loglamout = plotGCVRMSE.fd(-6, -1, 0.25, theTimes, dataMatrix, 
                           OscfdPar, FALSE)

#Based on the plot I decided to set lambda to 10^(-2) 
OscfdPar =  fdPar(oscbasis, roughPenaltyMax, 10^(-2))

#Performing smoothing
oscfd = smooth.basis(theTimes,dataMatrix,OscfdPar)
#Other helpful output to extract
oscfd.fd = oscfd$fd
oscfd.df = oscfd$df
oscfd.gcv = oscfd$gcv

yhat = eval.fd(theTimes,oscfd.fd)
dyhat = eval.fd(theTimes,oscfd.fd,1)
d2yhat = eval.fd(theTimes,oscfd.fd,2)

# Plot the smoothed curve
plot(theTimes, dataMatrix[,1], type="p")
lines(theTimes,yhat[,1])

#Can run the following code to inspect each individual's smoothed plot
#plotfit.fd(dataMatrix, theTimes, oscfd.fd)

# Gather all data
dataMatrix2 = matrix(NA,totalPersons*nt,4)

for (i in 1:totalPersons){
dataMatrix2[(1+(i-1)*nt):(i*nt),1] = i
dataMatrix2[(1+(i-1)*nt):(i*nt),2] = yhat[,i]   
dataMatrix2[(1+(i-1)*nt):(i*nt),3] = dyhat[,i]
dataMatrix2[(1+(i-1)*nt):(i*nt),4] = d2yhat[,i]     
}
dimnames(dataMatrix2) = list(NULL,c("ID","y","dy","d2y"))
dataMatrix2 = data.frame(dataMatrix2)
write.table(dataMatrix2,file="dataMatrix2.dat",col.names=TRUE,
            row.names=FALSE)

Run Linear Mixed Effects Model

library('nlme') #Library for mixed effects modeling
dataMatrix2 = read.table("dataMatrix2.dat",header=TRUE)
treg <- lme(d2y ~ y + dy - 1, data=dataMatrix2, 
            random=list( ~ dy - 1 | ID))

summary(treg)
## Linear mixed-effects model fit by REML
##  Data: dataMatrix2 
##         AIC       BIC   logLik
##   -880.5924 -858.6147 444.2962
## 
## Random effects:
##  Formula: ~dy - 1 | ID
##                 dy  Residual
## StdDev: 0.02552231 0.1852451
## 
## Fixed effects: d2y ~ y + dy - 1 
##        Value   Std.Error   DF   t-value p-value
## y  -0.509032 0.001259629 1769 -404.1128       0
## dy -0.049282 0.005238766 1769   -9.4072       0
##  Correlation: 
##    y    
## dy 0.018
## 
## Standardized Within-Group Residuals:
##          Min           Q1          Med           Q3          Max 
## -10.83429023  -0.30655579   0.02347903   0.30865620   9.85893674 
## 
## Number of Observations: 1800
## Number of Groups: 30
treg$coefficients
## $fixed
##           y          dy 
## -0.50903205 -0.04928205 
## 
## $random
## $random$ID
##               dy
## 1   0.0182534835
## 2  -0.0234404328
## 3  -0.0125575187
## 4  -0.0358077937
## 5  -0.0037703190
## 6   0.0343251458
## 7  -0.0151891388
## 8   0.0289567062
## 9  -0.0251186307
## 10 -0.0027598633
## 11  0.0107718630
## 12 -0.0262424377
## 13  0.0025286239
## 14  0.0139650991
## 15 -0.0096131391
## 16  0.0200478425
## 17  0.0331259162
## 18 -0.0054708762
## 19 -0.0281037542
## 20  0.0101445041
## 21 -0.0147878132
## 22 -0.0012300569
## 23  0.0066650613
## 24  0.0447071272
## 25 -0.0149232998
## 26  0.0397964551
## 27  0.0107856952
## 28 -0.0131654697
## 29 -0.0427150612
## 30  0.0008220819
int = intervals(treg) #95% confidence intervals for all parameters

#Other Helpful commands
int$fixed #Intervals for fixed effects parameters
##          lower        est.       upper
## y  -0.51150257 -0.50903205 -0.50656153
## dy -0.05955687 -0.04928205 -0.03900723
## attr(,"label")
## [1] "Fixed effects:"
(int$reStruct)$ID #Intervals for random effects parameters
##             lower       est.      upper
## sd(dy) 0.01857461 0.02552231 0.03506876
int$sigma #Intervals for residual SD for process 1
##     lower      est.     upper 
## 0.1792429 0.1852451 0.1914483 
## attr(,"label")
## [1] "Within-group standard error:"