#
# STA112 : Données manquantes 2015
# Site de référence 
# http://www.multiple-imputation.com
# Livre 
# http://www.stefvanbuuren.nl/mi/FIMD.html

library(mice)
## Loading required package: Rcpp
## Loading required package: lattice
## mice 2.22 2014-06-10
# par defaut na.omit 
# comparer les estimation avec fit
lm(Ozone ~ Wind, data=airquality)
## 
## Call:
## lm(formula = Ozone ~ Wind, data = airquality)
## 
## Coefficients:
## (Intercept)         Wind  
##      96.873       -5.551
fit <- lm(Ozone ~ Wind, data = airquality, na.action = na.omit)
coef(fit)
## (Intercept)        Wind 
##   96.872895   -5.550923
#
### impute the mean
imp <- mice(airquality, method="mean", m=1, maxit=1)
## 
##  iter imp variable
##   1   1  Ozone  Solar.R
### Figure 
#pdf("mean_Impute.pdf")
lwd <- 1.5
par(mfrow=c(1,2))
breaks <- seq(-20, 200, 10)
nudge <- 1
x <- matrix(c(breaks-nudge, breaks+nudge), ncol=2)
obs <- airquality[,"Ozone"]
mis  <- imp$imp$Ozone[,1]
fobs <- c(hist(obs, breaks, plot=FALSE)$counts, 0)
fmis <- c(hist(mis, breaks, plot=FALSE)$counts, 0)
y <- matrix(c(fobs, fmis), ncol=2)
matplot(x, y, type="s",
        col=c(mdc(4),mdc(5)), lwd=2, lty=1,
        xlim = c(0, 170), ylim = c(0,40), yaxs = "i",
        xlab="Ozone (ppb)",
        ylab="Frequency")
box()

tp <- xyplot(imp, Ozone~Solar.R, na.groups=ici(imp),
             ylab="Ozone (ppb)", xlab="Solar Radiation (lang)",
             cex = 0.75, lex=lwd,
             ylim = c(-20, 180), xlim = c(0,350))
print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92))
#dev.off()


#pdf("locf.pdf")
par(mfrow=c(1,1))

Oz <- airquality$Ozone
locf <- function(x) {
  a <- x[1]
  for (i in 2:length(x)) {
    if (is.na(x[i])) x[i] <- a
    else a <- x[i]
  }
  return(x)
}
Ozi <- locf(Oz)
colvec <- ifelse(is.na(Oz),mdc(2),mdc(1))

### Figure 

plot(Ozi[1:80],col=colvec,type="l",xlab="Day number",ylab="Ozone (ppb)")
points(Ozi[1:80],col=colvec,pch=20,cex=1)

#dev.off()


###  Example of multiple imputation

imp <- mice(airquality, seed=1, print=FALSE)
fit <- with(imp, lm(Ozone ~ Wind + Temp + Solar.R))
tab <- round(summary(pool(fit)),3)
tab[,c(1:3,5)]
##                 est     se      t Pr(>|t|)
## (Intercept) -64.331 21.535 -2.987    0.004
## Wind         -3.053  0.658 -4.641    0.000
## Temp          1.612  0.231  6.967    0.000
## Solar.R       0.061  0.022  2.731    0.008
fit <- lm(Ozone~Wind+Temp+Solar.R,data=airquality,na.action=na.omit)
round(coef(summary(fit)),3)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -64.342     23.055  -2.791    0.006
## Wind          -3.334      0.654  -5.094    0.000
## Temp           1.652      0.254   6.516    0.000
## Solar.R        0.060      0.023   2.580    0.011
### 
#pdf("mi_ozone.pdf")
par(mfrow = c(1,2))
mis  <- imp$imp$Ozone[,1]
fmis <- c(hist(mis, breaks, plot=FALSE)$counts, 0)
y <- matrix(c(fobs, fmis), ncol=2)
matplot(x, y, type="s",
        col=c(mdc(4),mdc(5)), lwd=2, lty=1,
        xlim = c(0, 170), ylim = c(0,40), yaxs = "i",
        xlab="Ozone (ppb)",
        ylab="Frequency")
box()

tp <- xyplot(imp, Ozone~Solar.R, subset = .imp==1,
             ylab="Ozone (ppb)", xlab="Solar Radiation (lang)",
             cex = 0.75, lex=lwd,
             ylim = c(-20, 180), xlim = c(0,350))
print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92))
#dev.off()

#
library(Hmisc)
## Loading required package: grid
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
data(nhanes)
library(mice)
# incomple cases 
ic(nhanes)
##    age  bmi hyp chl
## 1    1   NA  NA  NA
## 3    1   NA   1 187
## 4    3   NA  NA  NA
## 6    3   NA  NA 184
## 10   2   NA  NA  NA
## 11   1   NA  NA  NA
## 12   2   NA  NA  NA
## 15   1 29.6   1  NA
## 16   1   NA  NA  NA
## 20   3 25.5   2  NA
## 21   1   NA  NA  NA
## 24   3 24.9   1  NA
na.pattern(nhanes)
## pattern
## 0000 0001 0100 0110 0111 
##   13    3    1    1    7
#
# create 5 imputed data sets
#
imp <- mice(nhanes)     
## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   1   5  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   2   5  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   3   5  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   4   5  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl
##   5   5  bmi  hyp  chl
complete(imp)   
##    age  bmi hyp chl
## 1    1 22.0   1 187
## 2    2 22.7   1 187
## 3    1 30.1   1 187
## 4    3 22.5   2 186
## 5    1 20.4   1 113
## 6    3 21.7   1 184
## 7    1 22.5   1 118
## 8    1 30.1   1 187
## 9    2 22.0   1 238
## 10   2 27.2   1 186
## 11   1 27.2   1 187
## 12   2 26.3   1 186
## 13   3 21.7   1 206
## 14   2 28.7   2 204
## 15   1 29.6   1 187
## 16   1 28.7   1 131
## 17   3 27.2   2 284
## 18   2 26.3   2 199
## 19   1 35.3   1 218
## 20   3 25.5   2 204
## 21   1 27.2   1 187
## 22   1 33.2   1 229
## 23   1 27.5   1 131
## 24   3 24.9   1 284
## 25   2 27.4   1 186
print(imp)
## Multiply imputed data set
## Call:
## mice(data = nhanes)
## Number of multiple imputations:  5
## Missing cells per column:
## age bmi hyp chl 
##   0   9   8  10 
## Imputation methods:
##   age   bmi   hyp   chl 
##    "" "pmm" "pmm" "pmm" 
## VisitSequence:
## bmi hyp chl 
##   2   3   4 
## PredictorMatrix:
##     age bmi hyp chl
## age   0   0   0   0
## bmi   1   0   1   1
## hyp   1   1   0   1
## chl   1   1   1   0
## Random generator seed value:  NA