Statistics with R session 1

1. Data preparation

1.1 Read the data

# get the current working directory (the location from where you opened this file)
getwd() # e.g. [1] "/Users/nokelaip"

## [1] "/Users/nokelaip/CharveLEAD Dropbox/Petri Nokelainen/home1_cloud/home1/koulutus/2023/TTU/Session2/R_Session"

# put that location into variable "wd"
wd <- getwd()
# set the working directory to the value of variable "wd"
setwd(wd) # copy paste "wd" value

# you may also use RStudio menu: Session - Set Working Directory - To Source File location

# read in the data, this time we will use a shorter name
dfttu_wide_cross <- read.table("dfttu_wide_cross.txt", header=TRUE, sep= "\t")

1.2 View the data

# view the first df
head(dfttu_wide_cross,3)

##   id group gender autonomy competence deep organised     score
## 1  1     0      1     3.75       4.25    4       3.0 16.666667
## 2  2     1      1       NA         NA   NA        NA  5.555556
## 3  3     1      1     3.25       3.75    4       3.5 41.666667

tail(dfttu_wide_cross,4)

##      id group gender autonomy competence deep organised     score
## 291 291     1      1     3.25       3.50  4.0      4.50  83.33333
## 292 292     0      0       NA         NA   NA        NA  19.44444
## 293 293     1      1     4.25       5.00  4.5      4.50 100.00000
## 294 294     1      1     3.50       3.75  4.0      3.25  94.44444

summary(dfttu_wide_cross)

##        id             group            gender          autonomy    
##  Min.   :  1.00   Min.   :0.0000   Min.   :0.0000   Min.   :1.500  
##  1st Qu.: 74.25   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:3.250  
##  Median :147.50   Median :0.0000   Median :1.0000   Median :3.750  
##  Mean   :147.50   Mean   :0.4762   Mean   :0.7007   Mean   :3.687  
##  3rd Qu.:220.75   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:4.000  
##  Max.   :294.00   Max.   :1.0000   Max.   :1.0000   Max.   :5.000  
##                                                     NA's   :59     
##    competence         deep         organised         score       
##  Min.   :1.250   Min.   :2.000   Min.   :1.000   Min.   :  0.00  
##  1st Qu.:3.500   1st Qu.:3.750   1st Qu.:3.250   1st Qu.: 22.22  
##  Median :4.000   Median :4.000   Median :3.750   Median : 44.44  
##  Mean   :3.834   Mean   :4.021   Mean   :3.695   Mean   : 45.63  
##  3rd Qu.:4.250   3rd Qu.:4.250   3rd Qu.:4.250   3rd Qu.: 69.44  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :100.00  
##  NA's   :59      NA's   :59      NA's   :59

colnames(dfttu_wide_cross)

## [1] "id"         "group"      "gender"     "autonomy"   "competence"
## [6] "deep"       "organised"  "score"

View(dfttu_wide_cross)

1.3 Perform data frame manipulation

1.3.1 Simplified data frame

# install.packages("dplyr")
library(dplyr)
# install.packages("stats")
library(stats)

# we will create a simple version of the "dfttu_wide_cross" df and practice some often needed actions 

# select some columns into a new df (in this order)
df <- dplyr::select(dfttu_wide_cross, group, gender, autonomy, score, id)

# sort rows
df <- df[order(df$id,df$group),] # sort by id and group

# change the order of the columns
df <- df[,c(5,1:4)] # the last "id" becomes the first

# change column names
setnames(df, "id", "participant")

# create index variable (column)
df$index <- seq.int(nrow(df))

# remove "index" column
df = subset(df, select = -c(index))

# replace all "NA" values with "999" (typical missing value for SPSS)
df[is.na(df)] <- 999
# redo that operation
df[df==999] <- NA

# select random sample of 150 rows
df_n150 <- df[sample(nrow(df), 150), ]

# write new df's
write.table (df, file = "dfttu_wide_cross_test.txt", row.names = FALSE, sep= "\t")
write.table (df_n150, file = "dfttu_wide_cross_test_n150.txt", row.names = FALSE, sep= "\t")

# remove them from the RStudio environment (=memory)
remove(df,df_n150)

1.3.2 Original data frame

# we will return to the original full df "dfttu_wide_cross"

# # remove participants with NA in "score"
# dfttu_wide_cross_test1 <- subset(dfttu_wide_cross, score > -1) # need to use "-1" as the score has values from 0 to 100
# dfttu_wide_cross_test2 <- subset(dfttu_wide_cross, score > -1 & autonomy > 0) # no NA's in score AND autonomy
# dfttu_wide_cross_test3 <- subset(dfttu_wide_cross, score > -1 | autonomy > 0) # no NA's in score OR autonomy


# create new factor (character) variable "score3class" based on "score":
# lowest value to 33.3 -> 1 = low
# 33.4 - 66.6 -> 2 = medium
# 66.7 to highest value -> 3 = high
dfttu_wide_cross$score3class_chr <- cut(dfttu_wide_cross$score, breaks = c(-1, 33.3, 66.6, Inf), labels = c("Low score", "Medium score", "High score"))

# create new numeric variable "score3class" based on "score":
dfttu_wide_cross$score3class <- cut(dfttu_wide_cross$score, breaks = c(-1, 33.3, 66.6, Inf), labels = FALSE) # produces values 1,2,3
# but we would like to store values 0,1,2 instead:
dfttu_wide_cross$score3class <- dfttu_wide_cross$score3class -1

# create new factor "autonomy_chr" based on "autonomy":
# 1 totally disagree
# 2 disagree
# 3 neutral
# 4 agree
# 5 totally agree
dfttu_wide_cross$autonomy_chr <- cut(dfttu_wide_cross$autonomy, breaks = c(-1, 1, 2, 3, 4, Inf), labels = c("Totally disagree", "Disagree", "Neutral", "Agree", "Totally agree"))

# convert numeric "gender" (0,1) to character "gender_chr" (Female,Male)
dfttu_wide_cross$gender_chr <- ifelse(dfttu_wide_cross$gender < 1, "Female", "Male")

# convert numeric "group" (0,1) to character "group_chr" (Control, Treatment)
dfttu_wide_cross$group_chr <- ifelse(dfttu_wide_cross$group < 1, "Control", "Treatment")

# save changes into file
write.table(dfttu_wide_cross, file = "dfttu_wide_cross_final.txt", row.names = FALSE, sep= "\t")

Generally, we have three options if missing data exists:

FULL DATA (do nothing, leave NA’s)
IMPUTED DATA (impute NA’s)
2.1 simple mean/median imputation
2.2 multiple imputation
COMPLETE DATA (delete all NA’s)

# -------------------------------
# 1. FULL DATA, incl. missing values
# install.packages("JointAI")
library(JointAI)

# plot all variables
plot_all(dfttu_wide_cross[c("autonomy","competence","deep","organised","score","score3class")], fill = 'lightblue', border = 'black', 
         # ncol = 4, # how many columns?
         breaks = 30)

# display missing data by pattern
JointAI::md_pattern(dfttu_wide_cross)

mice::md.pattern(dfttu_wide_cross, rotate.names = T)

##     id group gender score score3class_chr score3class gender_chr group_chr
## 235  1     1      1     1               1           1          1         1
## 59   1     1      1     1               1           1          1         1
##      0     0      0     0               0           0          0         0
##     autonomy competence deep organised autonomy_chr    
## 235        1          1    1         1            1   0
## 59         0          0    0         0            0   5
##           59         59   59        59           59 295

It seems that the participants have either a full set of responses or a completely missing survey profile (autonomy, competence, …).

Next we do imputation with mean values (not usually recommended).

# -------------
# 2. IMPUTATION

# 2.1 Simple mean (or median) imputation

 # create a copy of df
df.impMean <- dfttu_wide_cross

# replace one variable (autonomy) NA's with mean (average)
df.impMean$autonomy[is.na(df.impMean$autonomy)] <- mean(df.impMean$autonomy, na.rm=TRUE) # mean replace NA's in "autonomy"

# replace all variables with NA's with mean (average)
for(i in 1:ncol(df.impMean)) { # produces error messages as there is also a character variable with NA's (autonomy_chr)
  df.impMean[ , i][is.na(df.impMean[ , i])] <- mean(df.impMean[ , i], na.rm=TRUE)
}

# replace only numeric variables with NA's with mean (average)
df.impMean[,sapply(df.impMean, is.numeric)] <- lapply(df.impMean[,sapply(df.impMean, is.numeric)], 
                                        function(x){
                                         x <- ifelse(is.na(x), mean(x, na.rm  = TRUE), x)
                                          }
                                        )

# re-calculate missing "autonomy_chr" values
df.impMean$autonomy_chr <- cut(df.impMean$autonomy, breaks = c(-1, 1, 2, 3, 4, Inf), labels = c("Totally disagree", "Disagree", "Neutral", "Agree", "Totally agree"))

# save mean imputed df
write.table (df.impMean, file = "dfttu_wide_cross_impMean.txt", row.names = FALSE, sep= "\t")

# read mean imputed df
dfttu_wide_cross.impMean <- read.table("dfttu_wide_cross_impMean.txt", header=TRUE, sep= "\t")

Now we have a mean imputed data without NA’s. Next step is to do multiple imputation (MI), where several imputed data frames (in this case 5) are created.

# -------------
# 2. IMPUTATION

# 2.2 Multiple imputation
# install.packages("mice")
library(mice)
# install.packages("VIM")
library(VIM)

# plot NA's once again with mice
md.pattern(dfttu_wide_cross, rotate.names = TRUE)

##     id group gender score score3class_chr score3class gender_chr group_chr
## 235  1     1      1     1               1           1          1         1
## 59   1     1      1     1               1           1          1         1
##      0     0      0     0               0           0          0         0
##     autonomy competence deep organised autonomy_chr    
## 235        1          1    1         1            1   0
## 59         0          0    0         0            0   5
##           59         59   59        59           59 295

# and with VIM # https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
dfttuNA_plot <- aggr(dfttu_wide_cross, col=c('lightblue','lightgrey'), numbers=TRUE, sortVars=TRUE, labels=names(dfttu_wide_cross), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##         Variable     Count
##         autonomy 0.2006803
##       competence 0.2006803
##             deep 0.2006803
##        organised 0.2006803
##     autonomy_chr 0.2006803
##               id 0.0000000
##            group 0.0000000
##           gender 0.0000000
##            score 0.0000000
##  score3class_chr 0.0000000
##      score3class 0.0000000
##       gender_chr 0.0000000
##        group_chr 0.0000000

# remove "_chr" variables (as they are duplicates for their numerical versions)
dfttu_wide_cross_MI <- dplyr::select(dfttu_wide_cross, c("group", "gender", "autonomy", "competence", "deep", "organised", "score"))

# perform "predictive mean matching" (PMM) imputation
df.impPMM <- mice(dfttu_wide_cross_MI, 
                  m=5, # create 5 imputed df's
                  maxit=60, # n of iterations 
                  meth='pmm', # other methods: > methods(mice)
                  print=FALSE,
                  seed=2021) # replicable imputation

# for which vars PMM was applied for?
df.impPMM$meth

##      group     gender   autonomy competence       deep  organised      score 
##         ""         ""      "pmm"      "pmm"      "pmm"      "pmm"         ""

     # group     gender   autonomy competence       deep  organised      score 
     #    ""         ""      "pmm"      "pmm"      "pmm"      "pmm"         "" 

# we see that missing data were in autonomy, competence, deep and organised vars

# # view summary of imputed data
# summary(df.impPMM)

# view imputed values (5 for each variable)
df.impPMM$imp

## $group
## [1] 1 2 3 4 5
## <0 rows> (or 0-length row.names)
## 
## $gender
## [1] 1 2 3 4 5
## <0 rows> (or 0-length row.names)
## 
## $autonomy
##        1    2    3    4    5
## 2   2.00 3.50 3.00 2.75 3.50
## 5   3.00 3.00 4.00 4.75 3.50
## 8   4.25 3.00 3.25 4.00 2.00
## 9   4.00 3.75 2.25 3.75 4.00
## 13  4.75 3.75 4.50 2.00 4.00
## 22  4.75 4.50 4.00 3.00 3.50
## 27  3.75 2.00 2.25 4.25 4.25
## 30  4.00 3.25 3.75 4.75 3.25
## 33  4.75 3.50 3.75 3.75 4.00
## 39  4.00 3.25 3.50 4.75 3.25
## 42  5.00 3.00 2.00 5.00 3.75
## 43  3.00 4.25 3.50 4.25 3.50
## 45  2.75 4.00 3.75 4.50 3.50
## 46  3.75 3.50 3.25 4.50 4.00
## 47  3.25 3.25 4.00 3.50 3.50
## 49  3.50 2.00 2.75 2.25 5.00
## 57  4.00 4.75 4.00 3.50 5.00
## 62  3.00 3.25 4.75 3.00 3.50
## 69  4.00 3.00 2.50 4.75 3.00
## 80  4.25 2.75 3.50 3.75 3.50
## 95  4.75 4.25 3.50 3.50 2.50
## 103 3.00 3.50 4.25 4.00 3.75
## 109 3.75 3.75 4.00 4.00 4.50
## 115 4.25 4.00 4.00 4.00 3.75
## 116 4.50 3.00 4.00 3.75 4.25
## 118 4.00 2.00 4.00 2.25 4.75
## 122 4.25 3.50 3.50 3.50 3.75
## 135 3.50 3.75 4.00 4.75 4.75
## 141 4.25 3.75 3.50 4.75 3.25
## 144 4.25 4.00 3.75 5.00 4.00
## 145 5.00 4.25 4.00 3.00 4.25
## 162 3.25 4.25 3.00 3.50 2.50
## 164 3.50 4.00 3.25 3.75 2.25
## 167 3.50 2.00 4.50 3.25 3.75
## 178 4.50 3.50 4.00 3.75 4.25
## 182 3.75 4.50 2.00 4.00 4.75
## 185 3.00 4.00 3.75 3.75 2.25
## 191 4.00 5.00 4.25 4.25 2.00
## 196 3.75 3.50 3.50 4.00 3.00
## 206 3.00 3.75 3.75 3.75 4.25
## 208 3.50 3.75 3.00 3.75 4.00
## 209 3.25 3.00 3.00 3.50 4.50
## 210 4.00 2.25 3.00 4.75 3.00
## 221 4.50 5.00 3.75 4.00 4.75
## 222 3.50 4.00 3.75 3.00 3.75
## 230 3.50 2.75 2.25 4.50 4.75
## 244 4.00 4.50 4.25 3.75 4.00
## 247 4.25 4.75 4.00 3.50 5.00
## 248 3.75 3.75 3.50 4.00 5.00
## 250 2.75 4.75 5.00 4.00 4.50
## 251 4.25 5.00 3.50 4.00 3.75
## 254 3.00 4.50 4.75 3.75 4.25
## 255 2.75 3.75 3.50 2.25 4.00
## 256 3.75 4.00 4.25 3.25 3.00
## 267 3.00 4.00 4.75 4.75 4.25
## 273 3.25 2.00 3.00 3.00 3.75
## 282 5.00 4.00 4.25 3.75 3.75
## 289 5.00 3.50 4.00 4.00 3.00
## 292 2.25 3.00 3.00 3.75 2.75
## 
## $competence
##        1    2    3    4    5
## 2   3.75 4.00 3.25 2.75 4.75
## 5   4.00 2.25 4.50 4.00 3.25
## 8   3.75 3.75 2.75 4.00 4.00
## 9   3.75 4.00 2.50 3.75 4.00
## 13  5.00 3.50 4.00 2.50 4.50
## 22  5.00 3.50 4.00 3.75 4.25
## 27  3.75 1.25 3.00 4.00 4.00
## 30  3.75 4.00 3.75 4.25 3.75
## 33  5.00 2.75 3.75 2.75 4.00
## 39  4.25 1.50 3.25 4.25 2.75
## 42  5.00 3.75 1.25 3.75 4.00
## 43  2.75 5.00 3.75 4.50 4.75
## 45  3.25 4.50 4.25 3.50 3.75
## 46  4.75 3.00 3.50 5.00 3.75
## 47  3.25 4.00 3.50 4.25 4.50
## 49  3.75 2.50 3.00 3.00 4.50
## 57  4.00 5.00 4.00 3.25 4.50
## 62  3.50 3.75 5.00 3.50 3.75
## 69  4.25 2.50 4.00 4.75 3.25
## 80  4.25 2.50 4.00 4.50 3.50
## 95  4.75 4.50 4.50 3.00 2.75
## 103 3.25 4.50 4.25 4.75 4.00
## 109 3.75 3.50 4.00 4.25 4.50
## 115 4.00 4.75 3.50 4.00 4.00
## 116 4.25 2.25 4.00 4.50 5.00
## 118 4.00 3.75 3.25 2.75 5.00
## 122 3.50 4.00 3.75 3.25 3.75
## 135 4.00 3.25 3.75 4.75 4.75
## 141 5.00 4.00 4.75 5.00 3.25
## 144 4.75 4.75 4.25 4.75 4.25
## 145 4.50 4.75 4.25 3.50 3.50
## 162 3.75 4.25 3.75 3.75 2.75
## 164 3.50 3.75 3.75 3.25 3.75
## 167 3.75 4.00 4.75 3.75 4.25
## 178 4.00 2.75 4.00 3.50 4.50
## 182 3.00 4.50 1.25 4.00 4.00
## 185 3.25 4.25 3.50 4.25 2.75
## 191 4.00 4.25 4.00 3.00 2.75
## 196 4.50 3.25 4.00 4.00 3.25
## 206 3.25 4.00 4.00 4.00 4.00
## 208 4.25 3.25 3.00 4.00 4.00
## 209 3.25 2.75 3.00 4.00 4.00
## 210 4.50 3.75 3.50 4.75 1.50
## 221 5.00 4.75 4.00 4.00 4.50
## 222 3.50 3.75 4.00 2.50 3.75
## 230 2.75 2.50 2.75 4.75 4.50
## 244 3.00 4.50 4.00 3.75 3.50
## 247 3.75 4.25 4.75 3.50 4.75
## 248 4.25 4.25 4.00 4.50 3.50
## 250 2.75 4.50 5.00 4.50 4.00
## 251 4.75 5.00 4.00 4.25 4.25
## 254 3.00 4.00 3.50 3.75 4.00
## 255 2.75 4.00 3.50 3.75 3.00
## 256 4.50 4.50 4.00 3.50 2.50
## 267 3.75 4.00 5.00 3.75 4.00
## 273 4.00 4.00 4.00 3.00 4.25
## 282 5.00 3.75 4.75 3.75 3.50
## 289 4.75 3.25 4.25 4.75 3.75
## 292 3.00 4.00 2.75 4.00 3.50
## 
## $deep
##        1    2    3    4    5
## 2   3.75 4.00 3.50 3.75 4.00
## 5   4.25 3.00 4.50 3.75 3.75
## 8   4.00 3.75 4.00 4.50 3.75
## 9   4.00 3.75 3.50 3.75 4.00
## 13  4.00 3.50 4.25 3.50 4.00
## 22  4.50 4.25 4.50 4.00 3.50
## 27  4.00 4.00 4.00 4.50 4.00
## 30  4.50 4.00 4.25 3.50 3.75
## 33  4.00 4.25 3.75 3.50 4.00
## 39  4.25 3.00 3.50 4.00 3.00
## 42  4.00 4.25 4.00 4.00 4.50
## 43  4.50 5.00 3.50 4.75 4.50
## 45  4.00 4.50 4.50 4.75 3.50
## 46  4.75 3.50 3.00 4.25 4.00
## 47  3.50 3.50 4.00 4.50 4.00
## 49  3.75 3.75 3.50 3.50 4.00
## 57  4.00 5.00 4.00 3.25 4.00
## 62  3.00 4.00 4.00 3.25 4.00
## 69  3.75 3.50 4.25 4.50 3.00
## 80  4.75 3.75 3.75 5.00 3.50
## 95  5.00 4.75 4.50 3.50 3.50
## 103 3.00 4.00 4.50 4.25 3.75
## 109 4.25 3.75 3.50 4.25 4.00
## 115 4.00 3.00 3.50 5.00 4.25
## 116 4.00 4.25 4.50 4.00 3.75
## 118 4.25 4.00 3.75 3.50 4.50
## 122 4.00 4.00 4.25 4.00 4.25
## 135 4.00 4.00 3.50 4.00 4.00
## 141 4.75 4.00 3.75 4.00 3.75
## 144 4.75 4.50 4.25 4.25 4.00
## 145 5.00 4.50 4.00 3.00 4.00
## 162 4.00 4.00 4.00 3.50 3.50
## 164 4.00 3.00 4.00 4.00 3.00
## 167 4.50 4.50 4.00 4.00 4.00
## 178 3.75 3.00 3.75 3.00 4.25
## 182 3.50 3.75 3.50 2.00 3.50
## 185 3.00 4.00 4.00 4.25 4.00
## 191 2.75 4.00 4.00 3.75 3.25
## 196 4.50 2.75 4.25 3.50 3.00
## 206 4.25 2.25 4.00 4.25 3.75
## 208 4.00 4.25 3.75 4.75 4.00
## 209 3.75 3.50 4.00 5.00 4.75
## 210 3.50 4.50 4.00 5.00 3.50
## 221 4.50 4.00 3.50 4.50 4.00
## 222 4.00 3.75 3.00 4.00 3.75
## 230 4.25 2.75 3.50 5.00 3.75
## 244 3.25 4.00 4.75 4.00 4.50
## 247 4.00 3.50 4.25 2.25 4.50
## 248 4.00 4.00 4.00 4.00 3.50
## 250 3.75 5.00 4.50 4.25 4.00
## 251 4.50 3.50 4.50 4.50 3.75
## 254 4.00 4.00 3.50 4.00 4.00
## 255 4.00 4.25 5.00 3.50 3.50
## 256 4.25 3.75 3.50 3.75 3.25
## 267 3.50 4.50 4.50 3.50 4.25
## 273 4.00 4.00 4.50 4.00 4.25
## 282 4.50 3.50 4.75 3.00 3.50
## 289 4.50 2.75 4.75 4.00 3.00
## 292 4.00 4.00 2.75 4.00 3.50
## 
## $organised
##        1    2    3    4    5
## 2   2.50 3.50 3.25 3.00 3.50
## 5   4.50 3.00 4.00 2.50 3.00
## 8   3.25 3.75 4.25 4.00 3.25
## 9   3.25 4.25 3.25 2.50 3.25
## 13  4.75 2.50 3.50 3.00 3.25
## 22  3.25 3.75 4.00 4.00 2.50
## 27  3.50 3.25 4.00 3.25 3.25
## 30  4.00 4.25 4.00 3.25 4.00
## 33  4.00 4.00 3.25 2.50 4.50
## 39  4.00 3.00 3.50 4.50 3.00
## 42  4.00 3.25 3.25 3.25 3.00
## 43  3.25 4.00 3.25 3.75 2.75
## 45  3.25 4.00 5.00 3.75 3.25
## 46  5.00 2.50 3.50 4.50 3.25
## 47  4.00 3.50 4.00 3.75 4.25
## 49  3.50 3.00 4.25 2.50 4.00
## 57  4.25 5.00 4.00 2.25 4.75
## 62  4.00 3.75 4.00 3.75 4.75
## 69  3.75 3.25 3.50 3.50 3.00
## 80  4.75 2.50 4.25 4.50 3.00
## 95  4.75 4.50 3.75 4.25 2.50
## 103 3.00 4.00 4.00 4.25 4.00
## 109 4.50 3.25 3.50 4.00 4.50
## 115 4.00 4.00 2.50 3.50 4.25
## 116 3.75 3.00 3.75 4.25 4.25
## 118 4.50 3.50 3.25 4.25 4.50
## 122 4.25 3.25 4.25 4.50 4.00
## 135 1.25 3.50 3.75 4.00 4.50
## 141 4.50 4.00 4.50 3.75 3.50
## 144 5.00 4.25 2.75 4.00 3.50
## 145 5.00 3.75 3.25 3.75 1.25
## 162 3.75 3.00 3.25 3.50 4.00
## 164 4.00 3.25 4.25 3.75 3.00
## 167 4.50 5.00 3.75 3.50 4.00
## 178 3.50 3.25 4.00 3.25 3.50
## 182 3.75 3.00 4.00 2.25 2.50
## 185 3.25 4.00 3.50 4.00 3.00
## 191 2.25 4.00 3.25 2.50 4.00
## 196 4.50 3.00 4.00 3.25 3.00
## 206 4.25 2.75 4.75 3.25 3.50
## 208 4.25 4.25 3.00 3.25 4.25
## 209 4.00 2.50 3.25 4.25 4.75
## 210 4.50 3.25 4.50 3.75 4.00
## 221 3.75 4.50 2.25 3.50 2.50
## 222 3.75 3.75 3.25 2.75 2.50
## 230 2.25 3.00 2.50 4.00 2.50
## 244 3.00 4.50 4.75 4.25 4.25
## 247 2.25 3.75 4.50 2.25 4.50
## 248 4.00 3.75 3.75 3.25 2.50
## 250 3.50 3.75 4.50 3.50 2.50
## 251 4.25 4.75 3.50 3.25 3.50
## 254 3.50 4.00 3.00 1.25 4.75
## 255 4.25 4.00 4.50 3.50 3.00
## 256 4.75 2.50 2.50 2.50 3.00
## 267 3.25 3.75 4.25 3.25 4.00
## 273 4.00 4.25 4.50 4.50 3.75
## 282 3.50 3.25 4.50 2.50 3.00
## 289 3.50 3.00 3.00 3.25 2.50
## 292 3.50 3.75 3.25 4.75 2.50
## 
## $score
## [1] 1 2 3 4 5
## <0 rows> (or 0-length row.names)

# save the first imputation (out of 5) into complete df
df.impPMM.complete1 <- complete(df.impPMM, 1) # replace "1" with "2" to save the second imputed df etc.
# save PMM imp 1 into df
write.table (df.impPMM.complete1, file = "dfttu_wide_cross_impPMM_complete1.txt", row.names = FALSE, sep= "\t")
# read the df
dfttu_wide_cross.impPMM1 <- read.table("dfttu_wide_cross_impPMM_complete1.txt", header=TRUE, sep= "\t")

# save the whole PMM imputed "package" (5 imputations)
# install.packages("miceadds")
library(miceadds)
write.mice.imputation(mi.res=df.impPMM, name="dfttu_wide_cross_impPMM", mids2spss=F)

## 2023-02-23 13:33:53 
## 
##  /Users/nokelaip/CharveLEAD Dropbox/Petri Nokelainen/home1_cloud/home1/koulutus/2023/TTU/Session2/R_Session/dfttu_wide_cross_impPMM 
## 
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##      group     gender   autonomy competence       deep  organised      score 
##         ""         ""      "pmm"      "pmm"      "pmm"      "pmm"         "" 
## PredictorMatrix:
##            group gender autonomy competence deep organised score
## group          0      1        1          1    1         1     1
## gender         1      0        1          1    1         1     1
## autonomy       1      1        0          1    1         1     1
## competence     1      1        1          0    1         1     1
## deep           1      1        1          1    0         1     1
## organised      1      1        1          1    1         0     1
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##      group     gender   autonomy competence       deep  organised      score 
##         ""         ""      "pmm"      "pmm"      "pmm"      "pmm"         "" 
## PredictorMatrix:
##            group gender autonomy competence deep organised score
## group          0      1        1          1    1         1     1
## gender         1      0        1          1    1         1     1
## autonomy       1      1        0          1    1         1     1
## competence     1      1        1          0    1         1     1
## deep           1      1        1          1    0         1     1
## organised      1      1        1          1    1         0     1
## 
## 
##     variable MissProp Rhat.M.imp Rhat.Var.imp
## 1      group  0.00000         NA           NA
## 2     gender  0.00000         NA           NA
## 3   autonomy 20.06803  1.0120040     1.009872
## 4 competence 20.06803  1.0094990     1.060325
## 5       deep 20.06803  0.9950371     1.009788
## 6  organised 20.06803  0.9989290     1.003341
## 7      score  0.00000         NA           NA
##     variable MissProp Rhat.M.imp Rhat.Var.imp
## 1      group    0.000         NA           NA
## 2     gender    0.000         NA           NA
## 3   autonomy   20.068      1.012        1.010
## 4 competence   20.068      1.009        1.060
## 5       deep   20.068      0.995        1.010
## 6  organised   20.068      0.999        1.003
## 7      score    0.000         NA           NA
## 
## To cite R in publications use:
## 
##   R Core Team (2022). R: A language and environment for statistical
##   computing. R Foundation for Statistical Computing, Vienna, Austria.
##   URL https://www.R-project.org/.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2022},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.
## 
## 
## To cite mice in publications use:
## 
##   Stef van Buuren, Karin Groothuis-Oudshoorn (2011). mice: Multivariate
##   Imputation by Chained Equations in R. Journal of Statistical
##   Software, 45(3), 1-67. DOI 10.18637/jss.v045.i03.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {{mice}: Multivariate Imputation by Chained Equations in R},
##     author = {Stef {van Buuren} and Karin Groothuis-Oudshoorn},
##     journal = {Journal of Statistical Software},
##     year = {2011},
##     volume = {45},
##     number = {3},
##     pages = {1-67},
##     doi = {10.18637/jss.v045.i03},
##   }
## 
##                                                                                                sysname 
##                                                                                               "Darwin" 
##                                                                                                release 
##                                                                                               "22.3.0" 
##                                                                                                version 
## "Darwin Kernel Version 22.3.0: Thu Jan  5 20:48:54 PST 2023; root:xnu-8792.81.2~2/RELEASE_ARM64_T6000" 
##                                                                                               nodename 
##                                                                             "wks-98846-mac.ad.tuni.fi" 
##                                                                                                machine 
##                                                                                                "arm64" 
##                                                                                                  login 
##                                                                                                 "root" 
##                                                                                                   user 
##                                                                                             "nokelaip" 
##                                                                                         effective_user 
##                                                                                             "nokelaip" 
## R version 4.2.2 (2022-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] miceadds_3.16-18      VIM_6.2.2             colorspace_2.1-0     
##  [4] tsibble_1.1.3         tinytex_0.39          forcats_1.0.0        
##  [7] stringr_1.5.0         dplyr_1.1.0           purrr_1.0.1          
## [10] readr_2.1.2           tibble_3.1.8          tidyverse_1.3.2      
## [13] tidyr_1.3.0           texreg_1.38.6         tableone_0.13.2      
## [16] supportR_0.1.2        stargazer_5.2.3       sjstats_0.18.2       
## [19] sjPlot_2.8.12         simr_1.0.6            sandwich_3.0-2       
## [22] RColorBrewer_1.1-3    psych_2.2.9           plyr_1.8.8           
## [25] pan_1.6               optmatch_0.10.6       nullabor_0.3.9       
## [28] nlme_3.1-162          naniar_1.0.0          multilevelTools_0.1.1
## [31] mlmRev_1.0-8          mice_3.15.0           MatchIt_4.5.0        
## [34] lmtest_0.9-40         zoo_1.8-11            lmerTest_3.1-3       
## [37] lme4_1.1-31           Matrix_1.5-1          lattice_0.20-45      
## [40] JWileymisc_1.4.0      JointAI_1.0.4         intervals_0.15.2     
## [43] HLMdiag_0.5.0         gridExtra_2.3         gmodels_2.18.1.1     
## [46] ggraph_2.1.0          ggpubr_0.6.0          ggplot2_3.4.1        
## [49] gapminder_0.3.0       finalfit_1.0.6        extraoperators_0.1.1 
## [52] emmeans_1.8.4-1       doBy_4.6.16           devtools_2.4.5       
## [55] usethis_2.1.6         data.table_1.14.8     compareGroups_4.6.0  
## [58] apaTables_2.0.8      
## 
## loaded via a namespace (and not attached):
##   [1] vcd_1.4-11           Hmisc_4.7-0          svglite_2.1.0       
##   [4] class_7.3-20         ps_1.7.2             laeken_0.5.2        
##   [7] crayon_1.5.2         MASS_7.3-58.1        backports_1.4.1     
##  [10] reprex_2.0.1         rlang_1.0.6          readxl_1.4.0        
##  [13] performance_0.10.2   SparseM_1.81         nloptr_2.0.3        
##  [16] callr_3.7.3          flextable_0.7.1      glue_1.6.2          
##  [19] pbkrtest_0.5.2       parallel_4.2.2       processx_3.8.0      
##  [22] VGAM_1.1-7           haven_2.5.0          tidyselect_1.2.0    
##  [25] sjmisc_2.8.9         chron_2.3-57         xtable_1.8-4        
##  [28] MatrixModels_0.5-1   magrittr_2.0.3       evaluate_0.20       
##  [31] gdtools_0.2.4        cli_3.6.0            rstudioapi_0.13     
##  [34] miniUI_0.1.1.1       sp_1.5-0             rjags_4-13          
##  [37] bslib_0.3.1          rpart_4.1.19         mathjaxr_1.6-0      
##  [40] sjlabelled_1.2.0     shiny_1.7.1          xfun_0.31           
##  [43] pkgbuild_1.3.1       cluster_2.1.4        tidygraph_1.2.1     
##  [46] quantreg_5.94        ggrepel_0.9.3        listenv_0.8.0       
##  [49] png_0.1-7            future_1.26.1        withr_2.5.0         
##  [52] ggforce_0.3.3        ranger_0.14.1        cellranger_1.1.0    
##  [55] e1071_1.7-11         survey_4.1-1         coda_0.19-4         
##  [58] pillar_1.8.1         cachem_1.0.6         multcomp_1.4-20     
##  [61] fs_1.6.1             flexmix_2.3-18       kernlab_0.9-31      
##  [64] vctrs_0.5.2          pbivnorm_0.6.0       ellipsis_0.3.2      
##  [67] generics_0.1.3       tools_4.2.2          foreign_0.8-83      
##  [70] munsell_0.5.0        tweenr_1.0.2         proxy_0.4-27        
##  [73] anytime_0.3.9        fastmap_1.1.0        compiler_4.2.2      
##  [76] pkgload_1.3.2        abind_1.4-5          httpuv_1.6.5        
##  [79] sessioninfo_1.2.2    rms_6.3-0            utf8_1.2.3          
##  [82] later_1.3.0          jsonlite_1.8.4       scales_1.2.1        
##  [85] carData_3.0-5        estimability_1.4.1   promises_1.2.0.1    
##  [88] car_3.1-1            latticeExtra_0.6-29  checkmate_2.1.0     
##  [91] fstcore_0.9.12       rmarkdown_2.14       cowplot_1.1.1       
##  [94] moments_0.14.1       webshot_0.5.3        igraph_1.3.1        
##  [97] survival_3.4-0       numDeriv_2016.8-1.1  yaml_2.3.5          
## [100] plotrix_3.8-2        systemfonts_1.0.4    prabclus_2.3-2      
## [103] htmltools_0.5.2      memoise_2.0.1        lavaan_0.6-14       
## [106] profvis_0.3.7        modeltools_0.2-23    graphlayouts_0.8.0  
## [109] quadprog_1.5-8       viridisLite_0.4.1    digest_0.6.31       
## [112] assertthat_0.2.1     mime_0.12            bayestestR_0.13.0   
## [115] remotes_2.4.2        urlchecker_1.0.1     splines_4.2.2       
## [118] Formula_1.2-4        fpc_2.2-9            googledrive_2.0.0   
## [121] broom_1.0.3          hms_1.1.2            modelr_0.1.8        
## [124] microbenchmark_1.4.9 base64enc_0.1-3      mnormt_2.1.1        
## [127] nnet_7.3-18          sass_0.4.1           binom_1.1-1.1       
## [130] Rcpp_1.0.10          mclust_5.4.10        mvtnorm_1.1-3       
## [133] multcompView_0.1-8   fansi_1.0.4          tzdb_0.3.0          
## [136] truncnorm_1.0-8      parallelly_1.32.0    R6_2.5.1            
## [139] lifecycle_1.0.3      polspline_1.1.20     zip_2.2.0           
## [142] writexl_1.4.0        datawizard_0.6.5     ggsignif_0.6.3      
## [145] googlesheets4_1.0.0  minqa_1.2.5          gdata_2.18.0.1      
## [148] jquerylib_0.1.4      snakecase_0.11.0     robustbase_0.95-0   
## [151] TH.data_1.1-1        iterators_1.0.14     htmlwidgets_1.5.4   
## [154] officer_0.4.2        polyclip_1.10-0      rvest_1.0.2         
## [157] mgcv_1.8-41          globals_0.15.0       insight_0.19.0      
## [160] htmlTable_2.4.0      codetools_0.2-18     lubridate_1.8.0     
## [163] gtools_3.9.4         prettyunits_1.1.1    dbplyr_2.2.0        
## [166] gtable_0.3.1         DBI_1.1.2            stats4_4.2.2        
## [169] visdat_0.5.3         highr_0.9            httr_1.4.3          
## [172] stringi_1.7.12       reshape2_1.4.4       farver_2.1.1        
## [175] uuid_1.1-0           diptest_0.76-0       viridis_0.6.2       
## [178] ggthemes_4.2.4       xml2_1.3.3           boot_1.3-28         
## [181] fst_0.9.8            diagonals_6.0.0      kableExtra_1.3.4    
## [184] ggeffects_1.1.2      DEoptimR_1.0-11      jpeg_0.1-9          
## [187] Deriv_4.1.3          janitor_2.1.0        pkgconfig_2.0.3     
## [190] gargle_1.2.0         rstatix_0.7.2        mitools_2.4         
## [193] HardyWeinberg_1.7.5  RLRsim_3.1-8         Rsolnp_1.16         
## [196] knitr_1.39          
## 
##  1
##  2
##  3
##  4
##  5

load("dfttu_wide_cross_impPMM/dfttu_wide_cross_impPMM.Rdata")
# class(mi.res) # mids

# check if the imputation has gone well ... this is done with the whole "set" of 5 imputed df's

# plot observed (blue) against imputed (red)
xyplot(df.impPMM, competence ~ autonomy + deep + organised, 
       color=mdc(1:2), # blue = observed, red = imputed
       pch=20, # dot shape
       cex=1 # dot size
)

# density plotting (blue = original)
densityplot(df.impPMM) # successful imputation

# densityplot(get("mi.res")) # similar plot with the saved mi object

# --------------------------------------
# Estimate model with different datasets
# run linear regression with competence (dependent variable) and predictor variables (autonomy, score)
# first, data with NA's
Competence_assumptions <- lm(competence ~ autonomy + score + gender + group, dfttu_wide_cross)
summary(Competence_assumptions)

## 
## Call:
## lm(formula = competence ~ autonomy + score + gender + group, 
##     data = dfttu_wide_cross)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.82918 -0.27635  0.03015  0.27235  1.28052 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.073315   0.177185   6.058 5.59e-09 ***
## autonomy    0.680008   0.045479  14.952  < 2e-16 ***
## score       0.002613   0.001102   2.371   0.0186 *  
## gender      0.099692   0.070767   1.409   0.1603    
## group       0.116768   0.062926   1.856   0.0648 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4818 on 230 degrees of freedom
##   (59 observations deleted due to missingness)
## Multiple R-squared:  0.5272, Adjusted R-squared:  0.5189 
## F-statistic:  64.1 on 4 and 230 DF,  p-value: < 2.2e-16

# second, complete data
# create a "complete" version of df (remove all rows, that is, participants who have NA's)
dfttu_wide_cross_complete <- na.omit(dfttu_wide_cross)
Competence_assumptions_complete <- lm(competence ~ autonomy + score + gender + group, dfttu_wide_cross_complete)
summary(Competence_assumptions_complete)

## 
## Call:
## lm(formula = competence ~ autonomy + score + gender + group, 
##     data = dfttu_wide_cross_complete)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.82918 -0.27635  0.03015  0.27235  1.28052 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.073315   0.177185   6.058 5.59e-09 ***
## autonomy    0.680008   0.045479  14.952  < 2e-16 ***
## score       0.002613   0.001102   2.371   0.0186 *  
## gender      0.099692   0.070767   1.409   0.1603    
## group       0.116768   0.062926   1.856   0.0648 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4818 on 230 degrees of freedom
## Multiple R-squared:  0.5272, Adjusted R-squared:  0.5189 
## F-statistic:  64.1 on 4 and 230 DF,  p-value: < 2.2e-16

# third, simple mean imputed data
Competence_assumptions_impMean <- lm(competence ~ autonomy + score + gender + group, dfttu_wide_cross.impMean)
summary(Competence_assumptions_impMean)

## 
## Call:
## lm(formula = competence ~ autonomy + score + gender + group, 
##     data = dfttu_wide_cross.impMean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83887 -0.18039  0.02446  0.19092  1.26716 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.1198793  0.1557573   7.190 5.57e-12 ***
## autonomy    0.6843189  0.0405812  16.863  < 2e-16 ***
## score       0.0020216  0.0008705   2.322   0.0209 *  
## gender      0.0761898  0.0563849   1.351   0.1777    
## group       0.0951308  0.0504660   1.885   0.0604 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4318 on 289 degrees of freedom
## Multiple R-squared:  0.5227, Adjusted R-squared:  0.5161 
## F-statistic: 79.14 on 4 and 289 DF,  p-value: < 2.2e-16

# fourth, first PMM (predictive mean matching) imputed df
Competence_assumptions_impPMM.complete1 <- lm(competence ~ autonomy + score + gender + group, df.impPMM.complete1)
summary(Competence_assumptions_impPMM.complete1)

## 
## Call:
## lm(formula = competence ~ autonomy + score + gender + group, 
##     data = df.impPMM.complete1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83887 -0.29766  0.01884  0.29596  1.26006 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.112089   0.156964   7.085 1.06e-11 ***
## autonomy    0.686861   0.040627  16.907  < 2e-16 ***
## score       0.001854   0.000963   1.926   0.0551 .  
## gender      0.083779   0.063072   1.328   0.1851    
## group       0.112495   0.055729   2.019   0.0445 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.477 on 289 degrees of freedom
## Multiple R-squared:  0.5289, Adjusted R-squared:  0.5224 
## F-statistic: 81.12 on 4 and 289 DF,  p-value: < 2.2e-16

# ---------------
# Compare results
# let's get a whole picture of the results

# install.packages("sjPlot")
library(sjPlot)
sjPlot::tab_model(Competence_assumptions,
                  Competence_assumptions_complete,
                  Competence_assumptions_impMean,
                  # Competence_assumptions_impPMM.complete1,
                  CSS = list(
                    css.thead = 'border-top: 1px solid;',
                    css.lasttablerow = 'border-top: 1px solid;'),
                  digits = 3,
                  show.ci = 0.95,
                  collapse.ci = TRUE,
                  show.se = TRUE,
                  show.stat = FALSE,
                  #show.df = TRUE, 
                  show.re.var = TRUE,
                  show.r2 = FALSE,
                  show.icc = TRUE,
                  show.fstat = TRUE,
                  show.aic = TRUE,
                  show.dev = TRUE,
                  show.loglik = FALSE,
                  string.pred = "Predictors",
                  string.est = "Est. (C.I.)",
                  string.ci = "C.I.",
                  string.se = "S.E.",
                  p.style = "numeric", 
                  p.threshold = c(0.05, 0.01, 0.001),
                  emph.p = TRUE,
                  dv.labels = c("Model 1a<sub><i>Full</i></sub>",
                                "Model 1b<sub><i>Complete</i></sub>",
                                "Model 1c<sub><i>Mean imp</i></sub>" #,
                                #"Model 1d<sub><i>PMM imp comp1</i></sub>"
                                )
                  )

	Model 1a_Full			Model 1b_Complete			Model 1c_{Mean imp}
Predictors	Est. (C.I.)	S.E.	p	Est. (C.I.)	S.E.	p	Est. (C.I.)	S.E.	p
(Intercept)	1.073 (0.724 – 1.422)	0.177	<0.001	1.073 (0.724 – 1.422)	0.177	<0.001	1.120 (0.813 – 1.426)	0.156	<0.001
autonomy	0.680 (0.590 – 0.770)	0.045	<0.001	0.680 (0.590 – 0.770)	0.045	<0.001	0.684 (0.604 – 0.764)	0.041	<0.001
score	0.003 (0.000 – 0.005)	0.001	0.019	0.003 (0.000 – 0.005)	0.001	0.019	0.002 (0.000 – 0.004)	0.001	0.021
gender	0.100 (-0.040 – 0.239)	0.071	0.160	0.100 (-0.040 – 0.239)	0.071	0.160	0.076 (-0.035 – 0.187)	0.056	0.178
group	0.117 (-0.007 – 0.241)	0.063	0.065	0.117 (-0.007 – 0.241)	0.063	0.065	0.095 (-0.004 – 0.194)	0.050	0.060
Observations	235			235			294
Deviance	53.386			53.386			53.884
AIC	330.621			330.621			347.491

# we notice that full, complete and mean imputed data produce similar results

# how about the full (five times) PMM imputed data?

Competence_assumptions_impPMM <- with(df.impPMM, lm(competence ~ autonomy + score + gender + group)) 

pool(Competence_assumptions_impPMM)

## Class: mipo    m = 5 
##          term m   estimate         ubar            b            t dfcom
## 1 (Intercept) 5 1.11695950 2.555759e-02 7.025830e-03 3.398859e-02   289
## 2    autonomy 5 0.67386493 1.704194e-03 4.679347e-04 2.265715e-03   289
## 3       score 5 0.00245425 9.814699e-07 3.998122e-07 1.461245e-06   289
## 4      gender 5 0.09350242 4.163013e-03 4.144681e-04 4.660374e-03   289
## 5       group 5 0.10685405 3.299437e-03 2.018758e-04 3.541688e-03   289
##          df        riv     lambda       fmi
## 1  49.95987 0.32988222 0.24805371 0.2764505
## 2  50.03127 0.32949408 0.24783419 0.2762011
## 3  31.11599 0.48883275 0.32833289 0.3677084
## 4 148.19901 0.11947158 0.10672141 0.1185373
## 5 203.68616 0.07342192 0.06839987 0.0774145

# these should be low:
# fraction of information missing due to nonresponse (fmi)
# the relative increase in variance due to nonresponse (lambda) 

summary(pool(Competence_assumptions_impPMM))

##          term   estimate   std.error statistic        df      p.value
## 1 (Intercept) 1.11695950 0.184359947  6.058580  49.95987 1.779730e-07
## 2    autonomy 0.67386493 0.047599529 14.156966  50.03127 3.967468e-19
## 3       score 0.00245425 0.001208819  2.030286  31.11599 5.094972e-02
## 4      gender 0.09350242 0.068266934  1.369659 148.19901 1.728651e-01
## 5       group 0.10685405 0.059512084  1.795502 203.68616 7.405610e-02

#          term    estimate   std.error statistic        df      p.value
# 1 (Intercept) 1.104158848 0.204627650  5.395942  23.63079 1.608837e-05
# 2    autonomy 0.673287744 0.048663059 13.835705  43.28428 1.712608e-17 ***
# 3       score 0.002542074 0.001069936  2.375913  94.24786 1.952910e-02 *
# 4      gender 0.098300898 0.069125239  1.422070 107.19081 1.579094e-01
# 5       group 0.121213128 0.057557558  2.105946 266.44022 3.614463e-02 *

# results stat quite similar, except that group is now also stat sig

The last part of dealing with missing data is to delete all rows that contain missing values.

# ----------------
# 3. COMPLETE DATA

# plot complete df
plot_all(dfttu_wide_cross_complete[c("autonomy","competence","deep","organised","score","score3class")], fill = 'lightgreen', border = 'black', breaks = 30)

# save complete df
write.table (dfttu_wide_cross_complete, file = "dfttu_wide_cross_complete.txt", row.names = FALSE, sep= "\t")

# read complete df
dfttu_wide_cross_complete <- read.table("dfttu_wide_cross_complete.txt", header=TRUE, sep= "\t")

1.4 Create descriptive statistics for inline text

#######################
# get some descriptives
#
# what is the n of rows in df?
dfttu_nrow <- nrow(dfttu_wide_cross)
#
# how many rows have missing data?
dfttu_nrow_NA <- nrow(dfttu_wide_cross[!complete.cases(dfttu_wide_cross), ])
#
# what is the % of NA's in df?
dfttu_perc_NA <- dfttu_nrow_NA / dfttu_nrow * 100
#
# how many participants in df?
dfttu_n_id <- length(unique(dfttu_wide_cross$id)) # works also for repeated measurement df
#
# how many females?
# n
dfttu_n_fem <- nrow(dfttu_wide_cross[ which(dfttu_wide_cross$gender==0), ])
# %
dfttu_perc_fem <- dfttu_n_fem / dfttu_nrow * 100
#
# how many NA's in gender?
# n
dfttu_n_NA_gender <- nrow(dfttu_wide_cross[is.na(dfttu_wide_cross$gender),])
# %
dfttu_perc_NA_gender <- dfttu_n_NA_gender / dfttu_n_id * 100
#
# how many control group members?
# n
dfttu_n_control <- nrow(dfttu_wide_cross[ which(dfttu_wide_cross$group==0), ])
# %
dfttu_perc_control <- dfttu_n_control / dfttu_nrow * 100
#
# how many intervention group members?
# n
dfttu_n_intervention <- nrow(dfttu_wide_cross[ which(dfttu_wide_cross$group==1), ])
# %
dfttu_perc_intervention <- dfttu_n_intervention / dfttu_nrow * 100

Participants

The cross-sectional data has 294 participants of which 88 are females (29.9%). Participants are organised in the control (n = 154, 52.4%) and intervention (n = 140, 47.6%) groups. Both groups have studied the same mathematics course, but the intervention group received the teaching with “flipped learning” pedagogical implementation (including self-study phases via online material and group work/discussions instead of lectures). The control group completed the course in a more traditional way (teacher-led lectures and assistant-led exercises).

Measurements

Higher education students have answered to eight questions on a 5-point scale (1 = totally disagree, … , 5 = totally agree) that are related to autonomy (4 questions) and competence (4 questions) during studies. This data contains the mean values from these questions stored in two variables “autonomy” and “competence”. Autonomy measures if one feels that the studying is self-determined and meaningful. Competence relates to one’s experiences of how different tasks during studies develop expertise (instead of being routine tasks). In addition to self-assessments, the data contains the variable “score” that contains participants’ mathematics test score (from 0 to 100 percent).

1.5 Multivariate normality assumption checks

# run multivariate normality analysis
# we use the complete data here

# install.packages("MVN")
# install.packages("ggplot2)
library(MVN) # https://cran.r-project.org/web/packages/MVN/MVN.pdf
library(ggplot2)
# check columns of "dfttu_wide_cross_complete"
colnames(dfttu_wide_cross_complete)

##  [1] "id"              "group"           "gender"          "autonomy"       
##  [5] "competence"      "deep"            "organised"       "score"          
##  [9] "score3class_chr" "score3class"     "autonomy_chr"    "gender_chr"     
## [13] "group_chr"

# run diagnostics for "autonomy" and "competence"
mvn(data = complete(dfttu_wide_cross_complete[,4:5]), subset = NULL, mvnTest = "mardia",
    covariance = TRUE, tol = 1e-25, alpha = 0.5,
    scale = FALSE, desc = FALSE, transform = "none", R = 1000,
    univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
    multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
    showOutliers = TRUE, showNewData = FALSE)

## $multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 39.3460662839323 5.90900646336711e-08     NO
## 2 Mardia Kurtosis 4.31398866509343 1.60335170531489e-05     NO
## 3             MVN             <NA>                 <NA>     NO
## 
## $univariateNormality
##           Test   Variable Statistic   p value Normality
## 1 Shapiro-Wilk  autonomy     0.9602  <0.001      NO    
## 2 Shapiro-Wilk competence    0.9441  <0.001      NO    
## 
## $multivariateOutliers
##     Observation Mahalanobis Distance Outlier
## 169         169               24.065    TRUE
## 41           41               23.595    TRUE
## 168         168               20.500    TRUE
## 48           48               17.685    TRUE
## 218         218               16.522    TRUE
## 152         152               15.339    TRUE
## 69           69               13.724    TRUE
## 195         195               13.671    TRUE
## 54           54               13.244    TRUE
## 76           76               12.318    TRUE
## 226         226               12.318    TRUE
## 43           43               11.385    TRUE
## 145         145               10.926    TRUE
## 91           91               10.852    TRUE
## 65           65               10.321    TRUE
## 120         120               10.050    TRUE
## 36           36                9.506    TRUE

# save diagnostics
dfttu_mv <- mvn(data = complete(dfttu_wide_cross_complete[,4:5]), subset = NULL, mvnTest = "mardia",
    covariance = TRUE, tol = 1e-25, alpha = 0.5,
    scale = FALSE, desc = FALSE, transform = "none", R = 1000,
    univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
    multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
    showOutliers = TRUE, showNewData = FALSE)

# create index variable (column)
# we need this as the list of outlier participants from the MVN is based on row index values
dfttu_wide_cross_complete$index <- seq.int(nrow(dfttu_wide_cross_complete))

# create a list of outlier id's
# a good way to extract values is to use "str()"
# str(dfttu_mv)
outlier_list <- dfttu_mv$multivariateOutliers$Observation
length(outlier_list) # 17 outliers

## [1] 17

# index_outliers <- c(169,41,168,48,218,152,69,195,54,76,226,43,145,91,65,120,36)

# create new df that omits these outlier participants
# install.packages("extraoperators")
library(extraoperators) # need this for %!in%
dfttu_wide_cross_complete_final <- subset(dfttu_wide_cross_complete, index %!in% outlier_list)

# check that 17 participants have been omitted from the df
length(dfttu_wide_cross_complete$id) # 235 participants

## [1] 235

length(dfttu_wide_cross_complete_final$id) # 218 participants

## [1] 218

# re-run diagnostics
mvn(data = complete(dfttu_wide_cross_complete_final[,4:5]), subset = NULL, mvnTest = "mardia",
    covariance = TRUE, tol = 1e-25, alpha = 0.5,
    scale = FALSE, desc = FALSE, transform = "none", R = 1000,
    univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
    multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
    showOutliers = TRUE, showNewData = FALSE)

## $multivariateNormality
##              Test         Statistic           p value Result
## 1 Mardia Skewness  4.02210144203792 0.403023008174473    YES
## 2 Mardia Kurtosis -0.26133837262797 0.793831577376506    YES
## 3             MVN              <NA>              <NA>    YES
## 
## $univariateNormality
##           Test   Variable Statistic   p value Normality
## 1 Shapiro-Wilk  autonomy     0.9667   1e-04      NO    
## 2 Shapiro-Wilk competence    0.9543  <0.001      NO    
## 
## $multivariateOutliers
##     Observation Mahalanobis Distance Outlier
## 109         109               12.550    TRUE
## 174         174               12.550    TRUE
## 183         183               12.550    TRUE
## 198         198               12.550    TRUE
## 98           98               10.952    TRUE
## 221         221               10.211    TRUE
## 127         127                9.900    TRUE
## 19           19                9.015    TRUE
## 90           90                9.015    TRUE
## 110         110                8.754    TRUE
## 190         190                8.754    TRUE

dfttu_mv2 <- mvn(data = complete(dfttu_wide_cross_complete_final[,4:5]), subset = NULL, mvnTest = "mardia",
    covariance = TRUE, tol = 1e-25, alpha = 0.5,
    scale = FALSE, desc = FALSE, transform = "none", R = 1000,
    univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
    multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
    showOutliers = TRUE, showNewData = FALSE)

dfttu_wide_cross_complete_final$index <- seq.int(nrow(dfttu_wide_cross_complete_final))

outlier_list2 <- dfttu_mv2$multivariateOutliers$Observation
length(outlier_list2) # 11 outliers

## [1] 11

dfttu_wide_cross_complete_final2 <- subset(dfttu_wide_cross_complete_final, index %!in% outlier_list2)

# check that 11 participants have been omitted from the df
length(dfttu_wide_cross_complete_final$id) # 218 participants

## [1] 218

length(dfttu_wide_cross_complete_final2$id) # 208 participants

## [1] 208

# re-run diagnostics
mvn(data = complete(dfttu_wide_cross_complete_final2[,4:5]), subset = NULL, mvnTest = "mardia",
    covariance = TRUE, tol = 1e-25, alpha = 0.5,
    scale = FALSE, desc = FALSE, transform = "none", R = 1000,
    univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
    multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
    showOutliers = TRUE, showNewData = FALSE)

## $multivariateNormality
##              Test          Statistic           p value Result
## 1 Mardia Skewness   3.92084064631468 0.416824894001297    YES
## 2 Mardia Kurtosis -0.377220097864481 0.706010045724341    YES
## 3             MVN               <NA>              <NA>    YES
## 
## $univariateNormality
##           Test   Variable Statistic   p value Normality
## 1 Shapiro-Wilk  autonomy     0.9686   1e-04      NO    
## 2 Shapiro-Wilk competence    0.9572  <0.001      NO    
## 
## $multivariateOutliers
##     Observation Mahalanobis Distance Outlier
## 109         109               12.358    TRUE
## 174         174               12.358    TRUE
## 183         183               12.358    TRUE
## 98           98               10.914    TRUE
## 221         221               10.338    TRUE
## 127         127                9.618    TRUE
## 90           90                9.498    TRUE
## 110         110                8.626    TRUE
## 190         190                8.626    TRUE

# we can stop here, the worst cases have been omitted ...

# so we write the final version of data frame for the next phases of statistical analyses
write.table (dfttu_wide_cross_complete_final2, "dfttu_wide_cross_complete_final2.txt", row.names = FALSE, sep= "\t")

Analysis continues in

R session 2:
Rmd Statistics_R_02.Rmd
HTML Statistics_R_02.html