Updated 23.02.2023
Course homepage:
https://homepages.tuni.fi/petri.nokelainen/mmx9120/
R session 1 (this file):
Rmd Statistics_R_01.Rmd
HTML Statistics_R_01.html
R session 2:
Rmd Statistics_R_02.Rmd
HTML Statistics_R_02.html
R session 3:
Rmd Statistics_R_03.Rmd
HTML Statistics_R_03.html
# get the current working directory (the location from where you opened this file)
getwd() # e.g. [1] "/Users/nokelaip"
## [1] "/Users/nokelaip/CharveLEAD Dropbox/Petri Nokelainen/home1_cloud/home1/koulutus/2023/TTU/Session2/R_Session"
# put that location into variable "wd"
wd <- getwd()
# set the working directory to the value of variable "wd"
setwd(wd) # copy paste "wd" value
# you may also use RStudio menu: Session - Set Working Directory - To Source File location
# read in the data, this time we will use a shorter name
dfttu_wide_cross <- read.table("dfttu_wide_cross.txt", header=TRUE, sep= "\t")
# view the first df
head(dfttu_wide_cross,3)
## id group gender autonomy competence deep organised score
## 1 1 0 1 3.75 4.25 4 3.0 16.666667
## 2 2 1 1 NA NA NA NA 5.555556
## 3 3 1 1 3.25 3.75 4 3.5 41.666667
tail(dfttu_wide_cross,4)
## id group gender autonomy competence deep organised score
## 291 291 1 1 3.25 3.50 4.0 4.50 83.33333
## 292 292 0 0 NA NA NA NA 19.44444
## 293 293 1 1 4.25 5.00 4.5 4.50 100.00000
## 294 294 1 1 3.50 3.75 4.0 3.25 94.44444
summary(dfttu_wide_cross)
## id group gender autonomy
## Min. : 1.00 Min. :0.0000 Min. :0.0000 Min. :1.500
## 1st Qu.: 74.25 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:3.250
## Median :147.50 Median :0.0000 Median :1.0000 Median :3.750
## Mean :147.50 Mean :0.4762 Mean :0.7007 Mean :3.687
## 3rd Qu.:220.75 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :294.00 Max. :1.0000 Max. :1.0000 Max. :5.000
## NA's :59
## competence deep organised score
## Min. :1.250 Min. :2.000 Min. :1.000 Min. : 0.00
## 1st Qu.:3.500 1st Qu.:3.750 1st Qu.:3.250 1st Qu.: 22.22
## Median :4.000 Median :4.000 Median :3.750 Median : 44.44
## Mean :3.834 Mean :4.021 Mean :3.695 Mean : 45.63
## 3rd Qu.:4.250 3rd Qu.:4.250 3rd Qu.:4.250 3rd Qu.: 69.44
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :100.00
## NA's :59 NA's :59 NA's :59
colnames(dfttu_wide_cross)
## [1] "id" "group" "gender" "autonomy" "competence"
## [6] "deep" "organised" "score"
View(dfttu_wide_cross)
# install.packages("dplyr")
library(dplyr)
# install.packages("stats")
library(stats)
# we will create a simple version of the "dfttu_wide_cross" df and practice some often needed actions
# select some columns into a new df (in this order)
df <- dplyr::select(dfttu_wide_cross, group, gender, autonomy, score, id)
# sort rows
df <- df[order(df$id,df$group),] # sort by id and group
# change the order of the columns
df <- df[,c(5,1:4)] # the last "id" becomes the first
# change column names
setnames(df, "id", "participant")
# create index variable (column)
df$index <- seq.int(nrow(df))
# remove "index" column
df = subset(df, select = -c(index))
# replace all "NA" values with "999" (typical missing value for SPSS)
df[is.na(df)] <- 999
# redo that operation
df[df==999] <- NA
# select random sample of 150 rows
df_n150 <- df[sample(nrow(df), 150), ]
# write new df's
write.table (df, file = "dfttu_wide_cross_test.txt", row.names = FALSE, sep= "\t")
write.table (df_n150, file = "dfttu_wide_cross_test_n150.txt", row.names = FALSE, sep= "\t")
# remove them from the RStudio environment (=memory)
remove(df,df_n150)
# we will return to the original full df "dfttu_wide_cross"
# # remove participants with NA in "score"
# dfttu_wide_cross_test1 <- subset(dfttu_wide_cross, score > -1) # need to use "-1" as the score has values from 0 to 100
# dfttu_wide_cross_test2 <- subset(dfttu_wide_cross, score > -1 & autonomy > 0) # no NA's in score AND autonomy
# dfttu_wide_cross_test3 <- subset(dfttu_wide_cross, score > -1 | autonomy > 0) # no NA's in score OR autonomy
# create new factor (character) variable "score3class" based on "score":
# lowest value to 33.3 -> 1 = low
# 33.4 - 66.6 -> 2 = medium
# 66.7 to highest value -> 3 = high
dfttu_wide_cross$score3class_chr <- cut(dfttu_wide_cross$score, breaks = c(-1, 33.3, 66.6, Inf), labels = c("Low score", "Medium score", "High score"))
# create new numeric variable "score3class" based on "score":
dfttu_wide_cross$score3class <- cut(dfttu_wide_cross$score, breaks = c(-1, 33.3, 66.6, Inf), labels = FALSE) # produces values 1,2,3
# but we would like to store values 0,1,2 instead:
dfttu_wide_cross$score3class <- dfttu_wide_cross$score3class -1
# create new factor "autonomy_chr" based on "autonomy":
# 1 totally disagree
# 2 disagree
# 3 neutral
# 4 agree
# 5 totally agree
dfttu_wide_cross$autonomy_chr <- cut(dfttu_wide_cross$autonomy, breaks = c(-1, 1, 2, 3, 4, Inf), labels = c("Totally disagree", "Disagree", "Neutral", "Agree", "Totally agree"))
# convert numeric "gender" (0,1) to character "gender_chr" (Female,Male)
dfttu_wide_cross$gender_chr <- ifelse(dfttu_wide_cross$gender < 1, "Female", "Male")
# convert numeric "group" (0,1) to character "group_chr" (Control, Treatment)
dfttu_wide_cross$group_chr <- ifelse(dfttu_wide_cross$group < 1, "Control", "Treatment")
# save changes into file
write.table(dfttu_wide_cross, file = "dfttu_wide_cross_final.txt", row.names = FALSE, sep= "\t")
Generally, we have three options if missing data exists:
FULL DATA (do nothing, leave NA’s)
IMPUTED DATA (impute NA’s)
2.1 simple mean/median imputation
2.2 multiple imputation
COMPLETE DATA (delete all NA’s)
# -------------------------------
# 1. FULL DATA, incl. missing values
# install.packages("JointAI")
library(JointAI)
# plot all variables
plot_all(dfttu_wide_cross[c("autonomy","competence","deep","organised","score","score3class")], fill = 'lightblue', border = 'black',
# ncol = 4, # how many columns?
breaks = 30)
# display missing data by pattern
JointAI::md_pattern(dfttu_wide_cross)
mice::md.pattern(dfttu_wide_cross, rotate.names = T)
## id group gender score score3class_chr score3class gender_chr group_chr
## 235 1 1 1 1 1 1 1 1
## 59 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## autonomy competence deep organised autonomy_chr
## 235 1 1 1 1 1 0
## 59 0 0 0 0 0 5
## 59 59 59 59 59 295
It seems that the participants have either a full set of responses or a completely missing survey profile (autonomy, competence, …).
Next we do imputation with mean values (not usually recommended).
# -------------
# 2. IMPUTATION
# 2.1 Simple mean (or median) imputation
# create a copy of df
df.impMean <- dfttu_wide_cross
# replace one variable (autonomy) NA's with mean (average)
df.impMean$autonomy[is.na(df.impMean$autonomy)] <- mean(df.impMean$autonomy, na.rm=TRUE) # mean replace NA's in "autonomy"
# replace all variables with NA's with mean (average)
for(i in 1:ncol(df.impMean)) { # produces error messages as there is also a character variable with NA's (autonomy_chr)
df.impMean[ , i][is.na(df.impMean[ , i])] <- mean(df.impMean[ , i], na.rm=TRUE)
}
# replace only numeric variables with NA's with mean (average)
df.impMean[,sapply(df.impMean, is.numeric)] <- lapply(df.impMean[,sapply(df.impMean, is.numeric)],
function(x){
x <- ifelse(is.na(x), mean(x, na.rm = TRUE), x)
}
)
# re-calculate missing "autonomy_chr" values
df.impMean$autonomy_chr <- cut(df.impMean$autonomy, breaks = c(-1, 1, 2, 3, 4, Inf), labels = c("Totally disagree", "Disagree", "Neutral", "Agree", "Totally agree"))
# save mean imputed df
write.table (df.impMean, file = "dfttu_wide_cross_impMean.txt", row.names = FALSE, sep= "\t")
# read mean imputed df
dfttu_wide_cross.impMean <- read.table("dfttu_wide_cross_impMean.txt", header=TRUE, sep= "\t")
Now we have a mean imputed data without NA’s. Next step is to do multiple imputation (MI), where several imputed data frames (in this case 5) are created.
# -------------
# 2. IMPUTATION
# 2.2 Multiple imputation
# install.packages("mice")
library(mice)
# install.packages("VIM")
library(VIM)
# plot NA's once again with mice
md.pattern(dfttu_wide_cross, rotate.names = TRUE)
## id group gender score score3class_chr score3class gender_chr group_chr
## 235 1 1 1 1 1 1 1 1
## 59 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## autonomy competence deep organised autonomy_chr
## 235 1 1 1 1 1 0
## 59 0 0 0 0 0 5
## 59 59 59 59 59 295
# and with VIM # https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
dfttuNA_plot <- aggr(dfttu_wide_cross, col=c('lightblue','lightgrey'), numbers=TRUE, sortVars=TRUE, labels=names(dfttu_wide_cross), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## autonomy 0.2006803
## competence 0.2006803
## deep 0.2006803
## organised 0.2006803
## autonomy_chr 0.2006803
## id 0.0000000
## group 0.0000000
## gender 0.0000000
## score 0.0000000
## score3class_chr 0.0000000
## score3class 0.0000000
## gender_chr 0.0000000
## group_chr 0.0000000
# remove "_chr" variables (as they are duplicates for their numerical versions)
dfttu_wide_cross_MI <- dplyr::select(dfttu_wide_cross, c("group", "gender", "autonomy", "competence", "deep", "organised", "score"))
# perform "predictive mean matching" (PMM) imputation
df.impPMM <- mice(dfttu_wide_cross_MI,
m=5, # create 5 imputed df's
maxit=60, # n of iterations
meth='pmm', # other methods: > methods(mice)
print=FALSE,
seed=2021) # replicable imputation
# for which vars PMM was applied for?
df.impPMM$meth
## group gender autonomy competence deep organised score
## "" "" "pmm" "pmm" "pmm" "pmm" ""
# group gender autonomy competence deep organised score
# "" "" "pmm" "pmm" "pmm" "pmm" ""
# we see that missing data were in autonomy, competence, deep and organised vars
# # view summary of imputed data
# summary(df.impPMM)
# view imputed values (5 for each variable)
df.impPMM$imp
## $group
## [1] 1 2 3 4 5
## <0 rows> (or 0-length row.names)
##
## $gender
## [1] 1 2 3 4 5
## <0 rows> (or 0-length row.names)
##
## $autonomy
## 1 2 3 4 5
## 2 2.00 3.50 3.00 2.75 3.50
## 5 3.00 3.00 4.00 4.75 3.50
## 8 4.25 3.00 3.25 4.00 2.00
## 9 4.00 3.75 2.25 3.75 4.00
## 13 4.75 3.75 4.50 2.00 4.00
## 22 4.75 4.50 4.00 3.00 3.50
## 27 3.75 2.00 2.25 4.25 4.25
## 30 4.00 3.25 3.75 4.75 3.25
## 33 4.75 3.50 3.75 3.75 4.00
## 39 4.00 3.25 3.50 4.75 3.25
## 42 5.00 3.00 2.00 5.00 3.75
## 43 3.00 4.25 3.50 4.25 3.50
## 45 2.75 4.00 3.75 4.50 3.50
## 46 3.75 3.50 3.25 4.50 4.00
## 47 3.25 3.25 4.00 3.50 3.50
## 49 3.50 2.00 2.75 2.25 5.00
## 57 4.00 4.75 4.00 3.50 5.00
## 62 3.00 3.25 4.75 3.00 3.50
## 69 4.00 3.00 2.50 4.75 3.00
## 80 4.25 2.75 3.50 3.75 3.50
## 95 4.75 4.25 3.50 3.50 2.50
## 103 3.00 3.50 4.25 4.00 3.75
## 109 3.75 3.75 4.00 4.00 4.50
## 115 4.25 4.00 4.00 4.00 3.75
## 116 4.50 3.00 4.00 3.75 4.25
## 118 4.00 2.00 4.00 2.25 4.75
## 122 4.25 3.50 3.50 3.50 3.75
## 135 3.50 3.75 4.00 4.75 4.75
## 141 4.25 3.75 3.50 4.75 3.25
## 144 4.25 4.00 3.75 5.00 4.00
## 145 5.00 4.25 4.00 3.00 4.25
## 162 3.25 4.25 3.00 3.50 2.50
## 164 3.50 4.00 3.25 3.75 2.25
## 167 3.50 2.00 4.50 3.25 3.75
## 178 4.50 3.50 4.00 3.75 4.25
## 182 3.75 4.50 2.00 4.00 4.75
## 185 3.00 4.00 3.75 3.75 2.25
## 191 4.00 5.00 4.25 4.25 2.00
## 196 3.75 3.50 3.50 4.00 3.00
## 206 3.00 3.75 3.75 3.75 4.25
## 208 3.50 3.75 3.00 3.75 4.00
## 209 3.25 3.00 3.00 3.50 4.50
## 210 4.00 2.25 3.00 4.75 3.00
## 221 4.50 5.00 3.75 4.00 4.75
## 222 3.50 4.00 3.75 3.00 3.75
## 230 3.50 2.75 2.25 4.50 4.75
## 244 4.00 4.50 4.25 3.75 4.00
## 247 4.25 4.75 4.00 3.50 5.00
## 248 3.75 3.75 3.50 4.00 5.00
## 250 2.75 4.75 5.00 4.00 4.50
## 251 4.25 5.00 3.50 4.00 3.75
## 254 3.00 4.50 4.75 3.75 4.25
## 255 2.75 3.75 3.50 2.25 4.00
## 256 3.75 4.00 4.25 3.25 3.00
## 267 3.00 4.00 4.75 4.75 4.25
## 273 3.25 2.00 3.00 3.00 3.75
## 282 5.00 4.00 4.25 3.75 3.75
## 289 5.00 3.50 4.00 4.00 3.00
## 292 2.25 3.00 3.00 3.75 2.75
##
## $competence
## 1 2 3 4 5
## 2 3.75 4.00 3.25 2.75 4.75
## 5 4.00 2.25 4.50 4.00 3.25
## 8 3.75 3.75 2.75 4.00 4.00
## 9 3.75 4.00 2.50 3.75 4.00
## 13 5.00 3.50 4.00 2.50 4.50
## 22 5.00 3.50 4.00 3.75 4.25
## 27 3.75 1.25 3.00 4.00 4.00
## 30 3.75 4.00 3.75 4.25 3.75
## 33 5.00 2.75 3.75 2.75 4.00
## 39 4.25 1.50 3.25 4.25 2.75
## 42 5.00 3.75 1.25 3.75 4.00
## 43 2.75 5.00 3.75 4.50 4.75
## 45 3.25 4.50 4.25 3.50 3.75
## 46 4.75 3.00 3.50 5.00 3.75
## 47 3.25 4.00 3.50 4.25 4.50
## 49 3.75 2.50 3.00 3.00 4.50
## 57 4.00 5.00 4.00 3.25 4.50
## 62 3.50 3.75 5.00 3.50 3.75
## 69 4.25 2.50 4.00 4.75 3.25
## 80 4.25 2.50 4.00 4.50 3.50
## 95 4.75 4.50 4.50 3.00 2.75
## 103 3.25 4.50 4.25 4.75 4.00
## 109 3.75 3.50 4.00 4.25 4.50
## 115 4.00 4.75 3.50 4.00 4.00
## 116 4.25 2.25 4.00 4.50 5.00
## 118 4.00 3.75 3.25 2.75 5.00
## 122 3.50 4.00 3.75 3.25 3.75
## 135 4.00 3.25 3.75 4.75 4.75
## 141 5.00 4.00 4.75 5.00 3.25
## 144 4.75 4.75 4.25 4.75 4.25
## 145 4.50 4.75 4.25 3.50 3.50
## 162 3.75 4.25 3.75 3.75 2.75
## 164 3.50 3.75 3.75 3.25 3.75
## 167 3.75 4.00 4.75 3.75 4.25
## 178 4.00 2.75 4.00 3.50 4.50
## 182 3.00 4.50 1.25 4.00 4.00
## 185 3.25 4.25 3.50 4.25 2.75
## 191 4.00 4.25 4.00 3.00 2.75
## 196 4.50 3.25 4.00 4.00 3.25
## 206 3.25 4.00 4.00 4.00 4.00
## 208 4.25 3.25 3.00 4.00 4.00
## 209 3.25 2.75 3.00 4.00 4.00
## 210 4.50 3.75 3.50 4.75 1.50
## 221 5.00 4.75 4.00 4.00 4.50
## 222 3.50 3.75 4.00 2.50 3.75
## 230 2.75 2.50 2.75 4.75 4.50
## 244 3.00 4.50 4.00 3.75 3.50
## 247 3.75 4.25 4.75 3.50 4.75
## 248 4.25 4.25 4.00 4.50 3.50
## 250 2.75 4.50 5.00 4.50 4.00
## 251 4.75 5.00 4.00 4.25 4.25
## 254 3.00 4.00 3.50 3.75 4.00
## 255 2.75 4.00 3.50 3.75 3.00
## 256 4.50 4.50 4.00 3.50 2.50
## 267 3.75 4.00 5.00 3.75 4.00
## 273 4.00 4.00 4.00 3.00 4.25
## 282 5.00 3.75 4.75 3.75 3.50
## 289 4.75 3.25 4.25 4.75 3.75
## 292 3.00 4.00 2.75 4.00 3.50
##
## $deep
## 1 2 3 4 5
## 2 3.75 4.00 3.50 3.75 4.00
## 5 4.25 3.00 4.50 3.75 3.75
## 8 4.00 3.75 4.00 4.50 3.75
## 9 4.00 3.75 3.50 3.75 4.00
## 13 4.00 3.50 4.25 3.50 4.00
## 22 4.50 4.25 4.50 4.00 3.50
## 27 4.00 4.00 4.00 4.50 4.00
## 30 4.50 4.00 4.25 3.50 3.75
## 33 4.00 4.25 3.75 3.50 4.00
## 39 4.25 3.00 3.50 4.00 3.00
## 42 4.00 4.25 4.00 4.00 4.50
## 43 4.50 5.00 3.50 4.75 4.50
## 45 4.00 4.50 4.50 4.75 3.50
## 46 4.75 3.50 3.00 4.25 4.00
## 47 3.50 3.50 4.00 4.50 4.00
## 49 3.75 3.75 3.50 3.50 4.00
## 57 4.00 5.00 4.00 3.25 4.00
## 62 3.00 4.00 4.00 3.25 4.00
## 69 3.75 3.50 4.25 4.50 3.00
## 80 4.75 3.75 3.75 5.00 3.50
## 95 5.00 4.75 4.50 3.50 3.50
## 103 3.00 4.00 4.50 4.25 3.75
## 109 4.25 3.75 3.50 4.25 4.00
## 115 4.00 3.00 3.50 5.00 4.25
## 116 4.00 4.25 4.50 4.00 3.75
## 118 4.25 4.00 3.75 3.50 4.50
## 122 4.00 4.00 4.25 4.00 4.25
## 135 4.00 4.00 3.50 4.00 4.00
## 141 4.75 4.00 3.75 4.00 3.75
## 144 4.75 4.50 4.25 4.25 4.00
## 145 5.00 4.50 4.00 3.00 4.00
## 162 4.00 4.00 4.00 3.50 3.50
## 164 4.00 3.00 4.00 4.00 3.00
## 167 4.50 4.50 4.00 4.00 4.00
## 178 3.75 3.00 3.75 3.00 4.25
## 182 3.50 3.75 3.50 2.00 3.50
## 185 3.00 4.00 4.00 4.25 4.00
## 191 2.75 4.00 4.00 3.75 3.25
## 196 4.50 2.75 4.25 3.50 3.00
## 206 4.25 2.25 4.00 4.25 3.75
## 208 4.00 4.25 3.75 4.75 4.00
## 209 3.75 3.50 4.00 5.00 4.75
## 210 3.50 4.50 4.00 5.00 3.50
## 221 4.50 4.00 3.50 4.50 4.00
## 222 4.00 3.75 3.00 4.00 3.75
## 230 4.25 2.75 3.50 5.00 3.75
## 244 3.25 4.00 4.75 4.00 4.50
## 247 4.00 3.50 4.25 2.25 4.50
## 248 4.00 4.00 4.00 4.00 3.50
## 250 3.75 5.00 4.50 4.25 4.00
## 251 4.50 3.50 4.50 4.50 3.75
## 254 4.00 4.00 3.50 4.00 4.00
## 255 4.00 4.25 5.00 3.50 3.50
## 256 4.25 3.75 3.50 3.75 3.25
## 267 3.50 4.50 4.50 3.50 4.25
## 273 4.00 4.00 4.50 4.00 4.25
## 282 4.50 3.50 4.75 3.00 3.50
## 289 4.50 2.75 4.75 4.00 3.00
## 292 4.00 4.00 2.75 4.00 3.50
##
## $organised
## 1 2 3 4 5
## 2 2.50 3.50 3.25 3.00 3.50
## 5 4.50 3.00 4.00 2.50 3.00
## 8 3.25 3.75 4.25 4.00 3.25
## 9 3.25 4.25 3.25 2.50 3.25
## 13 4.75 2.50 3.50 3.00 3.25
## 22 3.25 3.75 4.00 4.00 2.50
## 27 3.50 3.25 4.00 3.25 3.25
## 30 4.00 4.25 4.00 3.25 4.00
## 33 4.00 4.00 3.25 2.50 4.50
## 39 4.00 3.00 3.50 4.50 3.00
## 42 4.00 3.25 3.25 3.25 3.00
## 43 3.25 4.00 3.25 3.75 2.75
## 45 3.25 4.00 5.00 3.75 3.25
## 46 5.00 2.50 3.50 4.50 3.25
## 47 4.00 3.50 4.00 3.75 4.25
## 49 3.50 3.00 4.25 2.50 4.00
## 57 4.25 5.00 4.00 2.25 4.75
## 62 4.00 3.75 4.00 3.75 4.75
## 69 3.75 3.25 3.50 3.50 3.00
## 80 4.75 2.50 4.25 4.50 3.00
## 95 4.75 4.50 3.75 4.25 2.50
## 103 3.00 4.00 4.00 4.25 4.00
## 109 4.50 3.25 3.50 4.00 4.50
## 115 4.00 4.00 2.50 3.50 4.25
## 116 3.75 3.00 3.75 4.25 4.25
## 118 4.50 3.50 3.25 4.25 4.50
## 122 4.25 3.25 4.25 4.50 4.00
## 135 1.25 3.50 3.75 4.00 4.50
## 141 4.50 4.00 4.50 3.75 3.50
## 144 5.00 4.25 2.75 4.00 3.50
## 145 5.00 3.75 3.25 3.75 1.25
## 162 3.75 3.00 3.25 3.50 4.00
## 164 4.00 3.25 4.25 3.75 3.00
## 167 4.50 5.00 3.75 3.50 4.00
## 178 3.50 3.25 4.00 3.25 3.50
## 182 3.75 3.00 4.00 2.25 2.50
## 185 3.25 4.00 3.50 4.00 3.00
## 191 2.25 4.00 3.25 2.50 4.00
## 196 4.50 3.00 4.00 3.25 3.00
## 206 4.25 2.75 4.75 3.25 3.50
## 208 4.25 4.25 3.00 3.25 4.25
## 209 4.00 2.50 3.25 4.25 4.75
## 210 4.50 3.25 4.50 3.75 4.00
## 221 3.75 4.50 2.25 3.50 2.50
## 222 3.75 3.75 3.25 2.75 2.50
## 230 2.25 3.00 2.50 4.00 2.50
## 244 3.00 4.50 4.75 4.25 4.25
## 247 2.25 3.75 4.50 2.25 4.50
## 248 4.00 3.75 3.75 3.25 2.50
## 250 3.50 3.75 4.50 3.50 2.50
## 251 4.25 4.75 3.50 3.25 3.50
## 254 3.50 4.00 3.00 1.25 4.75
## 255 4.25 4.00 4.50 3.50 3.00
## 256 4.75 2.50 2.50 2.50 3.00
## 267 3.25 3.75 4.25 3.25 4.00
## 273 4.00 4.25 4.50 4.50 3.75
## 282 3.50 3.25 4.50 2.50 3.00
## 289 3.50 3.00 3.00 3.25 2.50
## 292 3.50 3.75 3.25 4.75 2.50
##
## $score
## [1] 1 2 3 4 5
## <0 rows> (or 0-length row.names)
# save the first imputation (out of 5) into complete df
df.impPMM.complete1 <- complete(df.impPMM, 1) # replace "1" with "2" to save the second imputed df etc.
# save PMM imp 1 into df
write.table (df.impPMM.complete1, file = "dfttu_wide_cross_impPMM_complete1.txt", row.names = FALSE, sep= "\t")
# read the df
dfttu_wide_cross.impPMM1 <- read.table("dfttu_wide_cross_impPMM_complete1.txt", header=TRUE, sep= "\t")
# save the whole PMM imputed "package" (5 imputations)
# install.packages("miceadds")
library(miceadds)
write.mice.imputation(mi.res=df.impPMM, name="dfttu_wide_cross_impPMM", mids2spss=F)
## 2023-02-23 13:33:53
##
## /Users/nokelaip/CharveLEAD Dropbox/Petri Nokelainen/home1_cloud/home1/koulutus/2023/TTU/Session2/R_Session/dfttu_wide_cross_impPMM
##
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## group gender autonomy competence deep organised score
## "" "" "pmm" "pmm" "pmm" "pmm" ""
## PredictorMatrix:
## group gender autonomy competence deep organised score
## group 0 1 1 1 1 1 1
## gender 1 0 1 1 1 1 1
## autonomy 1 1 0 1 1 1 1
## competence 1 1 1 0 1 1 1
## deep 1 1 1 1 0 1 1
## organised 1 1 1 1 1 0 1
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## group gender autonomy competence deep organised score
## "" "" "pmm" "pmm" "pmm" "pmm" ""
## PredictorMatrix:
## group gender autonomy competence deep organised score
## group 0 1 1 1 1 1 1
## gender 1 0 1 1 1 1 1
## autonomy 1 1 0 1 1 1 1
## competence 1 1 1 0 1 1 1
## deep 1 1 1 1 0 1 1
## organised 1 1 1 1 1 0 1
##
##
## variable MissProp Rhat.M.imp Rhat.Var.imp
## 1 group 0.00000 NA NA
## 2 gender 0.00000 NA NA
## 3 autonomy 20.06803 1.0120040 1.009872
## 4 competence 20.06803 1.0094990 1.060325
## 5 deep 20.06803 0.9950371 1.009788
## 6 organised 20.06803 0.9989290 1.003341
## 7 score 0.00000 NA NA
## variable MissProp Rhat.M.imp Rhat.Var.imp
## 1 group 0.000 NA NA
## 2 gender 0.000 NA NA
## 3 autonomy 20.068 1.012 1.010
## 4 competence 20.068 1.009 1.060
## 5 deep 20.068 0.995 1.010
## 6 organised 20.068 0.999 1.003
## 7 score 0.000 NA NA
##
## To cite R in publications use:
##
## R Core Team (2022). R: A language and environment for statistical
## computing. R Foundation for Statistical Computing, Vienna, Austria.
## URL https://www.R-project.org/.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2022},
## url = {https://www.R-project.org/},
## }
##
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.
##
##
## To cite mice in publications use:
##
## Stef van Buuren, Karin Groothuis-Oudshoorn (2011). mice: Multivariate
## Imputation by Chained Equations in R. Journal of Statistical
## Software, 45(3), 1-67. DOI 10.18637/jss.v045.i03.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {{mice}: Multivariate Imputation by Chained Equations in R},
## author = {Stef {van Buuren} and Karin Groothuis-Oudshoorn},
## journal = {Journal of Statistical Software},
## year = {2011},
## volume = {45},
## number = {3},
## pages = {1-67},
## doi = {10.18637/jss.v045.i03},
## }
##
## sysname
## "Darwin"
## release
## "22.3.0"
## version
## "Darwin Kernel Version 22.3.0: Thu Jan 5 20:48:54 PST 2023; root:xnu-8792.81.2~2/RELEASE_ARM64_T6000"
## nodename
## "wks-98846-mac.ad.tuni.fi"
## machine
## "arm64"
## login
## "root"
## user
## "nokelaip"
## effective_user
## "nokelaip"
## R version 4.2.2 (2022-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] miceadds_3.16-18 VIM_6.2.2 colorspace_2.1-0
## [4] tsibble_1.1.3 tinytex_0.39 forcats_1.0.0
## [7] stringr_1.5.0 dplyr_1.1.0 purrr_1.0.1
## [10] readr_2.1.2 tibble_3.1.8 tidyverse_1.3.2
## [13] tidyr_1.3.0 texreg_1.38.6 tableone_0.13.2
## [16] supportR_0.1.2 stargazer_5.2.3 sjstats_0.18.2
## [19] sjPlot_2.8.12 simr_1.0.6 sandwich_3.0-2
## [22] RColorBrewer_1.1-3 psych_2.2.9 plyr_1.8.8
## [25] pan_1.6 optmatch_0.10.6 nullabor_0.3.9
## [28] nlme_3.1-162 naniar_1.0.0 multilevelTools_0.1.1
## [31] mlmRev_1.0-8 mice_3.15.0 MatchIt_4.5.0
## [34] lmtest_0.9-40 zoo_1.8-11 lmerTest_3.1-3
## [37] lme4_1.1-31 Matrix_1.5-1 lattice_0.20-45
## [40] JWileymisc_1.4.0 JointAI_1.0.4 intervals_0.15.2
## [43] HLMdiag_0.5.0 gridExtra_2.3 gmodels_2.18.1.1
## [46] ggraph_2.1.0 ggpubr_0.6.0 ggplot2_3.4.1
## [49] gapminder_0.3.0 finalfit_1.0.6 extraoperators_0.1.1
## [52] emmeans_1.8.4-1 doBy_4.6.16 devtools_2.4.5
## [55] usethis_2.1.6 data.table_1.14.8 compareGroups_4.6.0
## [58] apaTables_2.0.8
##
## loaded via a namespace (and not attached):
## [1] vcd_1.4-11 Hmisc_4.7-0 svglite_2.1.0
## [4] class_7.3-20 ps_1.7.2 laeken_0.5.2
## [7] crayon_1.5.2 MASS_7.3-58.1 backports_1.4.1
## [10] reprex_2.0.1 rlang_1.0.6 readxl_1.4.0
## [13] performance_0.10.2 SparseM_1.81 nloptr_2.0.3
## [16] callr_3.7.3 flextable_0.7.1 glue_1.6.2
## [19] pbkrtest_0.5.2 parallel_4.2.2 processx_3.8.0
## [22] VGAM_1.1-7 haven_2.5.0 tidyselect_1.2.0
## [25] sjmisc_2.8.9 chron_2.3-57 xtable_1.8-4
## [28] MatrixModels_0.5-1 magrittr_2.0.3 evaluate_0.20
## [31] gdtools_0.2.4 cli_3.6.0 rstudioapi_0.13
## [34] miniUI_0.1.1.1 sp_1.5-0 rjags_4-13
## [37] bslib_0.3.1 rpart_4.1.19 mathjaxr_1.6-0
## [40] sjlabelled_1.2.0 shiny_1.7.1 xfun_0.31
## [43] pkgbuild_1.3.1 cluster_2.1.4 tidygraph_1.2.1
## [46] quantreg_5.94 ggrepel_0.9.3 listenv_0.8.0
## [49] png_0.1-7 future_1.26.1 withr_2.5.0
## [52] ggforce_0.3.3 ranger_0.14.1 cellranger_1.1.0
## [55] e1071_1.7-11 survey_4.1-1 coda_0.19-4
## [58] pillar_1.8.1 cachem_1.0.6 multcomp_1.4-20
## [61] fs_1.6.1 flexmix_2.3-18 kernlab_0.9-31
## [64] vctrs_0.5.2 pbivnorm_0.6.0 ellipsis_0.3.2
## [67] generics_0.1.3 tools_4.2.2 foreign_0.8-83
## [70] munsell_0.5.0 tweenr_1.0.2 proxy_0.4-27
## [73] anytime_0.3.9 fastmap_1.1.0 compiler_4.2.2
## [76] pkgload_1.3.2 abind_1.4-5 httpuv_1.6.5
## [79] sessioninfo_1.2.2 rms_6.3-0 utf8_1.2.3
## [82] later_1.3.0 jsonlite_1.8.4 scales_1.2.1
## [85] carData_3.0-5 estimability_1.4.1 promises_1.2.0.1
## [88] car_3.1-1 latticeExtra_0.6-29 checkmate_2.1.0
## [91] fstcore_0.9.12 rmarkdown_2.14 cowplot_1.1.1
## [94] moments_0.14.1 webshot_0.5.3 igraph_1.3.1
## [97] survival_3.4-0 numDeriv_2016.8-1.1 yaml_2.3.5
## [100] plotrix_3.8-2 systemfonts_1.0.4 prabclus_2.3-2
## [103] htmltools_0.5.2 memoise_2.0.1 lavaan_0.6-14
## [106] profvis_0.3.7 modeltools_0.2-23 graphlayouts_0.8.0
## [109] quadprog_1.5-8 viridisLite_0.4.1 digest_0.6.31
## [112] assertthat_0.2.1 mime_0.12 bayestestR_0.13.0
## [115] remotes_2.4.2 urlchecker_1.0.1 splines_4.2.2
## [118] Formula_1.2-4 fpc_2.2-9 googledrive_2.0.0
## [121] broom_1.0.3 hms_1.1.2 modelr_0.1.8
## [124] microbenchmark_1.4.9 base64enc_0.1-3 mnormt_2.1.1
## [127] nnet_7.3-18 sass_0.4.1 binom_1.1-1.1
## [130] Rcpp_1.0.10 mclust_5.4.10 mvtnorm_1.1-3
## [133] multcompView_0.1-8 fansi_1.0.4 tzdb_0.3.0
## [136] truncnorm_1.0-8 parallelly_1.32.0 R6_2.5.1
## [139] lifecycle_1.0.3 polspline_1.1.20 zip_2.2.0
## [142] writexl_1.4.0 datawizard_0.6.5 ggsignif_0.6.3
## [145] googlesheets4_1.0.0 minqa_1.2.5 gdata_2.18.0.1
## [148] jquerylib_0.1.4 snakecase_0.11.0 robustbase_0.95-0
## [151] TH.data_1.1-1 iterators_1.0.14 htmlwidgets_1.5.4
## [154] officer_0.4.2 polyclip_1.10-0 rvest_1.0.2
## [157] mgcv_1.8-41 globals_0.15.0 insight_0.19.0
## [160] htmlTable_2.4.0 codetools_0.2-18 lubridate_1.8.0
## [163] gtools_3.9.4 prettyunits_1.1.1 dbplyr_2.2.0
## [166] gtable_0.3.1 DBI_1.1.2 stats4_4.2.2
## [169] visdat_0.5.3 highr_0.9 httr_1.4.3
## [172] stringi_1.7.12 reshape2_1.4.4 farver_2.1.1
## [175] uuid_1.1-0 diptest_0.76-0 viridis_0.6.2
## [178] ggthemes_4.2.4 xml2_1.3.3 boot_1.3-28
## [181] fst_0.9.8 diagonals_6.0.0 kableExtra_1.3.4
## [184] ggeffects_1.1.2 DEoptimR_1.0-11 jpeg_0.1-9
## [187] Deriv_4.1.3 janitor_2.1.0 pkgconfig_2.0.3
## [190] gargle_1.2.0 rstatix_0.7.2 mitools_2.4
## [193] HardyWeinberg_1.7.5 RLRsim_3.1-8 Rsolnp_1.16
## [196] knitr_1.39
##
## 1
## 2
## 3
## 4
## 5
load("dfttu_wide_cross_impPMM/dfttu_wide_cross_impPMM.Rdata")
# class(mi.res) # mids
# check if the imputation has gone well ... this is done with the whole "set" of 5 imputed df's
# plot observed (blue) against imputed (red)
xyplot(df.impPMM, competence ~ autonomy + deep + organised,
color=mdc(1:2), # blue = observed, red = imputed
pch=20, # dot shape
cex=1 # dot size
)
# density plotting (blue = original)
densityplot(df.impPMM) # successful imputation
# densityplot(get("mi.res")) # similar plot with the saved mi object
# --------------------------------------
# Estimate model with different datasets
# run linear regression with competence (dependent variable) and predictor variables (autonomy, score)
# first, data with NA's
Competence_assumptions <- lm(competence ~ autonomy + score + gender + group, dfttu_wide_cross)
summary(Competence_assumptions)
##
## Call:
## lm(formula = competence ~ autonomy + score + gender + group,
## data = dfttu_wide_cross)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.82918 -0.27635 0.03015 0.27235 1.28052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.073315 0.177185 6.058 5.59e-09 ***
## autonomy 0.680008 0.045479 14.952 < 2e-16 ***
## score 0.002613 0.001102 2.371 0.0186 *
## gender 0.099692 0.070767 1.409 0.1603
## group 0.116768 0.062926 1.856 0.0648 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4818 on 230 degrees of freedom
## (59 observations deleted due to missingness)
## Multiple R-squared: 0.5272, Adjusted R-squared: 0.5189
## F-statistic: 64.1 on 4 and 230 DF, p-value: < 2.2e-16
# second, complete data
# create a "complete" version of df (remove all rows, that is, participants who have NA's)
dfttu_wide_cross_complete <- na.omit(dfttu_wide_cross)
Competence_assumptions_complete <- lm(competence ~ autonomy + score + gender + group, dfttu_wide_cross_complete)
summary(Competence_assumptions_complete)
##
## Call:
## lm(formula = competence ~ autonomy + score + gender + group,
## data = dfttu_wide_cross_complete)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.82918 -0.27635 0.03015 0.27235 1.28052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.073315 0.177185 6.058 5.59e-09 ***
## autonomy 0.680008 0.045479 14.952 < 2e-16 ***
## score 0.002613 0.001102 2.371 0.0186 *
## gender 0.099692 0.070767 1.409 0.1603
## group 0.116768 0.062926 1.856 0.0648 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4818 on 230 degrees of freedom
## Multiple R-squared: 0.5272, Adjusted R-squared: 0.5189
## F-statistic: 64.1 on 4 and 230 DF, p-value: < 2.2e-16
# third, simple mean imputed data
Competence_assumptions_impMean <- lm(competence ~ autonomy + score + gender + group, dfttu_wide_cross.impMean)
summary(Competence_assumptions_impMean)
##
## Call:
## lm(formula = competence ~ autonomy + score + gender + group,
## data = dfttu_wide_cross.impMean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83887 -0.18039 0.02446 0.19092 1.26716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1198793 0.1557573 7.190 5.57e-12 ***
## autonomy 0.6843189 0.0405812 16.863 < 2e-16 ***
## score 0.0020216 0.0008705 2.322 0.0209 *
## gender 0.0761898 0.0563849 1.351 0.1777
## group 0.0951308 0.0504660 1.885 0.0604 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4318 on 289 degrees of freedom
## Multiple R-squared: 0.5227, Adjusted R-squared: 0.5161
## F-statistic: 79.14 on 4 and 289 DF, p-value: < 2.2e-16
# fourth, first PMM (predictive mean matching) imputed df
Competence_assumptions_impPMM.complete1 <- lm(competence ~ autonomy + score + gender + group, df.impPMM.complete1)
summary(Competence_assumptions_impPMM.complete1)
##
## Call:
## lm(formula = competence ~ autonomy + score + gender + group,
## data = df.impPMM.complete1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83887 -0.29766 0.01884 0.29596 1.26006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.112089 0.156964 7.085 1.06e-11 ***
## autonomy 0.686861 0.040627 16.907 < 2e-16 ***
## score 0.001854 0.000963 1.926 0.0551 .
## gender 0.083779 0.063072 1.328 0.1851
## group 0.112495 0.055729 2.019 0.0445 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.477 on 289 degrees of freedom
## Multiple R-squared: 0.5289, Adjusted R-squared: 0.5224
## F-statistic: 81.12 on 4 and 289 DF, p-value: < 2.2e-16
# ---------------
# Compare results
# let's get a whole picture of the results
# install.packages("sjPlot")
library(sjPlot)
sjPlot::tab_model(Competence_assumptions,
Competence_assumptions_complete,
Competence_assumptions_impMean,
# Competence_assumptions_impPMM.complete1,
CSS = list(
css.thead = 'border-top: 1px solid;',
css.lasttablerow = 'border-top: 1px solid;'),
digits = 3,
show.ci = 0.95,
collapse.ci = TRUE,
show.se = TRUE,
show.stat = FALSE,
#show.df = TRUE,
show.re.var = TRUE,
show.r2 = FALSE,
show.icc = TRUE,
show.fstat = TRUE,
show.aic = TRUE,
show.dev = TRUE,
show.loglik = FALSE,
string.pred = "Predictors",
string.est = "Est. (C.I.)",
string.ci = "C.I.",
string.se = "S.E.",
p.style = "numeric",
p.threshold = c(0.05, 0.01, 0.001),
emph.p = TRUE,
dv.labels = c("Model 1a<sub><i>Full</i></sub>",
"Model 1b<sub><i>Complete</i></sub>",
"Model 1c<sub><i>Mean imp</i></sub>" #,
#"Model 1d<sub><i>PMM imp comp1</i></sub>"
)
)
| Model 1aFull | Model 1bComplete | Model 1cMean imp | |||||||
|---|---|---|---|---|---|---|---|---|---|
| Predictors | Est. (C.I.) | S.E. | p | Est. (C.I.) | S.E. | p | Est. (C.I.) | S.E. | p |
| (Intercept) |
1.073 (0.724 – 1.422) |
0.177 | <0.001 |
1.073 (0.724 – 1.422) |
0.177 | <0.001 |
1.120 (0.813 – 1.426) |
0.156 | <0.001 |
| autonomy |
0.680 (0.590 – 0.770) |
0.045 | <0.001 |
0.680 (0.590 – 0.770) |
0.045 | <0.001 |
0.684 (0.604 – 0.764) |
0.041 | <0.001 |
| score |
0.003 (0.000 – 0.005) |
0.001 | 0.019 |
0.003 (0.000 – 0.005) |
0.001 | 0.019 |
0.002 (0.000 – 0.004) |
0.001 | 0.021 |
| gender |
0.100 (-0.040 – 0.239) |
0.071 | 0.160 |
0.100 (-0.040 – 0.239) |
0.071 | 0.160 |
0.076 (-0.035 – 0.187) |
0.056 | 0.178 |
| group |
0.117 (-0.007 – 0.241) |
0.063 | 0.065 |
0.117 (-0.007 – 0.241) |
0.063 | 0.065 |
0.095 (-0.004 – 0.194) |
0.050 | 0.060 |
| Observations | 235 | 235 | 294 | ||||||
| Deviance | 53.386 | 53.386 | 53.884 | ||||||
| AIC | 330.621 | 330.621 | 347.491 | ||||||
# we notice that full, complete and mean imputed data produce similar results
# how about the full (five times) PMM imputed data?
Competence_assumptions_impPMM <- with(df.impPMM, lm(competence ~ autonomy + score + gender + group))
pool(Competence_assumptions_impPMM)
## Class: mipo m = 5
## term m estimate ubar b t dfcom
## 1 (Intercept) 5 1.11695950 2.555759e-02 7.025830e-03 3.398859e-02 289
## 2 autonomy 5 0.67386493 1.704194e-03 4.679347e-04 2.265715e-03 289
## 3 score 5 0.00245425 9.814699e-07 3.998122e-07 1.461245e-06 289
## 4 gender 5 0.09350242 4.163013e-03 4.144681e-04 4.660374e-03 289
## 5 group 5 0.10685405 3.299437e-03 2.018758e-04 3.541688e-03 289
## df riv lambda fmi
## 1 49.95987 0.32988222 0.24805371 0.2764505
## 2 50.03127 0.32949408 0.24783419 0.2762011
## 3 31.11599 0.48883275 0.32833289 0.3677084
## 4 148.19901 0.11947158 0.10672141 0.1185373
## 5 203.68616 0.07342192 0.06839987 0.0774145
# these should be low:
# fraction of information missing due to nonresponse (fmi)
# the relative increase in variance due to nonresponse (lambda)
summary(pool(Competence_assumptions_impPMM))
## term estimate std.error statistic df p.value
## 1 (Intercept) 1.11695950 0.184359947 6.058580 49.95987 1.779730e-07
## 2 autonomy 0.67386493 0.047599529 14.156966 50.03127 3.967468e-19
## 3 score 0.00245425 0.001208819 2.030286 31.11599 5.094972e-02
## 4 gender 0.09350242 0.068266934 1.369659 148.19901 1.728651e-01
## 5 group 0.10685405 0.059512084 1.795502 203.68616 7.405610e-02
# term estimate std.error statistic df p.value
# 1 (Intercept) 1.104158848 0.204627650 5.395942 23.63079 1.608837e-05
# 2 autonomy 0.673287744 0.048663059 13.835705 43.28428 1.712608e-17 ***
# 3 score 0.002542074 0.001069936 2.375913 94.24786 1.952910e-02 *
# 4 gender 0.098300898 0.069125239 1.422070 107.19081 1.579094e-01
# 5 group 0.121213128 0.057557558 2.105946 266.44022 3.614463e-02 *
# results stat quite similar, except that group is now also stat sig
The last part of dealing with missing data is to delete all rows that contain missing values.
# ----------------
# 3. COMPLETE DATA
# plot complete df
plot_all(dfttu_wide_cross_complete[c("autonomy","competence","deep","organised","score","score3class")], fill = 'lightgreen', border = 'black', breaks = 30)
# save complete df
write.table (dfttu_wide_cross_complete, file = "dfttu_wide_cross_complete.txt", row.names = FALSE, sep= "\t")
# read complete df
dfttu_wide_cross_complete <- read.table("dfttu_wide_cross_complete.txt", header=TRUE, sep= "\t")
#######################
# get some descriptives
#
# what is the n of rows in df?
dfttu_nrow <- nrow(dfttu_wide_cross)
#
# how many rows have missing data?
dfttu_nrow_NA <- nrow(dfttu_wide_cross[!complete.cases(dfttu_wide_cross), ])
#
# what is the % of NA's in df?
dfttu_perc_NA <- dfttu_nrow_NA / dfttu_nrow * 100
#
# how many participants in df?
dfttu_n_id <- length(unique(dfttu_wide_cross$id)) # works also for repeated measurement df
#
# how many females?
# n
dfttu_n_fem <- nrow(dfttu_wide_cross[ which(dfttu_wide_cross$gender==0), ])
# %
dfttu_perc_fem <- dfttu_n_fem / dfttu_nrow * 100
#
# how many NA's in gender?
# n
dfttu_n_NA_gender <- nrow(dfttu_wide_cross[is.na(dfttu_wide_cross$gender),])
# %
dfttu_perc_NA_gender <- dfttu_n_NA_gender / dfttu_n_id * 100
#
# how many control group members?
# n
dfttu_n_control <- nrow(dfttu_wide_cross[ which(dfttu_wide_cross$group==0), ])
# %
dfttu_perc_control <- dfttu_n_control / dfttu_nrow * 100
#
# how many intervention group members?
# n
dfttu_n_intervention <- nrow(dfttu_wide_cross[ which(dfttu_wide_cross$group==1), ])
# %
dfttu_perc_intervention <- dfttu_n_intervention / dfttu_nrow * 100
Participants
The cross-sectional data has 294 participants of which 88 are females (29.9%). Participants are organised in the control (n = 154, 52.4%) and intervention (n = 140, 47.6%) groups. Both groups have studied the same mathematics course, but the intervention group received the teaching with “flipped learning” pedagogical implementation (including self-study phases via online material and group work/discussions instead of lectures). The control group completed the course in a more traditional way (teacher-led lectures and assistant-led exercises).
Measurements
Higher education students have answered to eight questions on a 5-point scale (1 = totally disagree, … , 5 = totally agree) that are related to autonomy (4 questions) and competence (4 questions) during studies. This data contains the mean values from these questions stored in two variables “autonomy” and “competence”. Autonomy measures if one feels that the studying is self-determined and meaningful. Competence relates to one’s experiences of how different tasks during studies develop expertise (instead of being routine tasks). In addition to self-assessments, the data contains the variable “score” that contains participants’ mathematics test score (from 0 to 100 percent).
# run multivariate normality analysis
# we use the complete data here
# install.packages("MVN")
# install.packages("ggplot2)
library(MVN) # https://cran.r-project.org/web/packages/MVN/MVN.pdf
library(ggplot2)
# check columns of "dfttu_wide_cross_complete"
colnames(dfttu_wide_cross_complete)
## [1] "id" "group" "gender" "autonomy"
## [5] "competence" "deep" "organised" "score"
## [9] "score3class_chr" "score3class" "autonomy_chr" "gender_chr"
## [13] "group_chr"
# run diagnostics for "autonomy" and "competence"
mvn(data = complete(dfttu_wide_cross_complete[,4:5]), subset = NULL, mvnTest = "mardia",
covariance = TRUE, tol = 1e-25, alpha = 0.5,
scale = FALSE, desc = FALSE, transform = "none", R = 1000,
univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
showOutliers = TRUE, showNewData = FALSE)
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 39.3460662839323 5.90900646336711e-08 NO
## 2 Mardia Kurtosis 4.31398866509343 1.60335170531489e-05 NO
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk autonomy 0.9602 <0.001 NO
## 2 Shapiro-Wilk competence 0.9441 <0.001 NO
##
## $multivariateOutliers
## Observation Mahalanobis Distance Outlier
## 169 169 24.065 TRUE
## 41 41 23.595 TRUE
## 168 168 20.500 TRUE
## 48 48 17.685 TRUE
## 218 218 16.522 TRUE
## 152 152 15.339 TRUE
## 69 69 13.724 TRUE
## 195 195 13.671 TRUE
## 54 54 13.244 TRUE
## 76 76 12.318 TRUE
## 226 226 12.318 TRUE
## 43 43 11.385 TRUE
## 145 145 10.926 TRUE
## 91 91 10.852 TRUE
## 65 65 10.321 TRUE
## 120 120 10.050 TRUE
## 36 36 9.506 TRUE
# save diagnostics
dfttu_mv <- mvn(data = complete(dfttu_wide_cross_complete[,4:5]), subset = NULL, mvnTest = "mardia",
covariance = TRUE, tol = 1e-25, alpha = 0.5,
scale = FALSE, desc = FALSE, transform = "none", R = 1000,
univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
showOutliers = TRUE, showNewData = FALSE)
# create index variable (column)
# we need this as the list of outlier participants from the MVN is based on row index values
dfttu_wide_cross_complete$index <- seq.int(nrow(dfttu_wide_cross_complete))
# create a list of outlier id's
# a good way to extract values is to use "str()"
# str(dfttu_mv)
outlier_list <- dfttu_mv$multivariateOutliers$Observation
length(outlier_list) # 17 outliers
## [1] 17
# index_outliers <- c(169,41,168,48,218,152,69,195,54,76,226,43,145,91,65,120,36)
# create new df that omits these outlier participants
# install.packages("extraoperators")
library(extraoperators) # need this for %!in%
dfttu_wide_cross_complete_final <- subset(dfttu_wide_cross_complete, index %!in% outlier_list)
# check that 17 participants have been omitted from the df
length(dfttu_wide_cross_complete$id) # 235 participants
## [1] 235
length(dfttu_wide_cross_complete_final$id) # 218 participants
## [1] 218
# re-run diagnostics
mvn(data = complete(dfttu_wide_cross_complete_final[,4:5]), subset = NULL, mvnTest = "mardia",
covariance = TRUE, tol = 1e-25, alpha = 0.5,
scale = FALSE, desc = FALSE, transform = "none", R = 1000,
univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
showOutliers = TRUE, showNewData = FALSE)
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 4.02210144203792 0.403023008174473 YES
## 2 Mardia Kurtosis -0.26133837262797 0.793831577376506 YES
## 3 MVN <NA> <NA> YES
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk autonomy 0.9667 1e-04 NO
## 2 Shapiro-Wilk competence 0.9543 <0.001 NO
##
## $multivariateOutliers
## Observation Mahalanobis Distance Outlier
## 109 109 12.550 TRUE
## 174 174 12.550 TRUE
## 183 183 12.550 TRUE
## 198 198 12.550 TRUE
## 98 98 10.952 TRUE
## 221 221 10.211 TRUE
## 127 127 9.900 TRUE
## 19 19 9.015 TRUE
## 90 90 9.015 TRUE
## 110 110 8.754 TRUE
## 190 190 8.754 TRUE
dfttu_mv2 <- mvn(data = complete(dfttu_wide_cross_complete_final[,4:5]), subset = NULL, mvnTest = "mardia",
covariance = TRUE, tol = 1e-25, alpha = 0.5,
scale = FALSE, desc = FALSE, transform = "none", R = 1000,
univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
showOutliers = TRUE, showNewData = FALSE)
dfttu_wide_cross_complete_final$index <- seq.int(nrow(dfttu_wide_cross_complete_final))
outlier_list2 <- dfttu_mv2$multivariateOutliers$Observation
length(outlier_list2) # 11 outliers
## [1] 11
dfttu_wide_cross_complete_final2 <- subset(dfttu_wide_cross_complete_final, index %!in% outlier_list2)
# check that 11 participants have been omitted from the df
length(dfttu_wide_cross_complete_final$id) # 218 participants
## [1] 218
length(dfttu_wide_cross_complete_final2$id) # 208 participants
## [1] 208
# re-run diagnostics
mvn(data = complete(dfttu_wide_cross_complete_final2[,4:5]), subset = NULL, mvnTest = "mardia",
covariance = TRUE, tol = 1e-25, alpha = 0.5,
scale = FALSE, desc = FALSE, transform = "none", R = 1000,
univariateTest = "SW", univariatePlot = "qq", multivariatePlot = "qq",
multivariateOutlierMethod = "adj", bc = FALSE, bcType = "rounded",
showOutliers = TRUE, showNewData = FALSE)
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 3.92084064631468 0.416824894001297 YES
## 2 Mardia Kurtosis -0.377220097864481 0.706010045724341 YES
## 3 MVN <NA> <NA> YES
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk autonomy 0.9686 1e-04 NO
## 2 Shapiro-Wilk competence 0.9572 <0.001 NO
##
## $multivariateOutliers
## Observation Mahalanobis Distance Outlier
## 109 109 12.358 TRUE
## 174 174 12.358 TRUE
## 183 183 12.358 TRUE
## 98 98 10.914 TRUE
## 221 221 10.338 TRUE
## 127 127 9.618 TRUE
## 90 90 9.498 TRUE
## 110 110 8.626 TRUE
## 190 190 8.626 TRUE
# we can stop here, the worst cases have been omitted ...
# so we write the final version of data frame for the next phases of statistical analyses
write.table (dfttu_wide_cross_complete_final2, "dfttu_wide_cross_complete_final2.txt", row.names = FALSE, sep= "\t")
Analysis continues in
R session 2:
Rmd Statistics_R_02.Rmd
HTML Statistics_R_02.html