Course homepage:
https://homepages.tuni.fi/petri.nokelainen/mmx9120/
R session 1:
Rmd Statistics_R_01.Rmd
HTML Statistics_R_01.html
R session 2 (this file):
Rmd Statistics_R_02.Rmd
HTML Statistics_R_02.html
R session 3:
Rmd Statistics_R_03.Rmd
HTML Statistics_R_03.html
# get the current working directory (the location from where you opened this file)
getwd() # e.g. [1] "/Users/nokelaip"
## [1] "/Users/nokelaip/Dropbox (CharveLEAD)/home1_cloud/home1/koulutus/2023/TTU/test_R_teaching_materials/2_basic_statistics"
# put that location into variable "wd"
wd <_ getwd()
# set the working directory to the value of variable "wd"
setwd(wd) # copy paste "wd" value
# you may also use RStudio menu: Session _ Set Working Directory _ To Source File location
# read in the data, this time we will use a shorter name
dfttu_wide <_ read.table("dfttu_wide_cross_complete_final2.txt", header=TRUE, sep= "\t")
# ________________
# CENTRAL TENDENCY
# what is the average (mean) mathematics score?
dfttu_m_score <_ round(mean(dfttu_wide$score, na.rm = TRUE), digits = 2)
#
# median?
dfttu_med_score <_ median(dfttu_wide$score, na.rm = TRUE)
#
# mode?
# install.packages("DescTools")
library(DescTools)
## Registered S3 method overwritten by 'DescTools':
## method from
## reorder.factor gdata
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
##
## Recode
## The following objects are masked from 'package:psych':
##
## AUC, ICC, SD
## The following objects are masked from 'package:extraoperators':
##
## %c%, %nin%
## The following object is masked from 'package:data.table':
##
## %like%
dfttu_mod_score <_ Mode(dfttu_wide$score, na.rm = TRUE)
#
# __________
# DISPERSION
# what is the average absolute deviation (AAD) of math score? (mean of absolute deviations from the mean)
# install.packages("lsr")
library(lsr)
dfttu_aad_score <_ aad(dfttu_wide$score, na.rm = TRUE)
#
# what is the standard deviation of the average score?
dfttu_sd_score <_ sd(dfttu_wide$score, na.rm = TRUE)
#
# how about the variance?
dfttu_var_score <_ var(dfttu_wide$score, na.rm = TRUE)
#
# median absolute deviation (MAD)? (median of absolute deviations from the median)
dfttu_mad_score <_ mad(dfttu_wide$score, na.rm = TRUE)
# install.packages("psych")
library(psych) # https://rdrr.io/cran/psych/man/describe.by.html
# descriptives for the whole df
describe(dfttu_wide)
# save descriptives into df
dfttu_desc <_ describe(dfttu_wide)
# investigate the structure of df
str(dfttu_desc)
## Classes 'psych', 'describe' and 'data.frame': 14 obs. of 13 variables:
## $ vars : int 1 2 3 4 5 6 7 8 9 10 ...
## $ n : num 208 208 208 208 208 208 208 208 208 208 ...
## $ mean : num 147.88 0.514 0.697 3.786 3.907 ...
## $ sd : num 84.384 0.501 0.461 0.607 0.605 ...
## $ median : num 147.5 1 1 3.75 4 ...
## $ trimmed : num 147.845 0.518 0.744 3.79 3.923 ...
## $ mad : num 105.265 0 0 0.371 0.371 ...
## $ min : num 1 0 0 2.25 2.5 2 1.25 0 1 0 ...
## $ max : num 294 1 1 5 5 5 5 100 3 2 ...
## $ range : num 293 1 1 2.75 2.5 3 3.75 100 2 2 ...
## $ skew : num 0.00641 _0.0573 _0.85177 _0.17761 _0.23328 ...
## $ kurtosis: num _1.1754 _2.0063 _1.2806 0.0306 _0.3043 ...
## $ se : num 5.851 0.0347 0.0319 0.0421 0.042 ...
# view df
view(dfttu_desc)
# extract mean (average) score (that is the 8th value in the "mean" vector/list)
dfttu_desc$mean[8]
## [1] 47.75641
# install.packages("stargazer")
library(stargazer)
# descriptives for the whole df
stargazer(dfttu_wide,
type="text") # omitting this produces latex
##
## ==============================================
## Statistic N Mean St. Dev. Min Max
## ______________________________________________
## id 208 147.880 84.384 1 294
## group 208 0.514 0.501 0 1
## gender 208 0.697 0.461 0 1
## autonomy 208 3.786 0.607 2.250 5.000
## competence 208 3.907 0.605 2.500 5.000
## deep 208 4.023 0.507 2.000 5.000
## organised 208 3.739 0.678 1.250 5.000
## score 208 47.756 29.118 0.000 100.000
## score3class 208 1.014 0.813 0 2
## index 208 108.524 63.331 1 218
## ______________________________________________
# first five rows
stargazer(dfttu_wide[1:5,],
summary=FALSE,
rownames=FALSE,
type="text") # omitting this produces latex
##
## ==============================================================================================================================
## id group gender autonomy competence deep organised score score3class_chr score3class autonomy_chr gender_chr group_chr index
## ______________________________________________________________________________________________________________________________
## 1 0 1 3.750 4.250 4 3 16.667 Low score 0 Agree Male Control 1
## 3 1 1 3.250 3.750 4 3.500 41.667 Medium score 1 Agree Male Treatment 2
## 4 0 1 4 4 4 4 88.889 High score 2 Agree Male Control 3
## 6 0 1 4 4.250 4.250 4.500 61.111 Medium score 1 Agree Male Control 4
## 7 0 0 3.500 3.750 4.500 5 38.889 Medium score 1 Agree Female Control 5
## ______________________________________________________________________________________________________________________________
# Open Jamovi and load in the "dfttu_wide_cross_complete_final2.txt" data
# install.packages("jmv")
# install.packages("jmvconnect")
library(jmv)
library(jmvconnect)
# We need Jamovi's "Syntax mode" to see the R code behind all Jamovi actions
# Set it on by
# 1) going to Jamovi window,
# 2) clicking the "three dots" in the upper right corner and
# 3) selecting "Syntax mode"
# Now you can see the R code that produces Jamovi output!
# Go back to RStudio and click yourself into console view
# To see the see available data in Jamovi type there
# what() # outcome is the name of data frames that are open currently in Jamovi, e.g., "dfttu_wide_cross_complete_final2"
# df_Jamovi <_ read(2)
# NOTE: You may also work with the data frame name that you use in RStudio ("dfttu_wide")
# After these steps we are able to work between Jamovi and RStudio
# to reproduce several analyses
# _______________________________
# FIRST, no_grouping descriptives
# this is the "default" Jamovi descriptives
# N, Missing, Mean, Median, SD, Min, Max
jmv::descriptives(
vars = vars(competence, autonomy, score),
data = dfttu_wide # or df_Jamovi if you have used what()
)
##
## DESCRIPTIVES
##
## Descriptives
## _____________________________________________________________
## competence autonomy score
## _____________________________________________________________
## N 208 208 208
## Missing 0 0 0
## Mean 3.907452 3.786058 47.75641
## Median 4.000000 3.750000 50.00000
## Standard deviation 0.6050550 0.6068433 29.11842
## Minimum 2.500000 2.250000 0.000000
## Maximum 5.000000 5.000000 100.0000
## _____________________________________________________________
# this is a simple/minimal descriptives table of "competence", "autonomy" and "score"
# where we usually report at least
# N of participants, N of missing data, Mean, Standard Deviation, Std Error of Mean
jmv::descriptives(
vars = vars(competence, autonomy, score),
data = dfttu_wide, # or df_Jamovi if you have used what()
median = FALSE, # override Jamovi default
min = FALSE, # override Jamovi default
max = FALSE, # override Jamovi default
se = TRUE) # override Jamovi default
##
## DESCRIPTIVES
##
## Descriptives
## ______________________________________________________________
## competence autonomy score
## ______________________________________________________________
## N 208 208 208
## Missing 0 0 0
## Mean 3.907452 3.786058 47.75641
## Std. error mean 0.04195302 0.04207701 2.018999
## Standard deviation 0.6050550 0.6068433 29.11842
## ______________________________________________________________
# if we wish to control all parameters of the output, we produce an identical table with
jmv::descriptives(
vars = vars(competence, autonomy, score),
data = dfttu_wide, # or df_Jamovi if you have used what()
n = TRUE,
missing = TRUE,
mean = TRUE,
median = FALSE, # override Jamovi default
min = FALSE, # override Jamovi default
max = FALSE, # override Jamovi default
se = TRUE, # override Jamovi default
sd = TRUE) # override Jamovi default
##
## DESCRIPTIVES
##
## Descriptives
## ______________________________________________________________
## competence autonomy score
## ______________________________________________________________
## N 208 208 208
## Missing 0 0 0
## Mean 3.907452 3.786058 47.75641
## Std. error mean 0.04195302 0.04207701 2.018999
## Standard deviation 0.6050550 0.6068433 29.11842
## ______________________________________________________________
library(psych)
# descriptives by gender
describeBy(dfttu_wide, dfttu_wide$gender_chr) # one grouping variable ... try this in console!
##
## Descriptive statistics by group
## group: Female
## _____________________________________________________________________________________________
## group: Male
# describeBy(dfttu_wide ~ gender) # formula input for one grouping var
# descriptives by gender and group
describeBy(dfttu_wide, list(dfttu_wide$gender_chr, dfttu_wide$group_chr)) # two grouping variables
##
## Descriptive statistics by group
## : Female
## : Control
## _____________________________________________________________________________________________
## : Male
## : Control
## _____________________________________________________________________________________________
## : Female
## : Treatment
## _____________________________________________________________________________________________
## : Male
## : Treatment
# describeBy(dfttu_wide ~ gender + group) # formula input for two grouping variables
library(psych)
# are females (0) doing better in math than males (1)?
describeBy(score ~ gender_chr, data = dfttu_wide) # try this command in the console!
##
## Descriptive statistics by group
## gender_chr: Female
## _____________________________________________________________________________________________
## gender_chr: Male
# are males (1) in control group (0)) doing better in math than males in treatment group (1))?
describeBy(score ~ gender_chr + group_chr, data = dfttu_wide)
##
## Descriptive statistics by group
## gender_chr: Female
## group_chr: Control
## _____________________________________________________________________________________________
## gender_chr: Male
## group_chr: Control
## _____________________________________________________________________________________________
## gender_chr: Female
## group_chr: Treatment
## _____________________________________________________________________________________________
## gender_chr: Male
## group_chr: Treatment
library(psych)
# create new df's from the psych outputs
dfttu_desc_score_by_gender <_ describeBy(score ~ gender_chr,
mat=TRUE, # create df
data = dfttu_wide)
dfttu_desc_score_by_gender_group <_ describeBy(score ~ gender_chr + group_chr,
data=dfttu_wide,
mat=TRUE,
digits=2) # output rounded to 2 decimals
# view df's
dfttu_desc_score_by_gender
dfttu_desc_score_by_gender_group
library(jmv)
library(jmvconnect)
# _____________________________
# SECOND, grouping descriptives
# # see available data in Jamovi
# what() # e.g., "dfttu_wide_cross_complete_final2"
# df_Jamovi <_ read(2)
# reproduce descriptive analysis from Jamovi
# Jamovi relies on "default" values to produce a basic descriptives table
jmv::descriptives(
formula = competence ~ gender_chr,
data = dfttu_wide # or df_Jamovi if you have used what()
)
##
## DESCRIPTIVES
##
## Descriptives
## __________________________________________________
## gender_chr competence
## __________________________________________________
## N Female 63
## Male 145
## Missing Female 0
## Male 0
## Mean Female 3.694444
## Male 4.000000
## Median Female 3.750000
## Male 4.000000
## Standard deviation Female 0.6559966
## Male 0.5590170
## Minimum Female 2.500000
## Male 2.750000
## Maximum Female 5.000000
## Male 5.000000
## __________________________________________________
# we modify some parameters to produce a typical "journal"
# (N, Missing, M, SD, SE) descriptives table where "competence" is grouped by "gender_chr:
jmv::descriptives(
formula = competence ~ gender_chr,
data = dfttu_wide, # or df_Jamovi if you have used what()
median = FALSE, # override Jamovi default
min = FALSE, # override Jamovi default
max = FALSE, # override Jamovi default
se = TRUE, # override Jamovi default
sd = TRUE) # override Jamovi default
##
## DESCRIPTIVES
##
## Descriptives
## __________________________________________________
## gender_chr competence
## __________________________________________________
## N Female 63
## Male 145
## Missing Female 0
## Male 0
## Mean Female 3.694444
## Male 4.000000
## Std. error mean Female 0.08264780
## Male 0.04642383
## Standard deviation Female 0.6559966
## Male 0.5590170
## __________________________________________________
# this is the "plots only" mod of Jamovi's "descriptives,
# so we do not wish to see a table, but only several plots:
jmv::descriptives(
data = dfttu_wide, # or df_Jamovi if you have used what()
formula = competence ~ gender_chr,
n = FALSE, # remove table
missing = FALSE, # remove table
mean = FALSE, # remove table
median = FALSE, # remove table
sd = FALSE, # remove table
min = FALSE, # remove table
max = FALSE, # remove table
bar = TRUE, # show bar plot
hist = TRUE, # show histogram
dens = TRUE, # show density plot
box = TRUE, # show box plot
violin = TRUE, # show violin plot
dot = TRUE, # show data as dots
dotType = "jitter",# which are jittered
boxMean = TRUE, # show group means
qq = TRUE) # show Q_Q plot
## Warning in FUN(X[[i]], ...): no non_missing arguments to max; returning _Inf
##
## DESCRIPTIVES
# this is a "breakout" of all options of Jamovi's "descriptives":
jmv::descriptives(
formula = competence ~ gender_chr, # to compare both gender and group "gender_chr:group_chr"
data = dfttu_wide, # or place "df_Jamovi" here if you have used "what()"
#
# FIGURE SETTINGS (Jamovi: "Plots") start from here
#
# First figure is BAR PLOT
# https://chartio.com/learn/charts/bar_chart_complete_guide/
bar = TRUE, # show bar plot?
#
# Second figure is HISTOGRAM
# https://keydifferences.com/difference_between_histogram_and_bar_graph.html
hist = TRUE, # show histogram?
dens = TRUE, # show density?
#
# Third figure is BOX PLOT
# https://towardsdatascience.com/understanding_boxplots_5e2df7bcbd51
box = TRUE, # show box plot?
violin = TRUE, # show violin plot? https://datavizcatalogue.com/methods/violin_plot.html
dot = TRUE, # show the actual data points?
dotType = "jitter", # options are "jitter" (add random noise to get better visualization) or "stack"
boxMean = TRUE, # show group means?
#
# Fourth figure is Q_Q PLOT
# https://data.library.virginia.edu/understanding_q_q_plots/
qq = TRUE, # show Q_Q plot?
#
#
# TABLE SETTINGS (Jamovi: "Statistics") start from here
#
n = TRUE, # show n of females and males?
missing = TRUE, # show how many responses are missing?
min = TRUE, # show lowest response values by females and males?
max = TRUE, # show highest response values by females and males?
sum = TRUE, # show sum of females and males values (greater for males as their n is bigger)
range = TRUE, # show max_min? (the response scale was from 1 to 5: max "theoretical"range is 4)
mean = TRUE, # show average values of females and males?
median = TRUE, # show median? ("Q2", central value: min(0%) __ Q1(25%) __ Q2(50%) __ Q3(75%) __ max(100%)
pc = TRUE, # show percentiles? (default: 25%(Q1), 50%(Q2), 75%(Q3))
pcValues = "25,50,75",# percentiles values
iqr = TRUE, # show interquartile range? (Q3_Q1)
mode = TRUE, # show mode (the most typical value)?
variance = TRUE, # show variance? ("u2", sigma2" or "s^2", dispersion: how far values are from the mean)
sd = TRUE, # show standard deviation? ("sigma" or "s", square root of variance sqrt(s^2))
se = TRUE, # show standard error of the mean? (s/sqrt(n), that is, how far the sample mean
# is likely to be from the population mean _ so the smaller is better!)
skew = TRUE, # show skewness? ("u3", left" or "positive": too many "low" values; "right" or "negative":
# too many "high" values)
kurt = TRUE, # show kurtosis? ("u4"; kurtosis of normal distribution is 3 (mesokurtic);
# if u4 < 3 (like we have here), then "tails" have less values (platykurtic);
# if u4 > 3, there are more extreme values than expected, (leptokurtic))
sw = TRUE) # show Shapiro_Wilk W and p? (tests if the data is normally distributed;
## Warning in FUN(X[[i]], ...): no non_missing arguments to max; returning _Inf
##
## DESCRIPTIVES
##
## Descriptives
## ___________________________________________________
## gender_chr competence
## ___________________________________________________
## N Female 63
## Male 145
## Missing Female 0
## Male 0
## Mean Female 3.694444
## Male 4.000000
## Std. error mean Female 0.08264780
## Male 0.04642383
## Median Female 3.750000
## Male 4.000000
## Mode Female 4.000000
## Male 4.000000
## Sum Female 232.7500
## Male 580.0000
## Standard deviation Female 0.6559966
## Male 0.5590170
## Variance Female 0.4303315
## Male 0.3125000
## IQR Female 0.7500000
## Male 0.7500000
## Range Female 2.500000
## Male 2.250000
## Minimum Female 2.500000
## Male 2.750000
## Maximum Female 5.000000
## Male 5.000000
## Skewness Female _0.1426493
## Male _0.1322616
## Std. error skewness Female 0.3015886
## Male 0.2013565
## Kurtosis Female _0.5205238
## Male _0.3077235
## Std. error kurtosis Female 0.5948406
## Male 0.4000953
## Shapiro_Wilk W Female 0.9335264
## Male 0.9595119
## Shapiro_Wilk p Female 0.0021087
## Male 0.0002889
## 25th percentile Female 3.250000
## Male 3.750000
## 50th percentile Female 3.750000
## Male 4.000000
## 75th percentile Female 4.000000
## Male 4.500000
## ___________________________________________________
# that is the case if p > .05; sensitive to sample size, Q_Q plot is better)
Analysis continues in
R session 3:
Rmd Statistics_R_03.Rmd
HTML Statistics_R_03.html