Statistics with R session 2

Course homepage:
https://homepages.tuni.fi/petri.nokelainen/mmx9120/

R session 1:
Rmd Statistics_R_01.Rmd
HTML Statistics_R_01.html

R session 2 (this file):
Rmd Statistics_R_02.Rmd
HTML Statistics_R_02.html

R session 3:
Rmd Statistics_R_03.Rmd
HTML Statistics_R_03.html

# get the current working directory (the location from where you opened this file)
getwd() # e.g. [1] "/Users/nokelaip"

## [1] "/Users/nokelaip/Dropbox (CharveLEAD)/home1_cloud/home1/koulutus/2023/TTU/test_R_teaching_materials/2_basic_statistics"

# put that location into variable "wd"
wd <_ getwd()
# set the working directory to the value of variable "wd"
setwd(wd) # copy paste "wd" value

# you may also use RStudio menu: Session _ Set Working Directory _ To Source File location

# read in the data, this time we will use a shorter name
dfttu_wide <_ read.table("dfttu_wide_cross_complete_final2.txt", header=TRUE, sep= "\t")

2. Descriptive statistics

2.1 General descriptives

# ________________
# CENTRAL TENDENCY
# what is the average (mean) mathematics score?
dfttu_m_score <_ round(mean(dfttu_wide$score, na.rm = TRUE), digits = 2)
#
# median?
dfttu_med_score <_ median(dfttu_wide$score, na.rm = TRUE)
#
# mode?
# install.packages("DescTools")
library(DescTools)

## Registered S3 method overwritten by 'DescTools':
##   method         from 
##   reorder.factor gdata

## 
## Attaching package: 'DescTools'

## The following object is masked from 'package:car':
## 
##     Recode

## The following objects are masked from 'package:psych':
## 
##     AUC, ICC, SD

## The following objects are masked from 'package:extraoperators':
## 
##     %c%, %nin%

## The following object is masked from 'package:data.table':
## 
##     %like%

dfttu_mod_score <_ Mode(dfttu_wide$score, na.rm = TRUE)
#
# __________
# DISPERSION
# what is the average absolute deviation (AAD) of math score? (mean of absolute deviations from the mean)
# install.packages("lsr")
library(lsr)
dfttu_aad_score <_ aad(dfttu_wide$score, na.rm = TRUE)
#
# what is the standard deviation of the average score?
dfttu_sd_score <_ sd(dfttu_wide$score, na.rm = TRUE)
#
# how about the variance?
dfttu_var_score <_ var(dfttu_wide$score, na.rm = TRUE)
#
# median absolute deviation (MAD)? (median of absolute deviations from the median)
dfttu_mad_score <_ mad(dfttu_wide$score, na.rm = TRUE)

2.1.1 Psych package

# install.packages("psych")
library(psych) # https://rdrr.io/cran/psych/man/describe.by.html

# descriptives for the whole df
describe(dfttu_wide)
# save descriptives into df
dfttu_desc <_ describe(dfttu_wide)
# investigate the structure of df
str(dfttu_desc)

## Classes 'psych', 'describe' and 'data.frame':    14 obs. of  13 variables:
##  $ vars    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ n       : num  208 208 208 208 208 208 208 208 208 208 ...
##  $ mean    : num  147.88 0.514 0.697 3.786 3.907 ...
##  $ sd      : num  84.384 0.501 0.461 0.607 0.605 ...
##  $ median  : num  147.5 1 1 3.75 4 ...
##  $ trimmed : num  147.845 0.518 0.744 3.79 3.923 ...
##  $ mad     : num  105.265 0 0 0.371 0.371 ...
##  $ min     : num  1 0 0 2.25 2.5 2 1.25 0 1 0 ...
##  $ max     : num  294 1 1 5 5 5 5 100 3 2 ...
##  $ range   : num  293 1 1 2.75 2.5 3 3.75 100 2 2 ...
##  $ skew    : num  0.00641 _0.0573 _0.85177 _0.17761 _0.23328 ...
##  $ kurtosis: num  _1.1754 _2.0063 _1.2806 0.0306 _0.3043 ...
##  $ se      : num  5.851 0.0347 0.0319 0.0421 0.042 ...

# view df
view(dfttu_desc)
# extract mean (average) score (that is the 8th value in the "mean" vector/list)
dfttu_desc$mean[8]

## [1] 47.75641

2.1.2 Stargazer package

# install.packages("stargazer")
library(stargazer) 

# descriptives for the whole df
stargazer(dfttu_wide, 
          type="text") # omitting this produces latex

## 
## ==============================================
## Statistic    N   Mean   St. Dev.  Min    Max  
## ______________________________________________
## id          208 147.880  84.384    1     294  
## group       208  0.514   0.501     0      1   
## gender      208  0.697   0.461     0      1   
## autonomy    208  3.786   0.607   2.250  5.000 
## competence  208  3.907   0.605   2.500  5.000 
## deep        208  4.023   0.507   2.000  5.000 
## organised   208  3.739   0.678   1.250  5.000 
## score       208 47.756   29.118  0.000 100.000
## score3class 208  1.014   0.813     0      2   
## index       208 108.524  63.331    1     218  
## ______________________________________________

# first five rows
stargazer(dfttu_wide[1:5,], 
          summary=FALSE, 
          rownames=FALSE, 
          type="text")  # omitting this produces latex

## 
## ==============================================================================================================================
## id group gender autonomy competence deep  organised score  score3class_chr score3class autonomy_chr gender_chr group_chr index
## ______________________________________________________________________________________________________________________________
## 1    0     1     3.750     4.250      4       3     16.667    Low score         0         Agree        Male     Control    1  
## 3    1     1     3.250     3.750      4     3.500   41.667  Medium score        1         Agree        Male    Treatment   2  
## 4    0     1       4         4        4       4     88.889   High score         2         Agree        Male     Control    3  
## 6    0     1       4       4.250    4.250   4.500   61.111  Medium score        1         Agree        Male     Control    4  
## 7    0     0     3.500     3.750    4.500     5     38.889  Medium score        1         Agree       Female    Control    5  
## ______________________________________________________________________________________________________________________________

2.1.3 Jamovi package

# Open Jamovi and load in the "dfttu_wide_cross_complete_final2.txt" data

# install.packages("jmv")
# install.packages("jmvconnect")
library(jmv)
library(jmvconnect)

# We need Jamovi's "Syntax mode" to see the R code behind all Jamovi actions
# Set it on by 
# 1) going to Jamovi window, 
# 2) clicking the "three dots" in the upper right corner and 
# 3) selecting "Syntax mode"
# Now you can see the R code that produces Jamovi output!

# Go back to RStudio and click yourself into console view
# To see the see available data in Jamovi type there
# what() # outcome is the name of data frames that are open currently in Jamovi, e.g., "dfttu_wide_cross_complete_final2"
# df_Jamovi <_ read(2)

# NOTE: You may also work with the data frame name that you use in RStudio ("dfttu_wide")

# After these steps we are able to work between Jamovi and RStudio
# to reproduce several analyses

# _______________________________
# FIRST, no_grouping descriptives

# this is the "default" Jamovi descriptives
# N, Missing, Mean, Median, SD, Min, Max
jmv::descriptives(
    vars = vars(competence, autonomy, score),
    data = dfttu_wide # or df_Jamovi if you have used what()
    )

## 
##  DESCRIPTIVES
## 
##  Descriptives                                                  
##  _____________________________________________________________ 
##                          competence    autonomy     score      
##  _____________________________________________________________ 
##    N                            208          208         208   
##    Missing                        0            0           0   
##    Mean                    3.907452     3.786058    47.75641   
##    Median                  4.000000     3.750000    50.00000   
##    Standard deviation     0.6050550    0.6068433    29.11842   
##    Minimum                 2.500000     2.250000    0.000000   
##    Maximum                 5.000000     5.000000    100.0000   
##  _____________________________________________________________

# this is a simple/minimal descriptives table of "competence", "autonomy" and "score"
# where we usually report at least
# N of participants, N of missing data, Mean, Standard Deviation, Std Error of Mean
jmv::descriptives(
    vars = vars(competence, autonomy, score),
    data = dfttu_wide, # or df_Jamovi if you have used what()
    median = FALSE, # override Jamovi default
    min = FALSE,    # override Jamovi default
    max = FALSE,    # override Jamovi default
    se = TRUE)      # override Jamovi default

## 
##  DESCRIPTIVES
## 
##  Descriptives                                                   
##  ______________________________________________________________ 
##                          competence    autonomy      score      
##  ______________________________________________________________ 
##    N                            208           208         208   
##    Missing                        0             0           0   
##    Mean                    3.907452      3.786058    47.75641   
##    Std. error mean       0.04195302    0.04207701    2.018999   
##    Standard deviation     0.6050550     0.6068433    29.11842   
##  ______________________________________________________________

# if we wish to control all parameters of the output, we produce an identical table with
jmv::descriptives(
    vars = vars(competence, autonomy, score),
    data = dfttu_wide, # or df_Jamovi if you have used what()
    n = TRUE,
    missing = TRUE,
    mean = TRUE,
    median = FALSE, # override Jamovi default
    min = FALSE,    # override Jamovi default
    max = FALSE,    # override Jamovi default
    se = TRUE,      # override Jamovi default
    sd = TRUE)      # override Jamovi default

## 
##  DESCRIPTIVES
## 
##  Descriptives                                                   
##  ______________________________________________________________ 
##                          competence    autonomy      score      
##  ______________________________________________________________ 
##    N                            208           208         208   
##    Missing                        0             0           0   
##    Mean                    3.907452      3.786058    47.75641   
##    Std. error mean       0.04195302    0.04207701    2.018999   
##    Standard deviation     0.6050550     0.6068433    29.11842   
##  ______________________________________________________________

2.2 Descriptive statistics by group

2.2.1 Psych package

library(psych)

# descriptives by gender
describeBy(dfttu_wide, dfttu_wide$gender_chr) # one grouping variable ... try this in console!

## 
##  Descriptive statistics by group 
## group: Female
## _____________________________________________________________________________________________ 
## group: Male

# describeBy(dfttu_wide ~ gender) # formula input for one grouping var

# descriptives by gender and group
describeBy(dfttu_wide, list(dfttu_wide$gender_chr, dfttu_wide$group_chr))  # two grouping variables

## 
##  Descriptive statistics by group 
## : Female
## : Control
## _____________________________________________________________________________________________ 
## : Male
## : Control
## _____________________________________________________________________________________________ 
## : Female
## : Treatment
## _____________________________________________________________________________________________ 
## : Male
## : Treatment

# describeBy(dfttu_wide ~ gender + group) # formula input for two grouping variables

library(psych)

# are females (0) doing better in math than males (1)? 
describeBy(score ~ gender_chr, data = dfttu_wide) # try this command in the console!

## 
##  Descriptive statistics by group 
## gender_chr: Female
## _____________________________________________________________________________________________ 
## gender_chr: Male

# are males (1) in control group (0)) doing better in math than males in treatment group (1))? 
describeBy(score ~ gender_chr + group_chr, data = dfttu_wide)

## 
##  Descriptive statistics by group 
## gender_chr: Female
## group_chr: Control
## _____________________________________________________________________________________________ 
## gender_chr: Male
## group_chr: Control
## _____________________________________________________________________________________________ 
## gender_chr: Female
## group_chr: Treatment
## _____________________________________________________________________________________________ 
## gender_chr: Male
## group_chr: Treatment

library(psych)

# create new df's from the psych outputs
dfttu_desc_score_by_gender <_ describeBy(score ~ gender_chr,
                                         mat=TRUE, # create df
                                         data = dfttu_wide)

dfttu_desc_score_by_gender_group <_ describeBy(score ~  gender_chr + group_chr, 
                                         data=dfttu_wide,
                                         mat=TRUE, 
                                         digits=2)  # output rounded to 2 decimals  

# view df's
dfttu_desc_score_by_gender
dfttu_desc_score_by_gender_group

2.2.2 Jamovi package

library(jmv)
library(jmvconnect)

# _____________________________
# SECOND, grouping descriptives

# # see available data in Jamovi
# what() # e.g., "dfttu_wide_cross_complete_final2"
# df_Jamovi <_ read(2)

# reproduce descriptive analysis from Jamovi

# Jamovi relies on "default" values to produce a basic descriptives table
jmv::descriptives(
    formula = competence ~ gender_chr,
    data = dfttu_wide # or df_Jamovi if you have used what()
    )

## 
##  DESCRIPTIVES
## 
##  Descriptives                                       
##  __________________________________________________ 
##                          gender_chr    competence   
##  __________________________________________________ 
##    N                     Female                63   
##                          Male                 145   
##    Missing               Female                 0   
##                          Male                   0   
##    Mean                  Female          3.694444   
##                          Male            4.000000   
##    Median                Female          3.750000   
##                          Male            4.000000   
##    Standard deviation    Female         0.6559966   
##                          Male           0.5590170   
##    Minimum               Female          2.500000   
##                          Male            2.750000   
##    Maximum               Female          5.000000   
##                          Male            5.000000   
##  __________________________________________________

# we modify some parameters to produce a typical "journal" 
# (N, Missing, M, SD, SE) descriptives table where "competence" is grouped by "gender_chr:
jmv::descriptives(
    formula = competence ~ gender_chr,
    data = dfttu_wide, # or df_Jamovi if you have used what()
    median = FALSE, # override Jamovi default
    min = FALSE,    # override Jamovi default
    max = FALSE,    # override Jamovi default
    se = TRUE,      # override Jamovi default
    sd = TRUE)      # override Jamovi default

## 
##  DESCRIPTIVES
## 
##  Descriptives                                       
##  __________________________________________________ 
##                          gender_chr    competence   
##  __________________________________________________ 
##    N                     Female                63   
##                          Male                 145   
##    Missing               Female                 0   
##                          Male                   0   
##    Mean                  Female          3.694444   
##                          Male            4.000000   
##    Std. error mean       Female        0.08264780   
##                          Male          0.04642383   
##    Standard deviation    Female         0.6559966   
##                          Male           0.5590170   
##  __________________________________________________

# this is the "plots only" mod of Jamovi's "descriptives, 
# so we do not wish to see a table, but only several plots:
jmv::descriptives(
    data = dfttu_wide, # or df_Jamovi if you have used what()
    formula = competence ~ gender_chr,
    n = FALSE,         # remove table
    missing = FALSE,   # remove table
    mean = FALSE,      # remove table
    median = FALSE,    # remove table
    sd = FALSE,        # remove table
    min = FALSE,       # remove table
    max = FALSE,       # remove table
    bar = TRUE,        # show bar plot
    hist = TRUE,       # show histogram
    dens = TRUE,       # show density plot
    box = TRUE,        # show box plot
    violin = TRUE,     # show violin plot
    dot = TRUE,        # show data as dots
    dotType = "jitter",# which are jittered
    boxMean = TRUE,    # show group means
    qq = TRUE)         # show Q_Q plot

## Warning in FUN(X[[i]], ...): no non_missing arguments to max; returning _Inf

## 
##  DESCRIPTIVES

# this is a "breakout" of all options of Jamovi's "descriptives":
jmv::descriptives(
    formula = competence ~ gender_chr, # to compare both gender and group "gender_chr:group_chr"
    data = dfttu_wide, # or place "df_Jamovi" here if you have used "what()"
                          # 
                          # FIGURE SETTINGS (Jamovi: "Plots") start from here
                          #
                          # First figure is BAR PLOT
                          # https://chartio.com/learn/charts/bar_chart_complete_guide/
    bar = TRUE,           # show bar plot?
                          #
                          # Second figure is HISTOGRAM
                          # https://keydifferences.com/difference_between_histogram_and_bar_graph.html
    hist = TRUE,          # show histogram?
    dens = TRUE,          # show density?
                          #
                          # Third figure is BOX PLOT
                          # https://towardsdatascience.com/understanding_boxplots_5e2df7bcbd51
    box = TRUE,           # show box plot?
    violin = TRUE,        # show violin plot? https://datavizcatalogue.com/methods/violin_plot.html
    dot = TRUE,           # show the actual data points? 
    dotType = "jitter",   # options are "jitter" (add random noise to get better visualization) or "stack"
    boxMean = TRUE,       # show group means?
                          #
                          # Fourth figure is Q_Q PLOT
                          # https://data.library.virginia.edu/understanding_q_q_plots/
    qq = TRUE,            # show Q_Q plot?
                          # 
                          # 
                          # TABLE SETTINGS (Jamovi: "Statistics") start from here
                          #
    n = TRUE,             # show n of females and males?
    missing = TRUE,       # show how many responses are missing?
    min = TRUE,           # show lowest response values by females and males?
    max = TRUE,           # show highest response values by females and males?
    sum = TRUE,           # show sum of females and males values (greater for males as their n is bigger)
    range = TRUE,         # show max_min? (the response scale was from 1 to 5: max "theoretical"range is 4)
    mean = TRUE,          # show average values of females and males?
    median = TRUE,        # show median? ("Q2", central value: min(0%) __ Q1(25%) __ Q2(50%) __ Q3(75%) __ max(100%)
    pc = TRUE,            # show percentiles? (default: 25%(Q1), 50%(Q2), 75%(Q3))
    pcValues = "25,50,75",# percentiles values
    iqr = TRUE,           # show interquartile range? (Q3_Q1)
    mode = TRUE,          # show mode (the most typical value)?
    variance = TRUE,      # show variance? ("u2", sigma2" or "s^2", dispersion: how far values are from the mean)
    sd = TRUE,            # show standard deviation? ("sigma" or "s", square root of variance sqrt(s^2))
    se = TRUE,            # show standard error of the mean? (s/sqrt(n), that is, how far the sample mean 
                          # is likely to be from the population mean _ so the smaller is better!)
    skew = TRUE,          # show skewness? ("u3", left" or "positive": too many "low" values; "right" or "negative": 
                          # too many "high" values)
    kurt = TRUE,          # show kurtosis? ("u4"; kurtosis of normal distribution is 3 (mesokurtic); 
                          # if u4 < 3 (like we have here), then "tails" have less values (platykurtic); 
                          # if u4 > 3, there are more extreme values than expected, (leptokurtic))
    sw = TRUE)            # show Shapiro_Wilk W and p? (tests if the data is normally distributed;

## Warning in FUN(X[[i]], ...): no non_missing arguments to max; returning _Inf

## 
##  DESCRIPTIVES
## 
##  Descriptives                                        
##  ___________________________________________________ 
##                           gender_chr    competence   
##  ___________________________________________________ 
##    N                      Female                63   
##                           Male                 145   
##    Missing                Female                 0   
##                           Male                   0   
##    Mean                   Female          3.694444   
##                           Male            4.000000   
##    Std. error mean        Female        0.08264780   
##                           Male          0.04642383   
##    Median                 Female          3.750000   
##                           Male            4.000000   
##    Mode                   Female          4.000000   
##                           Male            4.000000   
##    Sum                    Female          232.7500   
##                           Male            580.0000   
##    Standard deviation     Female         0.6559966   
##                           Male           0.5590170   
##    Variance               Female         0.4303315   
##                           Male           0.3125000   
##    IQR                    Female         0.7500000   
##                           Male           0.7500000   
##    Range                  Female          2.500000   
##                           Male            2.250000   
##    Minimum                Female          2.500000   
##                           Male            2.750000   
##    Maximum                Female          5.000000   
##                           Male            5.000000   
##    Skewness               Female        _0.1426493   
##                           Male          _0.1322616   
##    Std. error skewness    Female         0.3015886   
##                           Male           0.2013565   
##    Kurtosis               Female        _0.5205238   
##                           Male          _0.3077235   
##    Std. error kurtosis    Female         0.5948406   
##                           Male           0.4000953   
##    Shapiro_Wilk W         Female         0.9335264   
##                           Male           0.9595119   
##    Shapiro_Wilk p         Female         0.0021087   
##                           Male           0.0002889   
##    25th percentile        Female          3.250000   
##                           Male            3.750000   
##    50th percentile        Female          3.750000   
##                           Male            4.000000   
##    75th percentile        Female          4.000000   
##                           Male            4.500000   
##  ___________________________________________________

                          # that is the case if p > .05; sensitive to sample size, Q_Q plot is better)

Analysis continues in

R session 3:
Rmd Statistics_R_03.Rmd
HTML Statistics_R_03.html