Statistical Inference

Applied Multiple Regression/Correlation Analysis for the Behavioral by Jacob Cohen, Patricia Cohen, Stephen G. West, Leona S. AikenSciences

Author

Sungkyun Cho

Published

February 17, 2023

acad0 <- read_csv("data/c0301dt.csv")
acad0
A spec_tbl_df: 15 × 3
time pubs salary
<dbl> <dbl> <dbl>
3 18 51876
6 3 54511
3 2 53425
8 17 61863
6 21 47047
7 10 39115
11 27 59677
18 37 61458
# create a linear model with time as the predictor time, and the outcome being the salary
# the model is called mod1
mod1 <- lm(salary ~ time, data = acad0)

# print the model
mod1

# print the model summary
summary(mod1)

Call:
lm(formula = salary ~ time, data = acad0)

Coefficients:
(Intercept)         time  
      43659         1224  

Call:
lm(formula = salary ~ time, data = acad0)

Residuals:
     Min       1Q   Median       3Q      Max 
-13114.3  -3964.4     51.4   4025.1   8409.3 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  43658.6     2978.0  14.660 1.83e-09 ***
time          1224.4      336.5   3.639    0.003 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5763 on 13 degrees of freedom
Multiple R-squared:  0.5046,    Adjusted R-squared:  0.4665 
F-statistic: 13.24 on 1 and 13 DF,  p-value: 0.003
# r squared of mod1 and round to 2 decimal places
round(summary(mod1)$r.squared, 2)
0.5
# correlation between all variables in acad0 with corr.test
# corr.test is a function from the psych package
library(psych)
corr.test(acad0)
Call:corr.test(x = acad0)
Correlation matrix 
       time pubs salary
time   1.00 0.66   0.71
pubs   0.66 1.00   0.59
salary 0.71 0.59   1.00
Sample Size 
[1] 15
Probability values (Entries above the diagonal are adjusted for multiple tests.) 
       time pubs salary
time   0.00 0.02   0.01
pubs   0.01 0.00   0.02
salary 0.00 0.02   0.00

 To see confidence intervals of the correlations, print with the short=FALSE option
# buile a linear model with time an pubs as the predictor and salary as the outcome with interaction terms between time and pubs
mod2 <- lm(salary ~ time + pubs + time:pubs, data = acad0)

# visualize the interaction between time and pubs
ggplot(acad0, aes(x = time, y = pubs, fill = salary)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  labs(x = "Time", y = "Pubs", fill = "Salary") +
  theme_minimal()

# visualize the interaction between time and pubs with a package visreg
library(visreg)
visreg(mod2, "time", "pubs")

# visualize the interaction between time and pubs with a package car using effects function
library(car)
effects(mod2, "time", "pubs")

ERROR: Error in if (set.sign) {: argument is not interpretable as logical

Error in if (set.sign) {: argument is not interpretable as logical
Traceback:

1. effects(mod2, "time", "pubs")
2. effects.lm(mod2, "time", "pubs")

# regression diagnostics for mod2
library(car)

# plot the residuals against the fitted values
plot(mod2)

# plot the residuals against the fitted values with a package car
library(car)
residualPlots(mod2)

           Test stat Pr(>|Test stat|)
time         -0.9563           0.3615
pubs          0.6334           0.5407
Tukey test   -1.1784           0.2386

helping <- read_csv("data/altruism.csv")
# summary of helping
summary(helping)
       id             pho_1            pho_2            pho_3       
 Min.   :  1.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
 1st Qu.: 30.75   1st Qu.: 60.50   1st Qu.: 62.50   1st Qu.: 50.00  
 Median : 60.50   Median : 76.00   Median : 77.00   Median : 66.50  
 Mean   : 60.50   Mean   : 70.89   Mean   : 70.25   Mean   : 61.58  
 3rd Qu.: 90.25   3rd Qu.: 94.50   3rd Qu.: 91.50   3rd Qu.: 84.25  
 Max.   :120.00   Max.   :100.00   Max.   :100.00   Max.   :100.00  
                  NA's   :1        NA's   :1        NA's   :2       
      sex             age           emp_q20          emp_q22      
 Min.   :0.000   Min.   :  203   Min.   :  0.00   Min.   :  6.00  
 1st Qu.:0.000   1st Qu.: 2003   1st Qu.: 62.50   1st Qu.: 65.25  
 Median :1.000   Median : 2003   Median : 80.00   Median : 80.00  
 Mean   :0.678   Mean   : 2100   Mean   : 78.24   Mean   : 78.27  
 3rd Qu.:1.000   3rd Qu.: 2004   3rd Qu.: 91.50   3rd Qu.: 93.00  
 Max.   :1.000   Max.   :20004   Max.   :100.00   Max.   :100.00  
 NA's   :2       NA's   :3       NA's   :1        NA's   :2       
    emp_q23          emp_q24          emp_q25          emp_q26     
 Min.   :  0.00   Min.   :  0.00   Min.   :  4.00   Min.   :  2.0  
 1st Qu.: 56.50   1st Qu.: 60.00   1st Qu.: 64.50   1st Qu.: 59.5  
 Median : 70.00   Median : 71.00   Median : 74.00   Median : 75.0  
 Mean   : 67.49   Mean   : 73.98   Mean   : 74.46   Mean   : 73.5  
 3rd Qu.: 85.00   3rd Qu.: 90.50   3rd Qu.: 88.00   3rd Qu.: 91.0  
 Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.0  
 NA's   :1        NA's   :1        NA's   :1        NA's   :1      
# rename the column pho_1 to pho1, pho_2 to pho2, pho_3 to pho3
helping <- helping %>% rename(pho1 = pho_1, pho2 = pho_2, pho3 = pho_3)

# add a column pho to helping that is the average of pho1, pho2, and pho3 with na.rm = TRUE
helping <- helping %>% mutate(pho = mean(c(pho1, pho2, pho3), na.rm = TRUE))
helping["pho_mean2"] <- rowMeans(helping[, c("pho1", "pho2", "pho3")], na.rm = TRUE)
helping
A tibble: 120 × 14
id pho1 pho2 pho3 sex age emp_q20 emp_q22 emp_q23 emp_q24 emp_q25 emp_q26 pho pho_mean2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 95 95 95 1 2004 80 NA 80 80 70 70 67.58989 95.00000
2 58 62 NA 0 2003 62 58 59 57 56 59 67.58989 60.00000
3 100 50 50 NA 2003 90 51 51 51 52 100 67.58989 66.66667
4 77 77 64 1 2004 66 72 88 82 67 69 67.58989 72.66667
117 50 50 76 0 2003 52 100 0 26 48 45 67.58989 58.66667
118 92 76 94 0 1108 55 51 51 60 53 64 67.58989 87.33333
119 100 100 100 0 2004 68 75 55 72 75 63 67.58989 100.00000
120 60 78 26 0 2004 86 50 5 50 100 90 67.58989 54.66667
helping["pho_mean3"] <-    # "phone"이라는 새로운 변수에 assign!
  helping |>
  select(pho1:pho3) |>
  rowMeans(na.rm = TRUE)
helping
A tibble: 120 × 15
id pho1 pho2 pho3 sex age emp_q20 emp_q22 emp_q23 emp_q24 emp_q25 emp_q26 pho pho_mean2 pho_mean3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 95 95 95 1 2004 80 NA 80 80 70 70 67.58989 95.00000 95.00000
2 58 62 NA 0 2003 62 58 59 57 56 59 67.58989 60.00000 60.00000
3 100 50 50 NA 2003 90 51 51 51 52 100 67.58989 66.66667 66.66667
4 77 77 64 1 2004 66 72 88 82 67 69 67.58989 72.66667 72.66667
117 50 50 76 0 2003 52 100 0 26 48 45 67.58989 58.66667 58.66667
118 92 76 94 0 1108 55 51 51 60 53 64 67.58989 87.33333 87.33333
119 100 100 100 0 2004 68 75 55 72 75 63 67.58989 100.00000 100.00000
120 60 78 26 0 2004 86 50 5 50 100 90 67.58989 54.66667 54.66667
# load a dataset penguins from the palmerpenguins package
library(palmerpenguins)

# print the first 6 rows of penguins
head(penguins)
A tibble: 6 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
<fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
Adelie Torgersen 39.1 18.7 181 3750 male 2007
Adelie Torgersen 39.5 17.4 186 3800 female 2007
Adelie Torgersen 40.3 18.0 195 3250 female 2007
Adelie Torgersen NA NA NA NA NA 2007
Adelie Torgersen 36.7 19.3 193 3450 female 2007
Adelie Torgersen 39.3 20.6 190 3650 male 2007
# draw a scatterplot of flipper length and body mass with penguins with a facet wrap of species, with a number of columns of 2
penguins %>% ggplot(aes(x = flipper_length_mm, y = body_mass_g)) + geom_point() + facet_wrap(~species, ncol = 2)

# Tukey's multiple comparison test of flipper length between species
TukeyHSD(aov(flipper_length_mm ~ species, data = penguins))
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = flipper_length_mm ~ species, data = penguins)

$species
                      diff       lwr       upr p adj
Chinstrap-Adelie  5.869887  3.586583  8.153191     0
Gentoo-Adelie    27.233349 25.334376 29.132323     0
Gentoo-Chinstrap 21.363462 19.000841 23.726084     0
# Tukey's multiple comparison test of flipper length between species using multicomp package, glht function
library(multcomp)
glht(aov(flipper_length_mm ~ species, data = penguins), linfct = mcp(species = "Tukey"))

# summary of the Tukey's multiple comparison test of flipper length between species using multicomp package, glht function
summary(glht(aov(flipper_length_mm ~ species, data = penguins), linfct = mcp(species = "Tukey")))

# Tukey's multiple comparison test of flipper length between species using emmeans package
library(emmeans)
emmeans(aov(flipper_length_mm ~ species, data = penguins), "species")

     General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Linear Hypotheses:
                        Estimate
Chinstrap - Adelie == 0     5.87
Gentoo - Adelie == 0       27.23
Gentoo - Chinstrap == 0    21.36

     Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: aov(formula = flipper_length_mm ~ species, data = penguins)

Linear Hypotheses:
                        Estimate Std. Error t value Pr(>|t|)    
Chinstrap - Adelie == 0   5.8699     0.9699   6.052 1.06e-08 ***
Gentoo - Adelie == 0     27.2333     0.8067  33.760  < 1e-08 ***
Gentoo - Chinstrap == 0  21.3635     1.0036  21.286  < 1e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Adjusted p values reported -- single-step method)
 species   emmean    SE  df lower.CL upper.CL
 Adelie       190 0.540 339      189      191
 Chinstrap    196 0.805 339      194      197
 Gentoo       217 0.599 339      216      218

Confidence level used: 0.95 
# import the data from the file "data/students-shorter.sav" using haven package
library(haven)
students <- read_spss("data/students-shorter.sav")

# print the first 6 rows of students
head(students)
A tibble: 6 × 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a bys44b f1s83 ffugrad f1cncpt1 f1cncpt2 f1locus1 f1locus2 f1txrstd f1txmstd f1txsstd f1txhstd
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
124966 1249 1 2 4 1 3 4 2 4 3 5.25 -0.33 -0.10 0.03 -0.14 48.29 63.61 57.73 61.71
124972 1249 1 1 4 1 4 5 1 3 3 3.00 -0.33 -0.45 -0.43 -0.58 36.05 47.65 53.36 46.98
175551 1755 1 2 3 0 NA 3 2 3 2 2.50 0.42 0.33 -0.45 -0.59 55.13 43.44 46.39 50.48
180660 1806 1 1 4 1 2 NA 1 4 2 6.50 0.43 -0.02 0.03 0.07 42.54 56.19 40.14 56.48
180672 1806 1 2 4 1 2 3 1 4 2 4.25 0.02 -0.09 -0.88 -0.85 52.96 47.36 46.01 55.32
298885 2988 2 1 3 0 5 4 2 3 2 6.00 -0.33 -0.28 0.03 0.07 44.24 45.25 41.88 39.67