자세한 데이터 import에 대해서는 링크
Text files: csv
readr
패키지(tidyverse
에 포함)
read_csv()
, write_csv()
- R 기본 함수
read.csv()
를 개선
- 다양한 옵션은
?read_csv
, ?write_csv
참고
csv 파일 읽기
altruism.csv 파일 링크
library(tidyverse)
helping <- read_csv("data/altruism.csv")
helping |> print()
# A tibble: 120 × 12
id pho_1 pho_2 pho_3 sex age emp_q20 emp_q22 emp_q23 emp_q24 emp_q25
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 250 95 95 95 1 2004 80 NA 80 80 70
2 32 58 62 NA 0 2003 62 58 59 57 56
3 109 100 50 50 NA 2003 90 51 51 51 52
4 209 77 77 64 1 2004 66 72 88 82 67
5 94 77 50 77 1 2003 100 100 100 51 78
6 260 100 75 100 0 2004 100 60 70 55 70
7 258 77 94 86 1 2004 91 93 85 91 73
8 244 90 68 20 0 2004 67 66 31 67 63
9 180 100 79 77 0 2003 61 51 30 51 51
10 182 75 50 64 1 2003 80 80 70 65 70
# … with 110 more rows, and 1 more variable: emp_q26 <dbl>
read_csv()
의 자주 사용되는 옵션
read_csv("data/file.csv", skip = 2) # 첫 2절 스킵
read_csv("data/file.csv", na = ".") # 결측치가 .으로 기록된 파일
csv 파일 쓰기
write_csv()
: 단, 쓰기를 하면서 변수 타입 소멸
write_csv(helping, file="data/helping_new.csv")
Excel spreadsheets
readxl
package
read_excel()
, read_xlsx()
, read_xls()
엑셀 파일 읽기
stduents.xlsx 파일 링크
library(readxl) # install.packages("readxl")
stud <- read_xlsx("data/students.xlsx")
stud |> print()
# A tibble: 1,000 × 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a bys44b bys44c
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 124966 1249 1 2 4 1 3 4 2 4 4
2 124972 1249 1 1 4 1 4 5 1 3 3
3 175551 1755 1 2 3 0 NA 3 2 3 3
4 180660 1806 1 1 4 1 2 NA 1 4 4
5 180672 1806 1 2 4 1 2 3 1 4 3
6 298885 2988 2 1 3 0 5 4 2 3 3
# … with 994 more rows, and 82 more variables: bys44d <dbl>, bys44e <dbl>,
# bys44f <dbl>, bys44g <dbl>, bys44h <dbl>, bys44i <dbl>, bys44j <dbl>,
# bys44k <dbl>, bys44l <dbl>, bys44m <dbl>, bys48a <dbl>, bys48b <dbl>,
# bys79a <dbl>, byfamsiz <dbl>, famcomp <dbl>, bygrads <dbl>, byses <dbl>,
# byfaminc <dbl>, parocc <dbl>, bytxrstd <dbl>, bytxmstd <dbl>,
# bytxsstd <dbl>, bytxhstd <dbl>, bypared <dbl>, bytests <dbl>,
# par_inv <dbl>, f1s36a1 <dbl>, f1s36a2 <dbl>, f1s36b1 <dbl>, …
Specify sheet either by position or by name
read_excel("salaries.xlsx", sheet = 2) # The default is sheet = 1
read_excel("salaries.xlsx", sheet = "personnel")
Statistical packages
SPSS의 데이터: read_sav()
students-shorter.sav 파일 링크
library(haven) # install.packages("haven")
stud_spss <- read_sav("data/students-shorter.sav")
stud_spss |> print()
# A tibble: 1,000 x 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a
<dbl+lbl> <dbl+lb> <dbl+lb> <dbl+l> <dbl+l> <dbl+l> <dbl+lb> <dbl+lb> <dbl+l>
1 124966 1249 1 2 [Fem~ 4 [Whi~ 1 [whi~ 3 [2-3~ 4 [3-4~ 2 [Agr~
2 124972 1249 1 1 [Mal~ 4 [Whi~ 1 [whi~ 4 [3-4~ 5 [4-5~ 1 [Str~
3 175551 1755 1 2 [Fem~ 3 [Bla~ 0 [blk~ NA 3 [2-3~ 2 [Agr~
4 180660 1806 1 1 [Mal~ 4 [Whi~ 1 [whi~ 2 [1-2~ NA 1 [Str~
5 180672 1806 1 2 [Fem~ 4 [Whi~ 1 [whi~ 2 [1-2~ 3 [2-3~ 1 [Str~
6 298885 2988 2 1 [Mal~ 3 [Bla~ 0 [blk~ 5 [4-5~ 4 [3-4~ 2 [Agr~
7 604419 6044 6 2 [Fem~ 4 [Whi~ 1 [whi~ 4 [3-4~ 5 [4-5~ 2 [Agr~
8 605355 6053 6 2 [Fem~ 4 [Whi~ 1 [whi~ 2 [1-2~ 3 [2-3~ 1 [Str~
9 605377 6053 6 2 [Fem~ 4 [Whi~ 1 [whi~ 3 [2-3~ 5 [4-5~ 2 [Agr~
10 637529 6375 6 1 [Mal~ 3 [Bla~ 0 [blk~ 5 [4-5~ 6 [Ove~ 2 [Agr~
# i 990 more rows
# i 84 more variables: bys44b <dbl+lbl>, bys44c <dbl+lbl>, bys44d <dbl+lbl>,
# bys44e <dbl+lbl>, bys44f <dbl+lbl>, bys44g <dbl+lbl>, bys44h <dbl+lbl>,
# bys44i <dbl+lbl>, bys44j <dbl+lbl>, bys44k <dbl+lbl>, bys44l <dbl+lbl>,
# bys44m <dbl+lbl>, bys48a <dbl+lbl>, bys48b <dbl+lbl>, bys79a <dbl+lbl>,
# byfamsiz <dbl+lbl>, famcomp <dbl+lbl>, bygrads <dbl+lbl>, byses <dbl+lbl>,
# byfaminc <dbl+lbl>, parocc <dbl>, bytxrstd <dbl+lbl>, ...
stud_spss |>
select(ethnic) |>
print()
# A tibble: 1,000 x 1
ethnic
<dbl+lbl>
1 1 [white-asian]
2 1 [white-asian]
3 0 [blk,namer,hisp]
4 1 [white-asian]
5 1 [white-asian]
6 0 [blk,namer,hisp]
# i 994 more rows
labelled 데이터 참고
install.packages("labelled")
library(labelled)
# labelled 변수를 factor로 변환
stud_spss |>
unlabelled() |>
print()
# A tibble: 1,000 x 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a bys44b bys44c
<dbl> <dbl> <dbl> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
1 124966 1249 1 Female White~ white~ 2-3 h~ 3-4 h~ Agree Stron~ Stron~
2 124972 1249 1 Male White~ white~ 3-4 h~ 4-5 h~ Stron~ Disag~ Disag~
3 175551 1755 1 Female Black~ blk,n~ NA 2-3 h~ Agree Disag~ Disag~
4 180660 1806 1 Male White~ white~ 1-2 h~ NA Stron~ Stron~ Stron~
5 180672 1806 1 Female White~ white~ 1-2 h~ 2-3 h~ Stron~ Stron~ Disag~
6 298885 2988 2 Male Black~ blk,n~ 4-5 h~ 3-4 h~ Agree Disag~ Disag~
# i 994 more rows
# i 82 more variables: bys44d <fct>, bys44e <fct>, bys44f <fct>, bys44g <fct>,
# bys44h <fct>, bys44i <fct>, bys44j <fct>, bys44k <fct>, bys44l <fct>,
# bys44m <fct>, bys48a <fct>, bys48b <fct>, bys79a <fct>, byfamsiz <fct>,
# famcomp <fct>, bygrads <dbl>, byses <dbl>, byfaminc <fct>, parocc <dbl>,
# bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
# bypared <fct>, bytests <dbl>, par_inv <dbl>, f1s36a1 <fct>, ...
Labels 제거하기
stud <- stud_spss |>
remove_val_labels()
stud |> print()
# A tibble: 1,000 x 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a bys44b bys44c
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 124966 1249 1 2 4 1 3 4 2 4 4
2 124972 1249 1 1 4 1 4 5 1 3 3
3 175551 1755 1 2 3 0 NA 3 2 3 3
4 180660 1806 1 1 4 1 2 NA 1 4 4
5 180672 1806 1 2 4 1 2 3 1 4 3
6 298885 2988 2 1 3 0 5 4 2 3 3
# i 994 more rows
# i 82 more variables: bys44d <dbl>, bys44e <dbl>, bys44f <dbl>, bys44g <dbl>,
# bys44h <dbl>, bys44i <dbl>, bys44j <dbl>, bys44k <dbl>, bys44l <dbl>,
# bys44m <dbl>, bys48a <dbl>, bys48b <dbl>, bys79a <dbl>, byfamsiz <dbl>,
# famcomp <dbl>, bygrads <dbl>, byses <dbl>, byfaminc <dbl>, parocc <dbl>,
# bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
# bypared <dbl>, bytests <dbl>, par_inv <dbl>, f1s36a1 <dbl>, ...