자세한 데이터 import에 대해서는 링크
Text files: csv
readr
패키지(tidyverse
에 포함)
read_csv()
, write_csv()
R 기본 함수 read.csv()
를 개선
다양한 옵션은 ?read_csv
, ?write_csv
참고
csv 파일 읽기
altruism.csv 파일 링크
library (tidyverse)
helping <- read_csv ("data/altruism.csv" ) # tidyverse 패키지의 함수
helping |> print ()
# A tibble: 120 x 12
id pho_1 pho_2 pho_3 sex age emp_q20 emp_q22 emp_q23 emp_q24 emp_q25
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 95 95 95 1 2004 80 NA 80 80 70
2 2 58 62 NA 0 2003 62 58 59 57 56
3 3 100 50 50 NA 2003 90 51 51 51 52
4 4 77 77 64 1 2004 66 72 88 82 67
5 5 NA NA NA NA NA NA NA NA NA NA
6 6 100 75 100 0 2004 100 60 70 55 70
# i 114 more rows
# i 1 more variable: emp_q26 <dbl>
read_csv()
의 자주 사용되는 옵션
read_csv ("data/file.csv" , skip = 2 ) # 첫 2절 스킵
read_csv ("data/file.csv" , na = "." ) # 결측치가 .으로 기록된 파일
csv 파일 쓰기
write_csv()
: 단, 쓰기를 하면서 변수 타입 소멸
write_csv (helping, file = "data/helping_new.csv" )
Excel spreadsheets
readxl
package
read_excel()
, read_xlsx()
, read_xls()
엑셀 파일 읽기
stduents.xlsx 파일 링크
library (readxl) # install.packages("readxl")
stud <- read_xlsx ("data/students.xlsx" )
# 또는
stud <- readxl:: read_xlsx ("data/students.xlsx" )
stud |> print ()
# A tibble: 1,000 x 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a bys44b bys44c
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 124966 1249 1 2 4 1 3 4 2 4 4
2 124972 1249 1 1 4 1 4 5 1 3 3
3 175551 1755 1 2 3 0 NA 3 2 3 3
4 180660 1806 1 1 4 1 2 NA 1 4 4
5 180672 1806 1 2 4 1 2 3 1 4 3
6 298885 2988 2 1 3 0 5 4 2 3 3
# i 994 more rows
# i 82 more variables: bys44d <dbl>, bys44e <dbl>, bys44f <dbl>, bys44g <dbl>,
# bys44h <dbl>, bys44i <dbl>, bys44j <dbl>, bys44k <dbl>, bys44l <dbl>,
# bys44m <dbl>, bys48a <dbl>, bys48b <dbl>, bys79a <dbl>, byfamsiz <dbl>,
# famcomp <dbl>, bygrads <dbl>, byses <dbl>, byfaminc <dbl>, parocc <dbl>,
# bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
# bypared <dbl>, bytests <dbl>, par_inv <dbl>, f1s36a1 <dbl>, ...
Specify sheet either by position or by name
read_excel ("salaries.xlsx" , sheet = 2 ) # The default is sheet = 1
read_excel ("salaries.xlsx" , sheet = "personnel" )
Statistical packages
SPSS의 데이터: read_sav()
students-shorter.sav 파일 링크
library (haven) # install.packages("haven")
stud_spss <- read_sav ("data/students-shorter.sav" )
# 또는
stud_spss <- haven:: read_sav ("data/students-shorter.sav" )
stud_spss |> print ()
# A tibble: 1,000 x 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a
<dbl+lbl> <dbl+lbl> <dbl+lb> <dbl+l> <dbl+l> <dbl+l> <dbl+lb> <dbl+lb> <dbl+l>
1 124966 1249 1 2 [Fem~ 4 [Whi~ 1 [whi~ 3 [2-3~ 4 [3-4~ 2 [Agr~
2 124972 1249 1 1 [Mal~ 4 [Whi~ 1 [whi~ 4 [3-4~ 5 [4-5~ 1 [Str~
3 175551 1755 1 2 [Fem~ 3 [Bla~ 0 [blk~ NA 3 [2-3~ 2 [Agr~
4 180660 1806 1 1 [Mal~ 4 [Whi~ 1 [whi~ 2 [1-2~ NA 1 [Str~
5 180672 1806 1 2 [Fem~ 4 [Whi~ 1 [whi~ 2 [1-2~ 3 [2-3~ 1 [Str~
6 298885 2988 2 1 [Mal~ 3 [Bla~ 0 [blk~ 5 [4-5~ 4 [3-4~ 2 [Agr~
# i 994 more rows
# i 84 more variables: bys44b <dbl+lbl>, bys44c <dbl+lbl>, bys44d <dbl+lbl>,
# bys44e <dbl+lbl>, bys44f <dbl+lbl>, bys44g <dbl+lbl>, bys44h <dbl+lbl>,
# bys44i <dbl+lbl>, bys44j <dbl+lbl>, bys44k <dbl+lbl>, bys44l <dbl+lbl>,
# bys44m <dbl+lbl>, bys48a <dbl+lbl>, bys48b <dbl+lbl>, bys79a <dbl+lbl>,
# byfamsiz <dbl+lbl>, famcomp <dbl+lbl>, bygrads <dbl+lbl>, byses <dbl+lbl>,
# byfaminc <dbl+lbl>, parocc <dbl>, bytxrstd <dbl+lbl>, ...
stud_spss |>
select (ethnic) |>
print ()
# A tibble: 1,000 x 1
ethnic
<dbl+lbl>
1 1 [white-asian]
2 1 [white-asian]
3 0 [blk,namer,hisp]
4 1 [white-asian]
5 1 [white-asian]
6 0 [blk,namer,hisp]
# i 994 more rows
labelled 데이터 참고
# install.packages("labelled")
library (labelled)
# label 확인
val_labels (stud_spss$ race) |> print ()
Asian/Pacific islndr Hispanic Black not Hispanic
1 2 3
White not Hispanic Amer ind/AK Native MISSING
4 5 8
# label 확인
val_labels (stud_spss$ ethnic) |> print ()
blk,namer,hisp white-asian missing
0 1 8
# labelled 변수를 factor로 변환
stud <- stud_spss |>
unlabelled ()
stud |> print ()
# A tibble: 1,000 x 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a bys44b bys44c
<dbl> <dbl> <dbl> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
1 124966 1249 1 Female White~ white~ 2-3 h~ 3-4 h~ Agree Stron~ Stron~
2 124972 1249 1 Male White~ white~ 3-4 h~ 4-5 h~ Stron~ Disag~ Disag~
3 175551 1755 1 Female Black~ blk,n~ NA 2-3 h~ Agree Disag~ Disag~
4 180660 1806 1 Male White~ white~ 1-2 h~ NA Stron~ Stron~ Stron~
5 180672 1806 1 Female White~ white~ 1-2 h~ 2-3 h~ Stron~ Stron~ Disag~
6 298885 2988 2 Male Black~ blk,n~ 4-5 h~ 3-4 h~ Agree Disag~ Disag~
# i 994 more rows
# i 82 more variables: bys44d <fct>, bys44e <fct>, bys44f <fct>, bys44g <fct>,
# bys44h <fct>, bys44i <fct>, bys44j <fct>, bys44k <fct>, bys44l <fct>,
# bys44m <fct>, bys48a <fct>, bys48b <fct>, bys79a <fct>, byfamsiz <fct>,
# famcomp <fct>, bygrads <dbl>, byses <dbl>, byfaminc <fct>, parocc <dbl>,
# bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
# bypared <fct>, bytests <dbl>, par_inv <dbl>, f1s36a1 <fct>, ...
stud |> count (race) |> print ()
# A tibble: 6 x 2
race n
<fct> <int>
1 Asian/Pacific islndr 61
2 Hispanic 114
3 Black not Hispanic 100
4 White not Hispanic 704
5 Amer ind/AK Native 10
6 NA 11
Labels 제거하기
stud2 <- stud_spss |>
remove_val_labels ()
stud2 |> print ()
# A tibble: 1,000 x 93
stu_id sch_id sstratid sex race ethnic bys42a bys42b bys44a bys44b bys44c
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 124966 1249 1 2 4 1 3 4 2 4 4
2 124972 1249 1 1 4 1 4 5 1 3 3
3 175551 1755 1 2 3 0 NA 3 2 3 3
4 180660 1806 1 1 4 1 2 NA 1 4 4
5 180672 1806 1 2 4 1 2 3 1 4 3
6 298885 2988 2 1 3 0 5 4 2 3 3
# i 994 more rows
# i 82 more variables: bys44d <dbl>, bys44e <dbl>, bys44f <dbl>, bys44g <dbl>,
# bys44h <dbl>, bys44i <dbl>, bys44j <dbl>, bys44k <dbl>, bys44l <dbl>,
# bys44m <dbl>, bys48a <dbl>, bys48b <dbl>, bys79a <dbl>, byfamsiz <dbl>,
# famcomp <dbl>, bygrads <dbl>, byses <dbl>, byfaminc <dbl>, parocc <dbl>,
# bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
# bypared <dbl>, bytests <dbl>, par_inv <dbl>, f1s36a1 <dbl>, ...