Import

R for Data Science by Wickham & Grolemund

Published

March 13, 2025

자세한 데이터 import에 대해서는 링크

Text files: csv

readr 패키지(tidyverse에 포함)

read_csv(), write_csv()

R 기본 함수 read.csv()를 개선
다양한 옵션은 ?read_csv, ?write_csv 참고

csv 파일 읽기

altruism.csv 파일 링크

library(tidyverse)

helping <- read_csv("data/altruism.csv")  # tidyverse 패키지의 함수
helping |> print()

# A tibble: 120 x 12
     id pho_1 pho_2 pho_3   sex   age emp_q20 emp_q22 emp_q23 emp_q24 emp_q25
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1     1    95    95    95     1  2004      80      NA      80      80      70
2     2    58    62    NA     0  2003      62      58      59      57      56
3     3   100    50    50    NA  2003      90      51      51      51      52
4     4    77    77    64     1  2004      66      72      88      82      67
5     5    NA    NA    NA    NA    NA      NA      NA      NA      NA      NA
6     6   100    75   100     0  2004     100      60      70      55      70
# i 114 more rows
# i 1 more variable: emp_q26 <dbl>

Note

read_csv()의 자주 사용되는 옵션

read_csv("data/file.csv", skip = 2) # 첫 2절 스킵
read_csv("data/file.csv", na = ".") # 결측치가 .으로 기록된 파일

csv 파일 쓰기

write_csv(): 단, 쓰기를 하면서 변수 타입 소멸

write_csv(helping, file = "data/helping_new.csv")

Excel spreadsheets

readxl package

read_excel(), read_xlsx(), read_xls()

엑셀 파일 읽기

stduents.xlsx 파일 링크

library(readxl) # install.packages("readxl")
stud <- read_xlsx("data/students.xlsx")

# 또는
stud <- readxl::read_xlsx("data/students.xlsx")

stud |> print()

# A tibble: 1,000 x 93
  stu_id sch_id sstratid   sex  race ethnic bys42a bys42b bys44a bys44b bys44c
   <dbl>  <dbl>    <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 124966   1249        1     2     4      1      3      4      2      4      4
2 124972   1249        1     1     4      1      4      5      1      3      3
3 175551   1755        1     2     3      0     NA      3      2      3      3
4 180660   1806        1     1     4      1      2     NA      1      4      4
5 180672   1806        1     2     4      1      2      3      1      4      3
6 298885   2988        2     1     3      0      5      4      2      3      3
# i 994 more rows
# i 82 more variables: bys44d <dbl>, bys44e <dbl>, bys44f <dbl>, bys44g <dbl>,
#   bys44h <dbl>, bys44i <dbl>, bys44j <dbl>, bys44k <dbl>, bys44l <dbl>,
#   bys44m <dbl>, bys48a <dbl>, bys48b <dbl>, bys79a <dbl>, byfamsiz <dbl>,
#   famcomp <dbl>, bygrads <dbl>, byses <dbl>, byfaminc <dbl>, parocc <dbl>,
#   bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
#   bypared <dbl>, bytests <dbl>, par_inv <dbl>, f1s36a1 <dbl>, ...

Note

Specify sheet either by position or by name

read_excel("salaries.xlsx", sheet = 2) # The default is sheet = 1
read_excel("salaries.xlsx", sheet = "personnel")

Statistical packages

SPSS의 데이터: read_sav()

students-shorter.sav 파일 링크

library(haven) # install.packages("haven")
stud_spss <- read_sav("data/students-shorter.sav")

# 또는
stud_spss <- haven::read_sav("data/students-shorter.sav")

stud_spss |> print()

# A tibble: 1,000 x 93
  stu_id    sch_id    sstratid sex     race    ethnic  bys42a   bys42b   bys44a 
  <dbl+lbl> <dbl+lbl> <dbl+lb> <dbl+l> <dbl+l> <dbl+l> <dbl+lb> <dbl+lb> <dbl+l>
1 124966    1249      1        2 [Fem~ 4 [Whi~ 1 [whi~  3 [2-3~  4 [3-4~ 2 [Agr~
2 124972    1249      1        1 [Mal~ 4 [Whi~ 1 [whi~  4 [3-4~  5 [4-5~ 1 [Str~
3 175551    1755      1        2 [Fem~ 3 [Bla~ 0 [blk~ NA        3 [2-3~ 2 [Agr~
4 180660    1806      1        1 [Mal~ 4 [Whi~ 1 [whi~  2 [1-2~ NA       1 [Str~
5 180672    1806      1        2 [Fem~ 4 [Whi~ 1 [whi~  2 [1-2~  3 [2-3~ 1 [Str~
6 298885    2988      2        1 [Mal~ 3 [Bla~ 0 [blk~  5 [4-5~  4 [3-4~ 2 [Agr~
# i 994 more rows
# i 84 more variables: bys44b <dbl+lbl>, bys44c <dbl+lbl>, bys44d <dbl+lbl>,
#   bys44e <dbl+lbl>, bys44f <dbl+lbl>, bys44g <dbl+lbl>, bys44h <dbl+lbl>,
#   bys44i <dbl+lbl>, bys44j <dbl+lbl>, bys44k <dbl+lbl>, bys44l <dbl+lbl>,
#   bys44m <dbl+lbl>, bys48a <dbl+lbl>, bys48b <dbl+lbl>, bys79a <dbl+lbl>,
#   byfamsiz <dbl+lbl>, famcomp <dbl+lbl>, bygrads <dbl+lbl>, byses <dbl+lbl>,
#   byfaminc <dbl+lbl>, parocc <dbl>, bytxrstd <dbl+lbl>, ...

stud_spss |>
  select(ethnic) |>
  print()

# A tibble: 1,000 x 1
  ethnic            
  <dbl+lbl>         
1 1 [white-asian]   
2 1 [white-asian]   
3 0 [blk,namer,hisp]
4 1 [white-asian]   
5 1 [white-asian]   
6 0 [blk,namer,hisp]
# i 994 more rows

labelled 데이터 참고

# install.packages("labelled")
library(labelled)

# label 확인
val_labels(stud_spss$race) |> print()

Asian/Pacific islndr             Hispanic   Black not Hispanic 
                   1                    2                    3 
  White not Hispanic   Amer ind/AK Native              MISSING 
                   4                    5                    8

# label 확인
val_labels(stud_spss$ethnic) |> print()

blk,namer,hisp    white-asian        missing 
             0              1              8

# labelled 변수를 factor로 변환
stud <- stud_spss |>
  unlabelled()
stud |> print()

# A tibble: 1,000 x 93
  stu_id sch_id sstratid sex    race   ethnic bys42a bys42b bys44a bys44b bys44c
   <dbl>  <dbl>    <dbl> <fct>  <fct>  <fct>  <fct>  <fct>  <fct>  <fct>  <fct> 
1 124966   1249        1 Female White~ white~ 2-3 h~ 3-4 h~ Agree  Stron~ Stron~
2 124972   1249        1 Male   White~ white~ 3-4 h~ 4-5 h~ Stron~ Disag~ Disag~
3 175551   1755        1 Female Black~ blk,n~ NA     2-3 h~ Agree  Disag~ Disag~
4 180660   1806        1 Male   White~ white~ 1-2 h~ NA     Stron~ Stron~ Stron~
5 180672   1806        1 Female White~ white~ 1-2 h~ 2-3 h~ Stron~ Stron~ Disag~
6 298885   2988        2 Male   Black~ blk,n~ 4-5 h~ 3-4 h~ Agree  Disag~ Disag~
# i 994 more rows
# i 82 more variables: bys44d <fct>, bys44e <fct>, bys44f <fct>, bys44g <fct>,
#   bys44h <fct>, bys44i <fct>, bys44j <fct>, bys44k <fct>, bys44l <fct>,
#   bys44m <fct>, bys48a <fct>, bys48b <fct>, bys79a <fct>, byfamsiz <fct>,
#   famcomp <fct>, bygrads <dbl>, byses <dbl>, byfaminc <fct>, parocc <dbl>,
#   bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
#   bypared <fct>, bytests <dbl>, par_inv <dbl>, f1s36a1 <fct>, ...

stud |> count(race) |> print()

# A tibble: 6 x 2
  race                     n
  <fct>                <int>
1 Asian/Pacific islndr    61
2 Hispanic               114
3 Black not Hispanic     100
4 White not Hispanic     704
5 Amer ind/AK Native      10
6 NA                      11

Labels 제거하기

stud2 <- stud_spss |>
  remove_val_labels()
stud2 |> print()

# A tibble: 1,000 x 93
  stu_id sch_id sstratid   sex  race ethnic bys42a bys42b bys44a bys44b bys44c
   <dbl>  <dbl>    <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 124966   1249        1     2     4      1      3      4      2      4      4
2 124972   1249        1     1     4      1      4      5      1      3      3
3 175551   1755        1     2     3      0     NA      3      2      3      3
4 180660   1806        1     1     4      1      2     NA      1      4      4
5 180672   1806        1     2     4      1      2      3      1      4      3
6 298885   2988        2     1     3      0      5      4      2      3      3
# i 994 more rows
# i 82 more variables: bys44d <dbl>, bys44e <dbl>, bys44f <dbl>, bys44g <dbl>,
#   bys44h <dbl>, bys44i <dbl>, bys44j <dbl>, bys44k <dbl>, bys44l <dbl>,
#   bys44m <dbl>, bys48a <dbl>, bys48b <dbl>, bys79a <dbl>, byfamsiz <dbl>,
#   famcomp <dbl>, bygrads <dbl>, byses <dbl>, byfaminc <dbl>, parocc <dbl>,
#   bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
#   bypared <dbl>, bytests <dbl>, par_inv <dbl>, f1s36a1 <dbl>, ...