Import

R for Data Science by Wickham & Grolemund

Author

Sungkyun Cho

Published

February 17, 2023

자세한 데이터 import에 대해서는 링크

Text files: csv

readr 패키지(tidyverse에 포함)

read_csv(), write_csv()

  • R 기본 함수 read.csv()를 개선
  • 다양한 옵션은 ?read_csv, ?write_csv 참고

csv 파일 읽기

altruism.csv 파일 링크

library(tidyverse)

helping <- read_csv("data/altruism.csv")
helping |> print()
# A tibble: 120 × 12
      id pho_1 pho_2 pho_3   sex   age emp_q20 emp_q22 emp_q23 emp_q24 emp_q25
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1   250    95    95    95     1  2004      80      NA      80      80      70
 2    32    58    62    NA     0  2003      62      58      59      57      56
 3   109   100    50    50    NA  2003      90      51      51      51      52
 4   209    77    77    64     1  2004      66      72      88      82      67
 5    94    77    50    77     1  2003     100     100     100      51      78
 6   260   100    75   100     0  2004     100      60      70      55      70
 7   258    77    94    86     1  2004      91      93      85      91      73
 8   244    90    68    20     0  2004      67      66      31      67      63
 9   180   100    79    77     0  2003      61      51      30      51      51
10   182    75    50    64     1  2003      80      80      70      65      70
# … with 110 more rows, and 1 more variable: emp_q26 <dbl>
Note

read_csv()의 자주 사용되는 옵션

read_csv("data/file.csv", skip = 2) # 첫 2절 스킵
read_csv("data/file.csv", na = ".") # 결측치가 .으로 기록된 파일

csv 파일 쓰기

write_csv(): 단, 쓰기를 하면서 변수 타입 소멸

write_csv(helping, file="data/helping_new.csv")

Excel spreadsheets

readxl package

read_excel()read_xlsx()read_xls()

엑셀 파일 읽기

stduents.xlsx 파일 링크

library(readxl) # install.packages("readxl")

stud <- read_xlsx("data/students.xlsx")
stud |> print()
# A tibble: 1,000 × 93
  stu_id sch_id sstratid   sex  race ethnic bys42a bys42b bys44a bys44b bys44c
   <dbl>  <dbl>    <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 124966   1249        1     2     4      1      3      4      2      4      4
2 124972   1249        1     1     4      1      4      5      1      3      3
3 175551   1755        1     2     3      0     NA      3      2      3      3
4 180660   1806        1     1     4      1      2     NA      1      4      4
5 180672   1806        1     2     4      1      2      3      1      4      3
6 298885   2988        2     1     3      0      5      4      2      3      3
# … with 994 more rows, and 82 more variables: bys44d <dbl>, bys44e <dbl>,
#   bys44f <dbl>, bys44g <dbl>, bys44h <dbl>, bys44i <dbl>, bys44j <dbl>,
#   bys44k <dbl>, bys44l <dbl>, bys44m <dbl>, bys48a <dbl>, bys48b <dbl>,
#   bys79a <dbl>, byfamsiz <dbl>, famcomp <dbl>, bygrads <dbl>, byses <dbl>,
#   byfaminc <dbl>, parocc <dbl>, bytxrstd <dbl>, bytxmstd <dbl>,
#   bytxsstd <dbl>, bytxhstd <dbl>, bypared <dbl>, bytests <dbl>,
#   par_inv <dbl>, f1s36a1 <dbl>, f1s36a2 <dbl>, f1s36b1 <dbl>, …
Note

Specify sheet either by position or by name

read_excel("salaries.xlsx", sheet = 2) # The default is sheet = 1
read_excel("salaries.xlsx", sheet = "personnel")

Statistical packages

SPSS의 데이터: read_sav()

students-shorter.sav 파일 링크

library(haven) # install.packages("haven")

stud_spss <- read_sav("data/students-shorter.sav")
stud_spss |> print()
# A tibble: 1,000 x 93
   stu_id    sch_id   sstratid sex     race    ethnic  bys42a   bys42b   bys44a 
   <dbl+lbl> <dbl+lb> <dbl+lb> <dbl+l> <dbl+l> <dbl+l> <dbl+lb> <dbl+lb> <dbl+l>
 1 124966    1249     1        2 [Fem~ 4 [Whi~ 1 [whi~  3 [2-3~  4 [3-4~ 2 [Agr~
 2 124972    1249     1        1 [Mal~ 4 [Whi~ 1 [whi~  4 [3-4~  5 [4-5~ 1 [Str~
 3 175551    1755     1        2 [Fem~ 3 [Bla~ 0 [blk~ NA        3 [2-3~ 2 [Agr~
 4 180660    1806     1        1 [Mal~ 4 [Whi~ 1 [whi~  2 [1-2~ NA       1 [Str~
 5 180672    1806     1        2 [Fem~ 4 [Whi~ 1 [whi~  2 [1-2~  3 [2-3~ 1 [Str~
 6 298885    2988     2        1 [Mal~ 3 [Bla~ 0 [blk~  5 [4-5~  4 [3-4~ 2 [Agr~
 7 604419    6044     6        2 [Fem~ 4 [Whi~ 1 [whi~  4 [3-4~  5 [4-5~ 2 [Agr~
 8 605355    6053     6        2 [Fem~ 4 [Whi~ 1 [whi~  2 [1-2~  3 [2-3~ 1 [Str~
 9 605377    6053     6        2 [Fem~ 4 [Whi~ 1 [whi~  3 [2-3~  5 [4-5~ 2 [Agr~
10 637529    6375     6        1 [Mal~ 3 [Bla~ 0 [blk~  5 [4-5~  6 [Ove~ 2 [Agr~
# i 990 more rows
# i 84 more variables: bys44b <dbl+lbl>, bys44c <dbl+lbl>, bys44d <dbl+lbl>,
#   bys44e <dbl+lbl>, bys44f <dbl+lbl>, bys44g <dbl+lbl>, bys44h <dbl+lbl>,
#   bys44i <dbl+lbl>, bys44j <dbl+lbl>, bys44k <dbl+lbl>, bys44l <dbl+lbl>,
#   bys44m <dbl+lbl>, bys48a <dbl+lbl>, bys48b <dbl+lbl>, bys79a <dbl+lbl>,
#   byfamsiz <dbl+lbl>, famcomp <dbl+lbl>, bygrads <dbl+lbl>, byses <dbl+lbl>,
#   byfaminc <dbl+lbl>, parocc <dbl>, bytxrstd <dbl+lbl>, ...
stud_spss |>
    select(ethnic) |>
    print()
# A tibble: 1,000 x 1
  ethnic            
  <dbl+lbl>         
1 1 [white-asian]   
2 1 [white-asian]   
3 0 [blk,namer,hisp]
4 1 [white-asian]   
5 1 [white-asian]   
6 0 [blk,namer,hisp]
# i 994 more rows

labelled 데이터 참고

install.packages("labelled")
library(labelled)
# labelled 변수를 factor로 변환
stud_spss |>
    unlabelled() |>
    print()
# A tibble: 1,000 x 93
  stu_id sch_id sstratid sex    race   ethnic bys42a bys42b bys44a bys44b bys44c
   <dbl>  <dbl>    <dbl> <fct>  <fct>  <fct>  <fct>  <fct>  <fct>  <fct>  <fct> 
1 124966   1249        1 Female White~ white~ 2-3 h~ 3-4 h~ Agree  Stron~ Stron~
2 124972   1249        1 Male   White~ white~ 3-4 h~ 4-5 h~ Stron~ Disag~ Disag~
3 175551   1755        1 Female Black~ blk,n~ NA     2-3 h~ Agree  Disag~ Disag~
4 180660   1806        1 Male   White~ white~ 1-2 h~ NA     Stron~ Stron~ Stron~
5 180672   1806        1 Female White~ white~ 1-2 h~ 2-3 h~ Stron~ Stron~ Disag~
6 298885   2988        2 Male   Black~ blk,n~ 4-5 h~ 3-4 h~ Agree  Disag~ Disag~
# i 994 more rows
# i 82 more variables: bys44d <fct>, bys44e <fct>, bys44f <fct>, bys44g <fct>,
#   bys44h <fct>, bys44i <fct>, bys44j <fct>, bys44k <fct>, bys44l <fct>,
#   bys44m <fct>, bys48a <fct>, bys48b <fct>, bys79a <fct>, byfamsiz <fct>,
#   famcomp <fct>, bygrads <dbl>, byses <dbl>, byfaminc <fct>, parocc <dbl>,
#   bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
#   bypared <fct>, bytests <dbl>, par_inv <dbl>, f1s36a1 <fct>, ...

Labels 제거하기

stud <- stud_spss |>
    remove_val_labels()
stud |> print()
# A tibble: 1,000 x 93
  stu_id sch_id sstratid   sex  race ethnic bys42a bys42b bys44a bys44b bys44c
   <dbl>  <dbl>    <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 124966   1249        1     2     4      1      3      4      2      4      4
2 124972   1249        1     1     4      1      4      5      1      3      3
3 175551   1755        1     2     3      0     NA      3      2      3      3
4 180660   1806        1     1     4      1      2     NA      1      4      4
5 180672   1806        1     2     4      1      2      3      1      4      3
6 298885   2988        2     1     3      0      5      4      2      3      3
# i 994 more rows
# i 82 more variables: bys44d <dbl>, bys44e <dbl>, bys44f <dbl>, bys44g <dbl>,
#   bys44h <dbl>, bys44i <dbl>, bys44j <dbl>, bys44k <dbl>, bys44l <dbl>,
#   bys44m <dbl>, bys48a <dbl>, bys48b <dbl>, bys79a <dbl>, byfamsiz <dbl>,
#   famcomp <dbl>, bygrads <dbl>, byses <dbl>, byfaminc <dbl>, parocc <dbl>,
#   bytxrstd <dbl>, bytxmstd <dbl>, bytxsstd <dbl>, bytxhstd <dbl>,
#   bypared <dbl>, bytests <dbl>, par_inv <dbl>, f1s36a1 <dbl>, ...