Skip to content

Latest commit

 

History

History
145 lines (128 loc) · 3.13 KB

R_Study_04.md

File metadata and controls

145 lines (128 loc) · 3.13 KB

데이터 전처리

library(dplyr)
exam <- read.csv("./Data/csv_exam.csv")
head(exam, 10)
  • %>% : 파이프 연산자(pipe operator)로 함수들을 연결하는 기능
  • filter(조건) : 행 추출
    • & : and
    • | : or
    • %in% : 포함
exam %>% filter(class == 1)
exam %>% filter(class == 2)
exam %>% filter(class != 1)
exam %>% filter(english <= 80)
exam %>% filter(class == 1 & math >= 50)
exam %>% filter(english >= 90 | math >= 90)
exam %>% filter(class %in% c(1, 3, 5)) 
class1 <- exam %>%
  filter(class == 1) # class == 1인 것 class1에 할당
mean(class1$math)
  • 연산자

    • ^, ** : 제곱
    • %/% : 몫
    • %% : 나머지
  • select(열 이름) : 열(변수) 추출

exam %>%
  select(class, math) # 여러 변수 추출
exam %>%
  select(-math) # '-'기호를 통해 변수를 제외
  • dplyr 함수 조합
exam %>%
  filter(class == 1) %>%
  select(english)
exam %>%
  filter(class == 2) %>%
  select(math)

exam %>%
  select(id, math) %>%
  head
  • arrange(열 이름) : 열 정렬
exam %>% arrange(math) # 오름차순 정렬
exam %>% arrange(desc(math)) # 내림차순 정렬

head(mpg %>%
       filter(manufacturer == "audi") %>%
       arrange(desc(hwy)), 5)
mpg %>%
  filter(manufacturer == "audi") %>%
  arrange(desc(hwy)) %>% head(5)
  • mutate(새로운 열 = 조건) : 새로운 열 추가
exam %>% mutate(total = math + english + science) %>% head
exam %>% mutate(sum = math + english + science, mean = sum / 3) %>% arrange(sum)
exam %>% mutate(test = ifelse(science >= 60, "pass", "fail")) %>% head
  • summarise(조건) : 집단별로 요약
  • summarise 함수
    • n() : 빈도수
    • mean() : 평균
    • var() : 분산
    • sd() : 표준편차
    • sum() : 합계
    • max() : 최대값
    • min() : 최소값
    • median() : 중위값
    • IQR() : 4분위값
    • mad() : 중위절대편차
exam %>% summarise(mean_math = mean(math))
exam %>% group_by(class) %>% summarise(mean_math = mean(math))
exam %>% group_by(class) %>% summarise(
  mean_math = mean(math),
  sum_math = sum(math),
  median_math = median(math),
  n = n()
)
mpg %>%
  group_by(manufacturer, drv) %>%
  summarise(mean_cty = mean(cty)) %>%
  head(10)
mpg %>%
  group_by(manufacturer) %>%
  filter(class == "suv") %>%
  mutate(sum = hwy + cty) %>%
  summarise(mean = mean(sum)) %>%
  arrange(desc(mean)) %>%
  head(5)
  • left_join : 가로로 데이터 프레임 합치기
  • bind_rows : 세로로 데이터 프레임 합치기
    • 세로로 합칠 변수명이 다를 경우 rename()을 통해 변수를 통일
test1 <-
  data.frame(id = c(1, 2, 3, 4, 5),
             midterm = c(60, 80, 70, 90, 85))
test2 <-
  data.frame(id = c(1, 2, 3, 4, 5),
             final = c(70, 83, 65, 95, 80))
test3 <-
  data.frame(id = c(6, 7, 8, 9, 10),
             midterm = c(60, 80, 70, 90, 85))
total <- left_join(test1, test2, by = "id") # 가로로 합치기
total

name <- data.frame(id = c(1, 2, 3, 4, 5),
                   teacher = c("kim", "choi", "park", "jung", "lee"))
total <- left_join(total, name, by = "id")
total
total2 <- bind_rows(test1, test3) # 세로로 합치기
total2