Dplyr: ๊ทธ๋ฃน์„ ์ƒ˜ํ”Œ๋งํ•  ์ˆ˜ ์žˆ์–ด์•ผ ํ•จ

์— ๋งŒ๋“  2014๋…„ 03์›” 28์ผ  ยท  9์ฝ”๋ฉ˜ํŠธ  ยท  ์ถœ์ฒ˜: tidyverse/dplyr

๊ทธ๋ฃน ๋‚ด ๊ฐœ์ธ๋ฟ๋งŒ ์•„๋‹ˆ๋ผ

๊ฐ€์žฅ ์œ ์šฉํ•œ ๋Œ“๊ธ€

@drhagen์˜ ์œ„ ๋‹ต๋ณ€์€

sample_n_groups = function(tbl, size, replace = FALSE, weight = NULL) {
  # regroup when done
  grps = tbl %>% groups %>% lapply(as.character) %>% unlist
  # check length of groups non-zero
  keep = tbl %>% summarise() %>% ungroup() %>% sample_n(size, replace, weight)
  # keep only selected groups, regroup because joins change count.
  # regrouping may be unnecessary but joins do something funky to grouping variable
  tbl %>% right_join(keep, by=grps) %>% group_by_(.dots = grps)
}

๋ชจ๋“  9 ๋Œ“๊ธ€

species <- iris %.% 
  group_by(Species) %.% 
  summarise(wt = sum(Sepal.Length)) %.%
  sample_n(5, replace = T, weight = wt) %.%
  select(-wt)

inner_join(species, iris)

์ด๊ฒŒ ์™œ ๋‹ซํ˜”๋Š”์ง€ ๊ถ๊ธˆํ•ฉ๋‹ˆ๋‹ค. ์ž ์žฌ์ ์œผ๋กœ ์œ ์šฉํ•œ ๊ธฐ๋Šฅ์ธ ๊ฒƒ ๊ฐ™์Šต๋‹ˆ๋‹ค

iris %>%
    group_by(Species) %>%
    sample_n(1)

์ž„์˜์˜ ์ข…์—์„œ ๋ชจ๋“  ๋ฐ์ดํ„ฐ๋ฅผ ์„ ํƒํ•˜๋ ค๋ฉด, ์˜ˆ๋ฅผ ๋“ค์–ด

๊ทธ๋ฃน ๋‚ด ์ƒ˜ํ”Œ๋ง์ด ์ง๊ด€์ ์ธ ๋™์ž‘์ด๊ธฐ ๋•Œ๋ฌธ์— sample_n ์˜ ๋™์ž‘์ด ๊ทธ๋ฃน์— ๋Œ€ํ•ด ๋ณ€๊ฒฝ๋˜์–ด์•ผ ํ•œ๋‹ค๊ณ  ์ƒ๊ฐํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ๊ทธ๋ฃน ์ „์ฒด๋ฅผ ์ƒ˜ํ”Œ๋งํ•  ์ˆ˜ ์žˆ๋Š” ๊ฒƒ์ด ํŽธ๋ฆฌํ•œ ๊ฒฝ์šฐ๊ฐ€ ๋งŽ์Šต๋‹ˆ๋‹ค. ์ด๊ฒƒ์€ ๋‘ ๋ฒˆ์งธ ๊ธฐ๋Šฅ์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋‚ด ๊ตฌํ˜„์€ ๋‹ค์Œ๊ณผ ๊ฐ™์Šต๋‹ˆ๋‹ค.

sample_n_groups = function(tbl, size, replace = FALSE, weight=NULL) {
   # regroup when done
   grps = tbl %>% groups %>% unlist %>% as.character
   # check length of groups non-zero
   keep = tbl %>% summarise() %>% sample_n(size, replace, weight)
   # keep only selected groups, regroup because joins change count.
   # regrouping may be unnecessary but joins do something funky to grouping variable
   tbl %>% semi_join(keep) %>% group_by_(grps) 
}

@rcorty ์˜ ์˜ˆ์ œ๋Š” ์˜ˆ์ƒ๋Œ€๋กœ ์ž‘๋™ํ•ฉ๋‹ˆ๋‹ค.

iris %>% group_by(Species) %>% sample_n_groups(1)

+1

ํŽธ์ง‘: dplyr ๋Œ€ํ•œ ๋ณ€๊ฒฝ์œผ๋กœ ์ธํ•ด ์ด ์†”๋ฃจ์…˜์ด ์ค‘๋‹จ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.


์ด ๊ธฐ๋Šฅ์„ ์ฐพ๊ณ  ์žˆ๋Š” ๊ฒ€์ƒ‰ ์—”์ง„์„ ํ†ตํ•ด ์—ฌ๊ธฐ์— ๋„์ฐฉํ•œ ๋ถ„๋“ค์„ ์œ„ํ•ด @MarcusWalz์— ์˜ํ•œ ๊ตฌํ˜„์€ replace = TRUE ๋•Œ ๋Œ€์ฒด๋กœ ์ƒ˜ํ”Œ๋งํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค. ์ค‘๋ณต์„ ์œ ์ง€ํ•˜๋ ค๋ฉด ๊ตฌํ˜„์—์„œ right_join (๋˜๋Š” left_join ๋˜๋Š” inner_join )๋ฅผ ์‚ฌ์šฉํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.

sample_n_groups = function(tbl, size, replace = FALSE, weight=NULL) {
  # regroup when done
  grps = tbl %>% groups %>% unlist %>% as.character
  # check length of groups non-zero
  keep = tbl %>% summarise() %>% sample_n(size, replace, weight)
  # keep only selected groups, regroup because joins change count.
  # regrouping may be unnecessary but joins do something funky to grouping variable
  tbl %>% right_join(keep, by=grps) %>% group_by_(grps) 
}

ํด๋Ÿฌ์Šคํ„ฐ ๋ถ€ํŠธ์ŠคํŠธ๋ž˜ํ•‘์€ ์ด ๊ธฐ๋Šฅ์˜ ๊ด‘๋ฒ”์œ„ํ•œ ์‚ฌ์šฉ ์‚ฌ๋ก€์ž…๋‹ˆ๋‹ค.

@drhagen , ๊ตฌํ˜„ ์‹œ ์ƒˆ๋กœ์šด ๊ณ ์œ  ๊ทธ๋ฃน ID๋ฅผ ์ƒ์„ฑํ•˜๋Š” ๋ฐฉ๋ฒ•์— ๋Œ€ํ•œ ์ œ์•ˆ ์‚ฌํ•ญ์ด ์žˆ์Šต๋‹ˆ๊นŒ?

์‚ฌ์‹ค ์ด๊ฒƒ์€ ์•„์ฃผ ์‰ฝ์Šต๋‹ˆ๋‹ค.

sample_n_groups = function(tbl, size, replace = FALSE, weight=NULL) {
  # regroup when done
  grps = tbl %>% groups %>% unlist %>% as.character
  # check length of groups non-zero
  keep = tbl %>% summarise() %>% sample_n(size, replace, weight) %>% 
    mutate(unique_id = 1:NROW(.))
  # keep only selected groups, regroup because joins change count.
  # regrouping may be unnecessary but joins do something funky to grouping variable
  tbl %>% right_join(keep, by=grps) %>% group_by_(grps) 
}

@drhagen์˜ ์œ„ ๋‹ต๋ณ€์€

sample_n_groups = function(tbl, size, replace = FALSE, weight = NULL) {
  # regroup when done
  grps = tbl %>% groups %>% lapply(as.character) %>% unlist
  # check length of groups non-zero
  keep = tbl %>% summarise() %>% ungroup() %>% sample_n(size, replace, weight)
  # keep only selected groups, regroup because joins change count.
  # regrouping may be unnecessary but joins do something funky to grouping variable
  tbl %>% right_join(keep, by=grps) %>% group_by_(.dots = grps)
}
์ด ํŽ˜์ด์ง€๊ฐ€ ๋„์›€์ด ๋˜์—ˆ๋‚˜์š”?
0 / 5 - 0 ๋“ฑ๊ธ‰