์๋
ํ์ธ์!
by
์ฌ๋ฌ ๊ทธ๋ฃน์ ๋ํด shift
๋ ์๋ ์ด๋๋ณด๋ค ํจ์ฌ ๋๋ฆฝ๋๋ค.
์ฐธ์กฐ: http://stackoverflow.com/questions/35179911/shift-in-data-table-v1-9-6-is-slow-for-many-groups
์์ธํ ์๋ https://github.com/nachti/datatable_test/blob/master/leadtest.R ์
๊ฑด๋ฐฐ,
๊ฒ๋ฅดํ๋ฅดํธ
๋๋ผ์ด ์ผ์ด ์๋๋๋ค. gforce
๊ฐ :=
์ต์ ํ๋๋ฉด ์ด ๋ฌธ์ ๊ฐ ์ฌ๋ผ์ง๋๋ค. ์ด ๋ฆด๋ฆฌ์ค์ ๋ชฉ๋ก์ ์๋ค๊ณ ์๊ฐํฉ๋๋ค.
์ด ์ฑ๋ฅ ํฅ์์ ๋ํด +1์
๋๋ค. shift()
๋ ๋ง์ ์ฝ๋์์ ์ฃผ์ ๋ณ๋ชฉ ํ์์
๋๋ค. ๊ณ ์ ๋ ์์ ํ์์ shift()
๋ฅผ ์คํํ๋ ๋ฐ ๊ฑธ๋ฆฌ๋ ์๊ฐ์ ๋ฐ์ดํฐ์ ๊ทธ๋ฃน ์์ ๋น๋กํ๋ ๊ฒ ๊ฐ์ต๋๋ค.
library(data.table)
# Build table to store timings
timings <- CJ(RowCount = 10^7, Groups = 10^c(0:7))
timings[, SizePerGroup := RowCount/Groups]
# Loop through each experiment
for(i in 1:nrow(dt)){
print(paste0("Iteration: ", i))
# Build dataset
timings_i <- timings[i]
dt <- data.table(Grp = rep(seq_len(timings_i$Groups), each = timings_i$SizePerGroup))
dt[, Value := sample(100, size = .N, replace = T)]
# Measure the time it takes to insert a column indicating the previous value by group
elapsed <- system.time(dt[, PrevValueByGrp := shift(Value, type = "lag"), by = Grp])["elapsed"]
timings[i, Elapsed := elapsed]
}
library(ggplot2)
ggplot(timings, aes(x = Groups, y = Elapsed))+geom_line()+geom_point()
@ben519 ์ฐธ๊ณ ๋ก, ์ฝ๋๊ฐ ๋ค์๊ณผ ๊ฐ์ ํน์ํ ๊ฒฝ์ฐ์๋ ๋ฐ๋ก ๊ฐ๊ธฐ๊ฐ ์์ต๋๋ค.
library(data.table)
dt <- data.table(Grp = rep(seq_len(1e6), each=10L))
dt[, Value := sample(100L, size = .N, replace = TRUE)]
system.time(dt[, PrevValueByGrp := shift(Value, type = "lag"), by = Grp][])
# user system elapsed
# 19.50 0.80 20.34
system.time(dt[, v := shift(Value, type = "lag")][rowid(Grp)==1L, v := NA][])
# user system elapsed
# 1.00 0.87 1.25
dt[, all.equal(v, PrevValueByGrp)]
# [1] TRUE
๊ฐ์ฅ ์ ์ฉํ ๋๊ธ
@ben519 ์ฐธ๊ณ ๋ก, ์ฝ๋๊ฐ ๋ค์๊ณผ ๊ฐ์ ํน์ํ ๊ฒฝ์ฐ์๋ ๋ฐ๋ก ๊ฐ๊ธฐ๊ฐ ์์ต๋๋ค.