※ 引述《yummy7922 (crucify)》之銘言:
: [問題敘述]:
: 我的資料是重複測量的資料,資料中有13820位病人的多次測量值,
: 但不是每位病人的觀察筆數都相同,
: 我想要針對每一位病人,將每三筆資料計算一個平均值,
: 最後不到三筆的資料也算一個平均值,
: 不過我不知道該如何做,想請教各位高手們,謝謝。
library(plyr) # only used in data generation
library(dplyr)
library(data.table)
library(magrittr)
# data generation
n = 13620
dat = data.table(id = 1:n, len = sample(2:15, n, replace = TRUE)) %>%
mdply(function(id, len) data.table(id = rep(id, len), values = rnorm(len)))
# mean
k = 3
dat = select(dat, c(id, values)) # 你可以省略這行 只是刪掉len而已
start_time = Sys.time()
result = dat %>% group_by(id) %>% mutate(newgroup = rep(1:length(values),
each = k, length = length(values))) %>% group_by(id, newgroup) %>%
summarise(mean(values))
Sys.time() - start_time
# Time difference of 0.558032 secs
# Other method
k = 3
dat = select(dat, c(id, values)) # 你可以省略這行 只是移除前面的變更
# library(reshape2) # 如果不用data.table 請跑這一行
start_time = Sys.time()
dat$newgroup = unlist(tapply(dat$values, dat$id, function(x){
rep(1:length(x), each = k, length = length(x))
}))
result2 = melt(tapply(dat$values, list(dat$id, dat$newgroup), mean))
result2 = result2[!is.na(result2$value),]
Sys.time() - start_time
# Time difference of 1.556089 secs
result2 = result2[order(result2$Var1),]
all.equal(result$value, result2$value)
# TRUE
根據版友aaron77217提供的資料格式,新增:
library(dplyr)
library(data.table)
library(magrittr)
dat_gen_f = function(N_patient, max_obs_time, n_vars){
dat = sample(max_obs_time, N_patient, replace = TRUE) %>% {
cbind(rep(1:N_patient, times=.), sapply(., seq, from = 1) %>% unlist())
} %>% cbind(matrix(rnorm(nrow(.)*n_vars),, n_vars)) %>% data.table()
setnames(dat, c("id", "obs_times", paste0("V", 1:n_vars)))
}
mean_dat_f = function(dat, k){
result = dat %>% group_by(id) %>%
mutate(newgroup = rep(1:ceiling(length(obs_times)/k), each = k,
length=length(obs_times)),
n_combine = (length(obs_times) %/% k) %>% {c(rep(k, . * k),
rep(length(obs_times) - . * k, length(obs_times) - . * k))}) %>%
ungroup() %>% mutate(times_combine = paste((newgroup-1)*3+1,
(newgroup-1)*3 + n_combine, sep="-"))
result = result %>% select(match(c(names(dat)[names(dat)!="obs_times"],
"times_combine"), names(result))) %>% extract(, lapply(.SD, mean),
by = "id,times_combine")
result
}
start_time = Sys.time()
dat = dat_gen_f(30000, 20, 15)
Sys.time() - start_time
# Time difference of 1.503086 secs
start_time = Sys.time()
result = mean_dat_f(dat, 3)
Sys.time() - start_time
# Time difference of 4.236243 secs
start_time = Sys.time()
dat = dat_gen_f(13820, 15, 1)
Sys.time() - start_time
# Time difference of 0.4750271 secs
start_time = Sys.time()
result = mean_dat_f(dat, 3)
Sys.time() - start_time
# Time difference of 1.848106 secs