2014/8/21 22:28更新:
測試一個10000 => 12的case 比下面原本的快1230倍,現在測試中最快的。
(dat的row => ref_m的row)
ref_m = data.frame(
"gene_a" = c("A", "B", "C"),
"Chromosome" = c("1", "X", "2"),
"gene_start" = as.integer(c(25000, 1000, 0)),
"gene_end" = as.integer(c(50000, 2000, 800)),
stringsAsFactors = FALSE
)
dat = data.frame(
"Probe_b" = c("a1", "a2", "a3", "a4", "a5"),
"Chromosome2" = c("2", "4", "1", "X", "1"),
"Chr_s" = as.integer(c(175, 600, 23575, 1010, 30000)),
"Chr_e" = as.integer(c(200, 625, 23600, 1035, 30025)),
stringsAsFactors = FALSE
)
dat$Probe_b = as.character(dat$Probe_b)
ref_m$gene_a = as.character(ref_m$gene_a)
loc_ref_v = match(dat$Chromosome2, ref_m$Chromosome)
loc_dat_v = loc_ref_v > 0
loc_dat_v = loc_dat_v[!is.na(loc_dat_v)]
loc_ref_v = loc_ref_v[loc_dat_v]
loc = dat$Chr_s[loc_dat_v] >= ref_m$gene_start[loc_ref_v] &
dat$Chr_e[loc_dat_v] <= ref_m$gene_end[loc_ref_v]
tapply(dat$Probe_b[loc_dat_v][loc], ref_m$gene_a[loc_ref_v[loc]], c)
之前的:
借上面的資料一用,code如下:
ref_m = data.frame("gene_a"=c("A","B","C"),
"Chromosome"=c(1,"X",2),
"gene_s"=c(25000,1000,0),
"gene_e"=c(50000,2000,800))
dat = data.frame("Probe_b"=c("a1","a2","a3","a4","a5"),
"Chromosome2"=c(2,4,1,"X",1),
"chr_s"=c(175,600,23575,1010,30000),
"chr_e"=c(200,625,23600,1035,30025))
check_f = function(v, ref_m){
loc <- match(v$Chromosome2, ref_m$Chromosome, nomatch = 0)
if(length(loc) != 0 && loc > 0) # 避免loc出現integer(0)的情況
{
if(v$chr_s >= ref_m[loc,]$gene_s && v$chr_e <= ref_m[loc,]$gene_e)
return(as.character(ref_m[loc,]$gene_a))
else
return(NA)
}
else
return(NA)
}
result = rep(NA, nrow(dat))
for(i in 1:nrow(dat))
result[i] = check_f(dat[i,], ref_m)
tapply(as.character(dat$Probe_b), result, c)