################################################# ### 000000000000000000000000000000000000000000 ################################################# #factorial: factorial<-function(n) gamma(n + 1) #bin: bin<-function(x, p) { y <- 1:p for(i in 1:p) y[i] <- 1 - (floor(x/2^(i - 1)) == 2 * floor(x/2^(i))) y } #haplo: haplo<-function(marker) { #marker=number of marker allhaplo <- as.matrix(0:(2^marker - 1)) allhaplo <- apply(allhaplo, 1, bin, p = marker) if(marker == 1) allhaplo <- t(as.matrix(allhaplo)) t(allhaplo) } ################################################ ## 1111111111111111111111111111111111111111111 ################################################ add.locus<- function(x, y, z, K = 2) { ###x: old (existing) haplo, y: new genotype, z: position of y, K: pool size. ### this function is designed for getting haplos based on ### existing haplo locus x. The general strategy is to obtain the ### haplotype configuration locus by locus. For example, The geno of first locus ### produce a simple haplotype (length 1, actually it is allele, the shortest ### haplotype, then add the second locus (z=2) genotype, y. Sometimes (for missing ### data) we need to add haplos based on known haplo. tmp <- haplo(2 * K) temp <- apply(tmp, 1, sum) all.permutations <- tmp[temp == y, ] all.permutations <- all.permutations * 2^(z - 1) if(!is.matrix(all.permutations)) all.permutations <- t(as.matrix(all.permutations)) Haplo <- NULL for(i in 1:nrow(x)) { x1 <- x[i, ] if(!is.matrix(x1)) x1 <- t(as.matrix(x1)) x2 <- sweep(all.permutations, 2, x1, "+") x2 <- t(apply(x2, 1, sort)) tmp <- apply(sweep(x2, 2, (max(x2) + 1)^(0:(2 * K - 1)), "*"), 1, sum) nd <- !duplicated(tmp) x21 <- x2[nd, ] if(!is.matrix(x21)) x21 <- t(as.matrix(x21)) Haplo <- rbind(Haplo, x21) } if(!is.matrix(Haplo)) Haplo <- t(as.matrix(Haplo)) label <- apply(sweep(Haplo, 2, (max(Haplo) + 1)^(0:(2 * K - 1)), "*"), 1, sum) o <- order(label) Haplo <- Haplo[o, ] if(!is.matrix(Haplo)) Haplo <- t(as.matrix(Haplo)) nd <- !duplicated(label) label.1 <- label[nd] Haplo <- Haplo[nd, ] if(!is.matrix(Haplo)) Haplo <- t(as.matrix(Haplo)) Haplo } ############################### #####22222222222222222222222222222 ################################ p2h <-function(geno, y = geno[-1], z = 2:length(geno), K = 2) { x <- c(rep(1, geno[1]), rep(0, 2 * K - geno[1])) #x: initial haplo configuration n <- length(geno) x <- t(as.matrix(x)) for(i in 1:(n - 1)) x <- add.locus(x, y[i], z[i], K = K) dimnames(x) <- list(NULL, paste("H", 1:(2 * K), sep = "")) ##x: haplo configurations for all loci. return(x + 1) } ############################################################# ########333333333333333333333333333333333333333333333333333 ############################################################## get.haplo <- function(geno, K.default = 2, sample.size = ifelse(is.matrix(geno), nrow(geno), 1), pool.size = rep(K.default, sample.size) ) { ## construct hapltyoe configuration for genotype samples "geno" of all pools. ## it deals wih missing data ## geno: matrix n*m, n: sample size, m: number of loci add.count <- function(x, K) { x <- x[1:(2 * K)] group.size <- table(x) count <- factorial(2 * K)/(prod(factorial(group.size))) c(x, count) } if(!is.matrix(geno)) geno <- t(as.matrix(geno)) # geno<-apply(geno, 2, as.numeric) m <- ncol(geno) n <- nrow(geno) n.h <- 2^m haplo.index <- 1:n.h # base <- 2^(0:(m - 1)) config <- haplo(m) ALL.HAPLO <- as.list(1:nrow(geno)) ## For data with missing values..start to impute for(i in 1:nrow(geno)) { K<-pool.size[i] x <- geno[i, ] complete.missing <- x == -1 partial.missing <- x == 2 * K + 1 missing <- complete.missing | partial.missing cm <- sum(complete.missing) pm <- sum(partial.missing) loci.cm <- (1:m)[complete.missing] loci.pm <- (1:m)[partial.missing] x.not.missing <- x.nm <- x[!missing] N <- (2 * K + 1)^sum(complete.missing) * (2 * K - 1)^sum(partial.missing) ### impute missing data... X <- matrix(rep(x, N), nrow = N, byrow = T) if(pm + cm == 0) X <- t(as.matrix(x)) if(pm + cm >= 1) { missing.pattern <- list(1:(cm + pm)) for(u in 1:(cm + pm)) { if(u <= cm) missing.pattern[[u]] <- 0:(2 * K) else missing.pattern[[u]] <- 1:(2 * K - 1) } X.missing <- expand.grid(missing.pattern) X.missing <- matrix(unlist(as.vector(X.missing) ), nrow = N) if(cm > 0) X[, complete.missing] <- X.missing[, 1: cm] if(pm > 0) X[, partial.missing] <- X.missing[, (( cm + 1):(cm + pm))] } #end of pm+cm>1 ## end of impute (X: imputed genotypes) ## FOR each possible imputed data (genotype), compute haplotypes... all.haplo <- NULL for(r in 1:nrow(X)) { x.imp <- X[r, ] x.imputed <- x.imp[missing] p2h(x.imp, K=K)->HAP if(!is.matrix(HAP)) HAP <- t(as.matrix(HAP)) all.haplo <- rbind(all.haplo, HAP) } # end of r in 1:nrow(X); tmp <- all.haplo[, 1:(2 * K)] if(!is.matrix(tmp)) tmp <- t(as.matrix(tmp)) B <- max(tmp) + 1 label <- apply(sweep(tmp, 2, B^(0:(2 * K - 1)), "*"), 1, sum) tmp.1 <- tmp[!duplicated(label), ] if(!is.matrix(tmp.1)) tmp.1 <- t(as.matrix(tmp.1)) label.1 <- sort(unique(label)) o <- order(unique(label)) if(is.matrix(tmp.1)) all.haplo.1 <- tmp.1[o, ] else { all.haplo.1 <- tmp.1 all.haplo.1 <- t(as.matrix(all.haplo.1)) } if(!is.matrix(all.haplo.1)) all.haplo.1 <- t(as.matrix(all.haplo.1)) dimnames(all.haplo.1) <- list(NULL, paste("h", 1:(2 * K ), sep = "")) # ALL.HAPLO[[i]] <- all.haplo.1 ALL.HAPLO[[i]] <- t(apply(all.haplo.1, 1, add.count, K = K)) } # end of i=1,...nrow ALL.HAPLO } ################################################# ## 44444444444444444444444444444444444444444444444 ################################################# ehv <- function(geno, pool.size = rep(K.default, sample.size), K.default = 2, sample.size = ifelse(is.matrix(geno), nrow(geno), 1), epsilon = 1e-06, iter.tolerance = 1e-05) { #### Function for computing mle for haplotype frequencies #### by using EM algorithm ALL.HAPLO <- get.haplo(geno, pool.size = pool.size) if(!is.matrix(geno)) geno <- t(as.matrix(geno)) # geno <- apply(geno, 2, as.numeric) n <- nrow(geno) m <- ncol(geno) n.h <- 2^m W <- rbind(diag(n.h - 1), rep(-1, n.h - 1)) h <- rep(1, n.h)/n.h #initial info.compute <- STOP <- F it <- 0 repeat { it <- it + 1 EH <- rep(0, n.h) score <- Score <- 0 INFO <- INFO1 <- 0 logL <- 0 for(i in 1:n) { K<- pool.size[i] HAPLO <- ALL.HAPLO[[i]][, - (2 * K + 1)] ### + 1 count <- ALL.HAPLO[[i]][, 2 * K + 1] if(!is.matrix(HAPLO)) HAPLO <- t(as.matrix(HAPLO)) tmp1 <- apply(HAPLO, 1, function(x, prob.h) { prod(prob.h[x]) } , prob.h = h) tmp1 <- tmp1 * count logL <- logL + log(sum(tmp1)) prob.of.haplo <- tmp1/sum(tmp1) Si <- eehh <- rep(0, n.h) Ai <- Bi <- matrix(0, n.h, n.h) for(k in 1:nrow(HAPLO)) { S.local <- rep(0, n.h) A.local <- B.local <- matrix(0, n.h, n.h) eh <- rep(0, 2^m) z <- HAPLO[k, ] count.local <- table(z) subscript <- sort(unique(z)) eh[subscript] <- eh[subscript] + count.local * prob.of.haplo[k] eehh <- eehh + eh if(info.compute) { S.local[subscript] <- count.local * prob.of.haplo[k] S.local[h > 0] <- S.local[h > 0]/h[h > 0] Si <- Si + S.local count.local <- as.vector(count.local) A.local[subscript, subscript] <- count.local %*% t(count.local) * prob.of.haplo[k] A.local[h > 0, h > 0] <- A.local[h > 0, h > 0 ]/(h[h > 0] %*% t(h[h > 0])) Ai <- Ai + (-1) * A.local b.local <- S.local b.local[h > 0] <- b.local[h > 0]/h[h > 0] diag(B.local) <- b.local Bi <- Bi + B.local STOP <- T } } # end of k EH <- EH + eehh Score <- Score + Si INFO <- INFO + Bi + Ai + Si %*% t(Si) } # end of i in 1:nrow(X)... h.new <- EH/(2 *sum(pool.size)) delta <- h.new - h h <- h.new # cat("iter=", it, "\t", "delta=", sum(abs(delta)), "\n") if(sum(abs(delta)) < iter.tolerance) info.compute <- T if(STOP) break } #end of repeat h[h < epsilon] <- 0 #nonzero<-PRESENT nonzero <- h != 0 n.nonzero <- sum(nonzero) info <- INFO[nonzero, nonzero] tmp <- rbind(cbind(info, rep(1, n.nonzero)), c(rep(1, n.nonzero), 0)) w <- rbind(diag(n.nonzero - 1), rep(-1, n.nonzero - 1)) V1 <- solve(tmp)[1:(n.nonzero), 1:(n.nonzero)] V <- matrix(0, n.h, n.h) V[nonzero, nonzero] <- V1 list(h = h, logL = logL, info = INFO, V = V) } ############################################## ###555555555555555 ############################################## #### no pool (classsical case) used to check the validity of eh eh1 <-function(geno, iter.tolerance=1e-6) { if(!is.matrix(geno)) geno <- t(as.matrix(geno)) m <- ncol(geno) n <- nrow(geno) n.h <- 2^m base <- 2^(0:(m - 1)) all.haplo <- as.list(1:n) names(all.haplo) <- paste(paste("HaploConfig for", 1:n), "-th individual", sep = "") for(i in 1:n) { x <- geno[i, ] ho2 <- x == 2 ho0 <- x == 0 x1 <- x[!ho2 & !ho0] excluded <- sum(base[ho2]) base1 <- base[!ho2 & !ho0] if(sum(!ho2 & !ho0) > 0) { m1 <- length(x1) tmp <- haplo(m1) tmp <- apply(sweep(tmp, 2, base1, "*"), 1, sum) k <- 2^(m1 - 1) all.haplo[[i]] <- cbind(tmp[1:k], tmp[(2 * k):(k + 1)]) + excluded } if(sum(!ho2 & !ho0) == 0) all.haplo[[i]] <- rep(excluded, 2) } haplo.pro <- function(x) { x <- x + 1 if(!is.matrix(x)) x <- t(as.matrix(x)) B <- max(x)+1 tmp <- apply(sweep(x, 2, B^(0:1), "*"), 1, sum) tmp <- x[!duplicated(tmp), ] if(!is.matrix(tmp)) tmp <- t(as.matrix(tmp)) tmp } all.haplo <- lapply(all.haplo, haplo.pro) ##done all.haplo h <- rep(1/n.h, n.h) info.compute <- STOP <- F repeat { EH <- eh <- rep(0, n.h) INFO <- Score <- 0 logL <- 0 for(i in 1:n) { HAPLO <- all.haplo[[i]] w <- rep(1, nrow(HAPLO)) eq <- HAPLO[, 1] == HAPLO[, 2] w[!eq] <- 2 if(!is.matrix(HAPLO)) HAPLO <- t(as.matrix(HAPLO)) tmp1 <- apply(HAPLO, 1, function(x, prob.h) { prod(prob.h[x]) } , prob.h = h) prob.of.haplo <- (tmp1 * w)/sum(tmp1 * w) ####then logL #logl <- log(sum(h[HAPLO[, 1]] * h[HAPLO[, 2]] * w)) logl <- log(sum(tmp1 * w)) logL <- logL + logl eh <- rep(0, n.h) eh[HAPLO[, 1]] <- eh[HAPLO[, 1]] + prob.of.haplo eh[HAPLO[, 2]] <- eh[HAPLO[, 2]] + prob.of.haplo EH <- EH + eh if(info.compute) { Si <- rep(0, n.h) SSi <- Bi <- matrix(0, n.h, n.h) for(k in 1:nrow(HAPLO)) { S.local <- rep(0, n.h) SS.local <- B.local <- matrix(0, n.h, n.h) z <- HAPLO[k, ] count.local <- table(z) subscript <- sort(unique(z)) S.local[subscript] <- count.local * prob.of.haplo[k] S.local[h > 0] <- S.local[h > 0]/h[h > 0] Si <- Si + S.local count.local <- as.vector(count.local) SS.local[subscript, subscript] <- count.local %*% t(count.local) * prob.of.haplo[k] SS.local[h > 0, h > 0] <- SS.local[h > 0, h > 0]/(h[h > 0] %*% t(h[h > 0])) SSi <- SSi + (-1) * SS.local b.local <- S.local b.local[h > 0] <- b.local[h > 0]/h[h > 0] diag(B.local) <- b.local Bi <- Bi + B.local } ## end in k Score <- Score + Si INFO <- INFO + Bi + SSi + Si %*% t(Si) STOP <- T } ###end of computing INFO and logL. } #### <-------- end of i. h.new <- EH/(2 * n) delta <- h.new - h h <- h.new # print(sum(abs(delta))) if(sum(abs(delta)) < iter.tolerance) info.compute <- T if(STOP) break } h[h < 1e-06] <- 0 zero <- (h == 0) INFO <- INFO[!zero, !zero] n.h <- sum(!zero) W <- rbind(diag(n.h - 1), rep(-1, n.h - 1)) A <- t(W) %*% INFO %*% W if(all((eigen(A)[[1]]) != 0)) V.nonzero <- W %*% solve(A) %*% t(W) else V <- "sigular informatrix" V <- matrix(0, 2^m, 2^m) V[!zero, !zero] <- V.nonzero list(h = h, V = V, logL = logL) } ############################################## ### 666666666666666666666666666666666666666 ############################################## eh<- function(geno, pool.size = rep(K.default, sample.size), K.default=2, sample.size = ifelse(is.matrix(geno), nrow(geno), 1), epsilon = 1e-06, iter.tolerance = 1e-05) { #### Function for computing mle for haplotype frequencies by using EM algorithm ALL.HAPLO <- get.haplo(geno, pool.size = pool.size) if(!is.matrix(geno)) geno <- t(as.matrix(geno)) #geno <- apply(geno, 2, as.numeric) n <- nrow(geno) m <- ncol(geno) n.h <- 2^m W <- rbind(diag(n.h - 1), rep(-1, n.h - 1)) h <- rep(1, n.h)/n.h #initial it <- 0 repeat { it <- it + 1 EH <- rep(0, n.h) logL <- 0 for(i in 1:n) { K <- pool.size[i] HAPLO <- ALL.HAPLO[[i]][, - (2 * K + 1)] ### + 1 count <- ALL.HAPLO[[i]][, 2 * K + 1] if(!is.matrix(HAPLO)) HAPLO <- t(as.matrix(HAPLO)) tmp1 <- apply(HAPLO, 1, function(x, prob.h) { prod(prob.h[x]) } , prob.h = h) tmp1 <- tmp1 * count logL <- logL + log(sum(tmp1)) prob.of.haplo <- tmp1/sum(tmp1) eehh <- rep(0, n.h) for(k in 1:nrow(HAPLO)) { eh <- rep(0, 2^m) z <- HAPLO[k, ] count.local <- table(z) subscript <- sort(unique(z)) eh[subscript] <- eh[subscript] + count.local * prob.of.haplo[k] eehh <- eehh + eh } # end of k EH <- EH + eehh } # end of i in 1:nrow(X)... h.new <- EH/(2 * sum(pool.size)) delta <- h.new - h h <- h.new # cat("iter=", it, "\t", "delta=", sum(abs(delta)), "\n") if(sum(abs(delta)) < iter.tolerance) break } #end of repeat h[h < epsilon] <- 0 estimate <- data.frame(haplo(m), h) dimnames(estimate)[[2]] <- c(paste("snp", 1:m, sep=""), "freq") estimate[h>0, ]->estimate list(estimate=estimate, logL = logL) } ################################################ ## FUNCTION: LD.pair ## Description: Compute pairwise LD coefficient ## Input: haplotype frequencies (vector). ################################################ LD.pair <- function(freq,ld="dprime") { p <- log(length(freq), base=2) two.locus <- function(x, ld="dprime") { pA <- x[1, 1] + x[1, 2] pB <- x[1, 1] + x[2, 1] pAB <- x[1, 1] numerator <- pAB - pA * pB if (ld=="dprime"){ if(numerator >= 0) denominator <- min(pA * (1 - pB), pB * (1 - pA)) else denominator <- min(pA * pB, (1 - pA) * (1 - pB)) } else denominator <- sqrt(pA*pB*(1-pA)*(1-pB)) return(abs(numerator)/denominator) } D <- matrix(1, p, p) x <- cbind(1:(2^p), freq) haplos <- NULL for(i in 1:(2^p)) haplos <- rbind(haplos, bin(x[i, 1] - 1, p)) x <- cbind(haplos, x[, 2]) for(i in 1:(p - 1)) { for(j in (i + 1):p) { haplo <- x[, c(i, j)] h <- x[, p + 1] table <- matrix(0, 2, 2) table[1, 1] <- sum(h[haplo[, 1] == 0 & haplo[, 2] == 0]) table[1, 2] <- sum(h[haplo[, 1] == 0 & haplo[, 2] == 1]) table[2, 1] <- sum(h[haplo[, 1] == 1 & haplo[, 2] == 0]) table[2, 2] <- sum(h[haplo[, 1] == 1 & haplo[, 2] == 1]) D[i, j] <- D[j, i] <- two.locus(table, ld=ld) } } D } ######################################################