# famFLM: Region-based association test for familial data under functional linear models # 02-10-2014 # Requires R-libraries: 'GenABEL' (for null model estimation) and 'fda' (for functional basis estimation). # To enable parallel calculation, libraries 'parallel' and 'doSNOW' should be installed. # USAGE: # famFLM (formula, phenodata, genodata, positions, kin, nullmod, return.nullmod = FALSE, reg = NULL, # sliding.window = c(20, 10), mode = 'add', betaspline = 'bspline', bbasis = 15, order = 4, fbasis = 25, # genospline = FALSE, stat = 'F', return.time = FALSE, ncores = 1) # ARGUMENTS: # 'formula': referring to the column(s) in 'phenodata' to be analyzed as outcome and, # if needed, covariates. # 'phenodata': a data frame containing columns mentioned in 'formula' (trait to analyze and, # if needed, covariates). # 'genodata': a data frame or matrix (with ids given in rows) containing genotypes # coded as AA = 0, Aa = 1 and aa = 2, where a is a minor allele. NAs in 'genodata' # will be imputed by the mean values. Monomorphic and duplicated variants will be omitted. # 'genodata' object can also be of 'gwaa.data-class' or 'snp.data-class'. # 'positions': a vector of physical positions for genetic variants in 'genodata'. # 'kin': a kinship matrix to evaluate the null model. It can be calculated from a pedigree # (e.g. using 'kinship2' package) or from genotypic data (with 'ibs' function of GenABEL). # 'nullmod': an object of 'polygenic' data class. Setting 'nullmod' allows # to avoid re-estimation of the null model that does not depend on genotypes and can be # calculated once for a trait. 'nullmod' object in proper format can be obtained by running # 'famFLM' with 'return.nullmod = TRUE'. # 'return.nullmod': logical value indicating whether a 'nullmod' object should be returned. # 'reg': a vector assigning a region for each genetic marker in 'genodata'. # If NULL, 'sliding.window' parameters are used. # 'sliding.window': the sliding window size and step. Has no effect if 'reg' is defined. # 'mode': the mode of inheritance ('add', 'dom' or 'rec' for additive, dominant or recessive modes, # respectively). For dominant (recessive) mode genotypes will be recoded as AA = 0, Aa = 1 and aa = 1 # (AA = 0, Aa = 0 and aa = 1), where a is a minor allele. Default mode is additive. # 'betaspline': a basis function type used for beta-smooth, 'bspline' (B-spline basis, default) or # 'fspline' (Fourier basis). # 'bbasis': the number of basis functions to be used in 'bspline' (default = 15). # 'order': a polynomial order to be used in 'bspline' (default = 4 corresponds to the cubic B-splines). # 'fbasis': the number of basis functions to be used in 'fspline' (default = 25). # 'genospline': a basis function type used for genetic variant functions (GVF). Can be set to # 'bspline' or 'fspline'. The default 'genospline' = FALSE assumes beta-smooth only. # 'stat': the statistic to be used to calculate the P values. One of 'F' (default), 'Chisq', 'LRT'. # 'return.time': a logical value indicating whether the running time should be returned. # 'ncores': the number of CPUs for parallel calculations (default = 1). # VALUES: # 'results': a data frame containing P value, the number of variants and informative polymorphic # variants for each one of analyzed regions. # 'sample.size': after omitting NA's in trait and, if used, covariates. # 'nullmod': a null model object of class 'polygenic', returned if 'return.nullmod = TRUE'. # 'time': real and CPU time (in seconds) taken to perform the famFLM analysis ('total') # and in particular to calculate the null model ('null') and regional analysis ('regions'). # See the description of 'proc.time()' R-function for details. Returned if 'return.time = TRUE'. # AUTHORS: # Nadezhda Belonogova belon@bionet.nsc.ru # Gulnara Svishcheva gulsvi@mail.ru # EXAMPLES # source('famFLM.R') # load('example.data.RData') ### Run famFLM with B-spline basis and beta-smooth only (default): # out1 <- famFLM(trait ~ age + sex, phenodata, genodata, snpdata$position, kin, # return.nullmod = TRUE, reg = snpdata$gene) ### Run famFLM with B-spline basis for beta-smooth, Fourier basis for genetic variant functions, ### and using null model obtained in first run: # out2 <- famFLM(nullmod = out1$nullmod, phenodata = phenodata, # genodata = genodata, positions = snpdata$position, reg = snpdata$gene, genospline = 'fspline') ############################## famFLM <- function(formula, phenodata, genodata, positions, kin, nullmod, return.nullmod = FALSE, reg = NULL, sliding.window = c(20, 10), mode = 'add', betaspline = 'bspline', bbasis = 15, order = 4, fbasis = 25, genospline = FALSE, stat = 'F', return.time = FALSE, ncores = 1) { t0 <- proc.time() library(fda) library(GenABEL) if (!mode %in% c('add', 'dom', 'rec')) stop("Mode should be one of 'add', 'dom', 'rec'") if (class(genodata) == 'snp.data') { typ <- 1 } else if (class(genodata) == 'gwaa.data') { genodata <- genodata@gtdata typ <- 1 } else if (class(genodata) %in% c('matrix', 'data.frame')) { typ <- 2 } else { stop("genodata should be of gwaa.data, matrix or data.frame class") } if (!missing(phenodata)) { if (class(phenodata) == 'gwaa.data') { phenodata <- phenodata@phdata } else if (!class(phenodata) %in% c('matrix', 'data.frame')) { stop("phenodata should be of gwaa.data or data.frame class") } if (dim(genodata)[1] != dim(phenodata)[1]) stop("Dimensions of phenodata and genodata do not match") } k <- dim(genodata)[2] if (!is.null(reg)) { if (length(reg) != k) stop("Dimensions of reg and genodata do not match") l <- unique(reg) nreg <- length(l) } else { if (k < sliding.window[1]) stop("Window size should not exceed the number of genetic variants") if (sliding.window[1] < 1) stop("Window size should be >= 1") if (sliding.window[2] < 1) stop("Step should be >= 1") nreg <- floor((k - sliding.window[1]) / sliding.window[2] + 1) p1 <- sapply(1:nreg, function(i) (i - 1)*sliding.window[2] + 1) p2 <- p1 + sliding.window[1] - 1 l <- paste(p1, p2, sep = "_") } if (missing(nullmod) & missing(formula)) stop("Either formula or nullmod should be given") if (!missing(formula)) { if (is(try(formula, silent = TRUE), "try-error")) { formula <- phenodata[[as(match.call()[["formula"]], "character")]] } else { if (!is(try(as.formula(formula), silent = TRUE), "try-error")) {formula <- as.formula(formula)} } } run.null <- 0 if (!missing(nullmod)) { if (!missing(formula)) { if (is.null(nullmod$call$formula)) { warning("nullmod object has no 'call' info, running null model...") run.null <- 1 } else { run.null <- 1 if (!is(try(nullmod$call$formula, silent = TRUE), "try-error") & !is(try(formula, silent = TRUE), "try-error")) { if (length(nullmod$call$formula ) == length(formula)) { if (nullmod$call$formula == formula) run.null <- 0 } } if (!is(try(as.formula(formula), silent = TRUE), "try-error") & !is(try(as.formula(nullmod$call$formula), silent = TRUE), "try-error")) { if (as.formula(formula) == as.formula(nullmod$call$formula)) run.null <- 0 } if (!is(try(phenodata[, as.character(nullmod$call$formula)], silent = TRUE), "try-error")) { nf <- phenodata[, as.character(nullmod$call$formula)] if (length(nf) == length(formula)) { if (all.equal(nf, formula)) run.null <- 0 } } if (run.null) warning("nullmod object has different call, running null model...") } } else { if (!is.null(nullmod$call$formula)) { if (length(as.character(nullmod$call$formula)) > 1) { formula <- nullmod$call$formula cat("Formula from nullmod call is in effect\n") } } } if (!run.null) { if (is.null(nullmod$res) & is.null(nullmod$Inv)) { warning("nullmod object has no InvSigma and residualY data, running null model...") run.null <- 1 } } } else { run.null <- 1 } if (run.null) { t00 <- proc.time() if (!missing(formula)) { if (missing(kin)) stop("Kinship matrix is missing") if (missing(phenodata)) stop("Phenotype data is missing") if (!missing(kin) & !missing(phenodata)) { phenodata <- as.data.frame(phenodata) if (!is(try(as.formula(formula), silent = TRUE), "try-error")) { nullmod <- polygenic(as.formula(formula), data = phenodata, kinship.matrix = kin, quiet = TRUE) nullmod$call$formula <- as.formula(formula) } else { nullmod <- polygenic(formula, data = phenodata, kinship.matrix = kin, quiet = TRUE) nullmod$call$formula <- formula } } } else { stop("No formula to run null model") } ttt0 <- (proc.time() - t00) } else { ttt0 <- 'not run' if (!missing(phenodata)) { if (dim(phenodata)[1] != length(nullmod$res)) stop("Dimensions of nullmod phenotype and covariates do not match") } if (dim(genodata)[1] != length(nullmod$res)) stop("Dimensions of nullmod phenotype and genodata do not match") } SIGMAi <- nullmod$Inv res <- nullmod$res if (dim(SIGMAi)[1] != length(res)) { if (!is.null(rownames(SIGMAi)) & !is.null(rownames(genodata)) & !is.null(names(nullmod$res))) { vec <- match(rownames(SIGMAi), names(nullmod$res)) res <- nullmod$res[vec] vec <- match(rownames(SIGMAi), rownames(genodata)) genodata <- genodata[vec, ] } else { if (is(try(nullmod$measuredIDs, silent = TRUE), "try-error")) stop("Dimensions of nullmod residuals and InvSigma do not match") res <- nullmod$res[nullmod$measuredIDs] genodata <- genodata[nullmod$measuredIDs, ] } } n <- length(res) X <- matrix(rep(1, n), n, 1) if (!missing(formula)) { if (!is(try(as.formula(formula), silent = TRUE), "try-error")) { if (is(try(covariates <- as.matrix(model.frame(formula, phenodata)), silent = TRUE), "try-error")) stop("Check whether all covariates are given in phenodata") covariates <- as.matrix(model.frame(formula, phenodata, na.action = na.omit, drop.unused.levels = TRUE)[, -1]) if (dim(covariates)[1] != n) stop("Dimensions of nullmod phenotype and covariates do not match") X <- as.matrix(cbind(matrix(rep(1, n), n, 1), covariates)) } } if (ncores < 1) ncores <- 1 if (ncores > 1) { library(parallel) library(doSNOW) ncores <- min(ncores,nreg) if (ncores > detectCores()) { ncores <- detectCores() warning("Number of CPUs available: ", ncores) } } t00 <- proc.time() dis <- nullmod$h2an$est[length(nullmod$h2an$est)] Cori <- dis * SIGMAi CholInvCor <- chol(Cori) covariate <- CholInvCor %*% X pheno <- CholInvCor %*% res # making independent pheno P11 <- diag(n) - covariate %*% solve(t(covariate) %*% covariate, t(covariate)) P11CholInvCor <- P11 %*% CholInvCor ### definition of set of basis functions 'genobasis' and 'betabasis' to transform genotypes and their effects g <- TRUE if (missing(genospline)) g <- FALSE if (is.null(genospline)) g <- FALSE if (is.logical(genospline)) { if (!genospline) g <- FALSE } if (betaspline == 'bspline') { betabasis0 <- create.bspline.basis(norder = order, nbasis = bbasis) order0 <- order bbasis0 <- bbasis fbasis0 <- FALSE } else if (betaspline == 'fspline') { betabasis0 <- create.fourier.basis(c(0, 1), nbasis = fbasis) order0 <- FALSE bbasis0 <- FALSE fbasis0 <- fbasis } else { stop("betaspline should be either bspline or fspline") } if (g) { if (genospline == 'bspline') { genobasis0 <- create.bspline.basis(norder = order, nbasis = bbasis) if (!bbasis0) { order0 <- order bbasis0 <- bbasis } } else if (genospline == 'fspline') { genobasis0 <- create.fourier.basis(c(0, 1), nbasis = fbasis) if (!fbasis0) fbasis0 <- fbasis } else { stop("genospline should be either bspline or fspline") } J0 <- inprod(genobasis0, betabasis0) } #===================================================================================== analyse_FLM <- function(region, pheno, covariate) { if (typ == 1) Z <- as.numeric(genodata[, region]) if (typ == 2) Z <- genodata[, region] pos <- positions[region] Z <- as.matrix(Z) m0 <- dim(Z)[2] v <- sapply(1:dim(Z)[2], function(x) var(Z[, x], na.rm = TRUE)) Z <- as.matrix(Z[, v > 0]) pos <- pos[v > 0] if (dim(Z)[2] == 0) { warning("No polymorphic variants in region ", r, ", skipped") return(c('NA', m0, 0)) } MAF <- colMeans(Z, na.rm = TRUE) / 2 if (any(MAF > .5)) warning("Check whether minor allele homozygotes are coded as 2 at region ", r) if (mode == 'dom') { Z <- round(Z) Z[Z == 2] <- 1 } if (mode == 'rec') { Z <- round(Z) Z[Z == 1] <- 0 Z[Z == 2] <- 1 } # impute missing genotypes for (i in 1:dim(Z)[2]) { Z[is.na(Z[, i]), i] <- mean(Z[, i], na.rm = TRUE) } # detection and elimination of genetic variants, which are linear-dependent from other dqr <- qr(Z) index <- dqr$pivot[1:dqr$rank] Z <- as.matrix(Z[, index]) pos <- as.vector(pos[index]) if (dim(Z)[2] == 0) { warning("No polymorphic variants in region ", r, ", skipped") return(c('NA', m0, 0)) } #----------------------------------------------------------------- if ((bbasis0 & (dim(Z)[2] < bbasis0)) | (fbasis0 & (dim(Z)[2] < fbasis0))) { if ((betaspline == 'bspline') & (dim(Z)[2] < bbasis0)) { order <- min(order0, dim(Z)[2]) betabasis <- create.bspline.basis(norder = order, nbasis = dim(Z)[2]) } else if ((betaspline == 'fspline') & (dim(Z)[2] < fbasis0)) { if (dim(Z)[2] %% 2) { betabasis <- create.fourier.basis(c(0, 1), nbasis = dim(Z)[2]) } else { betabasis <- create.fourier.basis(c(0, 1), nbasis = dim(Z)[2] + 1) } } else { betabasis <- betabasis0 } if (g) { if ((genospline == 'bspline') & (dim(Z)[2] < bbasis0)) { order <- min(order0, dim(Z)[2]) genobasis <- create.bspline.basis(norder = order, nbasis = dim(Z)[2]) } else if ((genospline == 'fspline') & (dim(Z)[2] < fbasis0)) { if (dim(Z)[2] %% 2) { genobasis <- create.fourier.basis(c(0, 1), nbasis = dim(Z)[2]) } else { genobasis <- create.fourier.basis(c(0, 1), nbasis = dim(Z)[2] + 1) } } else { genobasis <- genobasis0 } J <- inprod(genobasis, betabasis) } } else { if (g) { genobasis <- genobasis0 J <- J0 } else { betabasis <- betabasis0 } } geno <- P11CholInvCor %*% Z # making centralized and independent genotypes ### Calculation of matrix FI in given positions of a region if (max(pos) > min(pos)) pos <- (pos - min(pos)) / (max(pos) - min(pos)) if (max(pos) == min(pos)) pos <- .5 if (g) { B <- eval.basis(pos, genobasis) ### Calculation of formula (1) for functional genotypes (depend on ONLY positions) FFF <- ginv(t(B) %*% B) %*% t(B) U <- geno %*% t(FFF) UJ <- matrix( U %*% J, ncol = ncol(J)) } else { B <- eval.basis(pos, betabasis) UJ <- geno %*% B } fit <- glm(pheno ~ covariate + UJ - 1, family = "gaussian") if (stat == 'F') { p <- anova(fit, test = stat)[3, 6] } else { p <- anova(fit, test = stat)[3, 5] } c(p, m0, dim(Z)[2]) } #================================================================================ if (ncores == 1) { out <- data.frame(region = l, pvalue = rep(NA, nreg), markers = rep(NA, nreg), polymorphic.markers = rep(NA, nreg)) for (i in 1:nreg) { r <- as.character(l[i]) if (!is.null(reg)) { region = reg == l[i] } else { region = p1[i]:p2[i] } out[i, 2:4] <- analyse_FLM(region, pheno, covariate) } } else { nj <- floor(nreg / ncores) vj <- rep(nj, ncores) nadd <- nreg %% ncores if (nadd > 0) vj[1:nadd] <- nj + 1 cl <- makeCluster(ncores, type = 'SOCK') registerDoSNOW(cl) out <- foreach(j = 1:ncores, .combine = rbind, .inorder = F) %dopar% { if (typ == 1) { library(GenABEL) } else if (g) { library(MASS) } library(fda) i0 <- sum(vj[1:(j-1)]) * (j > 1) + 1 out <- c() for (i in i0:(i0 + vj[j] - 1)) { r <- as.character(l[i]) if (!is.null(reg)) { region = reg == l[i] } else { region = p1[i]:p2[i] } out <- rbind(out, c(r, analyse_FLM(region, pheno, covariate))) } out } stopCluster(cl) out <- as.data.frame(out) out[, 2:4] <- sapply(2:4, function(x) as.numeric(as.character(out[, x]))) colnames(out) <- c('region', 'pvalue', 'markers', 'polymorphic.markers') out <- out[match(l, out$region), ] } if (is.null(reg)) { if (p2[nreg] < k) { out <- rbind(out, rep(NA)) i <- nreg + 1 p1 <- (i - 1) * sliding.window[2] + 1 p2 <- k r <- rownames(out)[i] <- paste(p1, p2, sep = "_") out$pvalue[i] <- analyse_FLM(p1:p2, pheno, covariate) } } ttt <- (proc.time() - t00) out0 <- out out <- c() out$results <- out0 out$sample.size <- n if (return.nullmod) out$nullmod <- nullmod if (return.time) { out$time$regions <- ttt out$time$null <- ttt0 ttt <- (proc.time() - t0) out$time$total <- ttt } out }