# famFLM: Region-based association test for familial data under functional linear models
# 02-10-2014

# Requires R-libraries: 'GenABEL' (for null model estimation) and 'fda' (for functional basis estimation).
# To enable parallel calculation, libraries 'parallel' and 'doSNOW' should be installed.

# USAGE:

# famFLM (formula, phenodata, genodata, positions, kin, nullmod, return.nullmod = FALSE, reg = NULL,
# sliding.window = c(20, 10), mode = 'add', betaspline = 'bspline', bbasis = 15, order = 4, fbasis = 25,
# genospline = FALSE, stat = 'F', return.time = FALSE, ncores = 1)

# ARGUMENTS:

#	'formula': referring to the column(s) in 'phenodata' to be analyzed as outcome and,
#	if needed, covariates.

#	'phenodata': a data frame containing columns mentioned in 'formula' (trait to analyze and,
#	if needed, covariates).

#	'genodata': a data frame or matrix (with ids given in rows) containing genotypes 
#	coded as AA = 0, Aa = 1 and aa = 2, where a is a minor allele. NAs in 'genodata'
#	will be imputed by the mean values. Monomorphic and duplicated variants will be omitted.
#	'genodata' object can also be of 'gwaa.data-class' or 'snp.data-class'. 

#	'positions': a vector of physical positions for genetic variants in 'genodata'.

#	'kin': a kinship matrix to evaluate the null model. It can be calculated from a pedigree
#	(e.g. using 'kinship2' package) or from genotypic data (with 'ibs' function of GenABEL).

#	'nullmod': an object of 'polygenic' data class. Setting 'nullmod' allows
#	to avoid re-estimation of the null model that does not depend on genotypes and can be
#	calculated once for a trait. 'nullmod' object in proper format can be obtained by running
#	'famFLM' with 'return.nullmod = TRUE'.

#	'return.nullmod': logical value indicating whether a 'nullmod' object should be returned.

#	'reg': a vector assigning a region for each genetic marker in 'genodata'.
#	If NULL, 'sliding.window' parameters are used.

#	'sliding.window': the sliding window size and step. Has no effect if 'reg' is defined.

#	'mode': the mode of inheritance ('add', 'dom' or 'rec' for additive, dominant or recessive modes,
#	respectively). For dominant (recessive) mode genotypes will be recoded as AA = 0, Aa = 1 and aa = 1
#	(AA = 0, Aa = 0 and aa = 1), where a is a minor allele. Default mode is additive.

#	'betaspline': a basis function type used for beta-smooth, 'bspline' (B-spline basis, default) or
#	'fspline' (Fourier basis).

#	'bbasis': the number of basis functions to be used in 'bspline' (default = 15).

#	'order': a polynomial order to be used in 'bspline' (default = 4 corresponds to the cubic B-splines).

#	'fbasis': the number of basis functions to be used in 'fspline' (default = 25).

#	'genospline': a basis function type used for genetic variant functions (GVF). Can be set to
#	'bspline' or 'fspline'. The default 'genospline' = FALSE assumes beta-smooth only.

#	'stat': the statistic to be used to calculate the P values. One of 'F' (default), 'Chisq', 'LRT'.

#	'return.time': a logical value indicating whether the running time should be returned.

#	'ncores': the number of CPUs for parallel calculations (default = 1).

#	VALUES:

#	'results': a data frame containing P value, the number of variants and informative polymorphic
#	variants for each one of analyzed regions.

#	'sample.size': after omitting NA's in trait and, if used, covariates.

#	'nullmod': a null model object of class 'polygenic', returned if 'return.nullmod = TRUE'.

#	'time': real and CPU time (in seconds) taken to perform the famFLM analysis ('total')
#	and in particular to calculate the null model ('null') and regional analysis ('regions').
#	See the description of 'proc.time()' R-function for details. Returned if 'return.time = TRUE'.

#	AUTHORS:

#	Nadezhda Belonogova belon@bionet.nsc.ru
#	Gulnara Svishcheva gulsvi@mail.ru

#	EXAMPLES

# source('famFLM.R')
# load('example.data.RData')

### Run famFLM with B-spline basis and beta-smooth only (default):

# out1 <- famFLM(trait ~ age + sex, phenodata, genodata, snpdata$position, kin,
#	return.nullmod = TRUE, reg = snpdata$gene)

### Run famFLM with B-spline basis for beta-smooth, Fourier basis for genetic variant functions,
### and using null model obtained in first run:

# out2 <- famFLM(nullmod = out1$nullmod, phenodata = phenodata,
#	genodata = genodata, positions = snpdata$position, reg = snpdata$gene, genospline = 'fspline')

##############################

famFLM <- function(formula, phenodata, genodata, positions, kin, nullmod, return.nullmod = FALSE, reg = NULL,
sliding.window = c(20, 10), mode = 'add', betaspline = 'bspline', bbasis = 15, order = 4, fbasis = 25,
genospline = FALSE, stat = 'F', return.time = FALSE, ncores = 1) {

t0 <- proc.time()

library(fda)
library(GenABEL)

if (!mode %in% c('add', 'dom', 'rec')) stop("Mode should be one of 'add', 'dom', 'rec'")

if (class(genodata) == 'snp.data') {
	typ <- 1
} else if (class(genodata) == 'gwaa.data') {
	genodata <- genodata@gtdata
	typ <- 1
} else if (class(genodata) %in% c('matrix', 'data.frame')) {
	typ <- 2
} else {
	stop("genodata should be of gwaa.data, matrix or data.frame class")
}

if (!missing(phenodata)) {
	if (class(phenodata) == 'gwaa.data') {
		phenodata <- phenodata@phdata
	} else if (!class(phenodata) %in% c('matrix', 'data.frame')) {
		stop("phenodata should be of gwaa.data or data.frame class")
	}
	if (dim(genodata)[1] != dim(phenodata)[1]) stop("Dimensions of phenodata and genodata do not match")
}

k <- dim(genodata)[2]

if (!is.null(reg)) {
	if (length(reg) != k) stop("Dimensions of reg and genodata do not match")
	l <- unique(reg)
	nreg <- length(l)
} else {
	if (k < sliding.window[1]) stop("Window size should not exceed the number of genetic variants")
	if (sliding.window[1] < 1) stop("Window size should be >= 1")
	if (sliding.window[2] < 1) stop("Step should be >= 1")
	nreg <- floor((k - sliding.window[1]) / sliding.window[2] + 1)
	p1 <- sapply(1:nreg, function(i) (i - 1)*sliding.window[2] + 1)
	p2 <- p1 + sliding.window[1] - 1
	l <- paste(p1, p2, sep = "_")
}

if (missing(nullmod) & missing(formula)) stop("Either formula or nullmod should be given")

if (!missing(formula)) {
	if (is(try(formula, silent = TRUE), "try-error")) {
		formula <- phenodata[[as(match.call()[["formula"]], "character")]]
	} else {
		if (!is(try(as.formula(formula), silent = TRUE), "try-error")) {formula <- as.formula(formula)}
	}
}

run.null <- 0

if (!missing(nullmod)) {
	if (!missing(formula)) {
		if (is.null(nullmod$call$formula)) {
			warning("nullmod object has no 'call' info, running null model...")
			run.null <- 1
		} else {
			run.null <- 1
			if (!is(try(nullmod$call$formula, silent = TRUE), "try-error") & !is(try(formula, silent = TRUE), "try-error")) {
				if (length(nullmod$call$formula ) == length(formula)) {
					if (nullmod$call$formula == formula) run.null <- 0
				}
			}
			if (!is(try(as.formula(formula), silent = TRUE), "try-error") & !is(try(as.formula(nullmod$call$formula), silent = TRUE), "try-error")) {
				if (as.formula(formula) == as.formula(nullmod$call$formula)) run.null <- 0
			}
			if (!is(try(phenodata[, as.character(nullmod$call$formula)], silent = TRUE), "try-error")) {
				nf <- phenodata[, as.character(nullmod$call$formula)]
				if (length(nf) == length(formula)) {
					if (all.equal(nf, formula)) run.null <- 0
				}
			}
			if (run.null) warning("nullmod object has different call, running null model...")
		}
	} else {
		if (!is.null(nullmod$call$formula)) {
			if (length(as.character(nullmod$call$formula)) > 1) {
				formula <- nullmod$call$formula
				cat("Formula from nullmod call is in effect\n")
			}
		}
	}
	if (!run.null) {
		if (is.null(nullmod$res) & is.null(nullmod$Inv)) {
			warning("nullmod object has no InvSigma and residualY data, running null model...")
			run.null <- 1
		}
	}
} else { run.null <- 1 }

if (run.null) {
	t00 <- proc.time()
	if (!missing(formula)) {
		if (missing(kin)) stop("Kinship matrix is missing")
		if (missing(phenodata)) stop("Phenotype data is missing")
		if (!missing(kin) & !missing(phenodata)) {
			phenodata <- as.data.frame(phenodata)
			if (!is(try(as.formula(formula), silent = TRUE), "try-error")) {
				nullmod <- polygenic(as.formula(formula), data = phenodata, kinship.matrix = kin, quiet = TRUE)
				nullmod$call$formula <- as.formula(formula)
			} else {
				nullmod <- polygenic(formula, data = phenodata, kinship.matrix = kin, quiet = TRUE)
				nullmod$call$formula <- formula
			}
		}
	} else { stop("No formula to run null model") } 
	ttt0 <- (proc.time() - t00)
} else {
	ttt0 <- 'not run'
	if (!missing(phenodata)) {
		if (dim(phenodata)[1] != length(nullmod$res)) stop("Dimensions of nullmod phenotype and covariates do not match")
	}
	if (dim(genodata)[1] != length(nullmod$res)) stop("Dimensions of nullmod phenotype and genodata do not match")
}

SIGMAi <- nullmod$Inv
res <- nullmod$res

if (dim(SIGMAi)[1] != length(res)) {
	if (!is.null(rownames(SIGMAi)) & !is.null(rownames(genodata)) & !is.null(names(nullmod$res))) {
		vec <- match(rownames(SIGMAi), names(nullmod$res))
		res <- nullmod$res[vec]
		vec <- match(rownames(SIGMAi), rownames(genodata))
		genodata <- genodata[vec, ]
	} else {
		if (is(try(nullmod$measuredIDs, silent = TRUE), "try-error")) stop("Dimensions of nullmod residuals and InvSigma do not match")
		res <- nullmod$res[nullmod$measuredIDs]
		genodata <- genodata[nullmod$measuredIDs, ]
	}
}
n <- length(res)

X <- matrix(rep(1, n), n, 1)

if (!missing(formula)) {
	if (!is(try(as.formula(formula), silent = TRUE), "try-error")) {
		if (is(try(covariates <- as.matrix(model.frame(formula, phenodata)), silent = TRUE), "try-error")) stop("Check whether all covariates are given in phenodata")
		covariates <- as.matrix(model.frame(formula, phenodata, na.action = na.omit, drop.unused.levels = TRUE)[, -1])
		if (dim(covariates)[1] != n) stop("Dimensions of nullmod phenotype and covariates do not match")
		X <- as.matrix(cbind(matrix(rep(1, n), n, 1), covariates))
	}
}

if (ncores < 1) ncores <- 1
if (ncores > 1) {
	library(parallel)
	library(doSNOW)
	ncores <- min(ncores,nreg)
	if (ncores > detectCores()) {
		ncores <- detectCores()
		warning("Number of CPUs available: ", ncores)
	}
}

t00 <- proc.time()

dis <- nullmod$h2an$est[length(nullmod$h2an$est)]
Cori <- dis * SIGMAi
CholInvCor <- chol(Cori)
covariate <- CholInvCor %*% X
pheno <- CholInvCor %*% res # making independent pheno
P11 <- diag(n) - covariate %*% solve(t(covariate) %*% covariate, t(covariate))
P11CholInvCor <- P11 %*% CholInvCor

### definition of set of basis functions 'genobasis' and 'betabasis' to transform genotypes and their effects   
g <- TRUE
if (missing(genospline)) g <- FALSE
if (is.null(genospline)) g <- FALSE
if (is.logical(genospline)) { if (!genospline) g <- FALSE }

if (betaspline == 'bspline') {
	betabasis0 <- create.bspline.basis(norder = order, nbasis = bbasis)
	order0 <- order
	bbasis0 <- bbasis
	fbasis0 <- FALSE
} else if (betaspline == 'fspline') {
	betabasis0 <- create.fourier.basis(c(0, 1), nbasis = fbasis)
	order0 <- FALSE
	bbasis0 <- FALSE
	fbasis0 <- fbasis
} else { stop("betaspline should be either bspline or fspline") }

if (g) {
	if (genospline == 'bspline') {
		genobasis0 <- create.bspline.basis(norder = order, nbasis = bbasis)
		if (!bbasis0) {
			order0 <- order
			bbasis0 <- bbasis
		}
	} else if (genospline == 'fspline') {
		genobasis0 <- create.fourier.basis(c(0, 1), nbasis = fbasis)
		if (!fbasis0) fbasis0 <- fbasis
	} else { stop("genospline should be either bspline or fspline") }
	J0 <- inprod(genobasis0, betabasis0)
}

#=====================================================================================
analyse_FLM <- function(region, pheno, covariate) {

	if (typ == 1) Z <- as.numeric(genodata[, region])
	if (typ == 2) Z <- genodata[, region]

	pos <- positions[region]

	Z <- as.matrix(Z)
	m0 <- dim(Z)[2]
	v <- sapply(1:dim(Z)[2], function(x) var(Z[, x], na.rm = TRUE))
	Z <- as.matrix(Z[, v > 0])
	pos <- pos[v > 0]

	if (dim(Z)[2] == 0) {
		warning("No polymorphic variants in region ", r, ", skipped")
		return(c('NA', m0, 0))
	}

	MAF <- colMeans(Z, na.rm = TRUE) / 2
	if (any(MAF > .5)) warning("Check whether minor allele homozygotes are coded as 2 at region ", r)

	if (mode == 'dom') {
		Z <- round(Z)
		Z[Z == 2] <- 1
	}
	if (mode == 'rec') {
		Z <- round(Z)
		Z[Z == 1] <- 0
		Z[Z == 2] <- 1
	}

	# impute missing genotypes
	for (i in 1:dim(Z)[2]) { Z[is.na(Z[, i]), i] <- mean(Z[, i], na.rm = TRUE) }

	# detection and elimination of genetic variants, which are linear-dependent from other
	dqr <- qr(Z)
	index <- dqr$pivot[1:dqr$rank]
	Z <- as.matrix(Z[, index])
	pos <- as.vector(pos[index])

	if (dim(Z)[2] == 0) {
		warning("No polymorphic variants in region ", r, ", skipped")
		return(c('NA', m0, 0))
	}

	#-----------------------------------------------------------------

	if ((bbasis0 & (dim(Z)[2] < bbasis0)) | (fbasis0 & (dim(Z)[2] < fbasis0))) {
		if ((betaspline == 'bspline') & (dim(Z)[2] < bbasis0)) {
			order <- min(order0, dim(Z)[2])
			betabasis <- create.bspline.basis(norder = order, nbasis = dim(Z)[2])
		} else if ((betaspline == 'fspline') & (dim(Z)[2] < fbasis0)) {
			if (dim(Z)[2] %% 2) {
				betabasis <- create.fourier.basis(c(0, 1), nbasis = dim(Z)[2])
			} else { betabasis <- create.fourier.basis(c(0, 1), nbasis = dim(Z)[2] + 1) }
		} else { betabasis <- betabasis0 }
		if (g) {
			if ((genospline == 'bspline') & (dim(Z)[2] < bbasis0)) {
				order <- min(order0, dim(Z)[2])
				genobasis <- create.bspline.basis(norder = order, nbasis = dim(Z)[2])
			} else if ((genospline == 'fspline') & (dim(Z)[2] < fbasis0)) {
				if (dim(Z)[2] %% 2) { genobasis <- create.fourier.basis(c(0, 1), nbasis = dim(Z)[2])
				} else { genobasis <- create.fourier.basis(c(0, 1), nbasis = dim(Z)[2] + 1) }
			} else { genobasis <- genobasis0 }
			J <- inprod(genobasis, betabasis)
		}
	} else {
		if (g) {
			genobasis <- genobasis0
			J <- J0
		} else {
			betabasis <- betabasis0
		}
	}

	geno <- P11CholInvCor %*% Z # making centralized and independent genotypes
	### Calculation of matrix FI in given positions of a region
	if (max(pos) > min(pos)) pos <- (pos - min(pos)) / (max(pos) - min(pos))
	if (max(pos) == min(pos)) pos <- .5
	if (g) {
		B <- eval.basis(pos, genobasis)
		### Calculation of formula (1) for functional genotypes (depend on ONLY positions)
		FFF <- ginv(t(B) %*% B) %*% t(B)
		U <- geno %*% t(FFF)
		UJ <- matrix( U %*% J, ncol = ncol(J))
	} else {
		B <- eval.basis(pos, betabasis)
		UJ <- geno %*% B
	}

	fit <- glm(pheno ~ covariate + UJ - 1, family = "gaussian")
	if (stat == 'F') {
		p <- anova(fit, test = stat)[3, 6]
	} else { p <- anova(fit, test = stat)[3, 5] }
	c(p, m0, dim(Z)[2])

}

#================================================================================

if (ncores == 1) {

	out <- data.frame(region = l, pvalue = rep(NA, nreg), markers = rep(NA, nreg), polymorphic.markers = rep(NA, nreg))

	for (i in 1:nreg) {
		r <- as.character(l[i])
		if (!is.null(reg)) { region = reg == l[i] } else { region = p1[i]:p2[i] }
		out[i, 2:4] <- analyse_FLM(region, pheno, covariate)
	}

} else {

	nj <- floor(nreg / ncores)
	vj <- rep(nj, ncores)
	nadd <- nreg %% ncores
	if (nadd > 0) vj[1:nadd] <- nj + 1

	cl <- makeCluster(ncores, type = 'SOCK')
	registerDoSNOW(cl)

	out <- foreach(j = 1:ncores, .combine = rbind, .inorder = F) %dopar% {
		if (typ == 1) { library(GenABEL) } else if (g) { library(MASS) }
		library(fda)
		i0 <- sum(vj[1:(j-1)]) * (j > 1) + 1
		out <- c()
		for (i in i0:(i0 + vj[j] - 1)) {
			r <- as.character(l[i])
			if (!is.null(reg)) { region = reg == l[i] } else { region = p1[i]:p2[i] }
			out <- rbind(out, c(r, analyse_FLM(region, pheno, covariate)))
		}
		out
	}

	stopCluster(cl)
	out <- as.data.frame(out)
	out[, 2:4] <- sapply(2:4, function(x) as.numeric(as.character(out[, x])))
	colnames(out) <- c('region', 'pvalue', 'markers', 'polymorphic.markers')
	out <- out[match(l, out$region), ]

}

if (is.null(reg)) {
	if (p2[nreg] < k) {
		out <- rbind(out, rep(NA))
		i <- nreg + 1
		p1 <- (i - 1) * sliding.window[2] + 1
		p2 <- k
		r <- rownames(out)[i] <- paste(p1, p2, sep = "_")
		out$pvalue[i] <- analyse_FLM(p1:p2, pheno, covariate)
	}
}

ttt <- (proc.time() - t00)

out0 <- out
out <- c()
out$results <- out0
out$sample.size <- n
if (return.nullmod) out$nullmod <- nullmod
if (return.time) {
	out$time$regions <- ttt
	out$time$null <- ttt0
	ttt <- (proc.time() - t0)
	out$time$total <- ttt
}
out
}