#' goseq Gene Ontology analyser
#' 
#' Does selection-unbiased testing for category enrichment amongst
#' differentially expressed (DE) genes for RNA-seq data.  By default, tests
#' gene ontology (GO) categories, but any categories may be tested.
#' 
#' The \code{pwf} argument is almost always the output of the function
#' \code{\link{nullp}}.  This is a data frame with 3 columns, named "DEgenes",
#' "bias.data" and "pwf" with the rownames set to the gene names.  Each row
#' corresponds to a gene with the DEgenes column specifying if the gene is DE
#' (1 for DE, 0 for not DE), the bias.data column giving the numeric value of
#' the DE bias being accounted for (usually the gene length or number of
#' counts) and the pwf column giving the genes value on the probability
#' weighting function.
#' 
#' \code{goseq} obtains length data from UCSC and GO mappings from the organism
#' packages (see \code{link{getgo}} and \code{\link{getlength}} for details).
#' If your data is in an unsupported format you will need to obtain the GO
#' category mapping and supply them to the \code{goseq} function using the
#' \code{gene2cat} argument.
#' 
#' To use your own gene to category mapping with \code{goseq}, use the
#' \code{gene2cat} argument.  This argument takes a data.frame, with one column
#' containing gene IDs and the other containing the associated categories.  As
#' the mapping from gene <-> category is in general many to many there will be
#' multiple rows containing the same gene identifier.  Alternatively,
#' \code{gene2cat} can take a list, where the names are the genes and the
#' entries are the GO categories associated with the genes.  This is the format
#' produced by the \code{\link{getgo}} function and is more space efficient
#' than the data.frame representation.
#' 
#' If \code{gene2cat} is left as \code{NULL}, \code{goseq} attempts to use
#' \code{\link{getgo}} to fetch GO category to gene identifier mappings.
#' 
#' The PWF is usually calculated using the \code{\link{nullp}} function to
#' correct for length bias. However, \code{goseq} will work with any vector of
#' weights.  Any bias can be accounted for so long as a weight for each gene is
#' supplied using this argument.  \code{NA}s are allowed in the "pwf" and
#' "bias.data" columns of the PWF data frame (these usually occur as a result
#' of missing length data for some genes).  Any entry which is \code{NA} is set
#' to the weighting of the median gene.
#' 
#' Valid options for the \code{test.cats} argument are any combination of
#' "GO:CC", "GO:BP", "GO:MF" & "KEGG".  The three GO terms refer to the
#' Cellular Component, Biological Process and Molecular Function respectively.
#' "KEGG" refers to KEGG pathways.
#' 
#' The three methods, "Wallenius", "Sampling" & "Hypergeometric", calculate the
#' p-values as follows.
#' 
#' "Wallenius" approximates the true distribution of numbers of members of a
#' category amongst DE genes by the Wallenius non-central hypergeometric
#' distribution.  This distribution assumes that within a category all genes
#' have the same probability of being chosen.  Therefore, this approximation
#' works best when the range in probabilities obtained by the probability
#' weighting function is small. "Wallenius" is the recommended method for
#' calculating p-values.
#' 
#' "Sampling" uses random sampling to approximate the true distribution and
#' uses it to calculate the p-values for over (and under) representation of
#' categories. In practice, its use quickly becomes computationally prohibitive
#' because \code{repcnt} would need to be set very high for most applications.
#' 
#' CAUTION: "Hypergeometric" should NEVER be used for producing results for
#' biological interpretation.  If there is genuinely no bias in power to detect
#' DE in your experiment, the PWF will reflect this and the other methods will
#' produce accurate results.
#' 
#' "Hypergeometric" assumes there is no bias in power to detect differential
#' expression at all and calculates the p-values using a standard
#' hypergeometric distribution.  Useful if you wish to test the effect of
#' selection bias on your results.
#' 
#' @param pwf An object containing gene names, DE calls, the probability
#' weighting function. Usually generated by \code{\link{nullp}}.
#' @param genome A string identifying the genome that \code{genes} refer to.
#' For a list of supported organisms run \code{\link{supportedGenomes}}.
#' @param id A string identifying the gene identifier used by \code{genes}.
#' For a list of supported gene IDs run \code{\link{supportedGeneIDs}}.
#' @param gene2cat A data frame with two columns containing the mapping between
#' genes and the categories of interest.  Alternatively, a list where the names
#' are genes and each entry is a vector containing GO categories associated
#' with that gene (this is the output produced by \code{\link{getgo}}).  If set
#' to \code{NULL} goseq attempts to fetch GO categories automatically using
#' \code{\link{getgo}}.
#' @param test.cats A vector specifying which categories to test for over
#' representation amongst DE genes.  See details for allowed options.
#' @param method The method to use to calculate the unbiased category
#' enrichment scores.  Valid options are "Wallenius", "Sampling" &
#' "Hypergeometric".  "Hypergeometric" and "Sampling" should almost never be
#' used (see details).
#' @param repcnt Number of random samples to be calculated when random sampling
#' is used.  Ignored unless \code{method="Sampling"}.
#' @param use_genes_without_cat A boolean to indicate whether genes without a
#' category should still be used. For example, a large number of gene may have
#' no GO term annotated. If this option is set to FALSE, those genes will be
#' ignored in the calculation of p-values (default behaviour). If this option
#' is set to TRUE, then these genes will count towards the total number of
#' genes outside the category being tested (default behaviour prior to version
#' 1.15.2).
#' @return goseq returns a data frame with several columns.  The first column
#' gives the name of the category, the second gives the p-value for the
#' associated category being over represented amongst DE genes.  The third
#' column gives the p-value for the associated category being under represented
#' amongst DE genes.  The p-values have not been corrected for multiple
#' hypothesis testing. The fourth and fifth columns give the number of
#' differentially expressed genes in the category and total genes in the
#' category respectively. If any of the categories was a GO term, there will be
#' two additional columns for the GO term and its ontology.
#' 
#' @author Matthew D. Young \email{myoung@@wehi.edu.au}
#' 
#' @seealso \code{\link{nullp}}, \code{\link{getgo}}, \code{\link{getlength}}
#' 
#' @export
#' 
#' @references Young, M. D., Wakefield, M. J., Smyth, G. K., Oshlack, A. (2010)
#' \emph{Gene ontology analysis for RNA-seq: accounting for selection bias}
#' Genome Biology Date: Feb 2010 Vol: 11 Issue: 2 Pages: R14
#' 
#' @examples
#' 
#' data(genes)
#' pwf <- nullp(genes,'hg19','ensGene')
#' pvals <- goseq(pwf,'hg19','ensGene')
#' head(pvals)
#' 
goseq <- function(pwf, genome, id, gene2cat = NULL, test.cats = c("GO:CC", "GO:BP", "GO:MF"), method = "Wallenius", repcnt = 2000, use_genes_without_cat = FALSE) {
  ################# Input pre-processing and validation ###################
  # Do some validation of input variables
  if (any(!test.cats %in% c("GO:CC", "GO:BP", "GO:MF", "KEGG"))) {
    stop("Invalid category specified.  Valid categories are GO:CC, GO:BP, GO:MF or KEGG")
  }
  if ((missing(genome) | missing(id))) {
    if (is.null(gene2cat)) {
      stop("You must specify the genome and gene ID format when automatically fetching gene to GO category mappings.")
    }
    # If we're using user specified mappings, this obviously isn't a problem
    genome <- "dummy"
    id <- "dummy"
  }
  if (!any(method %in% c("Wallenius", "Sampling", "Hypergeometric"))) {
    stop("Invalid calculation method selected.  Valid options are Wallenius, Sampling & Hypergeometric.")
  }
  if (!is.null(gene2cat) && (!is.data.frame(gene2cat) & !is.list(gene2cat))) {
    stop("Was expecting a dataframe or a list mapping categories to genes.  Check gene2cat input and try again.")
  }

  # Factors are evil
  pwf <- unfactor(pwf)
  gene2cat <- unfactor(gene2cat)

  ###################### Data fetching and processing ########################
  if (is.null(gene2cat)) {
    # When we fetch the data using getgo it will be in the list format
    message("Fetching GO annotations...")
    gene2cat <- getgo(rownames(pwf), genome, id, fetch.cats = test.cats)
    names(gene2cat) <- rownames(pwf)
    # Do the two rebuilds to remove any nulls
    cat2gene <- reversemapping(gene2cat)
    gene2cat <- reversemapping(cat2gene)
  } else {
    # The gene2cat input accepts a number of formats, we need to check each of them in term
    message("Using manually entered categories.")
    # The options are a flat mapping (that is a data frame or matrix) or a list, where the list can be either gene->categories or category->genes
    if (class(gene2cat) != "list") {
      # it's not a list so it must be a data.frame, work out which column contains the genes
      genecol_sum <- as.numeric(apply(gene2cat, 2, function(u) {
        sum(u %in% rownames(pwf))
      }))
      genecol <- which(genecol_sum != 0)
      if (length(genecol) > 1) {
        genecol <- genecol[order(-genecol_sum)[1]]
        warning(paste("More than one possible gene column found in gene2cat, using the one headed", colnames(gene2cat)[genecol]))
      }
      if (length(genecol) == 0) {
        genecol <- 1
        warning(paste("Gene column could not be identified in gene2cat conclusively, using the one headed", colnames(gene2cat)[genecol]))
      }
      othercol <- 1
      if (genecol == 1) {
        othercol <- 2
      }
      # Now put it into our delicious listy format
      gene2cat <- split(gene2cat[, othercol], gene2cat[, genecol])
      # Do the appropriate builds
      cat2gene <- reversemapping(gene2cat)
      gene2cat <- reversemapping(cat2gene)
    }
    # !!!!
    # The following conditional has been flagged as a potential issue when using certain
    # types of input where the category names are the same as gene names (which seems like
    # something you should avoid anyway...).  Leave it for now
    # !!!!
    # We're now garunteed to have a list (unless the user screwed up the input) but it could
    # be category->genes rather than the gene->categories that we want.
    if (sum(unique(unlist(gene2cat, use.names = FALSE)) %in% rownames(pwf)) > sum(unique(names(gene2cat)) %in% rownames(pwf))) {
      gene2cat <- reversemapping(gene2cat)
    }
    # Alright, we're garunteed a list going in the direction we want now.  Throw out genes which we will not use
    gene2cat <- gene2cat[names(gene2cat) %in% rownames(pwf)]

    # Rebuild because it's a fun thing to do
    cat2gene <- reversemapping(gene2cat)
    gene2cat <- reversemapping(cat2gene)

    ## make sure we remove duplicate entries .. e.g. see
    ## http://permalink.gmane.org/gmane.science.biology.informatics.conductor/46876
    cat2gene <- lapply(cat2gene, function(x) {
      unique(x)
    })
    gene2cat <- lapply(gene2cat, function(x) {
      unique(x)
    })
  }

  nafrac <- (sum(is.na(pwf$pwf)) / nrow(pwf)) * 100
  if (nafrac > 50) {
    warning(paste("Missing length data for ", round(nafrac), "% of genes.  Accuracy of GO test will be reduced.", sep = ""))
  }
  # Give the genes with unknown length the weight used by the median gene (not the median weighting!)
  pwf$pwf[is.na(pwf$pwf)] <- pwf$pwf[match(sort(pwf$bias.data[!is.na(pwf$bias.data)])[ceiling(sum(!is.na(pwf$bias.data)) / 2)], pwf$bias.data)]

  ###################### Calculating the p-values ########################
  # Remove all the genes with unknown GOterms
  unknown_go_terms <- nrow(pwf) - length(gene2cat)
  if ((!use_genes_without_cat) && unknown_go_terms > 0) {
    message(paste("For", unknown_go_terms, "genes, we could not find any categories. These genes will be excluded."))
    message("To force their use, please run with use_genes_without_cat=TRUE (see documentation).")
    message("This was the default behavior for version 1.15.1 and earlier.")
    pwf <- pwf[rownames(pwf) %in% names(gene2cat), ]
  }
  # A few variables are always useful so calculate them
  cats <- names(cat2gene)
  DE <- rownames(pwf)[pwf$DEgenes == 1]
  num_de <- length(DE)
  num_genes <- nrow(pwf)
  pvals <- data.frame(category = cats, over_represented_pvalue = NA, under_represented_pvalue = NA, stringsAsFactors = FALSE, numDEInCat = NA, numInCat = NA)
  if (method == "Sampling") {
    # We need to know the number of DE genes in each category, make this as a mask that we can use later...
    num_DE_mask <- rep(0, length(cats))
    a <- table(unlist(gene2cat[DE], FALSE, FALSE))
    num_DE_mask[match(names(a), cats)] <- as.numeric(a)
    num_DE_mask <- as.integer(num_DE_mask)
    # We have to ensure that genes not associated with a category are included in the simulation, to do this they need an empty entry in the gene2cat list
    gene2cat <- gene2cat[rownames(pwf)]
    names(gene2cat) <- rownames(pwf)
    message("Running the simulation...")
    # Now do the actual simulating
    lookup <- matrix(0, nrow = repcnt, ncol = length(cats))
    for (i in 1:repcnt) {
      # A more efficient way of doing weighted random sampling without replacment than the built in function
      # The order(runif...)[1:n] bit picks n genes at random, weighting them by the PWF
      # The table(as.character(unlist(...))) bit then counts the number of times this random set occured in each category
      a <- table(as.character(unlist(gene2cat[order(runif(num_genes)^(1 / pwf$pwf), decreasing = TRUE)[1:num_de]], FALSE, FALSE)))
      lookup[i, match(names(a), cats)] <- a
      pp(repcnt)
    }
    message("Calculating the p-values...")
    # The only advantage of the loop is it uses less memory...
    # for(i in 1:length(cats)){
    # 	pvals[i,2:3]=c((sum(lookup[,i]>=num_DE_mask[i])+1)/(repcnt+1),(sum(lookup[,i]<=num_DE_mask[i])+1)/(repcnt+1))
    # 	pp(length(cats))
    # }
    pvals[, 2] <- (colSums(lookup >= outer(rep(1, repcnt), num_DE_mask)) + 1) / (repcnt + 1)
    pvals[, 3] <- (colSums(lookup <= outer(rep(1, repcnt), num_DE_mask)) + 1) / (repcnt + 1)
  }
  if (method == "Wallenius") {
    message("Calculating the p-values...")
    # All these things are just to make stuff run faster, mostly because comparison of integers is faster than string comparison
    degenesnum <- which(pwf$DEgenes == 1)
    # Turn all genes into a reference to the pwf object
    cat2genenum <- relist(match(unlist(cat2gene), rownames(pwf)), cat2gene)
    # This value is used in every calculation, by storing it we need only calculate it once
    alpha <- sum(pwf$pwf)
    # Each category will have a different weighting so needs its own test
    pvals[, 2:3] <- t(sapply(cat2genenum, function(u) {
      # The number of DE genes in this category
      num_de_incat <- sum(degenesnum %in% u)
      # The total number of genes in this category
      num_incat <- length(u)
      # This is just a quick way of calculating weight=avg(PWF within category)/avg(PWF outside of category)
      avg_weight <- mean(pwf$pwf[u])
      weight <- (avg_weight * (num_genes - num_incat)) / (alpha - num_incat * avg_weight)
      if (num_incat == num_genes) {
        weight <- 1
      } # case for the root GO terms
      # Now calculate the sum of the tails of the Wallenius distribution (the p-values)
      c(
        dWNCHypergeo(num_de_incat, num_incat, num_genes - num_incat, num_de, weight)
        + pWNCHypergeo(num_de_incat, num_incat, num_genes - num_incat, num_de, weight, lower.tail = FALSE),
        pWNCHypergeo(num_de_incat, num_incat, num_genes - num_incat, num_de, weight)
      )
    }))
  }
  if (method == "Hypergeometric") {
    message("Calculating the p-values...")
    # All these things are just to make stuff run faster, mostly because comparison of integers is faster than string comparison
    degenesnum <- which(pwf$DEgenes == 1)
    # Turn all genes into a reference to the pwf object
    cat2genenum <- relist(match(unlist(cat2gene), rownames(pwf)), cat2gene)
    # Simple hypergeometric test, one category at a time
    pvals[, 2:3] <- t(sapply(cat2genenum, function(u) {
      # The number of DE genes in this category
      num_de_incat <- sum(degenesnum %in% u)
      # The total number of genes in this category
      num_incat <- length(u)
      # Calculate the sum of the tails of the hypergeometric distribution (the p-values)
      c(dhyper(num_de_incat, num_incat, num_genes - num_incat, num_de) + phyper(num_de_incat, num_incat, num_genes - num_incat, num_de, lower.tail = FALSE), phyper(num_de_incat, num_incat, num_genes - num_incat, num_de))
    }))
  }
  # Populate the count columns...
  degenesnum <- which(pwf$DEgenes == 1)
  cat2genenum <- relist(match(unlist(cat2gene), rownames(pwf)), cat2gene)
  pvals[, 4:5] <- t(sapply(cat2genenum, function(u) {
    c(sum(degenesnum %in% u), length(u))
  }))

  # Finally, sort by p-value
  pvals <- pvals[order(pvals$over_represented_pvalue), ]

  # Supplement the table with the GO term name and ontology group
  # but only if the enrichment categories are actually GO terms
  if (any(grep("^GO:", pvals$category))) {
    GOnames <- select(GO.db, keys = pvals$category, columns = c("TERM", "ONTOLOGY"))[, 2:3]
    colnames(GOnames) <- tolower(colnames(GOnames))
    pvals <- cbind(pvals, GOnames)
  }

  # And return
  return(pvals)
}
