diff --git a/docs/GSEA.fsx b/docs/GSEA.fsx index cdcd2bc..a9fe9c3 100644 --- a/docs/GSEA.fsx +++ b/docs/GSEA.fsx @@ -106,7 +106,7 @@ let expandedItems = OntologyEnrichment.expandOntologyTree items //minNumberInTerm: what is the minimal number of items that should be // within a bin to consider the bin at all (default 2). //data: sequence of expanded and split ontology items -let gseaResult = OntologyEnrichment.CalcOverEnrichment 1 (Some 5) (Some 2) expandedItems +let gseaResult = OntologyEnrichment.calcOverEnrichment 1 (Some 5) (Some 2) expandedItems let filterAndSortEnrichedModules = diff --git a/src/BioFSharp.Stats/OntologyEnrichment.fs b/src/BioFSharp.Stats/OntologyEnrichment.fs index fc15057..35751c4 100644 --- a/src/BioFSharp.Stats/OntologyEnrichment.fs +++ b/src/BioFSharp.Stats/OntologyEnrichment.fs @@ -32,13 +32,39 @@ module OntologyEnrichment = TotalNumberOfDE : int ///Number of all items (expanded) TotalUniverse : int + ///p value as calculated by hypergeometric test PValue : float - } + } with + static member Create(ontologyTerm, itemsInBin, numberOfDEsInBin, numberInBin, totalNumberOfDE, totalUnivers, pValue) = + {OntologyTerm = ontologyTerm; ItemsInBin = itemsInBin; NumberOfDEsInBin = numberOfDEsInBin; + NumberInBin = numberInBin; TotalNumberOfDE = totalNumberOfDE; TotalUniverse = totalUnivers; PValue = pValue} - /// Creates a gene set enrichment result - let createGseaResult ontologyTerm desInBin numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue = - {OntologyTerm = ontologyTerm;ItemsInBin = desInBin; NumberOfDEsInBin = numberOfDEsInBin; - NumberInBin = numberInBin; TotalNumberOfDE = totalNumberOfDE; TotalUniverse = totalUnivers; PValue = pValue} + /// Represents a gene set enrichment result with extended statistics + type GseaResultFdrExtended<'a> = { + ///Ontology term e.g. MapMan term, GO term ... + OntologyTerm : string + ///Sequence of single items associated to the ontology term + ItemsInBin : seq> + ///Number of significantly altered items in 'OntologyTerm' bin + NumberOfDEsInBin : int + ///Number of items in 'OntologyTerm' bin + NumberInBin : int + ///Number of significantly altered items within the total data set + TotalNumberOfDE : int + ///Number of all items (expanded) + TotalUniverse : int + ///p value as calculated by hypergeometric test + PValue : float + ///false discovery rate (FDR) using Benjamini-Hochberg method + FDR_BH : float + ///q value (FDR) as calculated by Storey method + QValue : float + ///pi0 (proportion of null hypotheses) as calculated by Storey method + Pi0 : float + } with + static member Create(ontologyTerm, itemsInBin, numberOfDEsInBin, numberInBin, totalNumberOfDE, totalUnivers, pValue, fdr: float, qVal: float, pi0: float) = + {OntologyTerm = ontologyTerm; ItemsInBin = itemsInBin; NumberOfDEsInBin = numberOfDEsInBin; + NumberInBin = numberInBin; TotalNumberOfDE = totalNumberOfDE; TotalUniverse = totalUnivers; PValue = pValue; FDR_BH = fdr; QValue = qVal; Pi0 = pi0} ///Splits an OntologyEntry with seperator concatenated TermIds let splitMultipleAnnotationsBy (separator:char) (item:OntologyItem<'A>) = @@ -62,8 +88,27 @@ module OntologyEnrichment = |> Seq.map (fun sTerm -> createOntologyItem oi.Id sTerm oi.GroupIndex oi.Item) ) - - /// Extends leaf OntologyEntries to their full tree + /// Extends leaf OntologyEntries to their full tree + /// Only works for hierarchical ontologies that mark their hierarchies by `.` (e.g. PS.lightreaction.LHCII) + /// collection of OntologyItems + /// A collection of `OntologyItems` in which each original element is expanded to each hierarchy level. + /// + /// + /// let data = [| + /// createOntologyItem "id1" "photosynthesis.lightreaction" 0 "item1" + /// createOntologyItem "id2" "protein.degradation" 0 "item2" + /// createOntologyItem "id3" "photosynthesis.lightreaction.LHCI" 1 "item3" + /// let result = expandOntologyTree data + /// //result: seq> = seq [| + /// // {Id = "id1"; OntologyTerm = "photosynthesis.lightreaction"; GroupIndex = 0; Item = "item1"} + /// // {Id = "id1"; OntologyTerm = "photosynthesis"; GroupIndex = 0; Item = "item1"} + /// // {Id = "id2"; OntologyTerm = "protein.degradation"; GroupIndex = 0; Item = "item2"} + /// // {Id = "id2"; OntologyTerm = "protein"; GroupIndex = 0; Item = "item2"} + /// // {Id = "id3"; OntologyTerm = "photosynthesis.lightreaction.LHCI"; GroupIndex = 1; Item = "item3"} + /// // {Id = "id3"; OntologyTerm = "photosynthesis.lightreaction"; GroupIndex = 1; Item = "item3"} + /// // {Id = "id3"; OntologyTerm = "photosynthesis"; GroupIndex = 1; Item = "item3"} + /// + /// let expandOntologyTree (data:seq>) = data |> Seq.collect (fun oi -> @@ -71,72 +116,58 @@ module OntologyEnrichment = expandenTermIds |> Seq.map (fun sTerm -> createOntologyItem oi.Id sTerm oi.GroupIndex oi.Item) ) - - // ########################################################################################################### - // the hypergeometric distribution is a discrete probability distribution that describes the probability of - // k successes in - // n draws from a finite - // x population of size containing - // m successes without replacement (successes states) - /// Calculates p value based on hypergeometric distribution (pValue <= k) - let CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE (splitPvalueThreshold:int) = - if (numberOfDEsInBin > 1) then - let hp = FSharp.Stats.Distributions.Discrete.Hypergeometric.Init totalUnivers totalNumberOfDE numberInBin - if numberInBin > splitPvalueThreshold then - // Calculate normal pValue - 1. - hp.CDF (float (numberOfDEsInBin - 1)) - else - // Calculate split pValue - 0.5 * ((1. - hp.CDF(float(numberOfDEsInBin - 1)) ) + ( (1. - hp.CDF(float(numberOfDEsInBin))) ) ) + /// Calculates p value based on hypergeometric distribution (pValue <= k) + /// the hypergeometric distribution is a discrete probability distribution that describes the probability of + /// k successes in + /// n draws from a finite + /// N population of size containing + /// K successes without replacement (successes states) + /// (k) number of significantly altered entities in the respective 'OntologyTerm' bin + /// (n) number of entities within the respective 'OntologyTerm' bin + /// (N) total number of entities in the experiment + /// (K) total number of significantly altered entities in the dataset + /// threshold until mid p values are calculated (default=5) + /// A pvalue determined by hypergeometric test. If the number of entities within a bin is below the splitPValue + /// threshold a mid p value is calculated that is lower than its default + let calcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE (splitPvalueThreshold:int) = + let hp = FSharp.Stats.Distributions.Discrete.Hypergeometric.Init totalUnivers totalNumberOfDE numberInBin + if numberInBin > splitPvalueThreshold then + // Calculate normal pValue + 1. - hp.CDF (float (numberOfDEsInBin - 1)) else - nan - - - // ####################################################### - // functional term enrichment is calculated according to following publication - // http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401 - // also includes mid-pValues - /// Calculates functional term enrichment - let CalcSimpleOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option) (data:seq>) = - let _splitPvalueThreshold = defaultArg splitPvalueThreshold 5 - - let totalUnivers = data |> Seq.length - let totalNumberOfDE = data |> Seq.filter (fun oi -> oi.GroupIndex = deGroupIndex) |> Seq.length - - // returns (DE count, all count) - let countDE (subSet:seq>) = - let countMap = - subSet - |> Seq.countBy (fun oi -> if oi.GroupIndex = deGroupIndex then true else false ) - |> Map.ofSeq - (countMap.TryFindDefault 0 true,(countMap.TryFindDefault 0 true) + (countMap.TryFindDefault 0 false)) - - data - |> Seq.groupBy ( fun oi -> oi.OntologyTerm) - |> Seq.map (fun (oTerm,values) -> - let numberOfDEsInBin,numberInBin = countDE values - let pValue = CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE _splitPvalueThreshold - createGseaResult oTerm values numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue) - - - // ####################################################### - // functional term enrichment is calculated according to following publication - // http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401 - // also includes mid-pValues - /// Calculates functional term enrichment - let CalcOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option) (minNumberInTerm:option) (data:seq>) = - let _splitPvalueThreshold = defaultArg splitPvalueThreshold 5 - let _minNumberInTerm = defaultArg minNumberInTerm 2 + // Calculate split pValue + 0.5 * ((1. - hp.CDF(float(numberOfDEsInBin - 1)) ) + ( (1. - hp.CDF(float(numberOfDEsInBin))) ) ) + + + /// Calculates functional term enrichment + /// http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401 + /// defines the index of the differential expression group + /// threshold until mid p values are calculated (default=5) + /// describes how many items are required in a bin to be taken into account for multiple testing correction (default = 2) + /// sequence of ontology items + /// sequence of GseaResult + /// + /// + /// let data = [| + /// createOntologyItem "id1" "photosynthesis.lightreaction" 0 "item1" + /// createOntologyItem "id2" "protein.degradation" 0 "item2" + /// createOntologyItem "id3" "photosynthesis.lightreaction.LHCI" 1 "item3" // significantly different gene/protein/metabolite + /// calcOverEnrichment 1 (Some 5) (Some 2) data + /// + /// + let calcOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option) (minNumberInTerm:option) (data:seq>) = + let _splitPvalueThreshold = defaultArg splitPvalueThreshold 0 + let _minNumberInBin = defaultArg minNumberInTerm 0 // Distinct by term and gene name // Has to be done by an ouside function //let distinctData = data |> Seq.distinctBy (fun o -> o.displayID) let gData = data |> Seq.groupBy ( fun o -> o.OntologyTerm) // reduce to terms at least annotated with 2 items - let fData = gData |> Seq.filter ( fun (key:string,values:seq>) -> Seq.length(values) >= _minNumberInTerm) + let fData = gData |> Seq.filter ( fun (key:string,values:seq>) -> Seq.length(values) >= _minNumberInBin) let groupCount = fData |> Seq.collect (fun (key:string,values:seq>) -> values ) |> Seq.countBy (fun o -> o.GroupIndex) - let totalUnivers = groupCount |> Seq.fold (fun (acc:int) (index:int,count:int) -> acc + count) 0 + let totalUniverse = groupCount |> Seq.fold (fun (acc:int) (index:int,count:int) -> acc + count) 0 let totalNumberOfDE = let tmp = groupCount |> Seq.tryFind (fun (key,v) -> key = deGroupIndex) if tmp.IsNone then @@ -152,10 +183,69 @@ module OntologyEnrichment = |> Map.ofSeq (countMap.TryFindDefault 0 true,(countMap.TryFindDefault 0 true) + (countMap.TryFindDefault 0 false)) - fData - |> Seq.map (fun (oTerm,values) -> - let numberOfDEsInBin,numberInBin = countDE values - let pValue = CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE _splitPvalueThreshold - createGseaResult oTerm values numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue) - - + let results = + fData + |> Seq.map (fun (oTerm,values) -> + let numberOfDEsInBin,numberInBin = countDE values + let pValue = calcHyperGeoPvalue numberOfDEsInBin numberInBin totalUniverse totalNumberOfDE _splitPvalueThreshold + GseaResult<'a>.Create(oTerm, values, numberOfDEsInBin, numberInBin, totalNumberOfDE, totalUniverse, pValue)) + + results + + [] + let CalcOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option) (minNumberInTerm:option) (data:seq>)= + calcOverEnrichment deGroupIndex splitPvalueThreshold minNumberInTerm data + + [] + let CalcSimpleOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option) (minNumberInTerm:option) (data:seq>)= + calcOverEnrichment deGroupIndex splitPvalueThreshold (Some 0) data + + /// Calculates functional term enrichment with additional multiple testing correction + /// http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401, storey q value, benjamini hochberg FDR + /// defines the index of the differential expression group + /// threshold until mid p values are calculated (default=5) + /// describes how many items are required in a bin to be taken into account for multiple testing correction (default = 2) + /// sequence of ontology items + /// sequence of GseaResultFdrExtended + /// + /// + /// let data = [| + /// createOntologyItem "id1" "photosynthesis.lightreaction" 0 "item1" + /// createOntologyItem "id2" "protein.degradation" 0 "item2" + /// createOntologyItem "id3" "photosynthesis.lightreaction.LHCI" 1 "item3" // significantly different gene/protein/metabolite + /// calcOverEnrichmentIncludeFDR 1 (Some 5) (Some 2) data + /// + /// + let calcOverEnrichmentIncludeFDR (deGroupIndex:int) (splitPvalueThreshold:option) (minNumberInTerm:option) (data:seq>) = + let results = + calcOverEnrichment deGroupIndex splitPvalueThreshold minNumberInTerm data + + let pvalues = + results + |> Seq.map _.PValue + |> Array.ofSeq + + // fdr calculation + let fdr_bh = + FSharp.Stats.Testing.MultipleTesting.benjaminiHochbergFDR pvalues + |> Array.ofSeq + + // q value calculation + let pi0 = FSharp.Stats.Testing.MultipleTesting.Qvalues.pi0Bootstrap pvalues + let qval = FSharp.Stats.Testing.MultipleTesting.Qvalues.ofPValues pi0 pvalues + results + |> Seq.mapi (fun i res -> + let fdr = fdr_bh.[i] + let qval = qval.[i] + let pi0 = pi0 + GseaResultFdrExtended<'a>.Create( + res.OntologyTerm, + res.ItemsInBin, + res.NumberOfDEsInBin, + res.NumberInBin, + res.TotalNumberOfDE, + res.TotalUniverse, + res.PValue, + fdr, qval, pi0) + + ) \ No newline at end of file diff --git a/tests/BioFSharp.Stats.Tests/BioFSharp.Stats.Tests.fsproj b/tests/BioFSharp.Stats.Tests/BioFSharp.Stats.Tests.fsproj index 965e4b6..16f2c25 100644 --- a/tests/BioFSharp.Stats.Tests/BioFSharp.Stats.Tests.fsproj +++ b/tests/BioFSharp.Stats.Tests/BioFSharp.Stats.Tests.fsproj @@ -10,6 +10,7 @@ + diff --git a/tests/BioFSharp.Stats.Tests/OntologyEnrichmentTests.fs b/tests/BioFSharp.Stats.Tests/OntologyEnrichmentTests.fs new file mode 100644 index 0000000..eeaf88b --- /dev/null +++ b/tests/BioFSharp.Stats.Tests/OntologyEnrichmentTests.fs @@ -0,0 +1,81 @@ +namespace BioFSharp.Stats.Tests + +open Xunit +open BioFSharp +open BioFSharp.Stats +open OntologyEnrichment + +module OntologyEnrichmentTests = + + let data = [| + createOntologyItem "id01" "PS.lightreaction.LHCII;protein.degradation.ubiquitin" 1 "description01" + createOntologyItem "id02" "PS.lightreaction.LHCII" 0 "description02" + createOntologyItem "id03" "PS.lightreaction.LHCII" 1 "description03" + createOntologyItem "id04" "PS.lightreaction.LHCII" 0 "description04" + createOntologyItem "id05" "PS.lightreaction.LHCII" 0 "description05" + createOntologyItem "id06" "PS.lightreaction.LHCII" 1 "description06" + createOntologyItem "id07" "PS.lightreaction.LHCII" 0 "description07" + createOntologyItem "id08" "PS.lightreaction.LHCII" 1 "description08" + createOntologyItem "id09" "PS.lightreaction.LHCII" 0 "description09" + createOntologyItem "id10" "PS.lightreaction.LHCII" 1 "description10" + createOntologyItem "id11" "PS.lightreaction.LHCII" 0 "description11" //36 + createOntologyItem "id12" "PS.lightreaction" 0 "description12" + createOntologyItem "id13" "PS.lightreaction" 0 "description13" + createOntologyItem "id14" "PS.lightreaction" 0 "description14" + createOntologyItem "id15" "PS.lightreaction" 1 "description15" //8 + createOntologyItem "id16" "protein.degradation.ubiquitin" 0 "description16" + createOntologyItem "id17" "protein.degradation.ubiquitin" 1 "description17" + createOntologyItem "id18" "protein.degradation.ubiquitin" 0 "description18" + createOntologyItem "id19" "protein.degradation.ubiquitin" 0 "description19" + createOntologyItem "id20" "protein.degradation.ubiquitin" 1 "description20" + createOntologyItem "id21" "protein.degradation.ubiquitin" 0 "description21" //18 + createOntologyItem "id22" "protein.degradation.ubiquitin.e1" 0 "description22" + createOntologyItem "id23" "protein.degradation.ubiquitin.e1" 0 "description23" + createOntologyItem "id24" "protein.degradation.ubiquitin.e1" 0 "description24" //12 + createOntologyItem "id25" "protein.synthesis.initiation" 1 "description25" + createOntologyItem "id26" "protein.synthesis.initiation" 0 "description26" + createOntologyItem "id27" "protein.synthesis.initiation" 1 "description27" + createOntologyItem "id28" "protein.synthesis.initiation" 1 "description28" + createOntologyItem "id29" "protein.synthesis.initiation" 1 "description29" + createOntologyItem "id30" "protein.synthesis" 0 "description30" //17 + createOntologyItem "id31" "singletest" 1 "description31" //1 + |] + + let dataSplit = + data + |> Seq.collect (splitMultipleAnnotationsBy ';') + + let dataExtended = + expandOntologyTree dataSplit + + let enrichmentResult = + calcOverEnrichmentIncludeFDR 1 (Some 0) (Some 0) dataExtended + + [] + let ``splitOntologyItems`` () = + let actual = dataSplit |> Seq.length + let expected = data.Length + 1 //first one is split into 2, all others remain the same + Assert.Equal(expected, actual) + + [] + let ``expandOntologyTree`` () = + let actual = dataExtended |> Seq.length + let expected = 92 // 36 + 8 + 18 + 12 + 18 + Assert.Equal(expected, actual) + + [] + let ``ontologyEnrichment`` () = + // per bin one result + let numberOfBins_actual = enrichmentResult |> Seq.length + let numberOfBins_expected = 10 + Assert.Equal(numberOfBins_expected, numberOfBins_actual) + let bin1 = enrichmentResult |> Seq.find (fun x -> x.OntologyTerm = "PS.lightreaction.LHCII") + Assert.Equal(11, bin1.NumberInBin) + Assert.Equal(5, bin1.NumberOfDEsInBin) + Assert.Equal(39, bin1.TotalNumberOfDE) + Assert.Equal(92, bin1.TotalUniverse) + Assert.Equal(0.5367813329, System.Math.Round(bin1.PValue, 10)) //https://systems.crump.ucla.edu/hypergeometric/index.php + let testbin = + enrichmentResult |> Seq.find (fun x -> x.OntologyTerm = "singletest") + // test for a singleton bin of which valid p values exist + Assert.Equal(0.4239130435, System.Math.Round(testbin.PValue, 10)) \ No newline at end of file