diff --git a/Project.toml b/Project.toml index 9270203..ef0c9e4 100644 --- a/Project.toml +++ b/Project.toml @@ -31,7 +31,7 @@ DataStructures = "0.18" JLD2 = "0.4" Loess = "0.5" MultivariateStats = "0.10" -OmicsProfiles = "0.1" +OmicsProfiles = "0.2" Plots = "1" Reexport = "1" StatsBase = "0.33" @@ -40,8 +40,9 @@ UMAP = "0.1" julia = "1.6" [extras] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Distributions", "Test"] +test = ["CUDA", "Distributions", "Test"] diff --git a/src/preprocess/highly_variable.jl b/src/preprocess/highly_variable.jl index 7e67862..e2012a3 100644 --- a/src/preprocess/highly_variable.jl +++ b/src/preprocess/highly_variable.jl @@ -116,23 +116,24 @@ function highly_variable_genes!(df::DataFrame, X::AbstractMatrix, ::Seuratv3HVG; error("This method is not implemented.") end -function highly_variable_genes!(X::AbstractMatrix, var::DataFrame, ::SeuratHVG; +function highly_variable_genes!(X::AbstractMatrix{T}, var::DataFrame, ::SeuratHVG; ntop_genes::Int=-1, min_disp=0.5, max_disp=Inf, min_mean=0.0125, max_mean=3., - nbins::Int=20) + nbins::Int=20) where {T} X = expm1.(X) μ, σ² = mean_and_var(X, 2, corrected=true) μ, σ² = vec(μ), vec(σ²) # compute dispersion - replace!(μ, 0. => 1e-12) - dispersion = replace!(σ² ./ μ, 0. => NaN) + μ[μ .== 0.] .= T(1e-12) + dispersion = σ² ./ μ + dispersion[dispersion .== 0.] .= T(NaN) dispersion .= log.(dispersion) μ .= log1p.(μ) - var[!, :means] = μ - var[!, :dispersions] = dispersion - h = fit(Histogram, μ; nbins=nbins) - var[!, :mean_bin] = map(x -> StatsBase.binindex(h, x), μ) + var[!, :means] = collect(μ) + var[!, :dispersions] = collect(dispersion) + h = fit(Histogram, var[!, :means]; nbins=nbins) + var[!, :mean_bin] = map(x -> StatsBase.binindex(h, x), var[!, :means]) gdf = groupby(var, :mean_bin) disp_bin = combine(gdf, [:dispersions => mean, :dispersions => std]) disp_bin = fill_missing_bins(disp_bin, :mean_bin, :dispersions_mean => 0, @@ -152,26 +153,26 @@ function highly_variable_genes!(X::AbstractMatrix, var::DataFrame, ::SeuratHVG; disp_bin.dispersions_std, var.mean_bin) var[!, :dispersions_norm] = disp_norm - var[!, :highly_variable] = select_ntop_genes(μ, disp_norm, ntop_genes, nrow(var), - min_mean, max_mean, min_disp, max_disp) + var[!, :highly_variable] = select_ntop_genes(var[!, :means], disp_norm, ntop_genes, + nrow(var), min_mean, max_mean, min_disp, max_disp) select!(var, Not(:mean_bin)) return var end -function highly_variable_genes!(X::AbstractMatrix, var::DataFrame, ::CellRangerHVG; - ntop_genes::Int=-1, min_disp=0.5, max_disp=Inf, min_mean=0.0125, max_mean=3.) +function highly_variable_genes!(X::AbstractMatrix{T}, var::DataFrame, ::CellRangerHVG; + ntop_genes::Int=-1, min_disp=0.5, max_disp=Inf, min_mean=0.0125, max_mean=3.) where {T} μ, σ² = mean_and_var(X, 2, corrected=true) μ, σ² = vec(μ), vec(σ²) # compute dispersion - replace!(μ, 0. => 1e-12) + μ[μ .== 0.] .= T(1e-12) dispersion = σ² ./ μ - var[!, :means] = μ - var[!, :dispersions] = dispersion - bins = [-Inf, percentile(μ, 10:5:100)..., Inf] - h = fit(Histogram, μ, bins) - var[!, :mean_bin] = map(x -> StatsBase.binindex(h, x), μ) + var[!, :means] = collect(μ) + var[!, :dispersions] = collect(dispersion) + bins = [-Inf, percentile(var[!, :means], 10:5:100)..., Inf] + h = fit(Histogram, var[!, :means], bins) + var[!, :mean_bin] = map(x -> StatsBase.binindex(h, x), var[!, :means]) gdf = groupby(var, :mean_bin) disp_bin = combine(gdf, [:dispersions => median, :dispersions => mad]) disp_bin = fill_missing_bins(disp_bin, :mean_bin, :dispersions_median => 0, @@ -181,8 +182,8 @@ function highly_variable_genes!(X::AbstractMatrix, var::DataFrame, ::CellRangerH disp_bin.dispersions_mad, var.mean_bin) var[!, :dispersions_norm] = disp_norm - var[!, :highly_variable] = select_ntop_genes(μ, disp_norm, ntop_genes, nrow(var), - min_mean, max_mean, min_disp, max_disp) + var[!, :highly_variable] = select_ntop_genes(var[!, :means], disp_norm, ntop_genes, + nrow(var), min_mean, max_mean, min_disp, max_disp) select!(var, Not(:mean_bin)) return var end diff --git a/src/preprocess/methodtype.jl b/src/preprocess/methodtype.jl index 014e85a..41304f1 100644 --- a/src/preprocess/methodtype.jl +++ b/src/preprocess/methodtype.jl @@ -28,6 +28,7 @@ struct CustomNormalization <: NormalizationMethod end Base.show(io::IO, ::CustomNormalization) = print(io, "custom normalization") Base.Symbol(::CustomNormalization) = :custom + abstract type HighlyVariableMethod end """ diff --git a/test/cuda.jl b/test/cuda.jl new file mode 100644 index 0000000..1a1f7c7 --- /dev/null +++ b/test/cuda.jl @@ -0,0 +1,60 @@ +@testset "cuda" begin + ngenes, ncells = (100, 500) + + @testset "normalize" begin + X = rand(ngenes, ncells) * 20 |> cu + + @testset "relative normalization" begin + normX = SnowyOwl.Preprocess.normalize(X, SnowyOwl.Preprocess.RelativeNormalization()) + library_size = collect(sum(normX, dims=1)) + @test all(library_size .≈ 1e4) + end + + @testset "log normalization" begin + normX = SnowyOwl.Preprocess.normalize(X, SnowyOwl.Preprocess.LogNormalization()) + library_size = collect(sum(expm1.(normX), dims=1)) + @test all(library_size .≈ 1e4) + end + + @testset "clr normalization" begin + normX = SnowyOwl.Preprocess.normalize(X, SnowyOwl.Preprocess.CenteredLogRatioNormalization()) + library_size = collect(sum(expm1.(normX), dims=1)) + @test_skip all(library_size .≈ 1e4) # TODO: not sure how to test + end + end + + @testset "logarithmize" begin + X = fill(2., ngenes, ncells) |> cu + + logX = SnowyOwl.Preprocess.logarithmize(X) + @test collect(logX) == fill(log1p(2.f0), ngenes, ncells) + end + + @testset "highly_variable" begin + nhvgs = 10 + X = Float32.(rand(NegativeBinomial(1, 0.8), ngenes, ncells)) |> cu + var = DataFrame(gene_symbols=1:ngenes, A=rand(ngenes)) + + @testset "Seurat method" begin + min_disp, max_disp = 0.5, Inf + min_mean, max_mean = 0.0125, 3. + hvg = SnowyOwl.Preprocess.highly_variable_genes(X, var, SnowyOwl.Preprocess.SeuratHVG(); + varname=:gene_symbols, + min_disp=min_disp, max_disp=max_disp, + min_mean=min_mean, max_mean=max_mean) + top_genes = (min_mean .< hvg.means .< max_mean) .& + (min_disp .< hvg.dispersions_norm .< max_disp) + @test names(hvg) == ["gene_symbols", "means", "dispersions", "dispersions_norm", + "highly_variable"] + @test all(top_genes .== hvg.highly_variable) + end + + @testset "cell ranger method" begin + hvg = SnowyOwl.Preprocess.highly_variable_genes(X, var, SnowyOwl.Preprocess.CellRangerHVG(); + ntop_genes=nhvgs) + @test names(hvg) == ["gene_symbols", "means", "dispersions", "dispersions_norm", + "highly_variable"] + @test count(hvg.highly_variable) == nhvgs + end + end +end diff --git a/test/highly_variable.jl b/test/highly_variable.jl index dbd0319..6b2348e 100644 --- a/test/highly_variable.jl +++ b/test/highly_variable.jl @@ -1,7 +1,7 @@ @testset "highly_variable" begin ngenes, ncells = (100, 500) nhvgs = 10 - X = rand(NegativeBinomial(1, 0.8), ngenes, ncells) + X = Float64.(rand(NegativeBinomial(1, 0.8), ngenes, ncells)) var = DataFrame(gene_symbols=1:ngenes, A=rand(ngenes)) min_disp, max_disp = 0.5, Inf diff --git a/test/runtests.jl b/test/runtests.jl index 50a0894..8fe85ac 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,14 +5,18 @@ using SparseArrays using DataFrames using Distributions using StatsBase -using Test -using OmicsProfiles +using CUDA using Plots +using Test const TEST_PATH = @__DIR__ ENV["DATADEPS_ALWAYS_ACCEPT"] = true +cuda_tests = [ + "cuda", +] + tests = [ "datasets", # "filter", @@ -24,6 +28,13 @@ tests = [ "plots", ] +if CUDA.functional() + CUDA.allowscalar(false) + append!(tests, cuda_tests) +else + @warn "CUDA unavailable, not testing GPU support" +end + @testset "SnowyOwl.jl" begin for t in tests include("$(t).jl")