Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support preprocessing calculation on cuda #48

Merged
merged 1 commit into from
Feb 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ DataStructures = "0.18"
JLD2 = "0.4"
Loess = "0.5"
MultivariateStats = "0.10"
OmicsProfiles = "0.1"
OmicsProfiles = "0.2"
Plots = "1"
Reexport = "1"
StatsBase = "0.33"
Expand All @@ -40,8 +40,9 @@ UMAP = "0.1"
julia = "1.6"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Distributions", "Test"]
test = ["CUDA", "Distributions", "Test"]
41 changes: 21 additions & 20 deletions src/preprocess/highly_variable.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,23 +116,24 @@ function highly_variable_genes!(df::DataFrame, X::AbstractMatrix, ::Seuratv3HVG;
error("This method is not implemented.")
end

function highly_variable_genes!(X::AbstractMatrix, var::DataFrame, ::SeuratHVG;
function highly_variable_genes!(X::AbstractMatrix{T}, var::DataFrame, ::SeuratHVG;
ntop_genes::Int=-1, min_disp=0.5, max_disp=Inf, min_mean=0.0125, max_mean=3.,
nbins::Int=20)
nbins::Int=20) where {T}
X = expm1.(X)
μ, σ² = mean_and_var(X, 2, corrected=true)
μ, σ² = vec(μ), vec(σ²)

# compute dispersion
replace!(μ, 0. => 1e-12)
dispersion = replace!(σ² ./ μ, 0. => NaN)
μ[μ .== 0.] .= T(1e-12)
dispersion = σ² ./ μ
dispersion[dispersion .== 0.] .= T(NaN)
dispersion .= log.(dispersion)
μ .= log1p.(μ)

var[!, :means] = μ
var[!, :dispersions] = dispersion
h = fit(Histogram, μ; nbins=nbins)
var[!, :mean_bin] = map(x -> StatsBase.binindex(h, x), μ)
var[!, :means] = collect(μ)
var[!, :dispersions] = collect(dispersion)
h = fit(Histogram, var[!, :means]; nbins=nbins)
var[!, :mean_bin] = map(x -> StatsBase.binindex(h, x), var[!, :means])
gdf = groupby(var, :mean_bin)
disp_bin = combine(gdf, [:dispersions => mean, :dispersions => std])
disp_bin = fill_missing_bins(disp_bin, :mean_bin, :dispersions_mean => 0,
Expand All @@ -152,26 +153,26 @@ function highly_variable_genes!(X::AbstractMatrix, var::DataFrame, ::SeuratHVG;
disp_bin.dispersions_std, var.mean_bin)
var[!, :dispersions_norm] = disp_norm

var[!, :highly_variable] = select_ntop_genes(μ, disp_norm, ntop_genes, nrow(var),
min_mean, max_mean, min_disp, max_disp)
var[!, :highly_variable] = select_ntop_genes(var[!, :means], disp_norm, ntop_genes,
nrow(var), min_mean, max_mean, min_disp, max_disp)
select!(var, Not(:mean_bin))
return var
end

function highly_variable_genes!(X::AbstractMatrix, var::DataFrame, ::CellRangerHVG;
ntop_genes::Int=-1, min_disp=0.5, max_disp=Inf, min_mean=0.0125, max_mean=3.)
function highly_variable_genes!(X::AbstractMatrix{T}, var::DataFrame, ::CellRangerHVG;
ntop_genes::Int=-1, min_disp=0.5, max_disp=Inf, min_mean=0.0125, max_mean=3.) where {T}
μ, σ² = mean_and_var(X, 2, corrected=true)
μ, σ² = vec(μ), vec(σ²)

# compute dispersion
replace!(μ, 0. => 1e-12)
μ[μ .== 0.] .= T(1e-12)
dispersion = σ² ./ μ

var[!, :means] = μ
var[!, :dispersions] = dispersion
bins = [-Inf, percentile(μ, 10:5:100)..., Inf]
h = fit(Histogram, μ, bins)
var[!, :mean_bin] = map(x -> StatsBase.binindex(h, x), μ)
var[!, :means] = collect(μ)
var[!, :dispersions] = collect(dispersion)
bins = [-Inf, percentile(var[!, :means], 10:5:100)..., Inf]
h = fit(Histogram, var[!, :means], bins)
var[!, :mean_bin] = map(x -> StatsBase.binindex(h, x), var[!, :means])
gdf = groupby(var, :mean_bin)
disp_bin = combine(gdf, [:dispersions => median, :dispersions => mad])
disp_bin = fill_missing_bins(disp_bin, :mean_bin, :dispersions_median => 0,
Expand All @@ -181,8 +182,8 @@ function highly_variable_genes!(X::AbstractMatrix, var::DataFrame, ::CellRangerH
disp_bin.dispersions_mad, var.mean_bin)
var[!, :dispersions_norm] = disp_norm

var[!, :highly_variable] = select_ntop_genes(μ, disp_norm, ntop_genes, nrow(var),
min_mean, max_mean, min_disp, max_disp)
var[!, :highly_variable] = select_ntop_genes(var[!, :means], disp_norm, ntop_genes,
nrow(var), min_mean, max_mean, min_disp, max_disp)
select!(var, Not(:mean_bin))
return var
end
Expand Down
1 change: 1 addition & 0 deletions src/preprocess/methodtype.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct CustomNormalization <: NormalizationMethod end
Base.show(io::IO, ::CustomNormalization) = print(io, "custom normalization")

Base.Symbol(::CustomNormalization) = :custom

abstract type HighlyVariableMethod end

"""
Expand Down
60 changes: 60 additions & 0 deletions test/cuda.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
@testset "cuda" begin
ngenes, ncells = (100, 500)

@testset "normalize" begin
X = rand(ngenes, ncells) * 20 |> cu

@testset "relative normalization" begin
normX = SnowyOwl.Preprocess.normalize(X, SnowyOwl.Preprocess.RelativeNormalization())
library_size = collect(sum(normX, dims=1))
@test all(library_size .≈ 1e4)
end

@testset "log normalization" begin
normX = SnowyOwl.Preprocess.normalize(X, SnowyOwl.Preprocess.LogNormalization())
library_size = collect(sum(expm1.(normX), dims=1))
@test all(library_size .≈ 1e4)
end

@testset "clr normalization" begin
normX = SnowyOwl.Preprocess.normalize(X, SnowyOwl.Preprocess.CenteredLogRatioNormalization())
library_size = collect(sum(expm1.(normX), dims=1))
@test_skip all(library_size .≈ 1e4) # TODO: not sure how to test
end
end

@testset "logarithmize" begin
X = fill(2., ngenes, ncells) |> cu

logX = SnowyOwl.Preprocess.logarithmize(X)
@test collect(logX) == fill(log1p(2.f0), ngenes, ncells)
end

@testset "highly_variable" begin
nhvgs = 10
X = Float32.(rand(NegativeBinomial(1, 0.8), ngenes, ncells)) |> cu
var = DataFrame(gene_symbols=1:ngenes, A=rand(ngenes))

@testset "Seurat method" begin
min_disp, max_disp = 0.5, Inf
min_mean, max_mean = 0.0125, 3.
hvg = SnowyOwl.Preprocess.highly_variable_genes(X, var, SnowyOwl.Preprocess.SeuratHVG();
varname=:gene_symbols,
min_disp=min_disp, max_disp=max_disp,
min_mean=min_mean, max_mean=max_mean)
top_genes = (min_mean .< hvg.means .< max_mean) .&
(min_disp .< hvg.dispersions_norm .< max_disp)
@test names(hvg) == ["gene_symbols", "means", "dispersions", "dispersions_norm",
"highly_variable"]
@test all(top_genes .== hvg.highly_variable)
end

@testset "cell ranger method" begin
hvg = SnowyOwl.Preprocess.highly_variable_genes(X, var, SnowyOwl.Preprocess.CellRangerHVG();
ntop_genes=nhvgs)
@test names(hvg) == ["gene_symbols", "means", "dispersions", "dispersions_norm",
"highly_variable"]
@test count(hvg.highly_variable) == nhvgs
end
end
end
2 changes: 1 addition & 1 deletion test/highly_variable.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
@testset "highly_variable" begin
ngenes, ncells = (100, 500)
nhvgs = 10
X = rand(NegativeBinomial(1, 0.8), ngenes, ncells)
X = Float64.(rand(NegativeBinomial(1, 0.8), ngenes, ncells))
var = DataFrame(gene_symbols=1:ngenes, A=rand(ngenes))

min_disp, max_disp = 0.5, Inf
Expand Down
15 changes: 13 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,18 @@ using SparseArrays
using DataFrames
using Distributions
using StatsBase
using Test
using OmicsProfiles
using CUDA
using Plots
using Test

const TEST_PATH = @__DIR__

ENV["DATADEPS_ALWAYS_ACCEPT"] = true

cuda_tests = [
"cuda",
]

tests = [
"datasets",
# "filter",
Expand All @@ -24,6 +28,13 @@ tests = [
"plots",
]

if CUDA.functional()
CUDA.allowscalar(false)
append!(tests, cuda_tests)
else
@warn "CUDA unavailable, not testing GPU support"
end

@testset "SnowyOwl.jl" begin
for t in tests
include("$(t).jl")
Expand Down