From 393fcf145272a437c28be06aa819a826ccf369e2 Mon Sep 17 00:00:00 2001 From: jkobject Date: Tue, 18 Feb 2025 14:15:21 +0100 Subject: [PATCH 1/4] dbug scprint --- src/methods/scprint/config.vsh.yaml | 12 ++++-------- src/methods/scprint/script.py | 9 ++++++--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/methods/scprint/config.vsh.yaml b/src/methods/scprint/config.vsh.yaml index 0efe1d84..5fc8c98d 100644 --- a/src/methods/scprint/config.vsh.yaml +++ b/src/methods/scprint/config.vsh.yaml @@ -57,7 +57,7 @@ arguments: - name: --batch_size type: integer description: The size of the batches to be used in the DataLoader. - default: 64 + default: 32 - name: --max_len type: integer description: The maximum length of the gene sequence. @@ -75,19 +75,15 @@ engines: setup: - type: python pip: - - huggingface_hub - # Can be unpinned after https://github.com/cantinilab/scPRINT/issues/14 is resolved - - scprint==1.6.2 - - scdataloader==1.6.4 + - scprint - type: docker run: lamin init --storage ./main --name main --schema bionty - - type: python - script: import bionty as bt; bt.core.sync_all_sources_to_latest() - type: docker run: lamin load anonymous/main - type: python script: from scdataloader.utils import populate_my_ontology; populate_my_ontology() - + - type: python + script: import bionty as bt; bt.core.sync_all_sources_to_latest() runners: - type: executable - type: nextflow diff --git a/src/methods/scprint/script.py b/src/methods/scprint/script.py index 5f0c95e8..adb040e9 100644 --- a/src/methods/scprint/script.py +++ b/src/methods/scprint/script.py @@ -74,16 +74,19 @@ print("CUDA is not available, using CPU", flush=True) precision = "32" dtype = torch.float32 -n_cores_available = len(os.sched_getaffinity(0)) -print(f"Using {n_cores_available} worker cores") +n_cores = min(len(os.sched_getaffinity(0)), 24) +print(f"Using {n_cores} worker cores") embedder = Embedder( how="random expr", batch_size=par["batch_size"], max_len=par["max_len"], add_zero_genes=0, - num_workers=n_cores_available, + num_workers=n_cores, doclass=False, doplot=False, + pred_embedding=["cell_type_ontology_term_id"], + keep_all_cls_pred=False, + output_expression="none", precision=precision, dtype=dtype, ) From 0bc9c5ab855e7fbe9c84ba82b6d6db660fd6d647 Mon Sep 17 00:00:00 2001 From: jkobject Date: Tue, 18 Feb 2025 14:31:17 +0100 Subject: [PATCH 2/4] allowing flash attn --- src/methods/scprint/script.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/methods/scprint/script.py b/src/methods/scprint/script.py index adb040e9..7ec38d3e 100644 --- a/src/methods/scprint/script.py +++ b/src/methods/scprint/script.py @@ -58,22 +58,26 @@ model_checkpoint_file = hf_hub_download( repo_id="jkobject/scPRINT", filename=f"{par['model_name']}.ckpt" ) -print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True) -model = scPrint.load_from_checkpoint( - model_checkpoint_file, - transformer="normal", # Don't use this for GPUs with flashattention - precpt_gene_emb=None, -) print("\n>>> Embedding data...", flush=True) if torch.cuda.is_available(): print("CUDA is available, using GPU", flush=True) precision = "16" dtype = torch.float16 + transformer="flash" else: print("CUDA is not available, using CPU", flush=True) precision = "32" dtype = torch.float32 + transformer="normal" + +print(f"Model checkpoint file: '{model_checkpoint_file}'", flush=True) +model = scPrint.load_from_checkpoint( + model_checkpoint_file, + transformer=transformer, # Don't use this for GPUs with flashattention + precpt_gene_emb=None, +) + n_cores = min(len(os.sched_getaffinity(0)), 24) print(f"Using {n_cores} worker cores") embedder = Embedder( From 3cb94a302e5ec114174df5a3e6666733c81e84bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20Kalfon?= Date: Thu, 20 Feb 2025 12:44:50 +0100 Subject: [PATCH 3/4] Update _viash.yaml --- _viash.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 36dd957a..5b612d43 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -91,7 +91,11 @@ authors: info: github: sainirmayi orcid: 0009-0003-6319-9803 - + - name: Jeremie Kalfon + roles: [contributor] + info: + github: jkobject + orcid: 0000-0002-2818-9728 config_mods: | .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } From 086946bcfd57b7540ff76ae6b3f09ad8ead5c453 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Fri, 21 Feb 2025 08:47:45 +0100 Subject: [PATCH 4/4] Update CHANGELOG --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48f7bceb..53a22533 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# task_batch_integration devel + +## Minor changes + +* Un-pin the scPRINT version and update parameters (PR #51) + # task_batch_integration 2.0.0 A major update to the OpenProblems framework, switching from a Python-based framework to a Viash + Nextflow-based framework. This update features the same concepts as the previous version, but with a new implementation that is more flexible, scalable, and maintainable.