Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #3

Merged
merged 2 commits into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions R/cross_validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
n_folds=n_folds,
n_reps=n_reps,
memory_requested_Gb=max_mem_Gb,
memory_multiplier=40,
verbose=verbose)
} else if (cv_type == 2) {
############################################
Expand Down Expand Up @@ -455,7 +454,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
n_folds=n_folds,
n_reps=1,
memory_requested_Gb=max_mem_Gb,
memory_multiplier=40,
verbose=verbose)
} else if (cv_type == 3) {
#################################################
Expand Down Expand Up @@ -486,7 +484,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
n_folds=n_folds,
n_reps=1,
memory_requested_Gb=max_mem_Gb,
memory_multiplier=40,
verbose=verbose)
} else {
error = methods::new("gpError",
Expand Down
7 changes: 4 additions & 3 deletions R/io.R
Original file line number Diff line number Diff line change
Expand Up @@ -2276,7 +2276,7 @@ fn_subset_merged_genotype_and_phenotype = function(list_merged, vec_idx, verbose
#' @param n_folds number of cross-validation folds (Default=10)
#' @param n_reps number of cross-validation replication (Default=10)
#' @param memory_requested_Gb memory requested or available for use (Default=400)
#' @param memory_multiplier estimated memory usage multiplier (Default=40)
#' @param memory_multiplier estimated memory usage multiplier (Default=50)
#' @param verbose show memory usage estimation messages? (Default=FALSE)
#' @returns
#' - Ok:
Expand All @@ -2288,15 +2288,16 @@ fn_subset_merged_genotype_and_phenotype = function(list_merged, vec_idx, verbose
#' list_mem = fn_estimate_memory_footprint(X=rnorm(10000), verbose=TRUE)
#' @export
fn_estimate_memory_footprint = function(X, n_models=7, n_folds=10, n_reps=10,
memory_requested_Gb=400, memory_multiplier=40, verbose=FALSE) {
memory_requested_Gb=400, memory_multiplier=50, verbose=FALSE)
{
###################################################
### TEST
# X = matrix(0.0, nrow=492, ncol=455255)
# n_models = 7
# n_reps = 10
# n_folds = 10
# memory_requested_Gb = 400
# memory_multiplier = 40
# memory_multiplier = 50
# verbose = TRUE
###################################################
if ((as.numeric(utils::object.size(X)) == 0) | (n_models <= 0) | (n_folds <= 0) | (n_reps <= 0) | (memory_requested_Gb <= 0) | (memory_multiplier <= 0)) {
Expand Down
95 changes: 95 additions & 0 deletions exec/00_gp_slurm_job_wrapper-LUCERNE.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

################################################################
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
### Please edit the input variables below to match your dataset:
################################################################

### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
GENOTYPE_DATA_RDS='/group/pasture/Jeff/lucerne/workdir/FINAL-IMPUTED-noTrailingAllele-filteredSNPlist.Rds'
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
PHENOTYPE_DATA_TSV='/group/pasture/Jeff/lucerne/workdir/Lucerne_PhenomicsDB_2024-05-27-BiomassPredicted.tsv'
### (3) Number of folds for k-fold cross-validation.
KFOLDS=5
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
NREPS=5
### (5) Full path to the location of the executable Rscript gp.R
DIR='/group/pasture/Jeff/gp/exec'

### Check if the genotype file exists
if [ ! -f $GENOTYPE_DATA_RDS ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
exit 101
fi
### Check if the phenotype file exists
if [ ! -f $PHENOTYPE_DATA_TSV ]
then
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
exit 102
fi
### Check if the genotype file is a valid Rds file
echo 'args = commandArgs(trailingOnly=TRUE)
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
' > test_geno_rds.R
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
exit 103
fi
rm test_geno_rds.R
### Check if the phenotype file is formatted according to the required specifications
echo 'args = commandArgs(trailingOnly=TRUE)
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
' > test_pheno_rds.R
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
exit 104
fi
rm test_pheno_rds.R
### Check if the genomic_selection repo folder exists
if [ ! -d $DIR ]
then
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
exit 105
fi
### Check if the genomic_selection repo belongs to the user
if [ ! -w $DIR ]
then
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
exit 106
fi
### Check if the genomic_selection repo has contains the slurm array job submission script
if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
then
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
exit 107
fi
### Initialise the output directory which will contain all the output Rds files across populations and traits
if [ ! -d ${DIR}/output ]
then
mkdir ${DIR}/output
fi
### Submit an array of jobs equivalent to the number of traits in the phenotype file
cd $DIR/
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
01_gp_slurm_job-LUCERNE.sh \
${GENOTYPE_DATA_RDS} \
${PHENOTYPE_DATA_TSV} \
${KFOLDS} \
${NREPS} \
${DIR}
95 changes: 95 additions & 0 deletions exec/00_gp_slurm_job_wrapper-RYEGRASS.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

################################################################
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
### Please edit the input variables below to match your dataset:
################################################################

### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
GENOTYPE_DATA_RDS='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_HS-1717536141.3435302.3200855812-IMPUTED.Rds'
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
PHENOTYPE_DATA_TSV='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_PhenomicsDB_2024-05-20-BiomassPredictedAndGroudtruth.tsv'
### (3) Number of folds for k-fold cross-validation.
KFOLDS=5
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
NREPS=5
### (5) Full path to the location of the executable Rscript gp.R
DIR='/group/pasture/Jeff/gp/exec'

### Check if the genotype file exists
if [ ! -f $GENOTYPE_DATA_RDS ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
exit 101
fi
### Check if the phenotype file exists
if [ ! -f $PHENOTYPE_DATA_TSV ]
then
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
exit 102
fi
### Check if the genotype file is a valid Rds file
echo 'args = commandArgs(trailingOnly=TRUE)
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
' > test_geno_rds.R
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
exit 103
fi
rm test_geno_rds.R
### Check if the phenotype file is formatted according to the required specifications
echo 'args = commandArgs(trailingOnly=TRUE)
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
' > test_pheno_rds.R
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
exit 104
fi
rm test_pheno_rds.R
### Check if the genomic_selection repo folder exists
if [ ! -d $DIR ]
then
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
exit 105
fi
### Check if the genomic_selection repo belongs to the user
if [ ! -w $DIR ]
then
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
exit 106
fi
### Check if the genomic_selection repo has contains the slurm array job submission script
if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
then
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
exit 107
fi
### Initialise the output directory which will contain all the output Rds files across populations and traits
if [ ! -d ${DIR}/output ]
then
mkdir ${DIR}/output
fi
### Submit an array of jobs equivalent to the number of traits in the phenotype file
cd $DIR/
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
01_gp_slurm_job-RYEGRASS.sh \
${GENOTYPE_DATA_RDS} \
${PHENOTYPE_DATA_TSV} \
${KFOLDS} \
${NREPS} \
${DIR}
120 changes: 120 additions & 0 deletions exec/01_gp_slurm_job-LUCERNE.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/bin/bash
#SBATCH --job-name="GS"
#SBATCH --account="dbiopast2" ### EDIT ME: Pick the appropriate account name, e.g. dbiopast1 or dbiopast2
#SBATCH --ntasks=1 ### LEAVE ME:Request a single task as we will be submitting this as an array job where each job corresponds to a trait
#SBATCH --cpus-per-task=32 ### EDIT ME: Parallelisation across replications, folds and models (more cpu means faster execution time but probably longer time to wait for the Slurm scheduler to find resources to allocate to the job)
#SBATCH --mem=400G ### EDIT ME: Proportional to the input data (will need to test the appropriate memory required, hint use `seff ${JOBID}`)
#SBATCH --time=7-0:0:00 ### EDIT ME: Proportional to the input data, number of folds, replications, and models to be used
###################################################################################################
### Edit the Slurm settings above to match your requirements.
###################################################################################################

###################################################################################################
### The variables below will be exported from `00_gs_slurm_job_wrapper.sh`:
###################################################################################################
### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
# GENOTYPE_DATA_RDS='/group/pasture/Jeff/genomic_selection/tests/grape.rds'
GENOTYPE_DATA_RDS=$1
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
# PHENOTYPE_DATA_TSV='/group/pasture/Jeff/genomic_selection/tests/grape_pheno.txt'
PHENOTYPE_DATA_TSV=$2
### (3) Number of folds for k-fold cross-validation.
# KFOLDS=5
KFOLDS=$3
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
# NREPS=3
NREPS=$4
### (5) Full path to the location of the executable Rscript gp.R
# DIR='/group/pasture/Jeff/gp/exec'
DIR=$5
###################################################################################################
### Edit the code below, if and only if you have read the documentation or familiar with `src/*.R`:
###################################################################################################
### Define the trait and population to include
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
TRAIT_IDX=$(echo "((${SLURM_ARRAY_TASK_ID}-1) / ${N_POPS}) + 1" | bc)
POP_IDX=$(echo "${SLURM_ARRAY_TASK_ID} % ${N_POPS}" | bc)
if [ "${POP_IDX}" -eq 0 ]
then
POP_IDX=${N_POPS}
fi
COLUMN_ID=$(echo 2 + ${TRAIT_IDX} | bc)
TRAIT=$(head -n1 $PHENOTYPE_DATA_TSV | cut -f${COLUMN_ID})
POP=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | head -n${POP_IDX} | tail -n1)
### Skip leave-one-population-out cross-validation if there is only one population
if [ "${N_POPS}" -eq 1 ]
then
BOOL_ACROSS=FALSE
else
if [ "${POP_IDX}" -eq 1 ]
then
BOOL_ACROSS=TRUE
else
BOOL_ACROSS=FALSE
fi
fi
### Output directories
DIR_OUT_MAIN=${DIR}/output
DIR_OUT=${DIR_OUT_MAIN}/output-${TRAIT}-${POP}
mkdir $DIR_OUT
### Log messages
echo JOB_${SLURM_ARRAY_TASK_ID}-TRAIT_${TRAIT}-POP_${POP} > ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
echo "==========================================
-------------------------------------------
Job Info
-------------------------------------------
SLURM_JOB_ID = $SLURM_JOB_ID
SLURM_JOB_NAME = $SLURM_JOB_NAME
SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST
SLURM_SUBMIT_HOST = $SLURM_SUBMIT_HOST
SLURM_SUBMIT_DIR = $SLURM_SUBMIT_DIR
SLURM_NTASKS = $SLURM_NTASKS
SLURM_ARRAY_TASK_ID = $SLURM_ARRAY_TASK_ID
SLURM_MEM_PER_NODE = $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) GB
SLURM_CPUS_PER_TASK = $SLURM_CPUS_PER_TASK
-------------------------------------------
Variables
-------------------------------------------
GENOTYPE_DATA_RDS : $GENOTYPE_DATA_RDS
PHENOTYPE_DATA_TSV : $PHENOTYPE_DATA_TSV
KFOLDS : $KFOLDS
NREPS : $NREPS
TRAIT : $TRAIT
POPULATION : $POP
-------------------------------------------
Output directory
-------------------------------------------
${DIR_OUT}
==========================================" >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

### Run within and across population replicated k-fold cross-validation and prediction of missing phenotypes
time \
Rscript ${DIR}/gp.R \
--fname-geno $GENOTYPE_DATA_RDS \
--fname-pheno $PHENOTYPE_DATA_TSV \
--population $POP \
--dir-output $DIR_OUT \
--pheno-idx-col-y $COLUMN_ID \
--bool-within TRUE \
--bool-across $BOOL_ACROSS \
--n-folds $KFOLDS \
--n-reps $NREPS \
--bool-parallel TRUE \
--max-mem-Gb $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) \
--n-threads $SLURM_CPUS_PER_TASK \
--verbose TRUE >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
### Clean-up
mv ${DIR_OUT}/GENOMIC_PREDICTIONS_OUTPUT-*.Rds ${DIR_OUT_MAIN}
mv ${DIR_OUT}/job_info-${TRAIT}-${POP}.log ${DIR_OUT_MAIN}
rm -R ${DIR_OUT}
Loading
Loading