diff --git a/bayes_ca/data/tlae-embeddings.py b/bayes_ca/data/tlae-embeddings.py new file mode 100644 index 0000000..c7ccac5 --- /dev/null +++ b/bayes_ca/data/tlae-embeddings.py @@ -0,0 +1,47 @@ +import json +from pathlib import Path + +import click +import whisper +import numpy as np +from sentence_transformers import SentenceTransformer + + +@click.command() +@click.option("--file") +@click.option("--datadir") +@click.option("--outdir") +def main(file, datadir, outdir): + """ + This script assumes you have access to the copyrighted stimuli + and that you are running in an environment with SBert (i.e., + sentence-transformers) and OpenAI's Whisper installed. + + Params + ------ + file : str + Stimulus file to transcribe and generate embedding + datadir : str + Local path to the stimuli files + outdir : str + Local path to store transcriptions and embeddings + """ + model = whisper.load_model("medium.en") + result = model.transcribe(str(Path(datadir, file))) + + whisper_outname = "whisper-" + str(Path(file).with_suffix(".json")) + with open(Path(outdir, whisper_outname), "w") as outfile: + json.dump(result, outfile, indent=4) + + sentences = result["text"].split(". ") + model = SentenceTransformer("all-mpnet-base-v2") + + embeddings = model.encode(sentences) + sbert_outname = Path(outdir, f"sbert-{whisper_outname}") + np.save(sbert_outname, embeddings, allow_pickle=False) + + return + + +if __name__ == "__main__": + main() diff --git a/bayes_ca/data/tlae-fmriprep.sh b/bayes_ca/data/tlae-fmriprep.sh new file mode 100644 index 0000000..a9de4f0 --- /dev/null +++ b/bayes_ca/data/tlae-fmriprep.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +#SBATCH --job-name=tlae_fMRIPrep +#SBATCH --output=tlae_fmriprep.%j.out +#SBATCH --time=1-00:00 +#SBATCH --cpus-per-task=16 +#SBATCH --mem-per-cpu=8GB +#SBATCH --array=0-25 +#SBATCH -p russpold,owners + +# Define directories + +DATADIR=$OAK/users/emdupre/think-like-an-expert/ds003233 +OUTDIR=$SCRATCH/think-like-an-expert +SIFDIR=$OAK/users/emdupre/think-like-an-expert/ +LICENSE=$HOME/submission_scripts + +# Begin work section +subj_list=(`find $DATADIR -maxdepth 1 -type d -name 'sub-s*' -printf '%f\n' | sort -n -ts -k2.1`) +sub="${subj_list[$SLURM_ARRAY_TASK_ID]}" +echo "SUBJECT_ID: " $sub + +singularity run --cleanenv -B ${DATADIR}:/data:ro \ + -B ${OUTDIR}:/out \ + -B ${LICENSE}/license.txt:/license/license.txt:ro \ + ${SIFDIR}/fmriprep-23-2-0.sif \ + /data /out participant \ + --participant-label ${sub} \ + --output-space fsaverage5 MNI152NLin2009cAsym:res-2 \ + -w /out/workdir \ + --notrack \ + --fs-license-file /license/license.txt \ No newline at end of file