Code for processing TLAE dataset

emdupre · May 9, 2024 · fd9eacb · fd9eacb
1 parent 21a781f
commit fd9eacb
Show file tree

Hide file tree

Showing 3 changed files with 204 additions and 0 deletions.
diff --git a/bayes_ca/data/tlae-embeddings.py b/bayes_ca/data/tlae-embeddings.py
@@ -0,0 +1,47 @@
+import json
+from pathlib import Path
+
+import click
+import whisper
+import numpy as np
+from sentence_transformers import SentenceTransformer
+
+
+@click.command()
+@click.option("--file")
+@click.option("--datadir")
+@click.option("--outdir")
+def main(file, datadir, outdir):
+    """
+    This script assumes you have access to the copyrighted stimuli
+    and that you are running in an environment with SBert (i.e.,
+    sentence-transformers) and OpenAI's Whisper installed.
+
+    Params
+    ------
+    file : str
+        Stimulus file to transcribe and generate embedding
+    datadir : str
+        Local path to the stimuli files
+    outdir : str
+        Local path to store transcriptions and embeddings
+    """
+    model = whisper.load_model("medium.en")
+    result = model.transcribe(str(Path(datadir, file)))
+
+    whisper_outname = "whisper-" + str(Path(file).with_suffix(".json"))
+    with open(Path(outdir, whisper_outname), "w") as outfile:
+        json.dump(result, outfile, indent=4)
+
+    sentences = result["text"].split(". ")
+    model = SentenceTransformer("all-mpnet-base-v2")
+
+    embeddings = model.encode(sentences)
+    sbert_outname = Path(outdir, f"sbert-{whisper_outname}")
+    np.save(sbert_outname, embeddings, allow_pickle=False)
+
+    return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bayes_ca/data/tlae-fmriprep.sh b/bayes_ca/data/tlae-fmriprep.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#
+#SBATCH --job-name=tlae_fMRIPrep
+#SBATCH --output=tlae_fmriprep.%j.out
+#SBATCH --time=1-00:00
+#SBATCH --cpus-per-task=16
+#SBATCH --mem-per-cpu=8GB
+#SBATCH --array=0-25
+#SBATCH -p russpold,owners
+
+# Define directories
+
+DATADIR=$OAK/users/emdupre/think-like-an-expert/ds003233
+OUTDIR=$SCRATCH/think-like-an-expert
+SIFDIR=$OAK/users/emdupre/think-like-an-expert/
+LICENSE=$HOME/submission_scripts
+
+# Begin work section
+subj_list=(`find $DATADIR -maxdepth 1 -type d -name 'sub-s*' -printf '%f\n' | sort -n -ts -k2.1`)
+sub="${subj_list[$SLURM_ARRAY_TASK_ID]}"
+echo "SUBJECT_ID: " $sub
+
+singularity run --cleanenv -B ${DATADIR}:/data:ro \
+	-B ${OUTDIR}:/out \
+	-B ${LICENSE}/license.txt:/license/license.txt:ro \
+	${SIFDIR}/fmriprep-23-2-0.sif \
+	/data /out participant \
+    --participant-label ${sub} \
+	--output-space fsaverage5 MNI152NLin2009cAsym:res-2 \
+	-w /out/workdir \
+	--notrack \
+    --fs-license-file /license/license.txt 
diff --git a/bayes_ca/data/tlae-postproc.py b/bayes_ca/data/tlae-postproc.py
@@ -0,0 +1,125 @@
+from pathlib import Path
+
+import click
+import numpy as np
+import nibabel as nib
+from scipy import stats
+from sklearn import linear_model
+from nilearn import image, masking
+
+
+def _regress_confounds(data, confounds):
+    """
+    Regress out confounds from data.
+    """
+    lr = linear_model.LinearRegression()
+    lr.fit(confounds, data.T)
+    regr_data = data - np.dot(lr.coef_, confounds.T) - lr.intercept_[:, None]
+    # Note some % of values on cortical surface are NaNs,
+    # so the following will throw an error
+    zscore_data = stats.zscore(regr_data, axis=1)
+    return np.nan_to_num(zscore_data)
+
+
+def create_sessions(postproc_dir, subject_id):
+    """
+    Row stack individual fMRI files to yield one time x voxel matrix per session
+    """
+    p = Path(postproc_dir)
+    for i in range(1, 6):
+        ses_id = f"ses-wk{i}"
+        files = list(p.glob(f"{subject_id}_{ses_id}*.npy"))
+        files = sorted(files)
+        ses_data = np.row_stack([np.load(f) for f in files])
+        np.save(f"{subject_id}_{ses_id}_AG_roi", ses_data)
+    return
+
+
+@click.command()
+@click.option("--datadir")
+@click.option("--outdir")
+@click.option("--subject")
+def main(datadir, outdir, subject):
+    """
+    Known issues identified in Lee et al., 2024, NeurIPS :
+        - sub-s103, no ses-wk3
+        - after sub-s106 w4recap, w5recap was skipped
+        - sub-s112, no ses-wk6 placement
+        - sub-s201, no ses-wk6 placement
+    """
+    s = Path(datadir, subject)
+
+    vols = sorted(s.rglob("*preproc_bold.nii.gz"))
+    for vol in vols:
+        sub, ses, task, space, res, _, _ = vol.name.split("_")
+        print(f"Processing : {vol}")
+        mask = s.rglob(f"**/func/*{ses}_{task}_{space}_{res}_desc-brain_mask.nii.gz")
+        surfs = s.rglob(f"{sub}_{ses}_{task}*bold.func.gii")
+        cfile = s.rglob(f"{sub}_{ses}_{task}_desc-confounds_timeseries.tsv")
+
+        # load confounds
+        conf = np.genfromtxt(next(cfile).as_posix(), names=True)
+        conf_keys = [
+            "trans_x",  # Motion and motion derivatives
+            "trans_x_derivative1",
+            "trans_y",
+            "trans_y_derivative1",
+            "trans_z",
+            "trans_z_derivative1",
+            "rot_x",
+            "rot_x_derivative1",
+            "rot_y",
+            "rot_y_derivative1",
+            "rot_z",
+            "rot_z_derivative1",
+            "framewise_displacement",
+        ]
+        conf_keys += [f"a_comp_cor_{i:02d}" for i in range(6)]
+        conf_keys += ["cosine00", "cosine01"]
+
+        try:
+            regrs = np.column_stack([conf[c] for c in conf_keys])
+        except ValueError:  # scan too short ; no cosine01 generated
+            conf_keys = conf_keys[:-1]
+            regrs = np.column_stack([conf[c] for c in conf_keys])
+        regrs = np.nan_to_num(regrs)
+
+        # clean data using linear regression of confounds
+        mask = next(mask)
+        masked_vol = masking.apply_mask(vol, mask)
+        clean_vol = _regress_confounds(masked_vol.T, regrs)
+
+        for surf in surfs:
+            surf_data = np.column_stack([x.data for x in nib.load(surf).darrays])
+            if "hemi-L" in str(surf):
+                clean_surf_l = _regress_confounds(surf_data, regrs)
+            elif "hemi-R" in str(surf):
+                clean_surf_r = _regress_confounds(surf_data, regrs)
+
+        # we'll also extract data for ang. gyr. ROI in volumetric space
+        roi = Path(datadir, "..", "AG_mask.nii.gz")
+        unmask_vol = masking.unmask(clean_vol.T, mask)
+        # paper reports 3mm isotropic voxels, but 2mm acquired so resample
+        rs_vol = image.resample_img(unmask_vol, np.eye(3) * 3)
+        rs_mask = image.resample_to_img(roi, rs_vol, interpolation="nearest")
+        ang_roi = masking.apply_mask(rs_vol, rs_mask)
+
+        # Save hdf5 filem with all fields as n_samples x n_features
+        # savepath = Path(outdir, f"{sub}_{ses}_{task}.h5")
+        # with h5py.File(savepath, "w") as hf:
+        #     grp = hf.create_group(ses)
+        #     grp.create_dataset("surf-l", data=clean_surf_l.T)
+        #     grp.create_dataset("surf-r", data=clean_surf_r.T)
+        #     grp.create_dataset("vol", data=clean_vol.T)
+        #     grp.create_dataset("regrs", data=regrs)
+        #     # grp.create_dataset("ag-roi", data=ang_roi)
+
+        np.save(Path(outdir, f"{sub}_{ses}_{task}_AG_roi"), ang_roi)
+
+        # Inefficient but safer : re-load individual task files and combine
+        # into larger session-specific time x voxel matrices
+        create_sessions(outdir, subject)
+
+
+if __name__ == "__main__":
+    main()