Skip to content

Commit

Permalink
Code for processing TLAE dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
emdupre committed May 9, 2024
1 parent 21a781f commit fd9eacb
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 0 deletions.
47 changes: 47 additions & 0 deletions bayes_ca/data/tlae-embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import json
from pathlib import Path

import click
import whisper
import numpy as np
from sentence_transformers import SentenceTransformer


@click.command()
@click.option("--file")
@click.option("--datadir")
@click.option("--outdir")
def main(file, datadir, outdir):
"""
This script assumes you have access to the copyrighted stimuli
and that you are running in an environment with SBert (i.e.,
sentence-transformers) and OpenAI's Whisper installed.
Params
------
file : str
Stimulus file to transcribe and generate embedding
datadir : str
Local path to the stimuli files
outdir : str
Local path to store transcriptions and embeddings
"""
model = whisper.load_model("medium.en")
result = model.transcribe(str(Path(datadir, file)))

whisper_outname = "whisper-" + str(Path(file).with_suffix(".json"))
with open(Path(outdir, whisper_outname), "w") as outfile:
json.dump(result, outfile, indent=4)

sentences = result["text"].split(". ")
model = SentenceTransformer("all-mpnet-base-v2")

embeddings = model.encode(sentences)
sbert_outname = Path(outdir, f"sbert-{whisper_outname}")
np.save(sbert_outname, embeddings, allow_pickle=False)

return


if __name__ == "__main__":
main()
32 changes: 32 additions & 0 deletions bayes_ca/data/tlae-fmriprep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
#
#SBATCH --job-name=tlae_fMRIPrep
#SBATCH --output=tlae_fmriprep.%j.out
#SBATCH --time=1-00:00
#SBATCH --cpus-per-task=16
#SBATCH --mem-per-cpu=8GB
#SBATCH --array=0-25
#SBATCH -p russpold,owners

# Define directories

DATADIR=$OAK/users/emdupre/think-like-an-expert/ds003233
OUTDIR=$SCRATCH/think-like-an-expert
SIFDIR=$OAK/users/emdupre/think-like-an-expert/
LICENSE=$HOME/submission_scripts

# Begin work section
subj_list=(`find $DATADIR -maxdepth 1 -type d -name 'sub-s*' -printf '%f\n' | sort -n -ts -k2.1`)
sub="${subj_list[$SLURM_ARRAY_TASK_ID]}"
echo "SUBJECT_ID: " $sub

singularity run --cleanenv -B ${DATADIR}:/data:ro \
-B ${OUTDIR}:/out \
-B ${LICENSE}/license.txt:/license/license.txt:ro \
${SIFDIR}/fmriprep-23-2-0.sif \
/data /out participant \
--participant-label ${sub} \
--output-space fsaverage5 MNI152NLin2009cAsym:res-2 \
-w /out/workdir \
--notrack \
--fs-license-file /license/license.txt
125 changes: 125 additions & 0 deletions bayes_ca/data/tlae-postproc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from pathlib import Path

import click
import numpy as np
import nibabel as nib
from scipy import stats
from sklearn import linear_model
from nilearn import image, masking


def _regress_confounds(data, confounds):
"""
Regress out confounds from data.
"""
lr = linear_model.LinearRegression()
lr.fit(confounds, data.T)
regr_data = data - np.dot(lr.coef_, confounds.T) - lr.intercept_[:, None]
# Note some % of values on cortical surface are NaNs,
# so the following will throw an error
zscore_data = stats.zscore(regr_data, axis=1)
return np.nan_to_num(zscore_data)


def create_sessions(postproc_dir, subject_id):
"""
Row stack individual fMRI files to yield one time x voxel matrix per session
"""
p = Path(postproc_dir)
for i in range(1, 6):
ses_id = f"ses-wk{i}"
files = list(p.glob(f"{subject_id}_{ses_id}*.npy"))
files = sorted(files)
ses_data = np.row_stack([np.load(f) for f in files])
np.save(f"{subject_id}_{ses_id}_AG_roi", ses_data)
return


@click.command()
@click.option("--datadir")
@click.option("--outdir")
@click.option("--subject")
def main(datadir, outdir, subject):
"""
Known issues identified in Lee et al., 2024, NeurIPS :
- sub-s103, no ses-wk3
- after sub-s106 w4recap, w5recap was skipped
- sub-s112, no ses-wk6 placement
- sub-s201, no ses-wk6 placement
"""
s = Path(datadir, subject)

vols = sorted(s.rglob("*preproc_bold.nii.gz"))
for vol in vols:
sub, ses, task, space, res, _, _ = vol.name.split("_")
print(f"Processing : {vol}")
mask = s.rglob(f"**/func/*{ses}_{task}_{space}_{res}_desc-brain_mask.nii.gz")
surfs = s.rglob(f"{sub}_{ses}_{task}*bold.func.gii")
cfile = s.rglob(f"{sub}_{ses}_{task}_desc-confounds_timeseries.tsv")

# load confounds
conf = np.genfromtxt(next(cfile).as_posix(), names=True)
conf_keys = [
"trans_x", # Motion and motion derivatives
"trans_x_derivative1",
"trans_y",
"trans_y_derivative1",
"trans_z",
"trans_z_derivative1",
"rot_x",
"rot_x_derivative1",
"rot_y",
"rot_y_derivative1",
"rot_z",
"rot_z_derivative1",
"framewise_displacement",
]
conf_keys += [f"a_comp_cor_{i:02d}" for i in range(6)]
conf_keys += ["cosine00", "cosine01"]

try:
regrs = np.column_stack([conf[c] for c in conf_keys])
except ValueError: # scan too short ; no cosine01 generated
conf_keys = conf_keys[:-1]
regrs = np.column_stack([conf[c] for c in conf_keys])
regrs = np.nan_to_num(regrs)

# clean data using linear regression of confounds
mask = next(mask)
masked_vol = masking.apply_mask(vol, mask)
clean_vol = _regress_confounds(masked_vol.T, regrs)

for surf in surfs:
surf_data = np.column_stack([x.data for x in nib.load(surf).darrays])
if "hemi-L" in str(surf):
clean_surf_l = _regress_confounds(surf_data, regrs)
elif "hemi-R" in str(surf):
clean_surf_r = _regress_confounds(surf_data, regrs)

# we'll also extract data for ang. gyr. ROI in volumetric space
roi = Path(datadir, "..", "AG_mask.nii.gz")
unmask_vol = masking.unmask(clean_vol.T, mask)
# paper reports 3mm isotropic voxels, but 2mm acquired so resample
rs_vol = image.resample_img(unmask_vol, np.eye(3) * 3)
rs_mask = image.resample_to_img(roi, rs_vol, interpolation="nearest")
ang_roi = masking.apply_mask(rs_vol, rs_mask)

# Save hdf5 filem with all fields as n_samples x n_features
# savepath = Path(outdir, f"{sub}_{ses}_{task}.h5")
# with h5py.File(savepath, "w") as hf:
# grp = hf.create_group(ses)
# grp.create_dataset("surf-l", data=clean_surf_l.T)
# grp.create_dataset("surf-r", data=clean_surf_r.T)
# grp.create_dataset("vol", data=clean_vol.T)
# grp.create_dataset("regrs", data=regrs)
# # grp.create_dataset("ag-roi", data=ang_roi)

np.save(Path(outdir, f"{sub}_{ses}_{task}_AG_roi"), ang_roi)

# Inefficient but safer : re-load individual task files and combine
# into larger session-specific time x voxel matrices
create_sessions(outdir, subject)


if __name__ == "__main__":
main()

0 comments on commit fd9eacb

Please sign in to comment.