Skip to content

Commit

Permalink
add aditional R scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
drcandacemakedamoore committed Nov 23, 2024
1 parent bbac9f1 commit 89a951c
Show file tree
Hide file tree
Showing 13 changed files with 904 additions and 0 deletions.
21 changes: 21 additions & 0 deletions dwi_2_environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# dwi environment


name: witty
channels:
- conda-forge

dependencies:
- dipy=1.7.0
- fury=0.7.1
- jupyter
- jupyterlab
- matplotlib=3.5.3
- nilearn=0.7.0
- osfclient=0.0.5
- python>=3.10
- pybids

# if environment does not resolve try installing everything but pybids, then `conda install conda-forge::pybids`


22 changes: 22 additions & 0 deletions dwi_3_environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# dwi environment


name: wittier
channels:
- conda-forge

dependencies:
- dipy=1.7.0
- fury=0.7.1
- jupyter
- jupyterlab
- matplotlib=3.5.3
- nilearn=0.7.0
- numexpr=2.8.4
- osfclient=0.0.5
- python>=3.10
- pybids

# if environment does not resolve try installing everything but pybids, then `conda install conda-forge::pybids`


154 changes: 154 additions & 0 deletions r_scripts/1162024Tables_paper_1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
## in this script, I will try to clean up the dataexport of castor on november 11th. At this thime, score repeated data, volbrain data and freesurfer data have not been uploaded yet
#weirdly enough, the age_T0_MRI_months is exported incorrectly by castor. I decided to limit my next export to just the baseline_MRI form at T0 and to merge it later
library(dplyr)
library(writexl) #to write the tables to excel files
#read in the raw data file. For now in my personal folder, will get moved around when we arrive at the final datastructure
raw <- read.csv("Z:/Aida_experiment/6112024_brick_castor_data/BRICK_export_20241106.csv", sep= ";")
ages <-read.csv("Z:/Aida_experiment/combined_BRICK_marjolein.csv", sep=",") #this file contains the ages and recon-all data, as I apparently cannot export calculated fields from castor
volbrain <- read.csv("Z:/castor_proof_files/csv_castor/current/volbrains_castor.csv") #contains volbrain output

#rename first columns in order to merge later.
raw <- raw %>%
rename(Participant_Id = `ï..Participant.Id`)

ages <- ages %>%
rename(Participant_Id = Participant.Id)

volbrain <- volbrain %>%
rename(Participant_Id = Participant.Id)

#I decided not to clean it up. Instead. Just start with creating the tables.

table1 <- raw %>% select(Participant_Id, Hydrea_at_scan_T0, brick_genotype, mothersbirth, fathersbirth)

#import age in months from the ages df
table1 <- table1 %>%
left_join(select(ages, Participant_Id, Age_at_scan_m_T0X, gender_BRICK), by = "Participant_Id")

#add a column in years. now we have the ingredients for table1
table1 <- table1 %>%
mutate(Age_at_scan_years = round(Age_at_scan_m_T0X / 12, 1))

#Table 2: Lab values including HbF

table2 <- raw %>% select(Participant_Id, ERY0, HB0, MCV0, HT0, reticulocyte_count_percentage_1, LEU0, TROMBO0, FE0, FERT0, TRAF0, TSAT0, ALAT0, LDH0, TBIL0, DBIL0, KREA0, SCHWARTZ_Bedsite0, FOLZ0, UREU0, NA0, K00, VITD0, ASAT0, AFOS0, GGT0, CRP0, KREA_U0, TE_U0, TE_U_KR0, ALB_U0, ALB_U_KR0, B120, HPLC_HBF_T0, HPLC_HBF_date_T0, HPLC_HbS_T0, HPLC_HbS_date_T0)

#Table 3: Descriptives on radiology reports
table3 <- raw %>% select(Participant_Id,Measurement_moment_QC_T1w_T0, Score_QC_T1w_T0, Exclude_Score_QC_T1w_T0, Remarks_Score_QC_T1w_T0, Screened_for_WMH_T0, WMH_observed_T0, Vasculature_examined_T0, Vascular_malformations_T0, Microbleeds_screened_T0, Microbleeds_present_T0, Incidental_finding_T0, Marjolein_T0, Remarks_MRI_T0)

#Table 4: White matter hyperintensities
table4 <- volbrain %>%
select(
Participant_Id, dl_Sex_T0, dl_Age_T0,
dl_Total_lesion_count_T0, dl_Total_lesion_volume_.absolute._cm3_T0,
dl_Total_lesion_volume_.normalized._._T0, dl_Total_lesion_burden_T0,
dl_Periventricular_lesion_count_T0, dl_Periventricular_lesion_volume_.absolute._cm3_T0,
dl_Periventricular_lesion_volume_.normalized._._T0, dl_Periventricular_lesion_burden_T0,
dl_Deep_white_lesion_count_T0, dl_Deep_white_lesion_volume_.absolute._cm3_T0,
dl_Deep_white_lesion_volume_.normalized._._T0, dl_Deep_white_lesion_burden_T0,
dl_Juxtacortical_lesion_count_T0, dl_Juxtacortical_lesion_volume_.absolute._cm3_T0,
dl_Juxtacortical_lesion_volume_.normalized._._T0, dl_Juxtacortical_lesion_burden_T0,
dl_Infratentorial_lesion_count_T0, dl_Infratentorial_lesion_volume_.absolute._cm3_T0,
dl_Infratentorial_lesion_volume_.normalized._._T0, dl_Infratentorial_lesion_burden_T0
)
#Table 5: Recon-all volumetrics (white, grey and subcortical matter) with z-scores for 2 different kinds of growth charts.
# Calculate the Total Cortical Volume and Total Cerebellar Volume
table5 <- ages %>%
mutate(
Total_Cortical_Volume = CortexVol + Left.Cerebellum.Cortex + Right.Cerebellum.Cortex,
Total_Cerebellar_Volume = Left.Cerebellum.Cortex + Right.Cerebellum.Cortex
) %>%

# Select relevant columns for the table and rename for clarity
select(
`Participant ID` = Participant_Id,
`Age (years)` = Age_at_scan_y_T0X,
`Gender` = gender_BRICK,

# Total Volumes
`Total Cortical Volume` = Total_Cortical_Volume,
`Total White Matter Volume` = CerebralWhiteMatterVol,
`Total Gray Matter Volume` = TotalGrayVol,
`Total Cerebellar Volume` = Total_Cerebellar_Volume,

# Subcortical Structures
`Thalamus Left` = Left.Thalamus.Proper,
`Thalamus Right` = Right.Thalamus.Proper,
`Caudate Left` = Left.Caudate,
`Caudate Right` = Right.Caudate,
`Putamen Left` = Left.Putamen,
`Putamen Right` = Right.Putamen,
`Pallidum Left` = Left.Pallidum,
`Pallidum Right` = Right.Pallidum,
`Hippocampus Left` = Left.Hippocampus,
`Hippocampus Right` = Right.Hippocampus
)
#Table 5_1: Neuropsychologic outcomes(WISCV&WAISIV) + Education level parents
#WiscV
table6_wisc <- raw %>%
select(
Participant_Id,
scorePrimIndex_TIQ_S_1, # Full Scale IQ
scorePrimIndex_VBI_IQ_1, # Verbal Comprehension Index
scorePrimIndex_VRI_IQ_1, # Visual Spatial Index
scorePrimIndex_FRI_IQ_1, # Fluid Reasoning Index
scorePrimIndex_WgI_IQ_1, # Working Memory Index
scorePrimIndex_VsI_IQ_1, # Processing Speed Index
scoreSecIndex_KRI_IQ_1, # Quantitative Reasoning Index
scoreSecIndex_AWI_IQ_1, # Auditory Working Memory Index
scoreSecIndex_NVI_IQ_1, # Nonverbal Index
scoreSecIndex_AVI_IQ_1, # General Ability Index
)

#waisIV

table6_wais <- raw %>%
select(
"Participant_Id",
"ScoreTIQ_1", # Full Scale IQ (FSIQ)
"ScoreSomVBI_1", # Verbal Comprehension Index (VCI)
"ScorePerVBI_1", # Perceptual Verbal Comprehension Index (related to FRI)
"ScoreBIVBI_1", # Blocked Verbal Comprehension Index (related to FRI)
"ScoreWgI_1", # Working Memory Index (WMI)
"ScoreVsI_1" # Processing Speed Index (PSI)
)

#create a real table1
library(tableone)

# Specify the categorical and continuous variables
categorical_vars <- c("Hydrea_at_scan_T0", "brick_genotype", "gender_BRICK")
continuous_vars <- c("Age_at_scan_years")

# Specify the categorical and continuous variables for CreateTableOne
vars <- c(categorical_vars, continuous_vars)

# Create the table using CreateTableOne
table1_summary <- CreateTableOne(vars = vars, data = combined_data, factorVars = categorical_vars)

# Print the table1 summary
print(table1_summary)

#I want the median and IQR for HbS
# Manually calculate median and IQR for 'HPLC_HbS_T0'
hplc_summary <- combined_data %>%
summarise(
Median_HPLC_HbS_T0 = median(HPLC_HbS_T0, na.rm = TRUE),
IQR_HPLC_HbS_T0 = IQR(HPLC_HbS_T0, na.rm = TRUE)
)

# Manually calculate mean and SD for 'HB0'
hb0_summary <- combined_data %>%
summarise(
Mean_HB0 = mean(HB0, na.rm = TRUE),
SD_HB0 = sd(HB0, na.rm = TRUE)
)

# Print the custom summaries for HPLC_HbS_T0 and HB0
print("HPLC_HbS_T0 - Median and IQR:")
print(hplc_summary)

print("HB0 - Mean and SD:")
print(hb0_summary)


78 changes: 78 additions & 0 deletions r_scripts/1172024_Table2_Growthcurves_script_attempt.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
library(dplyr)
library(tableone)
library(writexl)
library(tibble) # For rownames_to_column()

# Select relevant columns from table2
table2_filtered <- table2 %>%
select(Participant_Id, ERY0, HB0, MCV0, HT0, reticulocyte_count_percentage_1, LEU0,
TROMBO0, FE0, FERT0, TRAF0, TSAT0, ALAT0, LDH0, TBIL0, DBIL0, KREA0,
FOLZ0, UREU0, NA0, K00, VITD0, ASAT0, AFOS0, GGT0, CRP0, KREA_U0,
TE_U0, TE_U_KR0, ALB_U0, ALB_U_KR0, B120)


# Define continuous variables
continuous_vars <- c("ERY0", "HB0", "MCV0", "HT0", "reticulocyte_count_percentage_1",
"LEU0", "TROMBO0", "FE0", "FERT0", "TRAF0", "TSAT0", "ALAT0",
"LDH0", "TBIL0", "DBIL0", "KREA0", "FOLZ0", "UREU0", "NA0",
"K00", "VITD0", "ASAT0", "AFOS0", "GGT0", "CRP0", "KREA_U0",
"TE_U0", "TE_U_KR0", "ALB_U0", "ALB_U_KR0", "B120")

# Create a named vector of labels for the continuous variables, with units
labels <- c(
"ERY0" = "Red blood cell (RBC) count (x10^12/L)",
"HB0" = "Hemoglobin (g/dL)",
"MCV0" = "Mean corpuscular volume (MCV) (fL)",
"HT0" = "Hematocrit (HCT) (%)",
"reticulocyte_count_percentage_1" = "Reticulocyte count (%)",
"LEU0" = "White blood cell (WBC) count (x10^9/L)",
"TROMBO0" = "Platelet count (x10^9/L)",
"FE0" = "Serum iron (µg/dL)",
"FERT0" = "Ferritin serum (ng/mL)",
"TRAF0" = "Transferrin (mg/dL)",
"TSAT0" = "Transferrin saturation (%)",
"ALAT0" = "Alanine transaminase (ALT) (U/L)",
"LDH0" = "Lactate Dehydrogenase (LDH) (U/L)",
"TBIL0" = "Total bilirubin (mg/dL)",
"DBIL0" = "Conjugated/direct bilirubin (mg/dL)",
"KREA0" = "Creatinine (mg/dL)",
"FOLZ0" = "Folate (ng/mL)",
"UREU0" = "Urea (mg/dL)",
"NA0" = "Sodium (mmol/L)",
"K00" = "Potassium (mmol/L)",
"VITD0" = "Vitamin D (ng/mL)",
"ASAT0" = "Aspartate transaminase (AST) (U/L)",
"AFOS0" = "Alkaline phosphatase (ALP) (U/L)",
"GGT0" = "Gamma-glutamyl transferase (GGT) (U/L)",
"CRP0" = "C-reactive protein (CRP) (mg/L)",
"KREA_U0" = "Creatinine (Urine) (mg/dL)",
"TE_U0" = "Iron (Urine) (µg/dL)",
"TE_U_KR0" = "Iron (Urine, Kr) (µg/dL)",
"ALB_U0" = "Albumin (Urine) (g/dL)",
"ALB_U_KR0" = "Albumin (Urine, Kr) (g/dL)",
"B120" = "Bilirubin 120 (mg/dL)"
)

# Create a summary table with continuous variables
table2_summary <- CreateTableOne(
vars = continuous_vars,
data = table2_filtered,
factorVars = character(0) # No factor variables
)

# Convert the table summary to a data frame
table2_df <- as.data.frame(print(table2_summary, quote = FALSE, noSpaces = TRUE))

# Add the labels as a new column by matching with continuous_vars
table2_df <- table2_df %>%
rownames_to_column(var = "Variable") %>%
mutate(Variable = labels[continuous_vars])

# Rename the 'Overall' column to "Mean (SD)" to indicate the values
colnames(table2_df)[colnames(table2_df) == "Overall"] <- "Mean (SD)"


# View the final table
print(table2_df)
#question: weird results, how do we impute missing values?

117 changes: 117 additions & 0 deletions r_scripts/12112024_Identify_genotypes_bridge_sample.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#this script is to find the gentotypes that apply to the bridge subset of brick for the ash poster 2024
library(dplyr)
library(readr)
library(tableone)

#load dataset with genotypes
genotype <-read.csv("Z:/Aida_experiment/combined_BRICK_marjolein_with_genotype.csv")

# Create a new column with rounded "Age_at_scan_y_T0X" values in the genotype DataFrame. This is needed for an exact match
genotype <- genotype %>%
mutate(Age_at_scan_y_T0X = round(Age_at_scan_y_T0X, 2))

# Define ages for each gender
ages_male <- c(9.94, 11.74, 12.73, 12.81, 13.94, 15.09, 15.22, 16.02, 16.65, 17.4, 17.48)
ages_female <- c(8.05, 9.16, 10.32, 13.72, 13.77, 14.85, 16.25)

# Filter genotype DataFrame based on matching ages for males and females separately
matched_data_male <- genotype %>%
filter(Age_at_scan_y_T0X %in% ages_male & gender_BRICK == 1) %>%
select(Participant_Id, brick_genotype, Age_at_scan_y_T0X, gender_BRICK)

matched_data_female <- genotype %>%
filter(Age_at_scan_y_T0X %in% ages_female & gender_BRICK == 2) %>%
select(Participant_Id, brick_genotype, Age_at_scan_y_T0X, gender_BRICK)

# Assign to BRIDGE_male and BRIDGE_female DataFrames
BRIDGE_male <- matched_data_male
BRIDGE_female <- matched_data_female

# Display the resulting DataFrames
print("BRIDGE_male:")
print(BRIDGE_male)

print("BRIDGE_female:")
print(BRIDGE_female)

#not an exact match. let's just compare it manually

genotype_male <- genotype %>% filter(gender_BRICK == 1)

genotype_female <- genotype %>% filter(gender_BRICK == 2)

#now some relevant extra rows are the hydrea use row, Hb and HbF that are interesting for table 1
#load the bridge subset of participants, I manually added gender, brick number and genotype to the bridge subset. I've also ran th script of 1162024Tables_paper_1.R and 1172024_Table2_Growthcurves_script
#beforehand
male_brick <- read.csv("Z:/Aida_experiment/ASH_poster_2024/12112024_TCV_BRICK_BRIDGE_gen_boys.csv")
female_brick <- read.csv("Z:/Aida_experiment/ASH_poster_2024/12112024_TCV_BRICK_BRIDGE_gen_girls.csv")

#from df table1 I need "Hydrea_at_scan_T0" and from dataframe "table2_filtered" I need HB0. The NA values need to be kicked out. merge on Participant_Id into new df for males and females

# Extract the relevant columns from table1, table2_filtered, and raw for males
table1_filtered_male <- table1 %>%
select(Participant_Id, Hydrea_at_scan_T0) %>%
filter(!is.na(Hydrea_at_scan_T0))

table2_filtered_male <- table2_filtered %>%
select(Participant_Id, HB0) %>%
filter(!is.na(HB0))

raw_filtered_male <- raw %>%
select(Participant_Id, HPLC_HbS_T0) %>%
filter(!is.na(HPLC_HbS_T0))

# Merge male data with additional columns
male_data_merged <- male_brick %>%
inner_join(table1_filtered_male, by = "Participant_Id") %>%
inner_join(table2_filtered_male, by = "Participant_Id") %>%
inner_join(raw_filtered_male, by = "Participant_Id")

# Repeat the process for females
table1_filtered_female <- table1 %>%
select(Participant_Id, Hydrea_at_scan_T0) %>%
filter(!is.na(Hydrea_at_scan_T0))

table2_filtered_female <- table2_filtered %>%
select(Participant_Id, HB0) %>%
filter(!is.na(HB0))

raw_filtered_female <- raw %>%
select(Participant_Id, HPLC_HbS_T0) %>%
filter(!is.na(HPLC_HbS_T0))

# Merge female data with additional columns
female_data_merged <- female_brick %>%
inner_join(table1_filtered_female, by = "Participant_Id") %>%
inner_join(table2_filtered_female, by = "Participant_Id") %>%
inner_join(raw_filtered_female, by = "Participant_Id")

# Display the resulting data frames
print("Male Data Merged:")
print(male_data_merged)

print("Female Data Merged:")
print(female_data_merged)


#now make the dfs one and create a table 1
# Merge male and female brick datasets into one, keeping the 'gender' column intact
combined_data <- bind_rows(male_data_merged, female_data_merged)

# Define categorical and continuous variables
categorical_vars <- c("Hydrea_at_scan_T0", "genotype", "gender_brick")
continuous_vars <- c("AgeChild", "HB0", "HPLC_HbS_T0")

# Use the table1 package to create a summary table
table1_summary <- CreateTableOne(~ Hydrea_at_scan_T0 + genotype + gender_brick +
AgeChild + HB0 + HPLC_HbS_T0,
data = combined_data,
render.categorical = "Freq",
render.continuous = c("Median", "IQR"))

# Print the table1 summary
print(table1_summary)




Loading

0 comments on commit 89a951c

Please sign in to comment.