GM_ThesisBiometris.tex

%%%%%%%%%-----------------------------------------------------------------
%%%%%%%%% Thesis template, v1
%%%%%%%%% Mathematical & Statistical Methods group - Biometris
%%%%%%%%% Wageningen University & Research
%%%%%%%%%-----------------------------------------------------------------


\documentclass{amsart}
%  Font & Formatting
\usepackage{lmodern}
\usepackage[x11names]{xcolor}

%  Text encoding
\usepackage[utf8]{inputenc}
\usepackage{booktabs}
% \usepackage{lipsum}
\setcounter{tocdepth}{2}
\usepackage{enumitem}

% Setting margins
\usepackage[a4paper, left=3cm,right=3cm]{geometry}

% Packages for the titlepage
\usepackage{tikz}
\usetikzlibrary{calc}
\usepackage{graphicx}
\usepackage{newtxtext}
\usepackage{float}
\usepackage[inkscapeformat=png]{svg}
\newcommand{\parasep}{
\begin{center} 
   \includegraphics[scale=.3]{figures/eYzOK.png} 
\end{center}}

% Packages for tables
\usepackage{multicol}
\usepackage{multirow}
\usepackage{array}
\usepackage{ragged2e}
\usepackage{colortbl}
\usepackage{threeparttable}

% Packages for mathematical writing
\usepackage{amsmath}                           % Enables the align enviroment
\usepackage{amssymb}                           % Math symbols (e.g. \mathbb{})
\usepackage{dsfont} 	                         % For \mathds{1} blackboard bold 1
\usepackage{bm}                                % For roman (upright) bold latin letters
\usepackage{mathtools}                         % Better math

% For algorithms
\usepackage[ruled,vlined,linesnumbered]{algorithm2e}
% \usepackage{algpseudocode}
% \renewcommand{\algorithmicrequire}{\textbf{Input:}}
% \renewcommand{\algorithmicensure}{\textbf{Output:}}

% For urls and hyperlinks
\usepackage[scientific-notation=true, allow-number-unit-breaks=true]{siunitx} % For scientific notation
\usepackage{euscript}[mathcal]
\usepackage{txfonts}
\usepackage[
  hypertexnames=false,
  colorlinks= true,
  linkcolor=.,
  citecolor={Turquoise4},
  urlcolor={Blue4}]{hyperref} 

% For bibliography
\usepackage[backend=biber,
    style=nature, 
    maxcitenames=1,
    mincitenames=1,
    safeinputenc]
    {biblatex}
\addbibresource{literature/references.bib}
\def\bibfont{\footnotesize}

% Abbreviations
\usepackage[acronym, nomain, nonumberlist]{glossaries}
\makeglossaries
\newacronym{ad}{AD}{Alzheimer's disease}
\newacronym{sad}{SAD}{sporadic Alzheimer's disease}
\newacronym{fad}{FAD}{familial Alzheimer's disease}
\newacronym{apoe}{ApoE}{apolipoprotein E}
\newacronym{load}{LOAD}{late-onset Alzheimer's disease}
\newacronym{idl}{IDL}{intermediate density lipoprotein}
\newacronym{vldl}{VLDL}{very low density lipoprotein}
\newacronym{abca1}{ABCA1}{ATP-binding cassette transporter A1}
\newacronym{lrp}{LRP1}{low density lipoprotein receptor-related protein 1}
\newacronym{hdl}{HDL}{high density lipoprotein}
\newacronym{bbb}{BBB}{blood-brain barrier}
\newacronym{tlr4}{TLR4}{toll-like receptor 4}
\newacronym{nfkb}{NFkB}{nuclear factor kappa B}
\newacronym{atp}{ATP}{adenosine triphosphate}
\newacronym{vcs}{VCS}{version control system}
\newacronym{cv}{CV}{cross-validation}
\newacronym{gsk}{GSK-3$\beta$}{glycogen synthase kinase-3$\beta$}
\newacronym{csf}{CSF}{cerebrospinal fluid}
\newacronym{rss}{RSS}{residual sum of squares}
\newacronym{ancova}{ANCOVA}{analysis of covariance}
\newacronym{smote}{SMOTE}{synthetic minority over-sampling technique}
\newacronym{ml}{ML}{maximum likelihood}
\newacronym{lasso}{LASSO}{least absolute shrinkage and selection operator}
\newacronym{mnl}{MLR}{multinomial logistic regression}
\newacronym{roc}{ROC}{receiver-operating characteristics}
\newacronym{auc}{AUC}{area under the ROC curve}
\newacronym{ggm}{GGM}{Gaussian graphical model}
\newacronym{app}{APP}{amyloid precursor protein}
\newacronym{ldlr}{LDLR}{low-density lipoprotein receptor}
\newacronym{isf}{ISF}{interstitial fluid}
\newacronym{il}{IL}{interleukin}
\newacronym{ros}{ROS}{reactive oxygen species}
\newacronym{tnf}{TNF-$\alpha$}{tumour necrosis factor $\alpha$}
\newacronym{xgb}{XGBoost}{eXtreme gradient boosting}
\newacronym{dt}{DT}{decision tree}
\newacronym{mci}{MCI}{mild cognitive impairment}
\newacronym{scd}{SCD}{subjective cognitive decline}
\newacronym{nir}{NIR}{no-information rate}
\newacronym{ssr}{SSR}{sum of squares of the regression}
\newacronym{map}{MAP}{mean arterial pressure}
\newacronym{bmi}{BMI}{body mass index}
\newacronym{uplcms}{UPLC-MS/MS}{ultra high-performance liquid chromatography tandem mass spectrometry}
\newacronym{gcms}{GC-MS}{gas chromatography tandem mass spectrometry}
\newacronym{uplcesitof}{UPLC-ESI-Q-TOF-MS}{ultra high-performance liquid chromatography electrospray tandem quadrupole time-of-flight mass spectrometry}
\renewcommand{\glossarymark}[1]{}


%--------------------------------------------------------------------
%--------------- Front and Main matter style ------------------------
\newcommand{\frontmatter}{
    \pagenumbering{roman}   % Setting page numbering to lower-case roman
}
\newcommand{\mainmatter}{
    \newpage
    \pagenumbering{arabic}  % Setting page numbering to normal integers
}
%--------------- Front and Main matter style ------------------------
%--------------------------------------------------------------------

\begin{document}
% \input{Thesis_Template_Biometris-concordance}


% Add title page:
%------------------------------------------------------------------------------
% In this segment, enter the desired student data to be shown at the title page
\newcommand{\thesisAuthor}{George Miliarakis}                             % State your name
\newcommand{\thesisTitle}{Mechanistic links between ApoE4 dose and serum metabolome in Alzheimer's disease}                                      % State title thesis
\newcommand{\thesisSubTitle}{A data science approach}                            % State subtitle thesis
\newcommand{\thesisDegree}{MSc Thesis}      % Choose type
\newcommand{\university}{Wageningen University \& Research}                % You generally do not need to touch this
\newcommand{\thesisPlaceDate}{Wageningen, 18 March 2024}                      % State month and year
%------------------------------------------------------------------------------


%------------------------------------------------------------------------------
% In this segment, enter the desired supervisor data to be shown at the title page
\newcommand{\supervisor}{C.F.W. Peeters}                                                % State name of supervisor
\newcommand{\departmentSUP}{Mathematical \& Statistical Methods (Biometris)} % State department supervisor (generally Biometris)
\newcommand{\universitySUP}{Wageningen University \& Research}                              % State university supervisor (generally WUR)
%------------------------------------------------------------------------------


%------------------------------------------------------------------------------
% In this segment, enter the desired co-supervisor data to be shown at the title page
\newcommand{\cosupervisor}{Yannick Vermeiren}                                         % State name of co-supervisor
\newcommand{\departmentCOSUP}{Nutritional Biology, Human Nutrition \& Health}                             % State department or division co-supervisor
\newcommand{\universityCOSUP}{Wageningen University \& Research}                              % State university or company co-supervisor
%------------------------------------------------------------------------------


%------------------------------------------------------------------------------
% Logos and visuals
\begin{titlepage}
\thispagestyle{empty}

% Adding logos
\begin{figure} [H]
\vspace{-3cm}
 \centering
\begin{minipage}[t]{.45\linewidth}
  \raggedright
  % Upload and include SLU loggo here:
  \hspace*{-2cm}\includegraphics[width=\linewidth]{figures/WURlogo.png}

\end{minipage}%
  \begin{minipage}[t]{.45\linewidth}
  \vspace{-2.6cm}
 \raggedleft
%% Inclusion Biometris logo
 \hspace*{+2cm}\includegraphics[width =.95\textwidth]{figures/biometris_logo.png}

\end{minipage}
\end{figure}

% Bottom/background picture
\begin{tikzpicture}[overlay, remember picture]
\node[anchor=south west,
      xshift=+6.5cm,
      yshift=-0.2cm]
     at (current page.south west)
     {\includegraphics[width = 0.5\textwidth]{figures/torontodeclaration.png}};
\end{tikzpicture}
%------------------------------------------------------------------------------


%------------------------------------------------------------------------------
% Title information
\vspace{1cm}
\begin{center}
\par
\noindent
\rule[0.2cm]{\linewidth}{1.5pt}
\Huge
\textbf{\thesisTitle}
\vspace{0.2cm}
\LARGE
\par
\noindent
\thesisSubTitle\\
\rule[0.2cm]{\linewidth}{1.5pt}
\Large
\end{center}
%------------------------------------------------------------------------------


%------------------------------------------------------------------------------
% Author information
\vspace{2cm}
\noindent
\LARGE
\thesisAuthor\\
\vspace{.2 cm}
\small
\par \noindent
\thesisDegree
\par \noindent
\university
\par \noindent
\thesisPlaceDate
%------------------------------------------------------------------------------


%------------------------------------------------------------------------------
% Supervision information
\vspace{4cm}
\begin{flushright}
\emph{Supervisor} \\
\textbf{\supervisor} \\
\departmentSUP\\
\universitySUP
\end{flushright}

\vspace{.5cm}
\begin{flushright}
\emph{Supervisor} \\
\textbf{\cosupervisor} \\
\departmentCOSUP \\
\universityCOSUP
\end{flushright}
\end{titlepage}
%------------------------------------------------------------------------------
%--------------- Front matter ---------------------------------------
%-------------------------------------------------------------------
\frontmatter
% Foreword
\section*{Foreword}
This report represents the culmination of my thesis journey, as part of the M.Sc. programme Data Science for Food and Health at Wageningen University \& Research. Throughout this journey I acquired a deeper understanding of Alzheimer's disease pathology, as well as several new data science skills (network analysis in R, scientific writing in \LaTeX among others). Given the several scientific disciplines brought together in this project, I regret that this report requires a strong background in human pathophysiology, biochemistry, statistics and machine learning to be comprehended.

I am incredibly grateful to my supervisors, Dr. Carel F.W. Peeters and Dr. Yannick Vermeiren. Their guidance proved instrumental throughout this project. The depth and breadth of their combined academic and scientific expertise fostered an unparalleled learning environment. Dr. Peeters' brilliance was evident in the insightful feedback he always provided and the packages \textsf{rags2ridges} and \textsf{FMradio} he developed, that streamlined my analyses. My appreciation for Dr. Vermeiren's indispensable contributions is equally profound. His expertise in AD, coupled with his consistently constructive feedback, played a pivotal role in shaping this project. The latter would not have been possible without the invaluable data provided by the Amsterdam Dementia Cohort (confidentiality maintained).\\
\parasep
In loving memory of a cherished one who struggled with Alzheimer's disease (AD). Witnessing their struggle (and ultimate passing a month before I embarked on this project) ignited a profound desire in me to contribute to AD research and advocate for improved dementia care.
\begin{figure}[!b]
\includesvg[width=0.7\textwidth]{figures/commpos.svg}
\end{figure}
% Abstract
\newpage
\newgeometry{bottom=0.5cm}
\section*{Abstract}
\textbf{Background}\hspace{.1cm}Alzheimer's disease (AD), the most common form of dementia, represents a significant global healthcare challenge. Apolipoprotein-$\varepsilon$4 (ApoE4) is the strongest genetic risk factor for late-onset AD, with nuanced impacts based on sex and ancestry. It is reported that ApoE4 is not metabolized and recycled adequately in the brain, and over time induces the deposition of amyloid-$\beta$ and hyperphosphorylated tau proteins, impairs mitochondrial glucose metabolism and promotes neuroinflammation. There is limited evidence, however, on the effects of ApoE4 on peripheral metabolism. This study explored mechanistic links between the ApoE4 haplotype and serum metabolome in AD and subjective cognitive decline (SCD) patients via three approaches: ApoE4 dose-effects (0, 1, 2 alleles) on individual metabolite levels, multi-class classification of ApoE4 status and AD (4 possible phenotypes: AD without ApoE4, AD with at least 1 ApoE4, SCD without ApoE4, SCD with at least 1 ApoE4), as well as metabolite precision network analysis among ApoE4 carriers and non-carriers in AD.

\textbf{Methods}\hspace{.1cm} To screen for ApoE4 dose-effects on serum metabolites, two methods were applied: a global test (correcting only for sex) and nested linear model comparison using ANCOVA F-tests (correcting for several confounding clinical factor), both adjusting for multiple testing via FDR. To test the (added) classification potential of serum metabolites against ApoE4 and AD status, several multi-class classification models (multinomial logistic regression, decision tree and \acrlong{xgb}) were fitted considering the bias-variance trade-off and model interpretability. Model performance was  evaluated via confusion matrices, ROC curves and AUC, as well as accuracy and no-information rate obtained from repeated (100 times) 10-fold CV. To visualize and compare the metabolite (precision) network topologies among ApoE4 carriers and non-carriers, the high-dimensional matrices were first regularized with a Ridge penalty, sparsified and then pruned at a local FDR threshold of 0.001. Network topologies were plotted in Gaussian graphical models (GGMs) and communities of conditionally dependent metabolites were identified using the Girvan-Newman algorithm. The centrality measures were calculated and compared between the ApoE4 carriers and non-carriers using Wilcoxon Signed Rank test.

\textbf{Results}\hspace{.1cm} 
In the analysis 120 AD patients and 127 SCD individuals (n = 247) were included from the Amsterdam Dementia Cohort. The global test revealed significantly higher tri- and diglyceride levels in AD, while no significant nuances were observed in the SCD group. After adjusting for FDR, no significant ApoE4 dose effects were observed via nested linear model comparison in AD or SCD. However, consistent with the global test, the AD group presented a more pronounced lipid deregulated signature, while the SCD group mainly aminic. Serum metabolites slightly ($\sim0.02\%$ AUC) improved model performance and all models had higher accuracy compared to the no-information rate. Putrescine and sphingosine-1-phosphate (oxidative stress compound) emerged as top metabolites in predicting ApoE4 and AD status. The metabolite network of ApoE4 carriers was less random and had higher centrality measures, compared to the non-carriers'. Lastly, the first network could be separated in communities of conditionally dependent metabolites, while the latter was largely more sparse.

\textbf{Discussion}\hspace{.1cm} These findings suggest beyond perturbed triglyceride metabolism, pronounced aminic and oxidative stress signatures at systematic level associated with ApoE4 presence and (increasing) dose in AD. Further, impaired glucose metabolism (ketoglutarate), increased kynurenine pathway catabolism of tryptophan were observed in AD and SCD, respectively. By incorporating additional "omics" data over time, researchers can potentially develop more sensitive disease-predicting models and stage-dependent biomarkers through large-scale clinical studies. Future investigations could also explore the ApoE4 lipoprotein levels in cerebrospinal fluid (CSF) and serum across the AD continuum, considering as many confounding factors as possible.

\textbf{Conclusion}\hspace{.1cm} Peripheral metabolism is reported shifted by (the presence or dose) of ApoE4 in AD, considering individual metabolite levels, an ApoE4 and AD status classification signature and network analysis. The proposed methodology paves the way for more elaborate and efficient "omics" data analysis in AD research, potentially leading to the development of improved diagnosis support tools and therapeutic strategies.\\

\textbf{Keywords:}\hspace{.1cm} Alzheimer's disease, ApoE4, metabolomics, machine learning, network analysis 

\restoregeometry
% Glossary
\clearpage
\printacronyms[title = Abbreviations, toctitle = ABBREVIATIONS]

\newpage
% Inserting table of contents
\tableofcontents

%--------------------------------------------------------------------
%--------------- Main matter ----------------------------------------
\mainmatter

\newpage
\section{Introduction}\label{Intro}
\subsection{Alzheimer’s disease}
\acrfull{ad} is a complex, progressive neurodegenerative disorder and the most common form of dementia \cite{Penke2023NewDisease}. It was considered the 6th leading cause of death in the US in 2019, with an overall increase of 145\% in mortality from 2000 to 2019 \cite{20232023Figures}. The impact AD has on patients, their caregivers, and healthcare systems is detrimental. Hence, a considerable amount of research has been performed in an effort to understand, prevent, impede or cure it. Nonetheless, important aspects of its systematic manifestations remain unknown.

Age is the main risk factor for AD, while several genetic and lifestyle risk factors, as well as biochemical pathways contribute to its development \cite{Penke2023NewDisease}. AD manifests in various histopathological phenotypes and presents a broad spectrum of clinical signs and symptoms \cite{Heneka2015NeuroinflammationDisease, Edwards2019ANeurodegeneration}. The AD continuum starts with \acrfull{scd}, followed by \acrfull{mci} \cite*{AALDIJK2022101556}, and continues with progressive loss of global cognition, of which particularly memory, processing speed and executive functioning, spanning a total period of 10-15 years \cite{Scheltens2016AlzheimersDisease}. 

Sporadic or late-onset AD (\acrshort{load}; \>95\% of cases) is the most frequent phenotype, typically appearing after 65 years of age \cite{Beydoun2014EpidemiologicMeta-analysis}. A rarer phenotype ($\sim1\%$) is early onset familial AD (\acrshort{fad}), usually starting at ages 30–65 and passed in an autosomal dominant fashion \cite{VanCauwenberghe2015ThePerspectives}.

\subsubsection{Amyloid cascade hypothesis}
The amyloid cascade hypothesis has dominated the scientific discussion on the pathogenesis of AD. In historical terms, its impact was profound, as it helped distinguish and identify AD as a singular disease that may be studied for treatment \cite{Hardy2006AlzheimersReappraisal}. It suggests that chronic neuroinflammation promotes protein misfolding and accumulation in the brain, forming plaques (consisting of oligomerized amyloid A$\beta_{42}$) and tangles (consisting of hyperphosphorylated tau protein) \cite{Edwards2019ANeurodegeneration}. Nevertheless, it does not necessarily cover all AD cases. Clinical trials with A$\beta$ antibodies as a potential disease-modifying treatment have so far failed to demonstrate any clinical improvement in patients, despite successful removal of plaques from AD brains \cite{Kepp2023TheReview,Kurkinen2023TheThinking}. Evidence suggests that oxidative stress, metabolic abnormalities, atherosclerosis, cardiovascular effects, imbalances of intra-neuronal calcium and metal ion depositioning might also contribute to the development of AD \cite{Kepp2023TheReview}. \citeauthor{Kepp2023TheReview} propose a more complex and holistic view of AD pathology, by integrating (epi)genetic, environmental, vascular, neuro-inflammatory and metabolic factors in disease-predicting models \cite{Kepp2023TheReview}.

\subsection{Human apolipoprotein-E gene}
LOAD has been associated with various genetic risk factors, but primarily with the gene that encodes for \acrfull{apoe} \cite{Corder1993GeneFamilies}. The structure and function of ApoE, as well as how the first defines the latter is described in Section \ref{ApoEprot}.

\subsubsection{ApoE4 and Alzheimer's disease}
Humans due to two point mutations in the ApoE gene (rs7412 C/T, rs429358 C/T) present three common variants: $\varepsilon_2$, $\varepsilon_3$ and $\varepsilon_4$ \cite{Husain2021APOETherapeutics, Yang2023ApolipoproteinDisease}. The three ApoE haplotypes are phased in six genotypes [Fig. \ref{fig1}]. In Caucasian populations, the most abundant allele is ApoE3 (rs7412 C, rs429358 T), with a frequency of 78\%, and is considered neutral regarding the risk of \acrshort{ad} \cite{Liu2013ApolipoproteinTherapy}. ApoE4 (rs7412 C, rs429358 C) has a frequency of 14\% and represents the strongest genetic risk factor for LOAD, with gene dose effects \cite{Strittmatter1993ApolipoproteinDisease}. Conversely, ApoE2 (rs7412 T, rs429358 T) is found in 8\% of Caucasian populations, and is associated with a reduced risk of LOAD \cite{Liu2013ApolipoproteinTherapy}. The ApoE haplotypes are strongly linked to the primary pathological features of LOAD, namely A$\beta$ and phosphorylated tau \cite{Deming2017Genome-wideModifiers}. While the association between ApoE alleles and LOAD risk or protection is observed across diverse ancestral backgrounds, the strength of this association varies \cite{Belloy2019AForward, Farrer1997EffectsMeta-analysis}, see Section \ref{ancestry}.

Carrying a single $\varepsilon_4$ allele implies a risk of developing AD of about 20\% \cite{Bookheimer2009APOE4GA}. A heterozygous ApoE4 allelic composition (e.g., $\varepsilon_4$/$\varepsilon_x$) is associated with approximately 3-4 times increased life-time risk of developing AD, while the homozygous composition ($\varepsilon_4$/$\varepsilon_4$) is associated with 12 times increased risk \cite{Kim2009TheRO}. Even though ApoE4 is the strongest known genetic risk factor for LOAD, it is neither necessary nor sufficient to cause AD, and it is certainly not the only genetic risk factor \cite{SerranoPozo2019IsAD}.

\begin{figure}[H]
  \includegraphics[width=0.7\textwidth]{figures/ApoE@2x.png}
    \caption{Sankey chart representing the contribution of the three common ApoE alleles in forming the $\binom{3}{2} = 6$ ApoE genotypes. Heterozygotes ($\varepsilon_2$/$\varepsilon_3$,  $\varepsilon_2$/$\varepsilon_4$ and $\varepsilon_3$/$\varepsilon_4$) carry two distinct alleles, while homozygotes ($\varepsilon_2$/$\varepsilon_2$, $\varepsilon_3$/$\varepsilon_3$ and $\varepsilon_4$/$\varepsilon_4$) two copies of the same allele.}
  \label{fig1}
\end{figure}

\subsubsection{ApoE and sex synergy in Alzheimer's disease}
Notably, sex (60\% females) and ApoE4 allelic composition (50\% has at least one $\varepsilon_4$ allele) are the strongest genetic risk factors for \acrshort{load} \cite{Arnold2020SexMetabolome}. In this regard, it is shown that the ApoE4 genotype has a larger impact on females, as they present greater impairment of mitochondrial energy production, compared to males \cite{Arnold2020SexMetabolome, Yassine2020APOEDisease}.

\subsubsection{ApoE and ancestry in Alzheimer's disease} \label{ancestry}
The majority of studies exploring the relationship between ApoE polymorphism and the genetic aspect of LOAD have primarily focused on Northern European populations\cite{Yang2023ApolipoproteinDisease}. However, smaller studies involving diverse ancestral backgrounds show variations in ApoE4 allele frequencies \cite{Yang2023ApolipoproteinDisease}. While ApoE4 is present in 14\% of Caucasian Americans, its prevalence increases to 40\% among African Americans, 37\% in Oceania, and 26\% in Australia. Southern Asia and Europe exhibit ApoE4 allelic frequencies of less than 10\%, compared to Northern Europe where it rises to 25\% \cite{Belloy2019AForward, Egert2012ApoEFactors, Eisenberg2010WorldwideHistory, Logue2011AAmericans}.

ApoE alleles have different impact on distinct populations. ApoE4 implies a higher risk of LOAD for Korean, Japanese, and Japanese-American, compared to Caucasian populations \cite{Farrer1997EffectsMeta-analysis}. Conversely, ApoE4 is associated with a lower risk of LOAD in Native Americans, Hispanic Americans, African Americans, and those of African descent, compared to Caucasian-American populations \cite{Farrer1997EffectsMeta-analysis, Blue2019LocalHispanics, Suchy-Dicey2022APOEStudy, Rajabli2018AncestralPopulations, Naslavsky2022GlobalSample}. Notably, ApoE3 is more protective than ApoE2 against AD, as reported by a study in a Chinese population \cite{Chen2011ApolipoproteinDisease}. Some of these population-specific effects are attributed to the ApoE haplotype and ancestral variations in the ApoE gene beyond the $\varepsilon_2/\varepsilon_3/\varepsilon_4$ haplotypes \cite{Blue2019LocalHispanics, Rajabli2018AncestralPopulations}.

\subsubsection{Evolution of ApoE over time}
Interestingly, humans are the only species exhibiting polymorphism in the ApoE gene \cite{Yassine2020APOEDisease}. All other animal species have one ApoE variant, which resembles the human ApoE3 allele \cite{Hunsberger2019TheInterventions}. Nonetheless, ApoE4 is the oldest human allele, followed by ApoE3 and ApoE2 in age \cite{Yassine2020APOEDisease}. ApoE4 may be adaptive by reducing mortality in highly infectious environments, with food scarcity and shorter lifespans \cite{Trumble2017ApolipoproteinBurden}. However, as human environments became less septic and provided abundance of food, thus extending life expectancies,  ApoE4 started to burden the arteries and brain, increasing the risk of diseases related to ageing \cite{Yassine2020APOEDisease}. The emergence of ApoE3 from ApoE4 putatively reflects the shift in human diet from a plant-based one to a meat-rich one, where genes adaptive to high meat consumption were, and still are vital to regulate increased cholesterol levels \cite{Finch1999TheIsoforms}. 

\subsubsection{Tissue expression}
The principal producers of ApoE are hepatocytes in the liver \cite{Mahley2016CentralMetabolism}. In the CNS, ApoE is primarily expressed in glia (non-neuronal cells of the brain that support the neurons) and particularly in astrocytes (metabolic homeostasis and neuronal communication modulators), followed by microglia (the brain immune cells) \cite{Lanfranco2021ExpressionInflammation}. Each genotype is linked to different expression levels \cite{Husain2021APOETherapeutics}. ApoE2 carriers seem to have higher \acrfull{csf} levels of ApoE, compared to ApoE4 carriers \cite{Castellano2011HumanClearance, Cruchaga2012CerebrospinalDisease}. 

\subsection{Human Apolipoprotein-E}\label{ApoEprot}
\subsubsection{Structure and function}
\acrshort{apoe} is a brain-specific lipid-binding glycoprotein of 299 amino-acids (34 kDa, 1 dalton is the mass of $1/12$ of C$_{12}$) that comprises several types of lipoproteins, i.e., chilomicra, \acrlong{idl} and \acrlong{vldl} \cite{Husain2021APOETherapeutics}. Its main function in the brain is the transport of lipids (mainly cholesterol) through membrane receptors \cite{Yang2023ApolipoproteinDisease}. ApoE isoforms have an effect on diverse cellular functions, e.g. synaptic integrity, glucose metabolism, A$\beta$ clearance, \acrlong{bbb} integrity and mitochondrial regulation \cite{Husain2021APOETherapeutics}, see Fig. \ref{ApoeEffectsA} and \ref{ApoeEffectsB}. How these relate to \acrshort{ad} pathology will be elaborated in Section \ref{ApoEAD}.

\begin{figure}[htb]
  \includegraphics[width=0.8\textwidth]{figures/ApoEprot.png}
    \caption{Linear representation of the ApoE protein. Three structural domains are highlighted: N-terminal, hinge and C-terminal domains. The different amino acids at positions 112 and 158 are shown for the common alleles and amino acids at positions 136, 236 and 251 encoded by rarer alleles. Source: \citetitle{Bu2022APOEVariants}, \Citeauthor{Bu2022APOEVariants} (\citeyear{Bu2022APOEVariants}) \cite{Bu2022APOEVariants}}.
  \label{fig2}
\end{figure}

\subsubsection{ApoE isoforms}
The nuances in the amino acid composition of ApoE, specifically the presence of cysteine or arginine at positions 112 and 158, significantly change its structure and, thus its binding with lipids and receptors \cite{Yassine2020APOEDisease}. The most prevalent isoform, ApoE3, features cysteine at position 112 and arginine at position 158  \cite{Yassine2020APOEDisease}, as shown in Fig. \ref{fig2}. ApoE2 has two cysteines, while ApoE4 has two arginines at these positions \cite{Yassine2020APOEDisease}. The C-terminal domain of ApoE (positions 273–299) is crucial for its lipidation specificity and efficiency \cite{Hu2015OpposingMice}.

\subsubsection{Lipidation nuances of ApoE isoforms}
For ApoE to exert its effects, it needs to be lipidated \cite{Husain2021APOETherapeutics}. ApoE undergoes lipidation via \acrfull{abca1}, a lipid efflux protein \cite{Flowers2020APOEBrain, Courtney2016LXRDisease}. The lipidation degree varies among ApoE isoforms, with ApoE4 exhibiting the least efficient lipidation \cite{Hu2015OpposingMice, Heinsinger2016ApolipoproteinFluid}. This discrepancy in lipidation has been linked to alterations in lipoprotein size and type, in that ApoE4 "prefers" large triglyceride-rich VLDL, while ApoE2 and ApoE3 have a higher affinity for phospholipid-rich \acrshort{hdl} particles \cite{Nguyen2010MolecularE4}. The lower affinity of ApoE4 for HDL particles leads to increased levels of unlipidated ApoE, resulting in its aggregation \cite{Hatters2006ApolipoproteinFunction}. Moreover, ApoE4 fibrils are more neurotoxic than those of ApoE2 and ApoE3 \cite{Hatters2006Amino-terminalFibrils}.

Poor lipidation leads to poor ApoE recycling \cite{Yassine2020APOEDisease} (Fig. \ref{ApoeEffectsA}). The latter favours the entrapment of ABCA1 in endosomes, away from the cell membrane, thereby pooling cholesterol in the cell membrane rather than attaching it to HDL particles \cite{Rawat2019ApoE4Astrocytes}. This increased cholesterol content in the cell membrane amplifies \acrfull{tlr4} signalling in macrophages, activating \acrshort{nfkb} and inducing an inflammatory gene response \cite{Yassine2020APOEDisease}.  ApoE4 accumulation also sequestrates the insulin receptor (IR) in endosomes, impacting cellular energy preferences \cite{Zhao2017ApolipoproteinEndosomes}. This leads to a decrease in glucose utilization for \acrshort{atp} production (Fig. \ref{ApoeEffectsB}) and an increase in fatty acid oxidation \cite{Svennerholm1997ChangesSwedes}. 

\begin{figure}[t]
  \includegraphics[width=0.8\textwidth]{figures/ApoEeffectsB.jpg}
    \caption{Effects of ApoE isoforms on the metabolism and removal of A$\beta$. In the brain, ApoE is mainly expressed in astrocytes and microglia, and undergoes lipidation by \acrfull{abca1} creating lipoprotein particles. ApoE increases the accumulation and build up of A$\beta$, or promotes cellular uptake of A$\beta$ by astrocytes or microglia via endocytosis of the lipidated ApoE-A$\beta$ in an isoform-specific manner. This process involves several receptors, such as the low-density lipoprotein receptor and \acrfull{lrp}. ApoE also facilitates the breakdown of A$\beta$ outside the cells in an isoform-specific manner. At the \acrlong{bbb}, soluble A$\beta$ is predominantly transported from the \acrfull{isf} into the bloodstream via LRP1 and P-glycoproteins. Additionally, ApoE plays a role in the perivascular drainage of A$\beta$. Insufficient clearance of A$\beta$ can lead to its accumulation in the brain tissue, contributing to the formation of amyloid plaques. Source: \citetitle{Husain2021APOETherapeutics}, \Citeauthor{Husain2021APOETherapeutics} (\citeyear{Husain2021APOETherapeutics}) \cite{Husain2021APOETherapeutics}}
  \label{ApoeEffectsA}
\end{figure}

\begin{figure}[b]
  \includegraphics[width=0.8\textwidth]{figures/ApoEeffectsA.jpg}
    \caption{Schematic overview of A$\beta$-independent effects of ApoE in AD pathology. ApoE4 increases the phosphorylation of tau proteins, leading to the creation of tangles, inducing neurodegeneration, synaptic deficits and neurotoxicity. Moreover, ApoE4 is associated with decreased cholesterol efflux and neuroinflammation mediated by interleukin, tumour necrosis factor $\alpha$ and \acrfull{ros}. In the mitochondria, ApoE4 impairs glucose metabolism and ATP production.  Source: \citetitle{Husain2021APOETherapeutics}, \Citeauthor{Husain2021APOETherapeutics} (\citeyear{Husain2021APOETherapeutics}) \cite{Husain2021APOETherapeutics}.}
  \label{ApoeEffectsB}
\end{figure}

\subsubsection{Interplay between ApoE lipidation and Alzheimer's disease pathophysiology}\label{ApoEAD}
As mentioned earlier, ApoE isoforms have differential pleiotropic effects on various cellular functions. ApoE4 induces a pro-inflammatory response, leading to the dysfunction of the blood-brain barrier, which in turn impairs cognitive functions \cite{Marottoli2017PeripheralDysfunction, Teng2017ApoEInjury, Kloske2020TheDisease}. Moreover, ApoE modulates the primary neuropathological hallmarks of \acrshort{ad}: neuroinflammation, A$\beta$ plaques and tau tangles \cite{Husain2021APOETherapeutics}. Evidence from human and transgenic mice studies reveals increased brain A$\beta$ and amyloid plaque loads in ApoE4 carriers, compared to ApoE3; with the lowest levels in ApoE2 carriers \cite{Huang2017ApoE2Secretion, Tachibana2016RescuingLRP1, Safieh2019ApoE4:Disease}. ApoE4 has a higher binding affinity for A$\beta$ which leads to impaired clearance, its intracellular aggregation and higher plaque loads \cite{Kloske2020TheDisease} (Fig. \ref{ApoeEffectsA}). Additionally, high levels of ApoE4 in neurons remarkably increase tau protein phosphorylation, while high concentrations of ApoE3 do not seem to have an effect \cite{Cao2017ApoE4-associatedInjury, Shi2017ApoE4Tauopathy, Vasilevskaya2020InteractionAthletes, Wang2018GainCorrector}. Notably,  ApoE directly inhibits phosphorylation of tau by glycogen synthase kinase-3$\beta$ \cite{Hoe2006ApolipoproteinNeurons}. An overview of the A$\beta$-(in)dependent effects of ApoE is shown in Fig. \ref{ApoeEffectsB}.

\subsection{ApoE4-mediated metabolic changes in Alzheimer's disease}
Metabolism entails the repertoire of chemical reactions that keep living organisms alive. Metabolites -the molecules involved-, and especially lipid \cite{Barupal2019SetsPathophysiology,Fernandez-Calle2022APOEDiseases, Proitsi2017AssociationAnalysis}, perceived as functional intermediates, are rigorously studied as biomarkers or therapeutic targets in AD \cite{Oeckl2019GlialImpairment}.
 
\subsubsection{Measured in \textit{post-mortem} brain tissue} A metabolomic profiling of brain tissue, obtained \textit{post-mortem} from \acrshort{ad} patients and healthy controls revealed  pronounced impairments in sterol and sphingolipid levels in ApoE4 carriers with \acrshort{ad}  \cite{Bandaru2009ApoE4Brain}. However, another \textit{post-mortem} metabolomic analysis did not reveal nuances significantly correlated with ApoE4 \cite{Novotny2023MetabolomicBrains}, although they showed trends of increased cholesterol esters, unsaturated lipids, and sphingomyelin species.

\subsubsection{Measured in blood}
Transcriptomics and lipidomics analyses in humanized ApoE mice associated ApoE4 with decreased free fatty acid levels, many increased  tricarboxylic acid (TCA) cycle metabolites, as well as changes in plasma levels of phosphatidylcholines and unsaturated fatty acids \cite{Area-Gomez2020APOE4Mice, Zhao2020AlzheimersPathways}. A recent study on 58 individuals found six downregulated plasma metabolites (including lysophospholipids and cardiolipin) in ApoE4 carriers \cite{pena-bautista2020MetabolomicsEffect}. Further, the plasma metabolome of the latter reveals a preference for aerobic glycolysis \cite{Farmer2021APO4Glycolysis}. Significant correlations of ApoE genotype and sex with metabolites were observed, i.e. several phosphatidylcholines were found in a large study of more than 1500 individuals \cite{Arnold2020SexMetabolome}.

Perturbed serum metabolites associated with \acrshort{ad} are aminoacids, amines \cite{deLeeuw2017Blood-basedDisease, Green2023InvestigatingDisease}, cholesteryl esters \cite{Proitsi2017AssociationAnalysis}, sphingolipids \cite{Varma2018BrainStudy,Sun2022AssociationDisease,Green2023InvestigatingDisease,Oeckl2019GlialImpairment,Barupal2019SetsPathophysiology}, fatty acids \cite{Fernandez-Calle2022APOEDiseases,deLeeuw2017Blood-basedDisease}, glycerophospholipids \cite{Varma2018BrainStudy, Jia2022ATypes,Huo2020BrainAnalysis, Weng2019TheImpairment}, phosphatidylcholines \cite{Simpson2016BloodAging} and lipid peroxidation compounds \cite{Fernandez-Calle2022APOEDiseases}. These molecules are usually identified via high-throughput metabolomic pipelines (coupled with Mass Spectrometry (MS) detectors) that trace all compounds in a sample and result in high-dimensional data \cite{Oka2023MultiomicsCohort}. The latter often require advanced statistical methods e.g. projection to latent structures \cite{Weng2019TheImpairment, Peeters2019StableData} or graphical models \cite{Peeters2022Rags2ridges:Matrices} in order to extract putatively meaningful information. 
With such techniques, \citeauthor{deLeeuw2017Blood-basedDisease} discovered distinct serum metabolic signatures among \acrshort{ad} patients-controls and those carrying at least one ApoE4 allele \cite{deLeeuw2017Blood-basedDisease}, as they are shown in Fig. \ref{netan17}. The different intra-group metabolic profiles, however, among ApoE4 carriers and non-carriers remain obscure.

\begin{figure}[htb]
\vspace*{-0.2cm}
  \includegraphics[width=0.9\textwidth]{figures/network.jpeg}
    \caption{Mutual (left-hand panel) and distinct (right-hand panel) metabolic network topologies between ApoE4 carriers with AD and non-carriers with SCD, as published by \citeauthor{deLeeuw2017Blood-basedDisease}. Red edges represent links that are present exclusively in ApoE4 carriers with AD. Green edges represent connections found in the SCD group without ApoE4. Solid edges represent positive partial correlations, while dashed edges represent negative partial correlations. Abbreviations: SCD, subjective cognitive decline. Source: \citetitle{deLeeuw2017Blood-basedDisease}, \citeauthor{deLeeuw2017Blood-basedDisease} (\citeyear{deLeeuw2017Blood-basedDisease}) \cite{deLeeuw2017Blood-basedDisease}.}
  \label{netan17}
\end{figure}

\newpage
\subsection{Research Questions}
ApoE4 carriers --particularly females-- experience metabolic disturbances and are at increased risk of \acrshort{load}. The mechanistic links, however, between ApoE4 dose, metabolism and \acrshort{ad} development are not entirely known \cite{Fernandez-Calle2022APOEDiseases}. Tracking ApoE4 dose effects on serum metabolites might reveal metabolic perturbations at systematic level, preceding or concurring with AD. To this end, one may focus on shifted individual metabolite levels, a classification signature, a network signature or all of them. Considering all three approaches, one could state the following overarching research question and subquestions:

Are there mechanistic links between ApoE4 dose and serum metabolome in AD?
\begin{enumerate}[label=\roman*]
    \item Are there ApoE4 dose effects on serum metabolite levels in AD?
    \item What is the (added) classification potential of serum metabolites in predicting ApoE4 and AD status?   
    \item How do the metabolite network topologies differ between ApoE4 carriers and non-carriers?
\end{enumerate}


\subsection{Approach and Overview}
To facilitate statistical analysis, two new features were created as outcome measures for the first two research questions: "ApoE4dose" (0, 1, 2) and "target" (4 possible phenotypes: AD without ApoE4, AD with at least 1 ApoE4, SCD without ApoE4, SCD with at least 1 ApoE4), respectively, as described in Section \ref{featureeng}. Next, an overview of the approaches taken to address the research subquestions is provided.

\textit{Are there ApoE4 dose effects on serum metabolite levels in AD?} 
To investigate ApoE4 dose effects on individual metabolites two methods were applied: a global test (correcting only for sex) and nested linear model comparison using ANCOVA F-tests (correcting for several factors: Table \ref{tab:clin}, except CSF markers). 

\textit{What is the (added) classification potential of serum metabolites in predicting ApoE4 and AD status?}
To test the (added) classification potential of serum metabolites against the target, several multi-class classification models were fitted considering the bias-variance trade-off. First, a benchmark multinomial logistic regression model was fitted using only clinical background data as predictors. Second, the full metabolomic panel was added on top of the clinical data in the same model and feature importance was calculated. Third, the metabolites were projected to a latent orthogonal space, where 6 meta-metabolites (accounting for around 30\% of the variance) replaced the original metabolites. Finally, the meta-metabolites were fitted on top of the clinical data in a multinomial logistic regression model, a decision tree and a \acrfull{xgb} model. The discriminatory performance of the aforementioned models was comprehensively evaluated and compared.

\textit{How do the metabolite network topologies differ between ApoE4 carriers and non-carriers?}
To visualize and compare the (precision) metabolite network topologies among ApoE4 carriers and non-carriers, the high-dimensional matrices were first regularized with a Ridge penalty, sparsified and pruned controlling the False Discovery Rate. The metabolites were plotted in GGMs, their centrality measures were calculated and compared between the two groups using Wilcoxon Signed Rank test.

An introduction to the data used in this study is found in Section \ref{subjects}. A general overview of the software is found in Section \ref{datamanagement}, while the R session information is found in Appendix \ref{appendixB}. The results of the analysis are presented in Section \ref{results} and discussed in Section \ref{discuss}. The study is concluded in Section \ref{concl}.


\newpage
\section{Methods}\label{methods}

\subsection{Subjects}\label{subjects}
The data were collected from 120 AD patients and 127 SCD ($n$ = 247 in total) individuals recruited within the Amsterdam Dementia Cohort \cite{VanDerFlier2018AmsterdamCare, deLeeuw2017Blood-basedDisease}. The AD diagnosis was confirmed with a ratio of CSF markers t-tau/A$\beta_{42}>$ 0.52.  In this study two data sets were analysed: a semi-targeted metabolomics panel and clinical background data, i.e. age at diagnosis, sex, smoking status, alcohol consumption, hypertension (and medication), hyperlipidemia (and medication), anticoagulant medication, antidepressants, \acrfull{map} and \acrfull{bmi} (see Table \ref{tab:clin}). The metabolomics set contains $p =$ 230 metabolites (amines, organic acids, lipids and oxidative stress compounds) measured in serum obtained from peripheral blood (see Fig. \ref{metabolomics}. Amines and oxidative stress compounds were measured via \acrfull{uplcms}, organic acids with \acrfull{gcms} and lipids with \acrfull{uplcesitof}. The detailed methodology for the metabolomic analysis and ApoE genotyping can be found at de Leeuw et al.'s  Blood-based metabolic signatures in Alzheimer's disease: SMT1 \cite{deLeeuw2017Blood-basedDisease}. The data were cleaned as described in the same article. The metabolomics datasets for AD and SCD are high-dimensional, in the sense that they contain more variables than observations ($p > n$). Another particularity of the data is the variable collinearity. Therefore, appropriate measures need to be taken to prevent model over-fitting --the algorithm being unable to distinguish signal from noise and fitting the latter--  and to correct for spurious correlations.

\begin{figure}[h]
  \includegraphics[width=\textwidth]{figures/metabolomics.png}
    \caption{Schematic overview of the study methodology. CSF markers supported the diagnosis of AD or SCD. Peripheral blood was obtained from both groups for a semi-target metabolomics analysis and ApoE genotyping. For the metabolomics, the blood samples were centrifuged, the serum supernatant was obtained and processed distinctly for each metabolite class. Metabolite levels were measured using UPLC/GC-MS$^2$ and recorded in the data used in this study. The latter pertains to the Statistical Analysis part of the process.}
  \label{metabolomics}
\end{figure}

\begin{table}[htb]
\caption{Clinical background characteristics used as control variables of ApoE4 (dose) effects.}
\label{tab:clin}
\begin{threeparttable}
\begin{tabular}{clcc} \toprule
                                & \textbf{Clinical feature}   & \textbf{Values} & \textbf{Data type} \\ \midrule
\multirow{2}{*}{Anthropometric} & Age                   & years   & Discrete   \\
                                & Sex                   & m, f    & Binary     \\
\multirow{2}{*}{Intoxications}  & Smoking               & past, current, no & Nominal\\
                                & Alcohol               & yes, no & Binary     \\
\multirow{3}{*}{Comorbidities}  & Hypertension          & yes, no & Binary     \\
                                & Diabetes mellitus     & yes, no & Binary     \\
                                & Hypercholesterolemia  & yes, no & Binary     \\
\multirow{3}{*}{Medication}     & Cholesterol lowering  & yes, no & Binary     \\
                                & Antidepressants       & yes, no & Binary     \\
                                & Antiplatelet          & yes, no & Binary     \\
\multirow{3}{*}{CSF markers$\ast$} & A$\beta_{42}$      & pg/mL   & Continuous \\
                                & tau                   & pg/mL   & Continuous \\
                                & p-tau                 & pg/mL   & Continuous \\\bottomrule
\end{tabular}
\begin{tablenotes}
  \item[$\ast$] Were only used in the ApoE4 and AD status multi-class classification.
\end{tablenotes}
\end{threeparttable}
\end{table}

\subsection{Data management and software}\label{datamanagement}
The FAIR principles for data management and stewardship in science were published by  ~\citeauthor{Wilkinson2016TheStewardship} in 2016 \cite{Wilkinson2016TheStewardship}. FAIR stands for Findable, Accessible, Interactive, and Reusable data; the intention is to create and use data that are well-documented and reproducible. These principles were considered at every aspect of the study and implemented when applicable.

The statistical analyses were performed in \href{https://www.r-project.org}{\textsf{R}} (version 4.3.2), and the current report was redacted in \href{https://ctan.org/tex}{\LaTeX}  (Tex Live version 2023), both in \href{https://vscodium.com}{\textsf{VSCodium}} (the open-source version of VSCode) as integrated development environment (IDE). Vector graphics (e.g. metabolite network analysis plots) were edited using \href{https://inkscape.org}{\textsf{Inkscape}}. The schematic overview of the study was created in Biorender. All files (except for the data) are stored in a private \href{https://github.com}{\textsf{github}} repository.

\subsection{Outcome measures}\label{featureeng}
The information of the ApoE genotypes is valuable, and it might be interesting to screen for metabolic nuances between them. However, the genotypes were not equally represented in the data and hence, testing for differences among them would not be possible. The following two subsections describe the features ApoE4dose and target that were created as outcome measures to balance the genotype counts.

\begin{table}
  \caption{Observed ApoE genotype (top part), allele (middle part) and ApoE4 allelic (bottom part) counts in AD and SCD}
\begin{threeparttable}
\centering 
\label{tab:ApoEfreq}
  \begin{tabular}{crrrr} \toprule
    \multicolumn{1}{l}{}                & \multicolumn{1}{l}{}               & AD (\%)    & SCD (\%)   & Total (\%)  \\ \midrule
    \multirow{6}{*}{\textit{genotypes}} & $\varepsilon_2/\varepsilon_2$        & 2 (1.7)    & 0 (0.0)    & 2 (0.8)     \\
                                        & $\varepsilon_2/\varepsilon_3$      & 15 (12.5)  & 3 (2.4)    & 18 (7.3)    \\
                                        & $\varepsilon_2/\varepsilon_4$      & 5 (4.2)    & 2 (1.6)    & 7 (2.8)     \\
                                        & $\varepsilon_3/\varepsilon_3$      & 69 (57.5)  & 37 (29.1)  & 106 (42.9)  \\
                                        & $\varepsilon_3/\varepsilon_4$      & 26 (21.7)  & 59 (46.5)  & 85 (34.4)   \\
                                        & $\varepsilon_4/\varepsilon_4$      & 3 (2.5)    & 26 (20.5)  & 29 (11.7)   \\ \midrule
   \multirow{3}{*}{\textit{alleles}}    & $\varepsilon_2$    & 24 (10.0)  & 5 (2.0)    & 29 (6.0)    \\
                                        & $\varepsilon_3$    & 179 (74.6) & 136 (53.5) & 315 (64.0)    \\
                                        & $\varepsilon_4$   & 37 (15.4)  & 113 (44.5) & 150 (30.0)    \\ \midrule
    \multirow{4}{*}{$\varepsilon_4$}   & 1x$^\ast$            & 31 (25.8)  & 61 (48.0)  & 92 (37.2)   \\
                                        & 2x$^\ast$            & 3 (2.5)    & 26 (20.5)  & 29 (11.7)   \\
                                        & $\geq$ 1x$ ^\dagger$ & 34 (28.3)  & 87 (68.5)  & 121 (49.0)  \\
                                        & No$^{\ast\dagger}$     & 86 (71.7)  & 40 (31.5)  & 126 (51.0)  \\ \midrule
    \multicolumn{2}{r}{Total}                                                & 120 (49.0)       & 127 (51.0)       & 247         \\ \bottomrule
  \end{tabular}
  \begin{tablenotes}
    \item[$\ast$] rows used for ApoE4dose
    \item[$\dagger$] rows used for target 
  \end{tablenotes}
\end{threeparttable}
\end{table}

\subsubsection{ApoE4 dose effects}
\citeauthor{deLeeuw2017Blood-basedDisease} divided the subjects into two classes: those carrying at least one $\varepsilon_4$ allele, and $\varepsilon_4$ non-carriers. In order to study the $\varepsilon_4$ \textit{dose} effects, the genotypes can be categorized into groups, based on the number of $\varepsilon_4$ alleles they carry: 0, 1 or 2. The number of ApoE4 allele doses is shown in the bottom part of Table \ref{tab:ApoEfreq}.

\subsubsection{Classification of ApoE4 and AD status}
In order to incorporate ApoE4 status, as well as the diagnosis of AD, a four-level feature (AD without ApoE4, AD with at least 1 ApoE4, SCD without ApoE4, SCD with at least 1 ApoE4, "target") was created, as shown in Table \ref{tab:ApoEfreq}.

\newpage \subsection{Statistical Analysis} \label{stats}
\subsubsection{ApoE4 dose effects on serum metabolite levels in AD} \label{rq1}
To test if the number of ApoE4 alleles have an effect on mean metabolite levels in \acrshort{ad}, two methods were applied: a global test and nested linear model comparison with ANCOVA.

\leavevmode\newline\textbf{Global Test}\hspace{.25cm} The concept of a global test was first introduced by \citeauthor{Simon2004DesignHealth} to cater to the high dimensionality of gene expression data \cite{Simon2004DesignHealth}. The R package \textsf{globaltest} features a multinomial logistic regression model, fitting genes to predict clinical or biological group membership \cite{Goeman2004AOutcome, Goeman2006TestingAlternative, Goeman2023ThePackage}, number of ApoE4 alleles in this case. This method is also appropriate for other types of -omics data, such as metabolomics in this study \cite{Goeman2023ThePackage}. The null hypothesis is that metabolite levels are independent of the ApoE4 dose X, i.e. $H_0 : P(Y|X) = P(Y)$, where $X \in \mathbb{R}^{n x p}$. The test statistic under $H_0$ follows, asymptotically, a normal distribution. The \texttt{gt} function of the package was used to screen for nuances in metabolite levels on the number of ApoE4 alleles, correcting for sex. To assess ApoE4 dose effects correcting for clinical data nested linear model comparison was performed, as described in the next section.

\leavevmode\newline \textbf{Nested linear models}\hspace{.25cm}
A linear regression model using the least squares principle estimates the \acrfull{ssr}
$$\mathrm{SSR} = \sum_{i=1}^{n} (\hat{y_i}-\bar{y})^2$$ where $\bar{y}$ is the sample mean and $\hat{y_i}$ is the estimate for the $i$-th observation, using categorical and numerical variables. It does so while minimizing the \acrfull{rss} \cite{ott2015introduction}, 
$$\mathrm{RSS} = \sum_{i=1}^{n} (y_i - \hat{y_i})^2$$ where $y_i$ is the $i$-th observation \cite{ott2015introduction}. Two models were fitted for each metabolite, a full and a nested model. The dependent variable in each model was a metabolite; the nested model \eqref{rm}, has only clinical variables (Table \ref{tab:clin} except CSF markers) while the full model \eqref{fm}, features the clinical variables, plus the number of ApoE4 alleles (0, 1 or 2 -nominal) as explanatory variables. Let $y_j$ represent the $j$-th metabolite, $x_k$ the $k$-th clinical variable, and $D_{\varepsilon_4}$ the ApoE4 dose; the nested linear regression models are:
\begin{align}
    & y_j = \beta_0 + \sum_{k=1}^m\beta_kx_k +\epsilon \label{rm} \\
    & y_j = \beta_0 + \sum_{k=1}^m\beta_kx_k + \beta_{m+1}D_{\varepsilon_4} + \epsilon \label{fm}
\end{align}
For every metabolite $j$ = 1,...,$p$ the hypothesis test is 
\[H_0: \beta_{m+1} = 0 \; \mathrm{vs} \; H_\alpha: \beta_{m+1} \neq 0\]
The test statistic is ANCOVA F-test:
\[ F = \frac{\mathrm{SSR}_{full}-\mathrm{SSR}_{nested}/df_{full}-df_{nested}}{\mathrm{RSS}_{nested}/n-(m+2)}\]
Under H$_0$ $F \sim \mathcal{F}_{1, n-(m+2)}$ \cite{ott2015introduction}. The null hypothesis is rejected for large values of $F$.

This implies $p = 230$ hypothesis tests. The probability of incorrectly rejecting H$_0$ (type I error) increases monotonically with every additional test. Multiple testing provides tools to correct the increase of type I errors, i.e. controlling False Discovery Rate \cite{Benjamini1995ControllingTesting}. That is controlling the expected ratio of incorrectly rejected H$_0$ hypotheses, globally e.g. at a $q$ of 0.05. With the Benjamini-Hochberg's approach (see Algorithm \ref{alg:fdr}), first, the p-values are sorted in ascending order. Then, for every $j$ p-value, $q$ is multiplied by $j$ over the total number of tests \cite{Benjamini1995ControllingTesting} $m=230$ in the AD group in this study. In other words, after adjustment, a null hypothesis $j$ may be rejected only if its associated F-test's p-value is less or equal to a fraction ($j/m$) of $q$.

\begin{algorithm}
\caption{Benjamini–Hochberg's procedure to control FDR. Source: \cite{James2023AnEdition}}\label{alg:fdr}
Specify $q$, the level at which to control the FDR.\\
Compute p-values, $p_1, ... , p_m$, for the $m$ null hypotheses $H_{01},...,H_{0m}$. \\
Order the $m$ p-values so that $p(1) \leq p(2) \leq ... \leq p(m)$.\\
Define
\[L = \max\left\{j : p(j) \leq \frac{j}{m}q \right\}\] \\
Reject all null hypotheses $H_{0j}$ for which $p_j \leq p(L)$.
\end{algorithm}

To test for ApoE4 dose effects on each metabolite in AD and SCD, $2m=460$ model comparisons were needed in total, hence a function was created to iterate over the metabolites, fit the nested models, compare them using ANCOVA F-tests, store and adjust the p-values, filter those below .05, consolidate the coefficients of the meaningful full models and the p-values of their t-tests in a table and display it. To decrease run time, as well as harness the power of multiple CPU cores, \textsf{furrr}'s \texttt{future\_map} function was used for parallel iterations.

\newpage \subsubsection{Classification of ApoE4 and AD status}\label{rq2}
In statistical learning, a class denotes a group of objects that share common characteristics, such as having \acrshort{ad} or ApoE4 \cite*{Drummond2010}. Classification, in this context, denotes training a (supervised) learning algorithm on labelled data (containing their class) \cite*{Drummond2010}. The classifier learns patterns in the latter and may be able to predict class membership for unknown data \cite*{Drummond2010}.

\leavevmode\newline \textbf{Bias-Variance trade-off}\hspace{.25cm}The degree to which a user can understand and interpret the prediction or decisions made by a statistical model is defined as \textit{interpretability} \cite{Elshawi2019OnHypertension}. It was of interest in this study to find the optimal balance between the performance of a model and its interpretability. The \textit{bias-variance trade-off} was formally introduced by \citeauthor{Geman1992NeuralDilemma} and refers to the trade-off between the accuracy (opposite of bias) and precision (opposite of variance) of a prediction. It also refers to the trade-off between model flexibility (or complexity) and interpretability \cite{Geman1992NeuralDilemma}. One may consider this trade-off during model and evaluation method selection, as some impose more bias or variance than others.

\leavevmode\newline \textbf{Multi-class classification models}\hspace{.25cm}Considering interpretability, Multinomial Logistic Regression (\acrshort{mnl}) is inherently interpretable. Let $y$ a response with $K$ classes, $k \in N, [1,4]$ representing the k-th class of the target and $\beta_{kj}$ its set of coefficients,  $\beta_{lj}$ the coefficients of the rest of classes for $j$-th metabolite, a \acrlong{mnl} model calculated the probability

\[\textrm{P}(y=k|X=x) =  \dfrac{e^{\sum_{j=1}^{p}\beta_{kj}x_j}}{\sum_{l=1}^{K}\sum_{j=1}^{p}e^{\beta_{lj}x_j}}\]

In the $K$-way classification, the problem is reduced to $\binom{K}{2}$ binary classifications with the one-vs-one method. ApoE4 and AD status contains $K =4$ classes, so 6 binary classifications were performed \cite{James2023AnEdition}. The function \texttt{multinom} of the package \textsf{nnet} was used to fit a shallow neural network (with a single hidden layer, but allowing skip-layer connections) \cite{nnet}. For the $i$-th observation, it calculates the weights via a Least Squares estimation of the negative conditional log-likelihood that it belongs to the $k$-th class
\[E = \sum_{i}\sum_{k}-y_{ik}\log\hat{y}_{ik}, \; \;  \hat{y}_{ik} =  \dfrac{e^{\sum_{j=1}^{p}\hat{\beta}_{kj}x_{ij}}}{\sum_{l=1}^{K}\sum_{j=1}^{p}e^{\hat{\beta}_{lj}x_{ij}}}\]
where $y_{ik}$, the true class will be exactly one and the others all zero \cite{nnet}. 

When $p > n$, it is not possible to perform classification fitting all the predictors \cite{James2023AnEdition}. Regularization trades off a small increase in bias for a great decrease in variance, by shrinking the low coefficients towards zero \cite{James2023AnEdition}. Ridge or L2 regularization \cite{Cessie1992RidgeRegression} shrinks the coefficients without setting them to exactly zero \cite{Cessie1992RidgeRegression}. The predictors were regularized with weight decay, a Ridge-like penalty based on the sum of squares of the weights \cite{nnet}: 
\[\lambda\sum_{j=1}^{p}\beta_j^2 \]where $\lambda \geq 0$ is a tuning parameter that balances the coefficient shrinking effect.

Another method to treat multi-collinearity and high dimensionality is a 2-stage \acrfull{ml} Factor Projection to a Latent orthogonal space, such as the one the package \textsf{FMradio} \cite{Peeters2019StableData} performs. In the 1st stage, a ML estimation is used to filter out redundant features from the data matrix. In the second stage, the latter is projected to an orthogonal space where the features are replaced by -fewer- L2-regularised  factors explaining their covariance. One can then use the produced factor scores as predictors in any model.

Decision Trees (\acrshort{dt}) are inherently interpretable, non-parametric models, that fit well large and complicated data sets. They have a tree-like structure that splits the data based on a threshold into branches and leaves (nodes) \cite{Song2015DecisionPrediction}. Several studies used decision trees to support AD diagnosis \cite{saputra2020detecting,dana2014using,kumar2017new,mofrad2019decision}. The \texttt{rpart2} \cite{rpart} function of \textsf{rpart} was used.

Boosting models, are ensemble models that fit several weak learners (such as linear/logistic regression or DTs) sequentially, reweighing the data, and take their weighted majority vote \cite{Friedman2000boosting,Friedman2001gbm}. Even though Boosting tends to outperform DTs, it often operates as a \textit{black box} and is poorly interpretable. The state-of-the-art \acrlong{xgb} (\texttt{xgbTree}) of the package \textsf{xgboost} \cite{Chen2016XGBoost:System} was used. Ensemble learners, and \acrshort{xgb} in particular, is also being used to predict multiclass AD status \cite{zhang2024multiclass,app13148298}.\\

\leavevmode\newline \textbf{Implementation}\hspace{.25cm} 
The R package \textsf{caret} streamlined the training and comparison of classification and regression models, offering broad model training and evaluation options, as well as feature importance estimation \cite{Kuhn2008BuildingPackage}. The functions \texttt{trainControl} and \texttt{train} were invoked to fit the models. The sampling method used to handle the unbalanced classes was \acrshort{smote} (Synthetic Minority Over-Sampling Technique), as implemented by the package \textsf{DMwR2} \cite{DMwR2}. Considering the bias-variance trade-off, three models were implemented: multinomial logistic regression (parametric, interpretable), decision tree (non-parametric, sufficiently interpretable), XGBoost (non-parametric, not interpretable).

First, a benchmark model was created fitting the clinical background data to predict the target in an unpenalized \acrshort{mnl} model ($\lambda$ = 0) (Fig. \ref{fig:flow}). Second, the 230-metabolite panel was fitted on top of the clinical background data in a penalized \acrshort{mnl} model ($\lambda$ = 9.187724, obtained from repeated 10-fold Cross-Validation (CV)) and feature importance was calculated invoking \textsf{caret}'s \texttt{varImp}. Next, the full metabolite panel was projected into 6 latent factors (meta-metabolites), explaining around 30\% of their variance. The number of latent factors was selected assuming four meta-metabolites account for the variance of the four metabolite classes (amines, lipids, organic acids and oxidative stress compounds) while balancing model complexity and cumulative explained variance. The 6-factor metabolite projection was then fitted on top of the clinical data in a unpenalized \acrshort{mnl} ($\lambda$ = 0), a \acrlong{dt} and an XGBoost model. The aforementioned models were hyperparameter-tuned over a grid of values and the best "tune" was selected via repeated (100 times) 10-fold \acrshort{cv}. The full metabolite panel was initially fitted in a \acrshort{dt} and a \acrshort{xgb} as well, but was omitted from further runs, as their added value was deemed small considering that more parsimonious models with only 19 predictors (clinical data + 6 meta-metabolites) would be fitted next.
\begin{figure}[ht]
  \includegraphics[width=0.9\textwidth]{figures/flowchart.pdf}
  \caption{\label{fig:flow} Multi-class classification of ApoE4 and AD status pipeline. First, fit a benchmark model: clinical data only in MLR with $\lambda=0$. Second, fit clinical data + 230 metabolites in MLR, hyperparameter tune, use the best $\lambda$, evaluate performance and get feature importance scores. Third, project the 230 metabolites to 6 latent factors (meta-metabolites), fit these on top of the clinical data in MLR and evaluate performance. Fourth, fit the same data in a decision tree, hyperparameter tune, use the best hyperparameters and evaluate performance. Fifth, fit the same data in an \acrfull{xgb}, hyperparameter tune, use the best hyperparameters and evaluate performance. Performance was evaluated for all models using P(Accuracy$\geq$NIR), ROC, AUC, confusion matrices obtained from repeated (100 times) 10-fold CV.\\
  MLR: \acrlong{mnl}}
\end{figure}\\

\leavevmode\newline \textbf{Evaluation}\hspace{.25cm} The models were fitted with the optimal hyperparameters. Model performance was assessed and compared with repeated (100 times) 10-fold CV-obtained \acrfull{roc} curves and their respective \acrfull{auc} using the \textsf{pROC} package \cite{pROC} and other metrics such as accuracy and \acrfull{nir} from \textsf{caret}'s \texttt{confusionMatrix} output. The confusion matrices were also plotted using the R package \textsf{cvms}.

ROC plots feature sensitivity (recall, true positive rate) \[ sensitivity = \frac{TP}{TP+FN}\] on the y-axis against 1-specificity (false positive rate) 
\[ specificity = \frac{TN}{FP+TN}\]
\[ 1-specificity = \frac{FP}{FP+TN}\] on the x-axis \cite{James2023AnEdition}.
A well-performing classifier has high sensitivity for low false positive rates, resulting in a curve that tends to stay close to the top-left corner of the graph and encloses a large AUC \cite{James2023AnEdition}. The latter represents the probability that the classifier will correctly distinguish the classes. An AUC of 1 indicates perfect performance, while an AUC of 0.5 is equivalent to random chance \cite{James2023AnEdition}. 

Accuracy represents the percentage of correct predictions over all predictions: 
\[ accuracy = \frac{TP+TN}{TP+FP+TN+FN}\]

NIR is the expected accuracy of a classifier that assigns the most frequent class to all observations \cite{Kuhn2008BuildingPackage}. Considering the correct predictions $k =TP+TN$ as successes over the total of $n$ predictions, a Binomial test was also performed: $H_0: accuracy = NIR$ against $H_\alpha: accuracy \geq NIR$. The associated $p$-value is:

\[p = \sum_{k}^n \binom{n}{k} \textrm{NIR}^k (1-\textrm{NIR})^{n-k}\]

Under $H_0: p \sim \mathrm{B}(n,p)$. Under $H_\alpha,\ p$ tends to smaller values, in which case the model classifies better than "blindly" assigning the majority class. In other words, it represents the probability that the difference between accuracy and NIR is due to chance \cite{NIR2023}.

\newpage\subsubsection{Metabolite Network Analysis}\label{rq3}
Network science presents a unifying framework for data and system representation, applicable to any domain \cite{Barabasi2015NetworkScience}. A network, in an abstract sense, consists of nodes connected with links, also referred to as edges. In data science, a network whose nodes represent random features, whose joint probability distribution is defined by the ensemble of their edges is called \textit{graphical model} \cite{Peeters2022Rags2ridges:Matrices}. A metabolomic covariance correlation network represents the ensemble of metabolites based on their covariance, showing nuances among the samples that non-graphical statistical methods on individual metabolites may fail to detect \cite{PerezDeSouza2020Network-basedInterpretation}. It may provide insights into correlated metabolites that do not belong in the same metabolic pathway \cite{PerezDeSouza2020Network-basedInterpretation}.


A \textit{Gaussian graphical model} (\acrshort{ggm}) is an undirected graph that represents the conditional independence properties of the features \cite{KollerProbabilisticTechniques}. The statistic employed by GGMs is the partial correlation which also adjusts for indirect correlation, i.e. two metabolites are correlated with a third one and are shown correlated with each other \cite{Amara2022NetworksInterpretation}. For instance, let $\mathcal{V}$ a set of $p$ vertices, representing random features $Y_1,...,Y_p$ with joint probability distribution $P \sim N_p(\mathbf{0, \Sigma})$, and $\mathcal{E}$ set of edges, then $\mathcal{G=(V,E)}$ is a GGM if for all pairs $\{Y_i , Y_j\}$ with $i\neq j$:

\[ \mathbf{\Sigma}_{ij}^{-1} = (\mathbf{\Omega}_{ij})=0 \Longleftrightarrow Y_i \Perp Y_j\mid\{Y_k : k \neq i,j\} \Longleftrightarrow (i, j) \notin \mathcal{E}.\]

In natural language, a zero value in the inverse covariance matrix (usually referred to as precision matrix $\mathbf{\Omega}$) implies that the respective random features are independent, given the rest of features, and they are not connected by an undirected edge $((i, j) \notin \mathcal{E})$ \cite{Peeters2022Rags2ridges:Matrices}.

For the metabolite network analysis, the package \textsf{rags2ridges} \cite{Peeters2022Rags2ridges:Matrices} was used to generate the feature covariance matrices of ApoE4 carriers and non-carriers with AD. Their precision matrices were \textit{fused} to incorporate information from both groups, regularized with Ridge penalty and sparsified at a local FDR threshold $q=0.999$ using the \texttt{.fused} functions of \textsf{rags2ridges}. The optimal penalty was jointly estimated with 10-fold CV. The sparsified networks of each group, as well as their differential edges were plotted in GGMs using \texttt{Ugraph}.  Lastly, communities of strongly correlated metabolites were identified using the community search algorithm by Girvan-Newman \cite{PhysRevE.69.026113} as implemented in \textsf{rags2ridges}'s \texttt{Communities}. These metabolite ensembles may reflect conditionally dependent metabolite groups.

The network centrality measures (degree, betweenness, eigenvector centrality, number of positive and negative edges, mutual information and (partial) variance) were calculated using \texttt{GGMnetworkStats.fused}. Degree (of centrality) represents  the sum of connected vertices \cite{newman2010Networks}; each neighbouring node receives one "point".  Eigenvector centrality, instead of awarding vertices just one point per neighbour, it gives each vertex a score proportional to the sum of the scores of its neighbours \cite{newman2010Networks}. Betweenness reflects the extent to which a node lies on paths between other vertices \cite{newman2010Networks}. Mutual information represents the amount of information we obtain for a node considering the rest of the nodes. The aforementioned measures aid in highlighting hub (central) metabolites with a regulatory potential. Wilcoxon Signed Rank tests were performed to test for differences between the network statistics and corrected for FDR at 0.05.    \clearpage
\section{Results} \label{results}
The ApoE genotype frequencies in \acrshort{ad} and SCD are shown in Table \ref{tab:ApoEfreq}. The most common genotype was $\varepsilon_3/\varepsilon_3$, followed by $\varepsilon_3/\varepsilon_4$ and $\varepsilon_4/\varepsilon_4$. Cumulatively, the most abundant allele is $\varepsilon_3$ (64\%), followed by $\varepsilon_4$ (30\%) and $\varepsilon_2$ (6\%). In the AD group, the allelic frequencies were 74.6\% for $\varepsilon_3$, followed by 15.4\% for $\varepsilon_4$ and 10\% for $\varepsilon_2$. In the SCD group, $\varepsilon_3$, $\varepsilon_4$ and $\varepsilon_2$ exhibited relative frequencies of 53.5\%, 44.5\% and 2\%, respectively. The increased $\varepsilon_4$ (Fig. \ref{plot:sankey}) frequency in SCD patients might reflect compensatory mechanisms in place in this group or an increased risk for AD in the future.
\begin{figure}[H]
  \includegraphics[width=0.65\textwidth]{figures/sankey@2x.png}
    \caption{\label{plot:sankey} Sankey charts representing the ApoE allelic composition of the sample based on the genotype counts in AD (top) and SCD (bottom) (Table \ref{tab:ApoEfreq}). Fig. \ref{fig1} reflects the theoretical distribution of the three common ApoE alleles in $\binom{3}{2}$ combinations (genotypes). This chart represents the relative ApoE allelic frequencies based on the 6 genotype counts from the data. ApoE3 is the most common allele, followed by ApoE4 and ApoE2 in both groups. A higher frequency of ApoE4 is observed in the SCD group compared to AD. This is attributed to the higher count of $\varepsilon_3/\varepsilon_4$ and homozygous $\varepsilon_4$.}
\end{figure}


\newpage \subsection{ApoE4 dose effects on serum metabolite levels in AD}
\subsubsection{Global Test}
Testing for ApoE4 dose-effects on serum metabolites of AD patients, correcting for sex (Ho: ApoE4 dose has no effect on mean metabolite levels, Ha: it has an effect), revealed a significant global difference in metabolites (p = 0.017). The most significantly altered metabolites are triglycerides and diglycerides (FDR-adjusted p-value $<$0.05) See Table \ref{tab:gt} and Fig. \ref{plot:gt}. In SCD individuals, the global test showed no significant differences among ApoE4 doses (p=0.544). Generally, fewer metabolites were altered in this group and none of the associations remained significant after adjusting for FDR. Interestingly, the affected metabolites are different between AD and SCD.
\begin{figure}[H]
  \includegraphics[width=0.8\textwidth]{figures/gt2.png}
    \caption{Covariates plot showing the metabolites affected by the number of ApoE4 alleles in AD. The contribution of each such metabolite is itself a test, with a p-value against the alternative hypothesis (bottom). The plotted metabolites are ordered in a hierarchical clustering graph (top). The thick black line represents the metabolites and clusters of metabolites that are most clearly associated with ApoE4 dose, correcting for family-wise error rate at 0.05. This can be interpreted as that the multiple testing algorithm employed can confidently infer that at least one of the metabolites below the last significant branch is associated with the number of ApoE4 alleles, without being able to pinpoint with enough confidence which one(s).}
  \label{plot:gt}
\end{figure}
\begin{table} 
\vspace{-0.5cm}
\makebox[\linewidth]{\begin{threeparttable}
\caption{Metabolites affected by ApoE4 dose, correcting for sex as per globaltest \cite{Goeman2023ThePackage}}.
\label{tab:gt}
\begin{tabular}{cclccrr} \toprule
  & Class & Metabolite & \multicolumn{1}{l}{Inheritance} & Assoc. with & \multicolumn{1}{l}{p-value} & \multicolumn{1}{l}{FDR} \\ \midrule
  \multirow{39}{*}{AD} & Lipid & TG (56:2) & 0.046 & 1 ApoE4 & $<$0.001 & 0.016 \\
   & Lipid & TG (58:1) & 0.117 & 1 ApoE4 & $<$0.001 & 0.021 \\
   & Lipid & DG (36:3) & 0.19 & 1 ApoE4 & $<$0.001 & 0.021 \\
   & Lipid & TG (56:3) & 0.244 & 1 ApoE4 & $<$0.001 & 0.021 \\
   & Lipid & TG (52:3) & 0.424 & 1 ApoE4 & 0.001 & 0.031 \\
   & Lipid & TG (54:5) & 0.48 & 1 ApoE4 & 0.001 & 0.027 \\
   & Lipid & TG (58:2) & 0.722 & 1 ApoE4 & 0.001 & 0.027 \\
   & Lipid & TG (54:4) & 0.761 & 1 ApoE4 & 0.002 & 0.033 \\
   & Lipid & TG (58:9) & 1 & 1 ApoE4 & 0.001 & 0.027 \\
   & Lipid & TG (56:7) & 1 & 1 ApoE4 & 0.001 & 0.027 \\
   & Lipid & TG (58:8) & 1 & 1 ApoE4 & 0.001 & 0.032 \\
   & Lipid & TG (54:6) & 1 & 1 ApoE4 & 0.002 & 0.035 \\
   & Lipid & TG (56:8) & 1 & 1 ApoE4 & 0.002 & 0.037 \\
   & Lipid & TG (52:4) & 1 & 1 ApoE4 & 0.003 & 0.055 \\
   & Lipid & TG (54:3) & 1 & 1 ApoE4 & 0.004 & 0.055 \\
   & Lipid & TG (56:6) & 1 & 1 ApoE4 & 0.004 & 0.059 \\
   & Lipid & TG (56:1) & 1 & 1 ApoE4 & 0.004 & 0.059 \\
   & Lipid & TG (52:2) & 1 & 1 ApoE4 & 0.005 & 0.063 \\
   & Lipid & TG (54:2) & 1 & 1 ApoE4 & 0.005 & 0.063 \\
   & Lipid & TG (60:2) & 1 & 1 ApoE4 & 0.007 & 0.077 \\
   & Lipid & TG (58:10) & 1 & 1 ApoE4 & 0.011 & 0.124 \\
   & Lipid & TG (51:3) & 1 & 1 ApoE4 & 0.015 & 0.154 \\
   & Lipid & SM (d18:1/18:1) & 1 & 2 ApoE4 & 0.018 & 0.169 \\
   & Organic acid & 2-ketoglutaric.acid & 1 & 2 ApoE4 & 0.019 & 0.169 \\
   & Lipid & TG (52:0) & 1 & 1 ApoE4 & 0.019 & 0.169 \\
   & Lipid & TG (50:3) & 1 & 2 ApoE4 & 0.02 & 0.169 \\
   & Organic acid & Uracil & 1 & no ApoE4 & 0.02 & 0.169 \\
   & Lipid & TG (52:5) & 1 & 1 ApoE4 & 0.021 & 0.174 \\
   & Lipid & TG (54:0) & 1 & 1 ApoE4 & 0.022 & 0.174 \\
   & Lipid & TG (50:2) & 1 & 2 ApoE4 & 0.022 & 0.174 \\
   & Lipid & TG (51:2) & 1 & 1 ApoE4 & 0.023 & 0.174 \\
   & Aminoacid & L-Glutamine & 1 & no ApoE4 & 0.024 & 0.174 \\
   & Lipid & TG (50:1) & 1 & 2 ApoE4 & 0.025 & 0.174 \\
   & Lipid & TG (50:4) & 1 & 1 ApoE4 & 0.026 & 0.176 \\
   & Lipid & TG (52:1) & 1 & 1 ApoE4 & 0.027 & 0.176 \\
   & Lipid & TG (54:1) & 1 & 1 ApoE4 & 0.031 & 0.203 \\
   & Lipid & TG (50:0) & 1 & 1 ApoE4 & 0.037 & 0.229 \\
   & Organic acid & Malic.acid & 1 & 2 ApoE4 & 0.038 & 0.234 \\
   & Lipid & DG (36:2) & 1 & 1 ApoE4 & 0.039 & 0.235 \\
   & Ox. Stress & PAF (16:0)& 1 & 2 ApoE4 & 0.049 & 0.283 \\ \midrule
  \multirow{10}{*}{SCD} & Lipid & LPC (20:5) & 1 & no ApoE4 & 0.012 & 0.828 \\
   & Lipid & SM (d18:1/23:0) & 1 & 1 ApoE4 & 0.015 & 0.828 \\
   & Aminoacid & L-Tryptophan & 1 & no ApoE4 & 0.024 & 0.828 \\
   & Aminoacid & Putrescine & 1 & 1 ApoE4 & 0.028 & 0.828 \\
   & Aminoacid & Glycine & 1 & 1 ApoE4 & 0.035 & 0.828 \\
   & Lipid & CE (18:2) & 1 & 1 ApoE4 & 0.038 & 0.828 \\
   & Aminoacid & SM (d18:1/22.0) & 1 & 1 ApoE4 & 0.039 & 0.828 \\
   & Ox. Stress & LPA (20:5) & 1 & no ApoE4 & 0.040 & 0.828 \\
   & Lipid & PC (36:3) & 1 & 1 ApoE4 & 0.043 & 0.828 \\
   & Organic acid & Succinic acid & 1 & 1 ApoE4 & 0.043 & 0.828 \\
   & Amine & Citrulline & 1 & 1 ApoE4 & 0.043 & 0.828 \\   \bottomrule  
  \end{tabular}
  \begin{tablenotes}
    \item[] CE: Cholesteryl ester, DG: Diglyceride, LPA: Lyso-sphingolipid LPC: Lysophosphatidylcholine, PAF: Platelet activating factor, SM: Sphingomyelin, TG: Triglyceride
  \end{tablenotes}
\end{threeparttable}}
  \end{table}
\clearpage  
\subsubsection{Nested Linear Models}
Several metabolites from all classes were altered by ApoE4 dose, both in AD and SCD. However, after controlling FDR, none of the effects remain significant at $\alpha=0.05$. Among AD patients, metabolites showing trends of a positive effect were several triglycerides, diglycerides, putrescine, 2-ketoglutraric acid, lysophosphatidylcholin, platelet activating factor (16:0) and lyso-phosphatidic acid (18:0) (Table \ref{tab:nested}). Among individuals with SCD, lipid metabolites were not affected as much as in the AD group, with only two sphingomyelin species showing a difference. Aminoacids L-serine, tryptophan, glycine, trytptophan, L-homoserine, putrescine exhibited trends of effect in this group. L-Tryptophan is negatively associated with ApoE4 dose (at 1x and 2x ApoE4), while L-serine, glycine and L-homoserine are negatively associated only with ApoE4 homozygotes. (Table \ref{tab:nested}). 
\begin{table}[H]
\caption{ApoE4 dose effects on serum metabolites in AD and SCD: results from nested linear model comparison. Full model: clinical background variables and number of ApoE4 alleles, Nested model: clinical background variables only. Colour range green-red reflects trends in positive-negative correlations between ApoE4 dose and metabolites.}
\label{tab:nested}
\centering
\begin{threeparttable}
  \begin{tabular}{clrrrrrrrr} \toprule
    \multicolumn{1}{l}{} & \multicolumn{1}{c}{\multirow{2}{*}{Metabolite}} & \multicolumn{1}{c}{\multirow{2}{*}{P($>$F)}} & \multicolumn{1}{c}{\multirow{2}{*}{FDR}} & \multicolumn{2}{c}{No $\varepsilon_4$} & \multicolumn{2}{c}{1x $\varepsilon_4$} & \multicolumn{2}{c}{2x $\varepsilon_4$} \\
\multicolumn{1}{l}{} & \multicolumn{1}{c}{} & \multicolumn{1}{c}{} & \multicolumn{1}{c}{} & \multicolumn{1}{l}{Coef.} & \multicolumn{1}{l}{P($>$t)$^\ast$} & \multicolumn{1}{l}{Coef.} & \multicolumn{1}{l}{P($>$t)$^\ast$} & \multicolumn{1}{l}{Coef.} & \multicolumn{1}{l}{P($>$t)$^\ast$} \\ \midrule
  \multirow{19}{*}{AD} & Lip.TG (52:3) & 0.001 & 0.156 & {\cellcolor[rgb]{0.98,0.8,0.808}}-1.4 & 0.818 & {\cellcolor[rgb]{0.949,0.973,0.969}}2.6 & 0.004 & {\cellcolor[rgb]{0.929,0.965,0.949}}3.7 & 0.001 \\
   & Lip.TG (52:4) & 0.003 & 0.156 & {\cellcolor[rgb]{0.984,0.988,0.996}}0.6 & 0.888 & {\cellcolor[rgb]{0.969,0.98,0.98}}1.6 & 0.008 & {\cellcolor[rgb]{0.953,0.976,0.973}}2.4 & 0.002 \\
   & DG (36:3) & 0.002 & 0.156 & {\cellcolor[rgb]{0.984,0.957,0.969}}0.0 & 0.631 & {\cellcolor[rgb]{0.984,0.957,0.965}}0.0 & 0.012 & {\cellcolor[rgb]{0.984,0.957,0.969}}0.0 & 0.001 \\
   & OS.HpH.PAF (16:0) & 0.002 & 0.156 & {\cellcolor[rgb]{0.388,0.745,0.482}}34.8 & $<$0.001 & {\cellcolor[rgb]{0.949,0.973,0.969}}2.6 & 0.017 & {\cellcolor[rgb]{0.914,0.961,0.937}}4.6 & 0.001 \\
   & Lip.TG (52:2) & 0.006 & 0.255 & {\cellcolor[rgb]{0.973,0.412,0.42}}-5.1 & 0.469 & {\cellcolor[rgb]{0.957,0.976,0.976}}2.1 & 0.03 & {\cellcolor[rgb]{0.929,0.965,0.949}}3.7 & 0.002 \\
   & Lip.TG (54:5) & 0.01 & 0.373 & {\cellcolor[rgb]{0.965,0.98,0.98}}1.9 & 0.496 & {\cellcolor[rgb]{0.98,0.984,0.992}}0.9 & 0.016 & {\cellcolor[rgb]{0.973,0.984,0.988}}1.3 & 0.006 \\
   & Lip.TG (50:2) & 0.012 & 0.398 & {\cellcolor[rgb]{0.973,0.471,0.478}}-4.5 & 0.297 & {\cellcolor[rgb]{0.98,0.984,0.992}}0.9 & 0.141 & {\cellcolor[rgb]{0.957,0.976,0.973}}2.2 & 0.003 \\
   & Lip.TG (50:1) & 0.018 & 0.45 & {\cellcolor[rgb]{0.98,0.706,0.714}}-2.3 & 0.539 & {\cellcolor[rgb]{0.984,0.988,0.996}}0.7 & 0.179 & {\cellcolor[rgb]{0.965,0.98,0.98}}1.8 & 0.005 \\
   & Lip.TG (54:4) & 0.02 & 0.45 & {\cellcolor[rgb]{0.937,0.969,0.953}}3.5 & 0.318 & {\cellcolor[rgb]{0.976,0.984,0.988}}1.2 & 0.015 & {\cellcolor[rgb]{0.973,0.984,0.984}}1.4 & 0.02 \\
   & Lip.TG (54:6) & 0.017 & 0.45 & {\cellcolor[rgb]{0.984,0.906,0.918}}-0.4 & 0.795 & {\cellcolor[rgb]{0.988,0.988,1}}0.5 & 0.027 & {\cellcolor[rgb]{0.984,0.988,0.996}}0.7 & 0.009 \\
   & Am.Putrescine & 0.029 & 0.515 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & $<$0.001 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.035 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.017 \\
   & OA.2.ketoglutaric.acid & 0.026 & 0.515 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.366 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.953 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.014 \\
   & Lip.TG (50:3) & 0.028 & 0.515 & {\cellcolor[rgb]{0.98,0.824,0.835}}-1.2 & 0.668 & {\cellcolor[rgb]{0.984,0.988,0.996}}0.6 & 0.127 & {\cellcolor[rgb]{0.973,0.984,0.988}}1.3 & 0.008 \\
   & Lip.TG (52:5) & 0.042 & 0.538 & {\cellcolor[rgb]{0.988,0.988,1}}0.5 & 0.789 & {\cellcolor[rgb]{0.988,0.988,1}}0.4 & 0.134 & {\cellcolor[rgb]{0.984,0.988,0.996}}0.7 & 0.014 \\
   & Lip.TG (56:7) & 0.042 & 0.538 & {\cellcolor[rgb]{0.976,0.698,0.706}}-2.4 & 0.095 & {\cellcolor[rgb]{0.988,0.988,1}}0.5 & 0.015 & {\cellcolor[rgb]{0.988,0.988,1}}0.4 & 0.105 \\
   & Lip.TG (56:8) & 0.044 & 0.538 & {\cellcolor[rgb]{0.98,0.804,0.816}}-1.4 & 0.055 & {\cellcolor[rgb]{0.984,0.98,0.992}}0.2 & 0.014 & {\cellcolor[rgb]{0.984,0.973,0.984}}0.2 & 0.151 \\
   & Lip.TG (58:9) & 0.042 & 0.538 & {\cellcolor[rgb]{0.984,0.906,0.918}}-0.4 & 0.068 & {\cellcolor[rgb]{0.984,0.961,0.973}}0.1 & 0.013 & {\cellcolor[rgb]{0.984,0.957,0.969}}0.0 & 0.41 \\
   & LPC (16:0) & 0.033 & 0.538 & {\cellcolor[rgb]{0.922,0.961,0.945}}4.2 & 0.028 & {\cellcolor[rgb]{0.988,0.988,1}}0.3 & 0.237 & {\cellcolor[rgb]{0.98,0.988,0.992}}0.9 & 0.009 \\
   & OS.HpH.LPA (18:0) & 0.044 & 0.538 & {\cellcolor[rgb]{0.988,0.988,1}}0.5 & 0.038 & {\cellcolor[rgb]{0.984,0.957,0.969}}0.0 & 0.178 & {\cellcolor[rgb]{0.984,0.965,0.976}}0.1 & 0.013 \\ \midrule
  \multirow{7}{*}{SCD} & Am.L-Serine & 0.002 & 0.285 & {\cellcolor[rgb]{0.914,0.961,0.937}}4.7 & $<$0.001 & {\cellcolor[rgb]{0.984,0.973,0.984}}0.2 & 0.115 & {\cellcolor[rgb]{0.98,0.839,0.851}}-1.1 & 0.003 \\
   & Am.L-Tryptophan & 0.002 & 0.285 & {\cellcolor[rgb]{0.929,0.965,0.949}}3.7 & 0.004 & {\cellcolor[rgb]{0.984,0.922,0.933}}-0.3 & 0.047 & {\cellcolor[rgb]{0.98,0.808,0.82}}-1.4 & 0.002 \\
   & Am.Glycine & 0.006 & 0.45 & {\cellcolor[rgb]{0.922,0.961,0.941}}4.3 & 0.001 & {\cellcolor[rgb]{0.988,0.988,1}}0.4 & 0.015 & {\cellcolor[rgb]{0.984,0.859,0.871}}-0.9 & 0.052 \\
   & Am.L-homoserine & 0.047 & 0.931 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.001 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.723 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.014 \\
   & Am.Putrescine & 0.045 & 0.931 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.969 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.013 & {\cellcolor[rgb]{0.984,0.953,0.965}}0.0 & 0.927 \\
   & Lip.SM (d18:1/22:0) & 0.043 & 0.931 & {\cellcolor[rgb]{0.937,0.969,0.957}}3.3 & 0.002 & {\cellcolor[rgb]{0.984,0.984,0.996}}0.3 & 0.015 & {\cellcolor[rgb]{0.984,0.98,0.992}}0.3 & 0.448 \\
   & Lip.SM (d18:1/23:0) & 0.029 & 0.931 & {\cellcolor[rgb]{0.973,0.984,0.988}}1.2 & 0.01 & {\cellcolor[rgb]{0.984,0.969,0.98}}0.1 & 0.012 & {\cellcolor[rgb]{0.984,0.973,0.984}}0.2 & 0.277 \\ \bottomrule
\end{tabular}
\begin{tablenotes}
  \item[$\ast$] Not corrected for multiple testing.
  \item[] Am: Aminoacid, Lip: Lipid, DG: Diglyceride, LPA: Lyso-sphingolipid LPC: Lysophosphatidylcholine, SM: Sphingomyelin, TG: Triglyceride, OA: Organic Acid, OS: Oxidative Stress compound, PAF: Platelet activating factor
\end{tablenotes}
\end{threeparttable}
\end{table}
\newpage
\subsection{Classification of ApoE4 and AD status}
All classification models had a multi-class ROC \acrshort{auc} above 0.8 and accuracies significantly higher than the No-Information Rate (p$<2.2e^{-16}$) (Table \ref{tab:clin}) in predicting AD without ApoE4, AD with at least 1 ApoE4, SCD without ApoE4, SCD with at least 1 ApoE4. The worst-performing model was  Multinomial Logistic Regression fitting the clinical data only, while the best-performing one was XGBoost fitting the clinical data and the 6 meta-metabolites (obtained from the Latent Factor Projection). Adding serum metabolite information (either the full 230-metabolite matrix or its 6-factor projection) seems to slightly increase the discriminatory power of the models.
Notably, looking at the individual ROC curves (Fig. \ref{roc:full}, \ref{roc:mlr6} and Appendix \ref{appendixA}), as well as the confusion matrices (Fig. \ref{cm:full} and \ref{cm:xgboost}) all models were able to discriminate better among certain classes (AD+E4 vs. SCD+E4, AD+E4 vs. SCD, AD-E4 vs. SCD+E4 and AD-E4 vs. SCD-E4) compared to others (AD+E4 vs. AD-E4 and SCD+E4 vs. SCD-E4).
\begin{table}[H]
  \centering
\caption{Performance metrics of multi-class classification of ApoE4 and AD status, obtained from 10-fold CV repeated 100 times. The multi-class AUC is the average of the 6 one-vs-one classifications (see ROC curves (Fig. \ref{roc:full}, \ref{roc:xgb} and Appendix \ref{appendixA}) for the binary AUCs)} 
\label{tab:class_results}
\begin{threeparttable}
\begin{tabular}{clllll}\toprule
Model  & Fitting & AUC & Accuracy (95/\% CI) & P(Acc.$\geq$NIR) & NIR \\ \midrule
\multirow{3}{*}{MLR} & Clinical features only & 0.814 & 0.4807 (0.4744, 0.4869) & $<2.2e^{-16}$ & \multirow{5}{*}{0.3522} \\
& Clinical features + 230 metabolites & 0.834 & 0.5753 (0.5692, 0.5815) & $<2.2e^{-16}$ &  \\
& \multirow{3}{*}{Clinical features + 6 latent factors} & 0.818 & 0.5313 (0.525, 0.5375) & $<2.2e^{-16}$ &  \\
Decision Tree & & 0.818 & 0.5082 (0.502, 0.5145) & $<2.2e^{-16}$ &  \\
XGBoost & & 0.836 & 0.5944 (0.5882, 0.6005) & $<2.2e^{-16}$ &  \\ \bottomrule
\end{tabular}
\begin{tablenotes}
  \item[]  AUC: multi-class Area Under the (ROC) Curve, MLR: Multinomial Logistic Regression, NIR: No-Information Rate
\end{tablenotes}
\end{threeparttable}
\end{table}
\begin{figure}[h]
  \includesvg[width=0.5\linewidth]{figures/mlr.svg}
  \caption{\label{roc:full}ROC curves of Multinomial Logistic Regression fitting the 230 metabolites on top of clinical background variables, obtained from repeated (100 times) 10-fold CV. }
\end{figure} \clearpage
\begin{figure}[h]
  \includesvg[width=0.5\linewidth]{figures/xgb.svg}
  \caption{\label{roc:xgb} ROC curves of \acrlong{xgb} fitting the 6 ML-estimated latent factors on top of the clinical background variables, obtained from repeated (100 times) 10-fold CV. }
  \end{figure} 
\begin{figure}[h]
  \includesvg[width=0.6\linewidth]{figures/cm_mlr.svg}
  \caption{Confusion matrix of the true (Target) and predicted (Prediction) classes of Multinomial Logistic Regression fitting the 230 metabolites on top of clinical background variables, obtained from repeated (100 times) 10-fold CV. Each tile displays the \textit{normalized} count (overall percentage) and the count of predicted classes in the middle. In the bottom of each tile the \textit{column normalized} count reflects the proportion of the prediction over all predictions per true class. On the right of the tile, the row percentage shows the fraction of the prediction over the rest of true classes. The diagonal represents the correctly predicted classes (True Positive and True Negative).}
  \label{cm:full}
\end{figure} \clearpage
\begin{figure}[ht]
    \includesvg[width=0.6\linewidth]{figures/cm_xgb.svg}
    \caption{Confusion matrix of the true (Target) and predicted (Prediction) classes of XGBoost fitting the 6 ML-estimated latent factors on top of the clinical background variables, obtained from repeated (100 times) 10-fold CV.}
  \label{cm:xgboost}
\end{figure}
\begin{table}[H] 
	\centering
	\caption{\label{tab:fimp} Features with the highest feature importance scores from a Multinomial Logistic Regression model fitting clinical background data and 230 metabolites to predicting ApoE4 and AD status, obtained via repeated (100 times) 10-fold CV.}
	\begin{tabular}{clc}
		\toprule
      Class &	Feature & Overall (\%) \\ \midrule
    Amine & Putrescine            & 100.00 \\
    Ox. stress & HpH.Spha.1.P.C18.0    & 97.43  \\
    Organic acid & Uracil         & 91.78  \\
    Lipid & TG (56:0)             & 88.86  \\
    Organic acid & 3.Hydroxybutyric.acid & 87.29  \\
    Clinical & Cholesterol medication   & 84.54  \\
    Amine & Sarcosine             & 83.54  \\
    Lipid & PE.O. (38:5)          & 81.60  \\
    Ox. stress & HpH.LPA.C20.5    & 72.43  \\
    Amine & L-Tryptophan          & 71.34  \\
    Lipid & SM (d18:1/20:1)       & 70.52  \\
    Amino acid & Glutathione      & 69.96  \\
    Amino acid & L-Serine         & 69.49  \\
    Amino acid & L-Histidine      & 69.41  \\
    Ox. stress & HpH.LPA.C18.0    & 68.81  \\
    Amino acid & Histamine        & 67.99  \\
    Amino acid & L-Threonine      & 67.85  \\
    Amino acid & L-Glutamine      & 67.70  \\
    Lipid & PC (38:3)             & 65.30  \\
    Organic acid & Pyruvic.acid   & 64.96 \\ \bottomrule
	\end{tabular}
\end{table}
The most important metabolites in identifying the four ApoE4 and AD status phenotypes were putrescine, sphingosine 1-phosphate (18:0), uracil, triglyceride (56:0), 3- hydroxybutyric acid, sarcosine, phosphoethanolamine (38:5), LPA (20:5), L-trypthophan, sphingomyelin (d18:1/20:1), glutathione, L-serine, L-histidine, LPA (18:0), histamine, L-threonine, L-glutamine, phosphatidylcholine (38:3), see Table \ref{tab:fimp}. 

\subsection{Metabolite Network Analysis}
The Gaussian graphical modelling between ApoE4 carriers and non-carriers with AD unveiled distinct correlations between the metabolites (Fig. \ref{netAD}). The ApoE4 carriers' network was more sparse, having less edges compared to the non-carriers'.  Moreover, the metabolites could be clearly separated in communities based on their covariances in ApoE4 carriers, as shown in Fig. \ref{comms}. The communities found contain similar compounds (phosphatidylcholines, triglycerides, sphingomyelines, prostaglandines) or molecules that share metabolic pathways (amines, amines-organic acids, aspartic-glutamic acid, oxidative stress compounds-lipids). The ApoE4 non-carrier's network appeared less cohesive, with only a few small aminic and organic acid communities.

The Wilcoxon Signed Rank tests revealed most network statistics to be larger in ApoE4 carriers compared to non-carriers (p-values $<$0.05) with AD (Table \ref{tab:netstats}). In the ApoE4 positive group, top central metabolites were amines (dopamine and citrulline) (Table \ref{tab:degrees}). ApoE4 non-carriers had 3-methoxytyrosine as top central metabolite, followed by the oxidative stress compound nitro fatty acid (C18:3).
\begin{table}[H]
  \caption{\label{tab:netstats} p-values of Wilcoxon Signed Rank test (alternative hypothesis: network statistics of ApoE4 carriers $>$ ApoE4 non-carriers)}
  \begin{tabular}{lcc}
  \toprule
  Statistic & p-value & FDR \\ \midrule
  Centrality                   & $8.459e^{-26}$ & $1.973e^{-25}$    \\
  Betweennes                   & $4.110e^{-18}$ & $5.754e^{-18}$    \\
  Eigenvector Centrality       & $1.366e^{-06}$ & $8.837e^{-06}$    \\
  Number of negative edges     & $8.837e^{-06}$ & $1.594e^{-06}$    \\
  Number of positive edges     & $2.712e^{-23}$ & $4.745e^{-23}$    \\
  Mutual Information           & $4.087e^{-30}$ & $1.603e^{-29}$    \\
  Variance                     & $4.582e^{-30}$ & $1.603e^{-29}$    \\ \bottomrule
  \end{tabular}
  \end{table}
\clearpage
\begin{table}[H]
  \caption{\label{tab:degrees} Top central metabolites in ApoE4 carriers and non-carriers with AD.}
  \begin{threeparttable}
  \begin{tabular}{lclc}\toprule
  \multicolumn{2}{c}{ApoE4 carriers} & \multicolumn{2}{c}{ApoE4 non-carriers} \\
  Metabolite & Degree & Metabolite & Degree \\ \midrule
  Am.Dopamine                    & 6      & Am.X3.Methoxytyrosine         & 5      \\
  Am.Citrulline                  & 5      & OS.LpH.NO2.OA                 & 4      \\
  Am.X3.Methoxytyramine          & 4      & Am.Cysteine                   & 3      \\
  Am.ADMA                        & 4      & Am.Dopamine                   & 3      \\
  Am.DL.3.aminoisobutyric.acid   & 4      & Am.Glutathione                & 3      \\
  Am.Ethanolamine                & 4      & Am.L.Aspartic.acid            & 3      \\
  Am.L.4.hydroxy.proline         & 4      & Am.X3.Methylhistidine         & 2      \\
  Am.L.Kynurenine                & 4      & Am.Hydroxylysine              & 2      \\
  Am.SDMA                        & 4      & Am.L.4.hydroxy.proline        & 2      \\
  OA.Glycolic.acid               & 4      & Am.L.carnosine                & 2      \\
  OA.3.Hydroxybutyric.acid       & 4      & Am.Methyldopa                 & 2      \\
  OA.Aspartic.acid               & 4      & OA.Citric.acid                & 2      \\
  OA.Glyceric.acid               & 4      & OA.Pyruvic.acid               & 2      \\
  Lip.TG.56.8.                   & 4      & OA.Aspartic.acid              & 2      \\
  Lip.SM.d18.1.20.1.             & 4      & OA.Iminodiacetate             & 2      \\
  OS.LpH.NO2.LA                  & 4      & OA.Uracil                     & 2      \\
  OS.HpH.PAF.C16.0               & 4      & Lip.TG.60.2.                  & 2      \\
  OS.HpH.cLPA.C18.2              & 4      & Lip.SM.d18.1.24.0.            & 2      \\
  Am.X3.Methoxytyrosine          & 3      & OS.LpH.NO2.LA                 & 2      \\
  Am.X3.Methylhistidine          & 3      & OS.LpH.NO2.aLA                & 2      \\
  Am.gamma.aminobutyric.acid     & 3      & OS.LpH.PGE2                   & 2      \\
  Am.Glycine                     & 3      & OS.cLpH.PGA2                  & 2      \\
  Am.L.2.aminoadipic.acid        & 3      & OS.cLpH.PGF2a                 & 2      \\
  Am.L.Arginine                  & 3      & Am.X1.Methylhistidine         & 1      \\
  Am.L.Aspartic.acid             & 3      & Am.Citrulline                 & 1      \\
  Am.L.homoserine                & 3      & Am.DL.3.aminoisobutyric.acid  & 1      \\
  Am.N6.N6.N6.Trimethyl.L.lysine & 3      & Am.Ethanolamine               & 1      \\
  Am.Sarcosine                   & 3      & Am.gamma.aminobutyric.acid    & 1      \\
  Am.Serotonine                  & 3      & Am.gamma.L.glutamyl.L.alanine & 1      \\
  OA.Lactic.acid                 & 3      & Am.Glycylglycine              & 1      \\
  OA.Malic.acid                  & 3      & Am.Histamine                  & 1      \\
  OA.2.ketoglutaric.acid         & 3      & Am.L.2.aminoadipic.acid       & 1      \\
  OA.Fumaric.acid                & 3      & Am.L.Alanine                  & 1      \\
  OA.Pyruvic.acid                & 3      & Am.L.Alpha.aminobutyric.acid  & 1      \\
  OA.Pyroglutamic.acid           & 3      & Am.L.Asparagine               & 1      \\
  OA.3.hydroxyisovaleric.acid    & 3      & Am.L.Glutamic.acid            & 1      \\
  OA.Uracil                      & 3      & Am.L.homoserine               & 1      \\
  Lip.TG.42.0.                   & 3      & Am.L.Kynurenine               & 1      \\ \bottomrule
  \end{tabular}
\begin{tablenotes}
\item[] ADMA: Asymmetric Dimethilarginine, SD, Am: Aminoacid, Lip: Lipid, DG: Diglyceride, LPA: Lyso-sphingolipid, SM: Sphingomyelin, TG: Triglyceride, OA: Organic Acid, OS: Oxidative Stress compound, PAF: Platelet Activating Factor, PGA2-PGE2: isoprostanes, NO2(a)La: nitro fatty acid, NO2.OA: nitro organic acid, SDMA: Symmetric Dimethilarginine. \end{tablenotes}
\end{threeparttable}
  \end{table} \clearpage
\begin{figure}[!h]
  \centerline{\includesvg[width=1.2\textwidth]{figures/NetAD.svg}}
  \caption{\label{netAD} Metabolite precision network topologies among ApoE4 carriers (top-left), non-carriers (top-right) and differential edge network (bottom) in AD.}  
  \end{figure}
\begin{figure}
  \includesvg[width=0.7\textwidth]{figures/comms.svg}
  \caption{\label{comms} Communities of (densely correlated) metabolites in ApoE4 carriers (top) and ApoE4 non-carriers with AD (bottom) representing functional metabolic groups. The metabolome of ApoE4 carriers can be separated in communities, while those of ApoE4 non-carriers are less cohesive.}
\end{figure}

\clearpage
\section{Discussion} \label{discuss}
This study had two objectives: to unveil mechanistic links between systematic metabolism and ApoE4 (number of alleles or status) in AD, and to propose a comprehensive omics data analysis methodology. To this end, three serum metabolic signatures were assessed among SCD and AD patients: individual metabolite levels, multi-class classification and network analysis. Individual metabolites were highlighted whose levels were shifted by ApoE4 dose. The multi-classs classification revealed metabolites that, on top of clinical background variables, can be fitted to predict ApoE4 and AD status. Metabolite network analysis unravelled metabolic interdependencies among ApoE4 carriers and non-carriers in AD. The serum metabolites reported here might serve as a \textit{proxy} of metabolic perturbations in the brain. The metabolites were measured in serum obtained from peripheral blood, an (almost) non-invasive alternative to spinal aspiration for \acrshort{csf}.

\subsection{Mechanistic links between ApoE4 dose and serum metabolome in AD}
The metabolites that emerged from the analysis were generally different among the approaches, and among AD-SCD. Nevertheless, certain metabolites appeared in all, or many of the metabolic signatures. The differential expression signature -as expected- revealed increased triglycerides and diglycerides, while the classification and network analysis -strikingly- presented a more pronounced aminic and organic acid signature.

A metabolite that appeared in all three metabolic signatures was putrescine: showed trends of positive ApoE4 dose effects in AD, was the most important feature in determining ApoE4 and AD status and central metabolite (5 dependent metabolites regardless of ApoE4 status) in AD. Putrescine appeared increased in a recent study \cite{Ju2021AstrocyticUC}, where it is described as a toxic by-product of activated urea cycle in A$\beta$-reactive astrocytes of AD brains \cite*{Ju2021AstrocyticUC,Wong2022PathogenicP}.

Trends of decreased putrescine and four amino acids with increasing ApoE4 presence were observed in the SCD group: L-serine, L-tryptophan, glycine and L-homoserine. Tryptophan and serine proved also important in predicting ApoE4 and AD status. In the brain, tryptophan is catabolized to kynurenine by the enzymes indoleamine 2,3-dioxygenase and tryptophan 2,3-dioxygenase, through the kynurenine pathway \cite*{Liang2022KynureninePM}. L-tryptophan levels were positively correlated with no ApoE4 in this study, and negatively correlated with 1 or 2 ApoE4 alleles. This could be potentially explained by increased catabolism via the kynurenine pathway. In this study, L-kynurenine was found central in the serum metabolome of ApoE4 carriers with AD, branched with five  metabolites. Recent reviews discussed kynurenine pathway metabolites to be closely associated with AD pathogenesis \cite{Liang2022KynureninePM,Sharma2022KynurenineMA}. Increased kynurenine pathway metabolites may reflect the extent of neuroinflammation in amyotrophic lateral sclerosis, frontotemporal dementia and early onset AD \cite*{Heylen2023BrainKP}. A whole-blood targeted metabolomics panel also revealed increased kynurenine pathway metabolites \cite{Teruya2021WholebloodMO}. Conversely, L-serine levels appear to be deficient in AD brains due to impaired glycolysis \cite{LeDouce2020ImpairmentOG}. 

Ketoglutarate exhibited trends of decreased levels with increasing ApoE4 presence in the global test and the ANCOVA F-tests exclusively in the AD group. The metabolite network analysis revealed it is also a central metabolite, driving the variance of six other metabolites. In the brain, the $\alpha$-ketoglutarate dehydrogenase complex is perceived as a "hub of plasticity in neurodegeneration and regeneration", as it reflects the impaired glucose metabolism and key enzymes of the tricarboxylic acid cycle \cite*{Hansen2022TheD}.

\subsubsection{Are there ApoE4 dose effects on serum metabolite levels in AD?}
The global test for ApoE4 dose effects on serum metabolite levels revealed positive dose effects on lipids in AD, mainly triglycerides and diglycerides. This is consistent with evidence from Caucasian \cite{Maxwell2013APOEMT,CARVALHOWELLS20121447,Bernath2020SerumTI} and southern Chinese populations \cite{Gan2022EffectsPopulation} that shows elevated triglycerides associated with ApoE4. The same test was not significant in the SCD group; mostly amines and amino acids showed trends of an effect. This could putatively reflect compensatory mechanisms still in place for the debilitating effects of ApoE4.

The nested linear model comparison did not reveal any significant ApoE4 dose effects, after correcting FDR. Even though the methods are fundamentally different, this can be partially explained considering the covariates in each approach. The global test was corrected for sex only, while the nested models fitted several background clinical factors. That is, the metabolic variance explained by ApoE4 dose in the global test might be partially attributed to other clinical factors in the nested models, such as hypercholesterolemia. Notably, the metabolites showing trends of an ApoE4 dose effect were different between the AD and SCD group. Consistent with the global test, AD group presented mainly a lipid (tri- and diglycerides) signature, while the SCD group an amino one (serine, tryptophan, glycine, homoserine and putrescine).

\subsubsection{What is the (added) classification potential of serum metabolites in predicting ApoE4 and AD
status?}
Serum metabolic information, in the form of a semi-targeted metabolomic panel or its latent 6-factor projection seems to slightly increase the classification performance of the models. The added performance, however, is minor. As expected, \acrshort{xgb} on clinical data and 6 meta-metabolites performed marginally better than penalized multinomial logistic regression on the clinical data and 230 metabolites, at the cost, however, of a significant loss of interpretability. Interestingly, all models discriminated better among certain classes (AD+E4 vs. SCD+E4, AD+E4 vs. SCD, AD-E4 vs. SCD+E4 and AD-E4 vs. SCD-E4) compared to others (AD+E4 vs. AD-E4 and SCD+E4 vs. SCD-E4). This can be putatively attributed to the class imbalance between AD+E4 (n = 34) and AD (n = 86), as well as SCD+E4 (n = 87) and SCD (n = 40). It might also show that the metabolic nuances among ApoE4 carriers and non-carriers are not as pronounced as among AD and SCD.

Each of the top four metabolites in delineating ApoE4 and AD status belonged to one of the major metabolite classes measured: amines (putrescine), oxidative stress compounds (sphingosine-1-phosphate (C18:0)), organic acids (uracil) and lipids (TG (56:0)). Perturbations in sphingolipid metabolism are observed in AD \cite{mielke2010alterations}, even in the stage of MCI \cite{den2023sphingolipids}. \Citeauthor{den2023sphingolipids} reported increased sphingosine-1-phosphate in CSF and plasma of homozygous ApoE4 carriers \cite{den2023sphingolipids}. Another recent study highlights sphingolipids and especially sphingosine-1-phosphate as diagnostic markers for AD \cite{d2022sphingolipid}.

\subsubsection{How do the metabolite network topologies differ between ApoE4 carriers and non-carriers?}
The GGMs revealed largely different metabolite network topologies between ApoE4 carriers and non-carriers. The metabolite network of ApoE4 carriers was less cohesive compared to non-carriers', in line with \citeauthor{deLeeuw2017Blood-basedDisease}'s findings. Further, the metabolite communities of the first include either molecules of the same biochemical subclass (e.g. phosphatidylcholines, sphingomyelines) or molecules sharing metabolic pathways (amines-organic acids, lipids-oxidative stress compounds) and might reflect functional groups. All the calculated metabolite network statistics were significantly larger in ApoE4 carriers compared to non-carriers. Amines are the top central metabolites in ApoE4 carriers, followed by organic acids. Dopamine was the most central metabolite in ApoE4 carriers. In this regard, evidence shows decreased dopaminergic neurotransmitters in AD \cite{shaikh2023targeting,pan2019dopamine}.

\subsection{Strengths and Limitations} This analytical study was based on -and thus enabled by- uniquely detailed datasets in AD research. The $n = 247$ subjects included in this study were screened for AD biomarkers in CSF, possible comorbidities, as well as several other risk factors. The clinical data alone were able to predict ApoE4 and AD status with an AUC of 81\%. In this regard, another strength of this study is the comprehensive and efficient methodology it presents for -omics data analysis. The employed statistical methods here comprise a state-of-the-art analytical framework for metabolomics (or any -omics) data, to extract insights or complement diagnostic protocols. An innovation of this study is the use of a four-class target (outcome, response) in classification models, instead of a binary one (AD yes or no). To reach a high multi-class AUC, all the binary AUCs should be large. In this sense, the multi-class classification models predicting ApoE4 and AD status reached similar AUCs, either fitting the 230 metabolites (in multinomial logistic regression) or six meta-metabolites accounting for $\sim$30\% of their variance (in \acrlong{xgb}).

AD is a complex and multifaceted disease with several proposed pathophysiological mechanisms. Hence, focusing on links between ApoE4 (status or dose) and serum metabolome in AD provides a limited view of the underlying pathologies at brain level. In this sense, gaining insights in ApoE genotype effects via an untargeted metabolomics panel might prove more informative. Nevertheless, the relative genotype counts did not allow further analysis without grouping. Further, the metabolites were measured in blood -rather than CSF- which only reveals metabolic perturbations at systematic level. Another limitation is the fact that the metabolomics data were semi-target, thus several potentially important metabolites were not measured. Additionally, the significantly increased tri- and diglycerides in the global test might be attributed to hypertriglyceremia (that often coexists with hypercholesterolemia) and was not measured in this study. Moreover, it is important to note that the SCD group in the data is not a \textit{control} group, but it represented subjects with varying degrees of subjective cognitive impairment, who might or  might not develop AD or another neurodegenerative disease at later stages. Lastly, another limitation of the study is that the blood samples were obtained without prior fasting.

\subsection{Future directions}
The metabolites reported in this study (putrescine and sphingosine 1-phosphate) can be targeted in cohort studies to measure their levels in CSF and blood, throughout the AD continuum. In an effort to further elucidate molecular mechanisms implicated in AD development, further research should integrate other -omics data, over time. Such studies might provide additional stage-dependent AD biomarkers that can then be monitored in large-scale cohorts. Regarding the ApoE4 lipoprotein, it could be insightful to measure its levels in CSF and serum in different stages of AD, while considering other risk factors that modify the increased LOAD risk (ancestry, sex, or other inherited gene variants). Moreover, to allow further assessment of its effects, it is proposed to stratify per ApoE4 gene dose, status, or ApoE genotype during participant selection and sampling. Lastly, it could be insightful for disease predicting models to integrate as many of the reported AD-related biomarkers as possible. In this fashion, their relative diagnostic potential can be assessed.

\subsubsection{Statistical Analysis}Multi-class classification models in AI-assisted differential diagnosis support should be preferred to binary, as they can predict a plurality of disease outcomes and better reflect reality. The R package \textsf{globaltest} offers a simple but robust approach to test for nuances in metabolite levels (or any biological high-dimentional data) attributed to a (multi-class) clinical feature -e.g. number of ApoE4 alleles. Even though \textsf{caret} is not the newest package for classification and regression pipelines in R, it is very simple, intuitive, well documented and maintained and is thus recommended for statistical learning research.
\clearpage
\section{Conclusion} \label{concl}
The serum metabolome is reported shifted by ApoE4 presence or dose. The multi-class classification of ApoE4 status and AD, as well as the metabolite network analysis provide additional metabolic signatures in the periphery. Tri- and diglycerides seem to be increased with increasing number of ApoE4 alleles in AD. The added classification potential of serum metabolome in identifying ApoE4 and AD status is small, but significant. Nevertheless, putrescine was the top metabolite in predicting ApoE4 and AD status, and thus might be worthwhile to assess in CSF at different stages of AD progression. Finally, the metabolite network analysis revealed distinct constellations of metabolites between ApoE4 carriers and non-carriers: metabolites of ApoE4 carriers could be grouped in communities of correlated metabolites, while among ApoE4 non-carriers not. Dopamine seems to be the most central metabolite in ApoE4 carriers with AD. Nevertheless, further investigation is needed to validate these findings, both in CSF and \textit{post-mortem} brain tissues of AD patients, as well as in age-matched healthy controls having ApoE4 allelic presence, in service of a control group. 
%--------------- Main matter ----------------------------------------
%--------------------------------------------------------------------


%--------------------------------------------------------------------
%--------------- References -----------------------------------------
\newpage
\section*{References}
\printbibliography[heading=none]
\clearpage
%--------------- References -----------------------------------------
%--------------------------------------------------------------------
\appendix 
\clearpage
\section{Code and Results} \label{code}
An HTML document with the code and the output of the analysis can be found at \href{https://gmiliarakis.github.io}{gmiliarakis.github.io}.\\
\newline
\section{Multiclass ROC curves} \label{appendixA}
\begin{figure}[htb]
  \includesvg[width=.6\textwidth]{figures/bench.svg}
  \caption{ROC curves of benchmark model: Multinomial Logistic Regression fitting the clinical background features only, obtained from repeated (100 times) 10-fold CV.}
  \label{roc:bench}
\end{figure}
\begin{figure}[htb]
\includesvg[width=.6\textwidth]{figures/mlr6.svg}
\caption{ROC curves of \acrlong*{mnl} fitting the 6 ML-estimated latent factors on top of the clinical background variables, obtained from repeated (100 times) 10-fold CV.}
\label{roc:mlr6}
\end{figure}
\newpage
\begin{figure}[htb]
\includesvg[width=.6\textwidth]{figures/tree.svg}
\caption{ROC curves of Decision Tree fitting the 6 ML-estimated latent factors on top of the clinical background variables, obtained from repeated (100 times) 10-fold CV.}
\label{roc:tree}
\end{figure}

\clearpage  
\section{R Session Information} \label{appendixB}

\begin{verbatim}
R version 4.3.2 (2023-10-31)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.2.1

Matrix products: default
BLAS:   .../vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: LAPACK version 3.11.0

locale:
[1] en_US.UTF-8

time zone: Europe/Amsterdam
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] rags2ridges_2.2.7       nnet_7.3-19             DMwR2_0.0.2                       
 [4] dplyr_1.1.4             caret_6.0-94            lattice_0.21-9                    
 [7] globaltest_5.56.0       survival_3.5-7          heatmaply_1.5.0               
[10] xgboost_1.7.6.1         pROC_1.18.5             FMradio_1.1.1       
[13] future_1.33.0           ggthemes_5.0.0          corpcor_1.6.10
[16] viridisLite_0.4.2       plotly_4.10.3           ggplot2_3.4.4
[19] rpart_4.1.21            furrr_0.3.1             viridis_0.6.4 
[22] e1071_1.7-14            cvms_1.6.0 

loaded via a namespace (and not attached):
 [1] splines_4.3.2           bitops_1.0-7            tibble_3.2.1                              
 [4] XML_3.99-0.16           lifecycle_1.0.4         globals_0.16.2                            
 [7] magrittr_2.0.3          Hmisc_5.1-1             rmarkdown_2.25                        
[10] lubridate_1.9.3         zlibbioc_1.48.0         sfsmisc_1.1-16                                 
[13] RCurl_1.98-1.13         ipred_0.9-14            lava_1.7.3                               
[16] S4Vectors_0.40.2        listenv_0.9.0           gRbase_2.0.1                                
[19] codetools_0.2-19        tidyselect_1.2.0        TSP_1.2-4                                             
[22] jsonlite_1.8.8          Formula_1.2-5           iterators_1.0.14                     
[25] snowfall_1.84-6.3       Rcpp_1.0.11             glue_1.6.2                           
[28] tufte_0.13              TTR_0.24.4              GenomeInfoDb_1.38.2         
[31] fastmap_1.1.1           fansi_1.0.6             digest_0.6.33                  
[34] RSQLite_2.3.4           utf8_1.2.4              tidyr_1.3.0                  
[37] recipes_1.0.9           class_7.3-22            httr_1.4.7                      
[40] gtable_0.3.4            timeDate_4032.109       blob_1.2.4                     
[43] RBGL_1.78.0             GSEABase_1.64.0         scales_1.3.0                        
[46] knitr_1.45              rstudioapi_0.15.0       tzdb_0.4.0                         
[49] curl_5.2.0              proxy_0.4-27            cachem_1.0.8                     
[52] foreign_0.8-85          AnnotationDbi_1.64.1    pillar_1.9.0                        
[55] VGAM_1.1-9              xtable_1.8-4            cluster_2.1.4                       
 [58] cli_3.6.2               compiler_4.3.2          rlang_1.1.2                     
 [61] plyr_1.8.9              stringi_1.8.3           assertthat_0.2.1               
 [64] Matrix_1.6-1.1          hms_1.1.3               bit64_4.0.5                      
 [67] quantmod_0.4.25         bit_4.0.5               hardhat_1.3.0 
 [70] graph_1.80.0            xts_0.13.1              MASS_7.3-60 
 [73] dendextend_1.17.1       backports_1.4.1         yaml_2.3.8   
 [76] DBI_1.1.3               RColorBrewer_1.1-3      expm_0.999-8   
 [79] purrr_1.0.2             BiocGenerics_0.48.1     seriation_1.5.4 
 [82] IRanges_2.36.0          RSpectra_0.16-1         GenomeInfoDbData_1.2.11
 [85] annotate_1.80.0         parallelly_1.36.0       stats4_4.3.2 
 [88] foreach_1.5.2           tools_4.3.2             snow_0.4-4
 [91] prodlim_2023.08.28      gridExtra_2.3           xfun_0.41 
 [94] ca_0.71.1               withr_2.5.2             BiocManager_1.30.22
 [97] timechange_0.2.0        R6_2.5.1                colorspace_2.1-0
[100] generics_0.1.3          renv_1.0.3              data.table_1.14.10
[103] htmlwidgets_1.6.4       ModelMetrics_1.2.2.2    pkgconfig_2.0.3
[106] registry_0.5-1          XVector_0.42.0          htmltools_0.5.7
[109] Biobase_2.62.0          png_0.1-8               gower_1.0.1
[112] reshape2_1.4.4          checkmate_2.3.1         nlme_3.1-163
[115] zoo_1.8-12              stringr_1.5.1           parallel_4.3.2
[118] grid_4.3.2              reshape_0.8.9           vctrs_0.6.5
[121] htmlTable_2.4.2         evaluate_0.23           readr_2.1.4
[124] crayon_1.5.2            future.apply_1.11.0     fdrtool_1.2.17 
[127] munsell_0.5.0           Biostrings_2.70.1       lazyeval_0.2.2 
[130] KEGGREST_1.42.0         igraph_1.6.0            memoise_2.0.1 
[133] base64enc_0.1-3         webshot_0.5.5
\end{verbatim}


\end{document}
% This is a template for a thesis at Biometris created by C.F.W. Peeters in 2021