m_DAGs_similarity.qmd

```{r  include=FALSE,message=FALSE}
knitr::opts_chunk$set(echo = TRUE,
                      cache=TRUE,
                      warning = FALSE,
                      message = FALSE, out.width = "100%",cache.lazy=FALSE)
pdf="TRUE"

library(tidyverse)
library(igraph)
library(ComplexHeatmap)
library(viridis)
library(circlize)
library(plotly)
library(randomcoloR)
library(factoextra)
library(RColorBrewer)
library(kableExtra)
library(igraph)
library(GGally)
```

```{r}
load(file='metadag_work_space.RData')
```

# m-DAGs similarities and Metadata

First, we will load the metadata and adjust them to match the structure of the similarities. This will facilitate the creation of graphs and statistics.

Keep in mind the path of the experiment:

```{r}
experiment=
  "0a845f74-826e-3b46-aed9-e7ecf74db262/"
path_exp=paste0("data/",experiment)
```

## MSA & Munkres similarities

In this section, we will present the similarities between m-DAGs considering the two similarity meausures described in the paper. Namely, the MSA and Munkres similarities.

The experimental data set consists of `r nrow(meta_taxo)` Eukaryotes from the animal, plant, fungus, and protists kingdoms.

```{r, echo=FALSE}
knitr::kable(table(meta_taxo$Kingdom),col.names = c("Kingdom","Abs. Freq."))
```

The similarity values are provided in the following files:

```{r}
list_Sim=dir(path_exp,pattern="^Similarities")
list_Sim
```

Load the m-DAGs similarities

```{r}
n=884# no synthetic mDGAs
Sim_MSA_mDAG=read_csv(paste0(path_exp,
                             "Similarities_mDAG_MSAMethod.csv"))
Sim_MSA_mDAG=as.matrix(Sim_MSA_mDAG[,-1])
rownames(Sim_MSA_mDAG)=colnames(Sim_MSA_mDAG)
Sim_MSA_mDAG=Sim_MSA_mDAG[meta_taxo$mDAG_Id[1:n],
                          meta_taxo$mDAG_Id[1:n]]
```

```{r}
Sim_Mun_mDAG=read_csv(paste0(path_exp,"Similarities_mDAG_MunkresMethod.csv"))
Sim_Mun_mDAG=as.matrix(Sim_Mun_mDAG[,-1])
rownames(Sim_Mun_mDAG)=colnames(Sim_Mun_mDAG)
Sim_Mun_mDAG=Sim_Mun_mDAG[meta_taxo$mDAG_Id[1:n],meta_taxo$mDAG_Id[1:n]]
```

## Heatmaps

Here, we provide examples of heatmaps to visualize the similarities betweem m-DAGs. We again consider colors to represent the different Kingdoms.

```{r, fig.align='center', fig.height=5, cache=TRUE}
dff<-meta_taxo[1:884,] %>% select(Kingdom)  %>% as.data.frame()
colorsK <- list(Kingdom= c("Animals"="red",
                           "Plants"="green",
                           "Fungi"="yellow",
                           "Protists"="black"))
annotationK <- HeatmapAnnotation(df=dff, col = colorsK,show_legend = TRUE)

MSA_heat_1 <- Heatmap(matrix = Sim_MSA_mDAG, 
                      column_title=
                        "m-DAGs MSA-similarity Eukaryotes by Kingdoms",
                      heatmap_legend_param=list(
                        title="Similarity",
                        at = seq(0,1,by=0.1)),
                      col=rev(viridis(256)),
                      cluster_rows = FALSE,
                      cluster_columns = FALSE,
                      top_annotation = annotationK,
                      show_column_names = FALSE, 
                      show_row_names = FALSE,
                      left_annotation =
                        rowAnnotation(df = dff,
                                      col = colorsK,
                                    show_annotation_name=FALSE,
                                    show_legend=FALSE
                                      ))


Mun_heat_1 <- Heatmap(matrix = Sim_Mun_mDAG, 
             column_title="m-DAGs Munkres-similarity  Eukaryotes by Kingdoms",
            name = "Munkres Similarity",
            heatmap_legend_param=list(
                        title="Similarity",
                        at = seq(0,1,by=0.1)),
                      col=rev(viridis(256)),
                      cluster_rows = FALSE,
                      cluster_columns = FALSE,
                      top_annotation = annotationK,
                      show_column_names = FALSE, 
                      show_row_names = FALSE,
                      left_annotation =
                        rowAnnotation(df = dff,
                                      col = colorsK,
                                    show_annotation_name=FALSE,
                                    show_legend=FALSE                                                                        ))
```

```{r, fig.align='center', fig.height=5, cache=TRUE}
##  Animals by phylum

meta_animals  = meta_taxo %>% filter(Kingdom=="Animals")
namesP=names(rev(sort(table(meta_animals$Phylum))))
namesP
dff=data.frame(Phylum=meta_animals$Phylum)
Phylum=ordered(meta_animals$Phylum,levels=namesP)
numbersP=paste(c(paste0(0,1:9),10:14),namesP,sep="-")
levels(Phylum)=numbersP
dff$Phylum=Phylum
col=rainbow(length(namesP))

colorsP=list(Phylum=col)
names(colorsP$Phylum)=numbersP

annot <- HeatmapAnnotation(df = dff, 
                               col = colorsP,
                               annotation_name_side = "left",
                               show_annotation_name=TRUE )

MSA_heat_2 <-  Heatmap(
  matrix = Sim_MSA_mDAG[meta_animals$mDAG_Id,
                        meta_animals$mDAG_Id],
  name = "MSA similarity",
  column_title = "m-DAGs MSA-similarity  Animals by Phyla",
  col = rev(viridis(256)),
  cluster_rows = FALSE,
  show_heatmap_legend = FALSE,
  cluster_columns = FALSE,
  top_annotation = annot,
  show_column_names = FALSE,
  show_row_names = FALSE,
  left_annotation =
    rowAnnotation(
      df = dff,
      col = colorsP,
      show_annotation_name = FALSE
    )
)


Mun_heat_2 <- Heatmap(
  matrix = Sim_Mun_mDAG[meta_animals$mDAG_Id,
                        meta_animals$mDAG_Id],
  column_title = "m-DAGs Munkres-similarity  Animals by Phyla",
 col = rev(viridis(256)),
  cluster_rows = FALSE,
  show_heatmap_legend = FALSE,
  cluster_columns = FALSE,
  top_annotation = annot,
  show_column_names = FALSE,
  show_row_names = FALSE,
  left_annotation =
    rowAnnotation(
      df = dff,
      col = colorsP,
      show_annotation_name = FALSE
    )
)

```

```{r heatmaps}
draw(MSA_heat_1,merge_legend=TRUE)
draw(MSA_heat_2,merge_legend=TRUE)
draw(Mun_heat_1,merge_legend=TRUE)
draw(Mun_heat_2,merge_legend=TRUE)
```

```{r echo=FALSE, include=FALSE,eval=FALSE}
pdf("data_appendix/MSA_kingdoms.pdf",width=5,height=5)
draw(MSA_heat_1,merge_legend=TRUE)
dev.off()
pdf("data_appendix/MSA_phyla.pdf")
draw(MSA_heat_2,merge_legend=TRUE)
dev.off()
pdf("data_appendix/Mun_kingdoms.pdf")
draw(Mun_heat_1,merge_legend=TRUE)
dev.off()
pdf("data_appendix/Mun_phyla.pdf")
draw(Mun_heat_2,merge_legend=TRUE)
dev.off()
```

## MDS (Multidimensional Scaling) MSA & Munkres similarities

Multi-dimensional Scaling (MDS) is a classic multivariate data analysis technique that allows for obtaining a low-dimensional representation of the observed similarities. First, we transform each similarity measure into a distance measure as follows: let $s_{ij}$ be a similarity measure between a pair $i,j$, we define its distance measure as $d_{ij}=\sqrt{1-s_{ij}^2}$.

The following is the MDS for the MSA distance:

```{r, fig.align='center', fig.height=5, cache=TRUE}
## Metric multidimensional scaling (mMDS)
mds7 <- cmdscale(sqrt(1-Sim_MSA_mDAG^2),k=7,eig=TRUE)
mds7$GOF
mds <- mds7$points %>%  as_tibble()
colnames(mds) <-paste0("Dim.",1:dim(mds7$points)[2])


cooordinates=as_tibble(mds7$points)
colnames(cooordinates)=paste("Component",1:7)
ggpairs(cooordinates,columns=1:4,
        aes(color=meta_taxo$Kingdom[1:884],
            title="MDS 4 dimensions projection",legend=1),
        lower=list(continuous="points")) + 
  scale_fill_manual(values = colorsK$Kingdom) + 
  theme(legend.position = "left")
```

The following is the MDS for the Munkres distance:

```{r, fig.align='center', fig.height=5}
## Metric multidimensional scaling
mds7 <- cmdscale(sqrt(1-Sim_Mun_mDAG^2),k=7,eig=TRUE)
mds7$GOF
mds <- mds7$points %>%  as_tibble()
colnames(mds) <-paste0("Dim.",1:dim(mds7$points)[2])

cooordinates=as_tibble(mds7$points)
colnames(cooordinates)=paste("Component",1:7)
ggpairs(cooordinates,columns=1:4,
        aes(color=meta_taxo$Kingdom[1:884],
            title="MDS 4 dimensions projection",legend=1),
        lower=list(continuous="points")) + 
  scale_fill_manual(values = colorsK$Kingdom) + 
  theme(legend.position = "left")
```

## Hierarchical clustering MSA similarity

Through hierarchical clustering using the Ward method, we have derived a partition of the m-DAGs into 4, 5, and 6 clusters, respectively. The corresponding information has been organized into a table, as follows:

```{r}
D=as.dist(sqrt(1-Sim_MSA_mDAG^2))
hc_MSA=hclust(as.dist(D),method ="ward.D")
clust4_MSA=cutree(hc_MSA,4)
table(clust4_MSA,meta_taxo$Kingdom[1:884])
```

```{r}
clust5_MSA=cutree(hc_MSA,5)
table(clust5_MSA,meta_taxo$Kingdom[1:884])
```

```{r}
clust6_MSA=cutree(hc_MSA,6)
table(clust6_MSA,meta_taxo$Kingdom[1:884])
```

We can also create a table that correlates the clusters (in this case, two clusters) with the Phylum classification:

```{r}
aux=meta_taxo[1:884,] %>%
  select(Organism,Kingdom,Phylum,Class,Full_Name)
aux$clust4_MSA=clust4_MSA
aux_Animals_cluster_1_2 = aux %>%
  filter(Kingdom=="Animals",clust4_MSA %in% c(1,2))

table(aux_Animals_cluster_1_2$Phylum,aux_Animals_cluster_1_2$clust4_MSA)
```

We can retrieve the information of the elements belonging to a specific classification (Animals and Plants) that are part of a particular cluster as follows:

```{r}
aux_7_Animals_cluster_3= filter(aux,
                                clust4_MSA==3,
                                Kingdom=="Animals")
aux_7_Animals_cluster_3

aux_14_Plants_clust_3= filter(aux,clust4_MSA==3,
                             Kingdom=="Plants")
aux_14_Plants_clust_3
```

We can retrieve the information of the elements from a specific Phylum or Class, and the cluster they belong to, as follows:

```{r}
aux_all_Nematodes_Flatworns= aux %>% 
  filter(Kingdom=="Animals",
         Phylum %in% c("Nematodes","Flatworms"))
aux_all_Nematodes_Flatworns
```

The class Algae are all in the same cluster:

```{r}
aux_all_algae_class= aux %>% 
  filter(Kingdom=="Plants",
         Class %in% c("algae"))
aux_all_algae_class
```

## Hierarchical clustering Munkres similarity

Analogous to the MSA similarity we obtain a classification of the m-DAGs into different clusters and retrieve the cluster's information as follows:

```{r}
D=as.dist(sqrt(1-Sim_Mun_mDAG^2))
hc_Mun=hclust(as.dist(D),method ="ward.D")
```

```{r}
clust4_Mun=cutree(hc_Mun,4)
table(clust4_Mun,meta_taxo$Kingdom[1:884])
```

```{r}
aux=meta_taxo[1:884,] %>%
  select(Organism,Kingdom,Phylum,Class,Full_Name)
aux$clust4_Mun=clust4_Mun
aux_Animals_cluster_1_2_Mun = aux %>%
  filter(Kingdom=="Animals",clust4_Mun %in% c(1,2))

table(aux_Animals_cluster_1_2_Mun$Phylum,
      aux_Animals_cluster_1_2_Mun$clust4_Mun)

aux_7_Animals_cluster_3_Mun= filter(aux,
                                clust4_Mun==3,
                                Kingdom=="Animals")
aux_7_Animals_cluster_3_Mun
aux_all_Nematodes_Flatworns= aux %>% 
  filter(Kingdom=="Animals",
         Phylum %in% c("Nematodes","Flatworms"))
aux_all_Nematodes_Flatworns

aux_14_Plants_clust2_Mun= filter(aux,clust4_Mun==3,
                             Kingdom=="Plants")
aux_14_Plants_clust2_Mun
aux_all_algae_class= aux %>% 
  filter(Kingdom=="Plants",
         Class %in% c("algae"))
aux_all_algae_class
```

## Comparison between MSA and Munkres similarities

In order to compare the two similarities we consider the Spearman and Pearson correlation. First, we load the similarities for every pair of m-DAG and each similarity measure.

```{r}
n=length(meta_taxo$mDAG_Id[1:884])
n
dim(Sim_MSA_mDAG)

aux=as_tibble(Sim_MSA_mDAG)
aux$mDag=names(aux)
aux=aux %>% pivot_longer(cols=`0313`:`0300`,
                         names_to="mDag_2",
                         values_to="Sim_MSA")

aux_2= aux %>%  mutate(i=pmax(as.integer(mDag),
                              as.integer(mDag_2)),
                       j=pmin(as.integer(mDag),
                       as.integer(mDag_2))) %>% unite("ij",i:j) %>%
  filter(duplicated(ij))


aux=as_tibble(Sim_Mun_mDAG)
aux$mDag=names(aux)
aux=aux %>% pivot_longer(cols=`0313`:`0300`,
                         names_to="mDag_2",
                         values_to="Sim_Mun")
aux_2 = aux_2 %>% left_join(aux)

Sim_comp=aux_2
rm(aux,aux_2)
```

Next we obtain the Spearman and Pearson correlations as follows:

```{r}
cor(Sim_comp[,c(3,5)],method="spearman")
cor(Sim_comp[,c(3,5)],method="pearson")
```

```{r}
GGally::ggpairs(Sim_comp[,c(3,5)])
```

```{r}
sim_pairs= Sim_comp%>% pivot_longer(
  cols=c(Sim_MSA,Sim_Mun),
  names_to="Method",
  values_to="Similarity")

ggstatsplot::ggbetweenstats(
  data = sim_pairs,
  x = Method,
  y = Similarity,
  centrality.plotting=TRUE)
```

```{r}
library(hrbrthemes)
library(viridis)
sim_pairs %>%
  ggplot( aes(x=Method, y=Similarity, fill=Method)) +
  geom_boxplot() +
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  theme(legend.position="none")+
  ggtitle("Boxplot diagram of the similarities between the two aproaches") 
```

Basic statistics similarity

```{r}
sim_pairs %>% group_by(Method) %>% 
  summarise(N=n(),
          mean=mean(Similarity),
          sd=sd(Similarity),
          Q1=quantile(Similarity,0.25),
          median=quantile(Similarity,0.5),
          Q3=quantile(Similarity,0.75))
```

```{r pasar_load, include=FALSE,cache=FALSE}
save.image(file='metadag_work_space.RData')
```