Skip to content
This repository has been archived by the owner on Apr 16, 2024. It is now read-only.

Commit

Permalink
Merge pull request #72 from biotaphy/version-2.0.0
Browse files Browse the repository at this point in the history
Version 2.0.0
  • Loading branch information
cjgrady authored Nov 22, 2019
2 parents 7b400de + 969a929 commit 8172665
Show file tree
Hide file tree
Showing 17 changed files with 414 additions and 315 deletions.
196 changes: 196 additions & 0 deletions analyses/phylo_beta_diversity/phylo_beta_diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import itertools as it # new dependency.
import dendropy
import random

from lmpy import Matrix, TreeWrapper

Expand Down Expand Up @@ -39,13 +40,23 @@ def pdnew(pam, tree):
# Pull out the data for current sample.
my_samp = pam[sample]

# print my_samp
# print pam.get_row_headers(),"\n"

# Pull out which spp are present & which are absent from the sample.
# yields lists of strings --> spp names.
sp_pres = list(it.compress(pam.get_column_headers(), my_samp))
# Dendropy does not retain underscores in labels
sp_pres = [i.replace('_', ' ') for i in sp_pres]

# print "The following spp are present in the sample:\n", sp_pres,"\n"
# print tree

# Get a tree of the spp present in the sample.
tree_pres = tree.extract_tree_with_taxa_labels(sp_pres)

# print tree_pres,"\n**********\n"

# Get sum of edge lengths for each sub tree.
PD_pres = tree_pres.length()

Expand Down Expand Up @@ -184,6 +195,8 @@ def core_PD_calc(pam, tree):
com_tot_multi = [1 if i > 0 else 0 for i in com_tot_multi]
# Calculate the PD.
sp_pres = list(it.compress(pam.get_column_headers(), com_tot_multi))
# Dendropy labels don't retain '_'
sp_pres = [i.replace('_', ' ') for i in sp_pres]
tree_pres = tree.extract_tree_with_taxa_labels(sp_pres)
pd_tot_multi = tree_pres.length()

Expand Down Expand Up @@ -294,6 +307,10 @@ def calculate_phylo_beta_diversity_jaccard(pam, tree):

num_sites = pam.shape[0] # Get the number of sites in the PAM

# print pam.data, "\n"
# print pam.get_column_headers(),"\n"
# print pam.get_row_headers(),"\n"

# Note: For ease of development, use these numpy arrays for the
# computations. They will be wrapped into a Matrix object when they are
# returned from the function.
Expand Down Expand Up @@ -478,3 +495,182 @@ def calculate_phylo_beta_diversity_sorensen(pam, tree):
Matrix(phylo_beta_sne_data, headers=mtx_headers),
Matrix(beta_sor_data, headers=mtx_headers),
Matrix(phylo_beta_sor_data, headers=mtx_headers))


# .............................................................................
def calc_phylo_jac_distr(pam, tree, nrand=5): # pragma: no cover
"""Calculates distribution of jaccard metrics based on randomization of
phylogenetic relationships.
Args:
pam (:obj:`Matrix`): A Lifemapper Matrix object with presence absence
values (site rows by species columns).
tree (:obj:`TreeWrapper`): A TreeWrapper object for a wrapped Dendropy
phylogenetic tree.
Returns:
Mean & SD of distribution of Jaccard-based metrics from randomizations.
Note:
* It looks like the scipy.spatial.distance.jaccard method may be useful
here.
Todo:
* Fill in method documentation
* Fill in method
* Fill in tests and method documentation in sphinx
"""
# Get a lookup dictionary for the matrix index of each species in the PAM
# in case they are not in the same order as the taxa in the tree
species_lookup = get_species_index_lookup(pam)

# Build a header dictionary, all of the returned matricies will have the
# same headers, site rows by site columns.
# Note: This will differ from the R method because each site will be
# present in both the rows and the columns.
mtx_headers = {
'0': pam.get_row_headers(), # Row headers
'1': pam.get_row_headers() # Column headers
}

num_sites = pam.data.shape[0] # Get the number of sites in the PAM

# print pam.data, "\n"
# print pam.get_column_headers(),"\n"
# print pam.get_row_headers(),"\n"

# These matrices will serve as running average placeholders.
pjtu_av = np.zeros((num_sites, num_sites), dtype=np.float)
pjne_av = np.zeros((num_sites, num_sites), dtype=np.float)
pjac_av = np.zeros((num_sites, num_sites), dtype=np.float)

# TODO: Randomize tree, calc. metrics, save running average.
# for trial in range(len(nrand)):
# Randomize tip labels of tree.

# Get core metrics related to phylogeny.
core_calc = core_PD_calc(pam, tree) # Matrix object.

# This loop will populate arrays with all beta diversity metrics.
for my_row in range(core_calc.data.shape[0]):
# Pull out the phylogentic core numeric values.
my_dat = core_calc.data[my_row, 0:4]
# Get index values for placing into output arrays.
my_dim = core_calc.get_row_headers()[my_row]

# Populate arrays.
phylo_beta_jtu_data[my_dim[0], my_dim[1]] = (
2*my_dat[0]) / ((2*my_dat[0]) + my_dat[3])

phylo_beta_jtu_data[my_dim[1], my_dim[0]] = phylo_beta_jtu_data[
my_dim[0], my_dim[1]]

phylo_beta_jac_data[my_dim[0], my_dim[1]] = (
my_dat[2] / (my_dat[3] + my_dat[2]))

phylo_beta_jac_data[my_dim[1], my_dim[0]] = phylo_beta_jac_data[
my_dim[0], my_dim[1]]

phylo_beta_jne_data[my_dim[0], my_dim[1]] = (
(my_dat[1] - my_dat[0]) / (my_dat[3] + my_dat[2])) * (
my_dat[3] / ((2 * my_dat[0]) + my_dat[3]))

phylo_beta_jne_data[my_dim[1], my_dim[0]] = phylo_beta_jne_data[
my_dim[0], my_dim[1]]

# Ensure diagonals are 1 just to match Biotaphy test file expectations.
for i in range(num_sites):
phylo_beta_jtu_data[i, i] = 1.
phylo_beta_jac_data[i, i] = 1.
phylo_beta_jne_data[i, i] = 1.

return (
Matrix(phylo_beta_jtu_data, headers=mtx_headers),
Matrix(phylo_beta_jne_data, headers=mtx_headers),
Matrix(phylo_beta_jac_data, headers=mtx_headers))


# .............................................................................
def calc_sig_phylo_sor(pam, tree): # pragma: no cover
"""Calculates phylogenetic beta diversity for the sorensen index family.
Args:
pam (:obj:`Matrix`): A Lifemapper Matrix object with presence absence
values.
tree (:obj:`TreeWrapper`): A TreeWrapper object for a wrapped Dendropy
phylogenetic tree.
Returns:
Phylogenetic beta diversity matrics (species by species)
* beta_sim: ADD DESCRIPTION
* phylo_beta_sim: ADD DESCRIPTION
* beta_sne: ADD DESCRIPTION
* phylo_beta_sne: ADD DESCRIPTION
* beta_sor: ADD DESCRIPTION
* phylo_beta_sor: ADD DESCRIPTION
Todo:
* Fill in method documentation
* Fill in method
* Fill in tests / documentation
"""
# Get a lookup dictionary for the matrix index of each species in the PAM
# in case they are not in the same order as the taxa in the tree
species_lookup = get_species_index_lookup(pam)

# Build a header dictionary, all of the returned matricies will have the
# same headers, site rows by site columns.
# Note: This will differ from the R method because each site will be
# present in both the rows and the columns.
mtx_headers = {
'0': pam.get_row_headers(), # Row headers
'1': pam.get_row_headers() # Column headers
}

num_sites = pam.data.shape[0] # Get the number of sites in the PAM

# Note: For ease of development, use these numpy arrays for the
# computations. They will be wrapped into a Matrix object when they are
# returned from the function.
phylo_beta_sim_data = np.zeros((num_sites, num_sites), dtype=np.float)
phylo_beta_sne_data = np.zeros((num_sites, num_sites), dtype=np.float)
phylo_beta_sor_data = np.zeros((num_sites, num_sites), dtype=np.float)

# TODO: Compute phylo beta diversity for sorensen index family
core_calc = core_PD_calc(pam, tree)

# This loop will populate arrays with beta diversity metrics.
for my_row in range(core_calc.data.shape[0]):

my_dat = core_calc.data[my_row, 0:4]
my_dim = core_calc.get_row_headers()[my_row]

phylo_beta_sim_data[my_dim[0], my_dim[1]] = (
my_dat[0] / (my_dat[0] + my_dat[3]))

phylo_beta_sim_data[my_dim[1], my_dim[0]] = phylo_beta_sim_data[
my_dim[0], my_dim[1]]

phylo_beta_sor_data[my_dim[0], my_dim[1]] = (
my_dat[2] / ((2*my_dat[3]) + my_dat[2]))

phylo_beta_sor_data[my_dim[1], my_dim[0]] = phylo_beta_sor_data[
my_dim[0], my_dim[1]]

phylo_beta_sne_data[my_dim[0], my_dim[1]] = (
(my_dat[1] - my_dat[0]) / ((2*my_dat[3]) + my_dat[2])) * (
my_dat[3] / (my_dat[0] + my_dat[3]))

phylo_beta_sne_data[my_dim[1], my_dim[0]] = phylo_beta_sne_data[
my_dim[0], my_dim[1]]

# Just to match formatting across scripts.
for i in range(num_sites):
phylo_beta_sim_data[i, i] = 1.
phylo_beta_sne_data[i, i] = 1.
phylo_beta_sor_data[i, i] = 1.

return (
Matrix(phylo_beta_sim_data, headers=mtx_headers),
Matrix(phylo_beta_sne_data, headers=mtx_headers),
Matrix(phylo_beta_sor_data, headers=mtx_headers))
Loading

0 comments on commit 8172665

Please sign in to comment.