Skip to content

Commit

Permalink
Merge pull request #7 from CamiloPosso/main
Browse files Browse the repository at this point in the history
Improving detail in Kaiko intermediate output
  • Loading branch information
CamiloPosso authored May 9, 2023
2 parents 4117f38 + 13a656d commit 8fc509c
Show file tree
Hide file tree
Showing 8 changed files with 396 additions and 153 deletions.
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Kaiko_volume/Kaiko_stationary_files/*
/Kaiko_denovo/src-*
/Kaiko_old
/Dockerfile-py27
/random txt files/*

!Kaiko_volume/Kaiko_intermediate/denovo_output/README.txt
!Kaiko_volume/Kaiko_stationary_files/README.txt
Expand All @@ -26,9 +27,6 @@ Kaiko_volume/Kaiko_stationary_files/*
!Kaiko_volume/Kaiko_stationary_files/ExtractUniRefMembers.py
!Kaiko_volume/config.yaml

!Kaiko_denovo/model/get_data.sh
!Kaiko_denovo/model/README.txt

# pipeline outputs and inputs
pipeline_input/*.MGF
pipeline_intermediary/denovo_output/*.txt
Expand Down Expand Up @@ -169,3 +167,4 @@ Kaiko_denovo_tf1.15.0

tf2.0.0_variable_names.txt
tf_updgrade_report.txt
*.pptx
204 changes: 105 additions & 99 deletions Kaiko_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,92 +3,106 @@
import pandas as pd
import numpy as np
import time
import re

from pathlib import Path, PureWindowsPath

# @profile
def run_diamond_tally(diamond_output, ntops, ncbi_taxa_folder, mode, fout, pident):

filterby={}
if pident: filterby['pident'] = float(pident)
# # if FLAGS.evalue: filterby['evalue'] = FLAGS.evalue
# # if FLAGS.mismatch: filterby['mismatch'] = FLAGS.mismatch
print('filterby:', filterby)

############################################################
# retrieve taxa lineage info such as taxa rank and lineage
############################################################
print("Retrieving taxa lineage info...")
taxa_table = read_ncbi_taxa_lineage(ncbi_taxa_folder / 'rankedlineage.dmp',
ncbi_taxa_folder / 'nodes.dmp')

############################################################
# read diamond output file
############################################################
print("Reading diamond output file...")
dmd = read_dmd(diamond_output)

############################################################
# filter by quality and taxa
############################################################
print("Filtering by quality and taxa...")
filtered_dmd = dmd_filter(dmd, filterby=filterby)
filtered_dmd = collect_taxid(filtered_dmd)
filtered_dmd['uniref_id'] = [i[0] for i in filtered_dmd.uniref_seq.str.split(" ",1)]

############################################################
# garbage collection
############################################################
print("Garbage collection...")
del dmd
import gc
gc.collect()

############################################################
# retrieve UniRef100 representative taxa and its members
############################################################
print("Retrieving UniRef100 representative taxa and members...")
unique_unirefs = set(filtered_dmd.uniref_id.drop_duplicates())
taxa_member_tbl = get_taxa_members(ncbi_taxa_folder / 'uniref100_member_taxa_tbl.csv',
unique_unirefs)

############################################################
# merge
############################################################
print("Adding taxa members...")
merged = filtered_dmd.merge(taxa_member_tbl, left_on="uniref_id", right_on="uid", how="inner")
print(" Final table size:{}".format(merged.shape))

if merged.shape[0]!=filtered_dmd.shape[0]:
print("[WARN] You might use the different version of UniRef100.fasta and .xml")

# get the member taxa of each (scan, uid)
unique_members = []
scanids = merged.scans.tolist()
uids = merged.uid.tolist()
commons = merged.common_taxa.tolist()
for ii, members in enumerate(merged.members.str.split(":").tolist()):
for mm in members:
unique_members.append({"scan":scanids[ii], "uid":uids[ii], "member_taxa":int(mm), "common_taxa":int(commons[ii])})
print(" #members:{}".format(len(unique_members)))
members_df = pd.DataFrame(unique_members)
# @profile
def run_diamond_tally(diamond_output, ntops, ncbi_taxa_folder, mode, fout, pident, n_protein_cutoff):
nprot = '{:.5e}'.format(n_protein_cutoff)
detailed_fout = fout.parent / (re.sub("_kaiko_prediction_.*.csv$", "", fout.name) + '_detailed.csv')
taxa_stats_path = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100_stats.txt'))
taxa_stats = pd.read_csv(taxa_stats_path, sep = '\t')

############################################################
# top-rank taxa
############################################################
print("Filtering top-rank taxa by hits...")

if mode=="member":
taxa_col = 'member_taxa'
taxa_col = 'member_taxa'
elif mode=="common":
taxa_col = 'common_taxa'
else:
print(" [ERR] please select the mode to be either 'member' or 'common'.")
return

unique_pepseq_taxa = members_df.drop_duplicates(subset=['scan',taxa_col])
if not detailed_fout.exists():

filterby={}
if pident: filterby['pident'] = float(pident)
# # if FLAGS.evalue: filterby['evalue'] = FLAGS.evalue
# # if FLAGS.mismatch: filterby['mismatch'] = FLAGS.mismatch
print('filterby:', filterby)

############################################################
# read diamond output file
############################################################
print("Reading diamond output file...")
dmd = read_dmd(diamond_output)

############################################################
# filter by quality and taxa
############################################################
print("Filtering by quality and taxa...")
filtered_dmd = dmd_filter(dmd, filterby=filterby)
filtered_dmd = collect_taxid(filtered_dmd)
filtered_dmd['uniref_id'] = [i[0] for i in filtered_dmd.uniref_seq.str.split(" ",1)]

############################################################
# garbage collection
############################################################
print("Garbage collection...")
del dmd
import gc
gc.collect()

############################################################
# retrieve UniRef100 representative taxa and its members
############################################################
print("Retrieving UniRef100 representative taxa and members...")
unique_unirefs = set(filtered_dmd.uniref_id.drop_duplicates())
taxa_member_tbl = get_taxa_members(ncbi_taxa_folder / 'uniref100_member_taxa_tbl.csv',
unique_unirefs)

############################################################
# merge
############################################################
print("Adding taxa members...")
merged = filtered_dmd.merge(taxa_member_tbl, left_on="uniref_id", right_on="uid", how="inner")
print(" Final table size:{}".format(merged.shape))

if merged.shape[0]!=filtered_dmd.shape[0]:
print("[WARN] You might use the different version of UniRef100.fasta and .xml")

# get the member taxa of each (scan, uid)
unique_members = []
scanids = merged.scans.tolist()
uids = merged.uid.tolist()
commons = merged.common_taxa.tolist()
for ii, members in enumerate(merged.members.str.split(":").tolist()):
for mm in members:
unique_members.append({"scan":scanids[ii], "uid":uids[ii], "member_taxa":int(mm), "common_taxa":int(commons[ii])})
print(" #members:{}".format(len(unique_members)))
members_df = pd.DataFrame(unique_members)

############################################################
# top-rank taxa
############################################################
print("Filtering top-rank taxa by hits...")

detailed_output = members_df
detailed_output["protein"] = [re.sub("UniRef100_", "", detailed_output.uid[i]) for i in range(0, len(detailed_output))]
detailed_output = detailed_output[['scan', 'uid', 'protein', 'member_taxa', 'common_taxa']]
detailed_output = detailed_output.merge(taxa_stats, left_on = taxa_col, right_on = 'taxid', how = 'left')
detailed_output.to_csv(detailed_fout, index = False)
else:
print("Loading |scan|protein|taxa| table" + detailed_fout.name + "\n")
detailed_output = pd.read_csv(detailed_fout)
detailed_output = detailed_output[['scan', 'uid', 'protein', 'member_taxa', 'common_taxa']]
detailed_output = detailed_output.merge(taxa_stats, left_on = taxa_col, right_on = 'taxid', how = 'left')

detailed_output = detailed_output[detailed_output['n_protein'] < n_protein_cutoff]
unique_pepseq_taxa = detailed_output.drop_duplicates(subset=['scan',taxa_col])
pepcount_taxid = unique_pepseq_taxa[taxa_col].value_counts()

print(' unique peptides: {}'.format(members_df.scan.value_counts().shape[0]))
print(' unique peptides: {}'.format(detailed_output.scan.value_counts().shape[0]))
print(' unique taxa: {}'.format(pepcount_taxid.shape[0]))

def besthit(row):
Expand All @@ -105,37 +119,17 @@ def besthit(row):
_ntops = besthits.shape[0]

top_taxa = besthits.nlargest(_ntops, keep='all').to_frame(name='hits')
top_taxa['taxid'] = top_taxa.index

############################################################
# save top-rank taxa info
############################################################
print("Saving top-rank taxa info...")
df = pd.concat([taxa_table, top_taxa], join='inner', axis=1)
df.index.name = 'tax_id'
if fout: df.sort_values(by="hits", ascending=False).to_csv(fout)

# taxon info
def read_ncbi_taxa_lineage(rankedlineage_file, nodes_file):
taxon_info = pd.read_csv(rankedlineage_file, sep='|', header=None,
names=['tax_id','tax_name','species','genus','family','order',
'class','phylum','kingdom','superkingdom','null'])
taxon_info = taxon_info.replace('\t\t', np.nan)
for col in ['tax_name','species','genus','family','order','class','phylum','kingdom','superkingdom']:
taxon_info[col] = taxon_info[col].str.extract(r'^\t(.*)\t$')[0].tolist()
# taxon_info[col] = taxon_info[col].str.extract(r'^\t(.*)\t$').tolist()
del taxon_info['null']
print(' taxon_info:', taxon_info.shape, taxon_info.tax_id.drop_duplicates().shape)

rank = pd.read_csv(nodes_file, sep='|', usecols=[0,2], header=None,
names=['tax_id','rank'])
rank = rank.replace('\t\t', np.nan)
rank['rank'] = rank['rank'].str.extract(r'^\t(.*)\t$')[0].tolist()
# rank['rank'] = rank['rank'].str.extract(r'^\t(.*)\t$').tolist()
print(' rank:', rank.shape, rank.tax_id.drop_duplicates().shape)

final_df = taxon_info.merge(rank, left_on='tax_id', right_on='tax_id')
print(' final_df:', final_df.shape, final_df.tax_id.drop_duplicates().shape)
return final_df.set_index('tax_id')
# df = pd.concat([top_taxa, taxa_stats], join='inner', axis=1)
df = top_taxa.merge(taxa_stats[['taxid', 'tax_name', 'rank', 'n_protein']], left_on = 'taxid', right_on = 'taxid')
df = df[['taxid', 'tax_name', 'rank', 'hits', 'n_protein']]
df.index.name = 'taxid'
if fout: df.sort_values(by="hits", ascending=False).to_csv(fout, index = False)

def read_dmd(diamond_output):
dmd_colnames = ['scans','uniref_seq','pident','evalue','mismatch']
Expand Down Expand Up @@ -207,3 +201,15 @@ def get_taxa_members(member_tbl_file, unique_unirefs):
return df


# prefix = "S1_NM0001_NMDC_MixedCommunities_R3_mgf"
# diamond_search_out = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_diamond_search_output.dmd")
# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction_top_taxa.csv")
# ncbi_taxa_folder = Path(PureWindowsPath("Kaiko_volume/Kaiko_stationary_files/ncbi_taxa").as_posix())

# run_diamond_tally(diamond_search_out,
# -1,
# ncbi_taxa_folder,
# "member",
# kaiko_tally,
# 100,
# 150000)
88 changes: 58 additions & 30 deletions Kaiko_4.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def aggregate_fasta(ref_fasta, diamon_tally, fout, ntops, taxa_key, kingdom_list

print(tdf.head(20))

taxids = set(tdf.tax_id.tolist())
taxids = set(tdf.taxid.tolist())
print("NCBI-TaxIDs:", taxids)

key_for_taxid = '{}='.format(taxa_key)
Expand Down Expand Up @@ -64,6 +64,48 @@ def aggregate_fasta(ref_fasta, diamon_tally, fout, ntops, taxa_key, kingdom_list

ofile.close()

# def write_proteins(database_file, taxa, fout):
# index_s_path = Path('Kaiko_volume/Kaiko_stationary_files/uniref100_index_s.txt')
# index_path = Path('Kaiko_volume/Kaiko_stationary_files/uniref100_index.txt')
# index_s = pd.read_csv(index_s_path, sep = '\t', header = None)
# index_s.index = index_s[0]
# index_pos = int(index_s.loc[f'taxid_{taxa}_positions'][1])
# index = index_path.open('r')
# index.seek(index_pos)
# protein_pos = index.readline()
# protein_pos = protein_pos.split(';;')[1].split(';')[:-1]

# with fout.open('a') as output_fasta:
# for pos in protein_pos:
# database_file.seek(int(pos))

# line = database_file.readline()
# line = line.decode("utf-8")

# reading_protein = True
# while reading_protein:
# output_fasta.write(line)
# line = database_file.readline()
# if line[0] == 62: # to find `>`
# reading_protein = False
# else:
# line = line.decode("utf-8")


# def write_single_protein(database_file, pos, output_fasta):
# database_file.seek(pos)

# line = database_file.readline()
# line = line.decode("utf-8")

# reading_protein = True
# while reading_protein:
# output_fasta.write(line)
# line = database_file.readline()
# if line[0] == 62: # to find `>`
# reading_protein = False
# else:
# line = line.decode("utf-8")

def rank_to_lineage(df):
coverage = {}
Expand All @@ -77,37 +119,23 @@ def rank_to_lineage(df):

EXCLUDED_RANKS = ['family','order','class','phylum','kingdom','superkingdom']

###############################################################
# parser = argparse.ArgumentParser()

# parser.add_argument(
# '--taxafile', type=str, required=True,
# help='top-rank taxa file.')

# parser.add_argument(
# '--ref_fasta', type=str, required=True,
# help='reference fasta file')

# parser.add_argument(
# '--fout', type=str, required=True,
# help='output fasta file.')

# parser.add_argument(
# '--key', type=str, default="TaxID",
# help='key value for the taxa ID. `TaxID` for UniRef fasta and `OX` for UniProtKB fasta')

# parser.add_argument('-l','--list', nargs='+', required=False)

# # parser.add_argument('-r','--rank', nargs='+', required=True)

# parser.add_argument(
# '--ntops', type=int, default=5, required=True,
# help='top n taxa. -1 for all')

# FLAGS = parser.parse_args()
# print(FLAGS)
# from pathlib import Path, PureWindowsPath

###############################################################
# prefix = "S1_NM0001_NMDC_MixedCommunities_R3_mgf"
# diamond_search_out = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_diamond_search_output.dmd")
# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction_top_taxa.csv")
# ncbi_taxa_folder = Path(PureWindowsPath("Kaiko_volume/Kaiko_stationary_files/ncbi_taxa").as_posix())
# nprot = '{:.5e}'.format(int(300000))
# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction" + f'_top_taxa_nprot_{nprot}.csv')
# ref_fasta = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100.fasta.gz').as_posix())
# kaiko_final_output = Path("Kaiko_volume/Kaiko_output/" + prefix + "_kaiko_output.fasta")



# aggregate_fasta(ref_fasta,
# kaiko_tally,
# kaiko_final_output,
# 5,
# 'TaxID',
# [])
Loading

0 comments on commit 8fc509c

Please sign in to comment.