Merge pull request #7 from CamiloPosso/main

Improving detail in Kaiko intermediate output
microbiomedata · May 9, 2023 · 8fc509c · 8fc509c
2 parents 4117f38 + 13a656d
commit 8fc509c
Show file tree

Hide file tree

Showing 8 changed files with 396 additions and 153 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,7 @@ Kaiko_volume/Kaiko_stationary_files/*
 /Kaiko_denovo/src-*
 /Kaiko_old
 /Dockerfile-py27
+/random txt files/*
 
 !Kaiko_volume/Kaiko_intermediate/denovo_output/README.txt
 !Kaiko_volume/Kaiko_stationary_files/README.txt
@@ -26,9 +27,6 @@ Kaiko_volume/Kaiko_stationary_files/*
 !Kaiko_volume/Kaiko_stationary_files/ExtractUniRefMembers.py
 !Kaiko_volume/config.yaml
 
-!Kaiko_denovo/model/get_data.sh
-!Kaiko_denovo/model/README.txt
-
 # pipeline outputs and inputs
 pipeline_input/*.MGF
 pipeline_intermediary/denovo_output/*.txt
@@ -169,3 +167,4 @@ Kaiko_denovo_tf1.15.0
 
 tf2.0.0_variable_names.txt
 tf_updgrade_report.txt
+*.pptx
diff --git a/Kaiko_3.py b/Kaiko_3.py
@@ -3,92 +3,106 @@
 import pandas as pd
 import numpy as np
 import time
+import re
 
+from pathlib import Path, PureWindowsPath
 
-# @profile    
-def run_diamond_tally(diamond_output, ntops, ncbi_taxa_folder, mode, fout, pident):
-
-    filterby={}
-    if pident: filterby['pident'] = float(pident)
-    # #     if FLAGS.evalue: filterby['evalue'] = FLAGS.evalue
-    # #     if FLAGS.mismatch: filterby['mismatch'] = FLAGS.mismatch
-    print('filterby:', filterby)
-
-    ############################################################
-    # retrieve taxa lineage info such as taxa rank and lineage
-    ############################################################
-    print("Retrieving taxa lineage info...")
-    taxa_table = read_ncbi_taxa_lineage(ncbi_taxa_folder / 'rankedlineage.dmp', 
-                                        ncbi_taxa_folder / 'nodes.dmp')
-
-    ############################################################
-    # read diamond output file
-    ############################################################
-    print("Reading diamond output file...")
-    dmd = read_dmd(diamond_output)
 
-    ############################################################
-    # filter by quality and taxa
-    ############################################################
-    print("Filtering by quality and taxa...")
-    filtered_dmd = dmd_filter(dmd, filterby=filterby)
-    filtered_dmd = collect_taxid(filtered_dmd)
-    filtered_dmd['uniref_id'] = [i[0] for i in filtered_dmd.uniref_seq.str.split(" ",1)]
-
-    ############################################################
-    # garbage collection
-    ############################################################
-    print("Garbage collection...")
-    del dmd
-    import gc
-    gc.collect()
-
-    ############################################################
-    # retrieve UniRef100 representative taxa and its members
-    ############################################################
-    print("Retrieving UniRef100 representative taxa and members...")
-    unique_unirefs = set(filtered_dmd.uniref_id.drop_duplicates())
-    taxa_member_tbl = get_taxa_members(ncbi_taxa_folder / 'uniref100_member_taxa_tbl.csv',
-                                       unique_unirefs)
-
-    ############################################################
-    # merge
-    ############################################################
-    print("Adding taxa members...")
-    merged = filtered_dmd.merge(taxa_member_tbl, left_on="uniref_id", right_on="uid", how="inner")
-    print("  Final table size:{}".format(merged.shape))
-
-    if merged.shape[0]!=filtered_dmd.shape[0]:
-        print("[WARN] You might use the different version of UniRef100.fasta and .xml")
-
-    # get the member taxa of each (scan, uid)
-    unique_members = []
-    scanids = merged.scans.tolist()
-    uids = merged.uid.tolist()
-    commons = merged.common_taxa.tolist()
-    for ii, members in enumerate(merged.members.str.split(":").tolist()):
-        for mm in members:
-            unique_members.append({"scan":scanids[ii], "uid":uids[ii], "member_taxa":int(mm), "common_taxa":int(commons[ii])})
-    print("  #members:{}".format(len(unique_members)))
-    members_df = pd.DataFrame(unique_members)
+# @profile    
+def run_diamond_tally(diamond_output, ntops, ncbi_taxa_folder, mode, fout, pident, n_protein_cutoff):
+    nprot = '{:.5e}'.format(n_protein_cutoff)
+    detailed_fout = fout.parent / (re.sub("_kaiko_prediction_.*.csv$", "", fout.name) + '_detailed.csv')
+    taxa_stats_path = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100_stats.txt'))
+    taxa_stats = pd.read_csv(taxa_stats_path, sep = '\t')
 
-    ############################################################
-    # top-rank taxa
-    ############################################################
-    print("Filtering top-rank taxa by hits...")
-
     if mode=="member":
-        taxa_col = 'member_taxa'
+            taxa_col = 'member_taxa'
     elif mode=="common":
         taxa_col = 'common_taxa'
     else:
         print("  [ERR] please select the mode to be either 'member' or 'common'.")
         return
 
-    unique_pepseq_taxa = members_df.drop_duplicates(subset=['scan',taxa_col])
+    if not detailed_fout.exists():
+
+        filterby={}
+        if pident: filterby['pident'] = float(pident)
+        # #     if FLAGS.evalue: filterby['evalue'] = FLAGS.evalue
+        # #     if FLAGS.mismatch: filterby['mismatch'] = FLAGS.mismatch
+        print('filterby:', filterby)
+
+        ############################################################
+        # read diamond output file
+        ############################################################
+        print("Reading diamond output file...")
+        dmd = read_dmd(diamond_output)
+
+        ############################################################
+        # filter by quality and taxa
+        ############################################################
+        print("Filtering by quality and taxa...")
+        filtered_dmd = dmd_filter(dmd, filterby=filterby)
+        filtered_dmd = collect_taxid(filtered_dmd)
+        filtered_dmd['uniref_id'] = [i[0] for i in filtered_dmd.uniref_seq.str.split(" ",1)]
+
+        ############################################################
+        # garbage collection
+        ############################################################
+        print("Garbage collection...")
+        del dmd
+        import gc
+        gc.collect()
+
+        ############################################################
+        # retrieve UniRef100 representative taxa and its members
+        ############################################################
+        print("Retrieving UniRef100 representative taxa and members...")
+        unique_unirefs = set(filtered_dmd.uniref_id.drop_duplicates())
+        taxa_member_tbl = get_taxa_members(ncbi_taxa_folder / 'uniref100_member_taxa_tbl.csv',
+                                           unique_unirefs)
+
+        ############################################################
+        # merge
+        ############################################################
+        print("Adding taxa members...")
+        merged = filtered_dmd.merge(taxa_member_tbl, left_on="uniref_id", right_on="uid", how="inner")
+        print("  Final table size:{}".format(merged.shape))
+
+        if merged.shape[0]!=filtered_dmd.shape[0]:
+            print("[WARN] You might use the different version of UniRef100.fasta and .xml")
+
+        # get the member taxa of each (scan, uid)
+        unique_members = []
+        scanids = merged.scans.tolist()
+        uids = merged.uid.tolist()
+        commons = merged.common_taxa.tolist()
+        for ii, members in enumerate(merged.members.str.split(":").tolist()):
+            for mm in members:
+                unique_members.append({"scan":scanids[ii], "uid":uids[ii], "member_taxa":int(mm), "common_taxa":int(commons[ii])})
+        print("  #members:{}".format(len(unique_members)))
+        members_df = pd.DataFrame(unique_members)
+
+        ############################################################
+        # top-rank taxa
+        ############################################################
+        print("Filtering top-rank taxa by hits...")
+
+        detailed_output = members_df
+        detailed_output["protein"] = [re.sub("UniRef100_", "", detailed_output.uid[i]) for i in range(0, len(detailed_output))]
+        detailed_output = detailed_output[['scan', 'uid', 'protein', 'member_taxa', 'common_taxa']]
+        detailed_output = detailed_output.merge(taxa_stats, left_on = taxa_col, right_on = 'taxid', how = 'left')
+        detailed_output.to_csv(detailed_fout, index = False)
+    else:
+        print("Loading |scan|protein|taxa| table" + detailed_fout.name + "\n")
+        detailed_output = pd.read_csv(detailed_fout)
+        detailed_output = detailed_output[['scan', 'uid', 'protein', 'member_taxa', 'common_taxa']]
+        detailed_output = detailed_output.merge(taxa_stats, left_on = taxa_col, right_on = 'taxid', how = 'left')
+
+    detailed_output = detailed_output[detailed_output['n_protein'] < n_protein_cutoff]
+    unique_pepseq_taxa = detailed_output.drop_duplicates(subset=['scan',taxa_col])
     pepcount_taxid = unique_pepseq_taxa[taxa_col].value_counts()
 
-    print('  unique peptides: {}'.format(members_df.scan.value_counts().shape[0]))
+    print('  unique peptides: {}'.format(detailed_output.scan.value_counts().shape[0]))
     print('  unique taxa: {}'.format(pepcount_taxid.shape[0]))
 
     def besthit(row):
@@ -105,37 +119,17 @@ def besthit(row):
         _ntops = besthits.shape[0]
 
     top_taxa = besthits.nlargest(_ntops, keep='all').to_frame(name='hits')
+    top_taxa['taxid'] = top_taxa.index
 
     ############################################################
     # save top-rank taxa info
     ############################################################
     print("Saving top-rank taxa info...")
-    df = pd.concat([taxa_table, top_taxa], join='inner', axis=1)
-    df.index.name = 'tax_id'
-    if fout: df.sort_values(by="hits", ascending=False).to_csv(fout)
-
-# taxon info
-def read_ncbi_taxa_lineage(rankedlineage_file, nodes_file):
-    taxon_info = pd.read_csv(rankedlineage_file, sep='|', header=None,
-                             names=['tax_id','tax_name','species','genus','family','order',
-                                    'class','phylum','kingdom','superkingdom','null'])
-    taxon_info = taxon_info.replace('\t\t', np.nan)
-    for col in ['tax_name','species','genus','family','order','class','phylum','kingdom','superkingdom']:
-        taxon_info[col] = taxon_info[col].str.extract(r'^\t(.*)\t$')[0].tolist()
-        # taxon_info[col] = taxon_info[col].str.extract(r'^\t(.*)\t$').tolist()
-    del taxon_info['null']
-    print('  taxon_info:', taxon_info.shape, taxon_info.tax_id.drop_duplicates().shape)
-
-    rank = pd.read_csv(nodes_file, sep='|', usecols=[0,2], header=None,
-                     names=['tax_id','rank'])
-    rank = rank.replace('\t\t', np.nan)
-    rank['rank'] = rank['rank'].str.extract(r'^\t(.*)\t$')[0].tolist()
-    # rank['rank'] = rank['rank'].str.extract(r'^\t(.*)\t$').tolist()
-    print('  rank:', rank.shape, rank.tax_id.drop_duplicates().shape)
-
-    final_df = taxon_info.merge(rank, left_on='tax_id', right_on='tax_id')
-    print('  final_df:', final_df.shape, final_df.tax_id.drop_duplicates().shape)
-    return final_df.set_index('tax_id')
+    # df = pd.concat([top_taxa, taxa_stats], join='inner', axis=1)
+    df = top_taxa.merge(taxa_stats[['taxid', 'tax_name', 'rank', 'n_protein']], left_on = 'taxid', right_on = 'taxid')
+    df = df[['taxid', 'tax_name', 'rank', 'hits', 'n_protein']]
+    df.index.name = 'taxid'
+    if fout: df.sort_values(by="hits", ascending=False).to_csv(fout, index = False)
 
 def read_dmd(diamond_output):
     dmd_colnames = ['scans','uniref_seq','pident','evalue','mismatch']
@@ -207,3 +201,15 @@ def get_taxa_members(member_tbl_file, unique_unirefs):
     return df
 
 
+# prefix = "S1_NM0001_NMDC_MixedCommunities_R3_mgf"
+# diamond_search_out = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_diamond_search_output.dmd")
+# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction_top_taxa.csv")
+# ncbi_taxa_folder = Path(PureWindowsPath("Kaiko_volume/Kaiko_stationary_files/ncbi_taxa").as_posix())
+
+# run_diamond_tally(diamond_search_out, 
+#                   -1, 
+#                   ncbi_taxa_folder, 
+#                   "member", 
+#                   kaiko_tally, 
+#                   100,
+#                   150000)
diff --git a/Kaiko_4.py b/Kaiko_4.py
@@ -25,7 +25,7 @@ def aggregate_fasta(ref_fasta, diamon_tally, fout, ntops, taxa_key, kingdom_list
 
     print(tdf.head(20))
 
-    taxids = set(tdf.tax_id.tolist())
+    taxids = set(tdf.taxid.tolist())
     print("NCBI-TaxIDs:", taxids)
 
     key_for_taxid = '{}='.format(taxa_key)
@@ -64,6 +64,48 @@ def aggregate_fasta(ref_fasta, diamon_tally, fout, ntops, taxa_key, kingdom_list
 
     ofile.close()
 
+# def write_proteins(database_file, taxa, fout):
+#     index_s_path = Path('Kaiko_volume/Kaiko_stationary_files/uniref100_index_s.txt')
+#     index_path = Path('Kaiko_volume/Kaiko_stationary_files/uniref100_index.txt')
+#     index_s = pd.read_csv(index_s_path, sep = '\t', header = None)
+#     index_s.index = index_s[0]
+#     index_pos = int(index_s.loc[f'taxid_{taxa}_positions'][1])
+#     index = index_path.open('r')
+#     index.seek(index_pos)
+#     protein_pos = index.readline()
+#     protein_pos = protein_pos.split(';;')[1].split(';')[:-1]
+
+#     with fout.open('a') as output_fasta:
+#         for pos in protein_pos:
+#             database_file.seek(int(pos))
+
+#             line = database_file.readline()
+#             line = line.decode("utf-8")
+
+#             reading_protein = True
+#             while reading_protein:
+#                 output_fasta.write(line)
+#                 line = database_file.readline()
+#                 if line[0] == 62:  # to find `>`
+#                     reading_protein = False
+#                 else:
+#                     line = line.decode("utf-8")
+
+
+# def write_single_protein(database_file, pos, output_fasta):
+#     database_file.seek(pos)
+
+#     line = database_file.readline()
+#     line = line.decode("utf-8")
+
+#     reading_protein = True
+#     while reading_protein:
+#         output_fasta.write(line)
+#         line = database_file.readline()
+#         if line[0] == 62:  # to find `>`
+#             reading_protein = False
+#         else:
+#             line = line.decode("utf-8")
 
 def rank_to_lineage(df):
     coverage = {}
@@ -77,37 +119,23 @@ def rank_to_lineage(df):
 
 EXCLUDED_RANKS = ['family','order','class','phylum','kingdom','superkingdom']
 
-###############################################################
-# parser = argparse.ArgumentParser()
-
-# parser.add_argument(
-#     '--taxafile', type=str, required=True,
-#     help='top-rank taxa file.')
-
-# parser.add_argument(
-#     '--ref_fasta', type=str, required=True,
-#     help='reference fasta file')
-
-# parser.add_argument(
-#     '--fout', type=str, required=True,
-#     help='output fasta file.')
-
-# parser.add_argument(
-#     '--key', type=str, default="TaxID",
-#     help='key value for the taxa ID. `TaxID` for UniRef fasta and `OX` for UniProtKB fasta')
-
-# parser.add_argument('-l','--list', nargs='+', required=False)
-
-# # parser.add_argument('-r','--rank', nargs='+', required=True)
-
-# parser.add_argument(
-#     '--ntops', type=int, default=5, required=True,
-#     help='top n taxa. -1 for all')
 
-# FLAGS = parser.parse_args()
-# print(FLAGS)
+# from pathlib import Path, PureWindowsPath
 
-###############################################################
+# prefix = "S1_NM0001_NMDC_MixedCommunities_R3_mgf"
+# diamond_search_out = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_diamond_search_output.dmd")
+# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction_top_taxa.csv")
+# ncbi_taxa_folder = Path(PureWindowsPath("Kaiko_volume/Kaiko_stationary_files/ncbi_taxa").as_posix())
+# nprot = '{:.5e}'.format(int(300000))
+# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction" + f'_top_taxa_nprot_{nprot}.csv')
+# ref_fasta = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100.fasta.gz').as_posix())
+# kaiko_final_output = Path("Kaiko_volume/Kaiko_output/" + prefix + "_kaiko_output.fasta")
 
 
 
+# aggregate_fasta(ref_fasta,
+#                 kaiko_tally,
+#                 kaiko_final_output,
+#                 5,
+#                 'TaxID',
+#                 [])