diff --git a/.gitignore b/.gitignore index 5d8f928..b78802a 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ Kaiko_volume/Kaiko_stationary_files/* /Kaiko_denovo/src-* /Kaiko_old /Dockerfile-py27 +/random txt files/* !Kaiko_volume/Kaiko_intermediate/denovo_output/README.txt !Kaiko_volume/Kaiko_stationary_files/README.txt @@ -26,9 +27,6 @@ Kaiko_volume/Kaiko_stationary_files/* !Kaiko_volume/Kaiko_stationary_files/ExtractUniRefMembers.py !Kaiko_volume/config.yaml -!Kaiko_denovo/model/get_data.sh -!Kaiko_denovo/model/README.txt - # pipeline outputs and inputs pipeline_input/*.MGF pipeline_intermediary/denovo_output/*.txt @@ -169,3 +167,4 @@ Kaiko_denovo_tf1.15.0 tf2.0.0_variable_names.txt tf_updgrade_report.txt +*.pptx diff --git a/Kaiko_3.py b/Kaiko_3.py index aab3562..382a520 100644 --- a/Kaiko_3.py +++ b/Kaiko_3.py @@ -3,92 +3,106 @@ import pandas as pd import numpy as np import time +import re +from pathlib import Path, PureWindowsPath -# @profile -def run_diamond_tally(diamond_output, ntops, ncbi_taxa_folder, mode, fout, pident): - - filterby={} - if pident: filterby['pident'] = float(pident) - # # if FLAGS.evalue: filterby['evalue'] = FLAGS.evalue - # # if FLAGS.mismatch: filterby['mismatch'] = FLAGS.mismatch - print('filterby:', filterby) - - ############################################################ - # retrieve taxa lineage info such as taxa rank and lineage - ############################################################ - print("Retrieving taxa lineage info...") - taxa_table = read_ncbi_taxa_lineage(ncbi_taxa_folder / 'rankedlineage.dmp', - ncbi_taxa_folder / 'nodes.dmp') - - ############################################################ - # read diamond output file - ############################################################ - print("Reading diamond output file...") - dmd = read_dmd(diamond_output) - ############################################################ - # filter by quality and taxa - ############################################################ - print("Filtering by quality and taxa...") - filtered_dmd = dmd_filter(dmd, filterby=filterby) - filtered_dmd = collect_taxid(filtered_dmd) - filtered_dmd['uniref_id'] = [i[0] for i in filtered_dmd.uniref_seq.str.split(" ",1)] - - ############################################################ - # garbage collection - ############################################################ - print("Garbage collection...") - del dmd - import gc - gc.collect() - - ############################################################ - # retrieve UniRef100 representative taxa and its members - ############################################################ - print("Retrieving UniRef100 representative taxa and members...") - unique_unirefs = set(filtered_dmd.uniref_id.drop_duplicates()) - taxa_member_tbl = get_taxa_members(ncbi_taxa_folder / 'uniref100_member_taxa_tbl.csv', - unique_unirefs) - - ############################################################ - # merge - ############################################################ - print("Adding taxa members...") - merged = filtered_dmd.merge(taxa_member_tbl, left_on="uniref_id", right_on="uid", how="inner") - print(" Final table size:{}".format(merged.shape)) - - if merged.shape[0]!=filtered_dmd.shape[0]: - print("[WARN] You might use the different version of UniRef100.fasta and .xml") - - # get the member taxa of each (scan, uid) - unique_members = [] - scanids = merged.scans.tolist() - uids = merged.uid.tolist() - commons = merged.common_taxa.tolist() - for ii, members in enumerate(merged.members.str.split(":").tolist()): - for mm in members: - unique_members.append({"scan":scanids[ii], "uid":uids[ii], "member_taxa":int(mm), "common_taxa":int(commons[ii])}) - print(" #members:{}".format(len(unique_members))) - members_df = pd.DataFrame(unique_members) +# @profile +def run_diamond_tally(diamond_output, ntops, ncbi_taxa_folder, mode, fout, pident, n_protein_cutoff): + nprot = '{:.5e}'.format(n_protein_cutoff) + detailed_fout = fout.parent / (re.sub("_kaiko_prediction_.*.csv$", "", fout.name) + '_detailed.csv') + taxa_stats_path = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100_stats.txt')) + taxa_stats = pd.read_csv(taxa_stats_path, sep = '\t') - ############################################################ - # top-rank taxa - ############################################################ - print("Filtering top-rank taxa by hits...") - if mode=="member": - taxa_col = 'member_taxa' + taxa_col = 'member_taxa' elif mode=="common": taxa_col = 'common_taxa' else: print(" [ERR] please select the mode to be either 'member' or 'common'.") return - unique_pepseq_taxa = members_df.drop_duplicates(subset=['scan',taxa_col]) + if not detailed_fout.exists(): + + filterby={} + if pident: filterby['pident'] = float(pident) + # # if FLAGS.evalue: filterby['evalue'] = FLAGS.evalue + # # if FLAGS.mismatch: filterby['mismatch'] = FLAGS.mismatch + print('filterby:', filterby) + + ############################################################ + # read diamond output file + ############################################################ + print("Reading diamond output file...") + dmd = read_dmd(diamond_output) + + ############################################################ + # filter by quality and taxa + ############################################################ + print("Filtering by quality and taxa...") + filtered_dmd = dmd_filter(dmd, filterby=filterby) + filtered_dmd = collect_taxid(filtered_dmd) + filtered_dmd['uniref_id'] = [i[0] for i in filtered_dmd.uniref_seq.str.split(" ",1)] + + ############################################################ + # garbage collection + ############################################################ + print("Garbage collection...") + del dmd + import gc + gc.collect() + + ############################################################ + # retrieve UniRef100 representative taxa and its members + ############################################################ + print("Retrieving UniRef100 representative taxa and members...") + unique_unirefs = set(filtered_dmd.uniref_id.drop_duplicates()) + taxa_member_tbl = get_taxa_members(ncbi_taxa_folder / 'uniref100_member_taxa_tbl.csv', + unique_unirefs) + + ############################################################ + # merge + ############################################################ + print("Adding taxa members...") + merged = filtered_dmd.merge(taxa_member_tbl, left_on="uniref_id", right_on="uid", how="inner") + print(" Final table size:{}".format(merged.shape)) + + if merged.shape[0]!=filtered_dmd.shape[0]: + print("[WARN] You might use the different version of UniRef100.fasta and .xml") + + # get the member taxa of each (scan, uid) + unique_members = [] + scanids = merged.scans.tolist() + uids = merged.uid.tolist() + commons = merged.common_taxa.tolist() + for ii, members in enumerate(merged.members.str.split(":").tolist()): + for mm in members: + unique_members.append({"scan":scanids[ii], "uid":uids[ii], "member_taxa":int(mm), "common_taxa":int(commons[ii])}) + print(" #members:{}".format(len(unique_members))) + members_df = pd.DataFrame(unique_members) + + ############################################################ + # top-rank taxa + ############################################################ + print("Filtering top-rank taxa by hits...") + + detailed_output = members_df + detailed_output["protein"] = [re.sub("UniRef100_", "", detailed_output.uid[i]) for i in range(0, len(detailed_output))] + detailed_output = detailed_output[['scan', 'uid', 'protein', 'member_taxa', 'common_taxa']] + detailed_output = detailed_output.merge(taxa_stats, left_on = taxa_col, right_on = 'taxid', how = 'left') + detailed_output.to_csv(detailed_fout, index = False) + else: + print("Loading |scan|protein|taxa| table" + detailed_fout.name + "\n") + detailed_output = pd.read_csv(detailed_fout) + detailed_output = detailed_output[['scan', 'uid', 'protein', 'member_taxa', 'common_taxa']] + detailed_output = detailed_output.merge(taxa_stats, left_on = taxa_col, right_on = 'taxid', how = 'left') + + detailed_output = detailed_output[detailed_output['n_protein'] < n_protein_cutoff] + unique_pepseq_taxa = detailed_output.drop_duplicates(subset=['scan',taxa_col]) pepcount_taxid = unique_pepseq_taxa[taxa_col].value_counts() - print(' unique peptides: {}'.format(members_df.scan.value_counts().shape[0])) + print(' unique peptides: {}'.format(detailed_output.scan.value_counts().shape[0])) print(' unique taxa: {}'.format(pepcount_taxid.shape[0])) def besthit(row): @@ -105,37 +119,17 @@ def besthit(row): _ntops = besthits.shape[0] top_taxa = besthits.nlargest(_ntops, keep='all').to_frame(name='hits') + top_taxa['taxid'] = top_taxa.index ############################################################ # save top-rank taxa info ############################################################ print("Saving top-rank taxa info...") - df = pd.concat([taxa_table, top_taxa], join='inner', axis=1) - df.index.name = 'tax_id' - if fout: df.sort_values(by="hits", ascending=False).to_csv(fout) - -# taxon info -def read_ncbi_taxa_lineage(rankedlineage_file, nodes_file): - taxon_info = pd.read_csv(rankedlineage_file, sep='|', header=None, - names=['tax_id','tax_name','species','genus','family','order', - 'class','phylum','kingdom','superkingdom','null']) - taxon_info = taxon_info.replace('\t\t', np.nan) - for col in ['tax_name','species','genus','family','order','class','phylum','kingdom','superkingdom']: - taxon_info[col] = taxon_info[col].str.extract(r'^\t(.*)\t$')[0].tolist() - # taxon_info[col] = taxon_info[col].str.extract(r'^\t(.*)\t$').tolist() - del taxon_info['null'] - print(' taxon_info:', taxon_info.shape, taxon_info.tax_id.drop_duplicates().shape) - - rank = pd.read_csv(nodes_file, sep='|', usecols=[0,2], header=None, - names=['tax_id','rank']) - rank = rank.replace('\t\t', np.nan) - rank['rank'] = rank['rank'].str.extract(r'^\t(.*)\t$')[0].tolist() - # rank['rank'] = rank['rank'].str.extract(r'^\t(.*)\t$').tolist() - print(' rank:', rank.shape, rank.tax_id.drop_duplicates().shape) - - final_df = taxon_info.merge(rank, left_on='tax_id', right_on='tax_id') - print(' final_df:', final_df.shape, final_df.tax_id.drop_duplicates().shape) - return final_df.set_index('tax_id') + # df = pd.concat([top_taxa, taxa_stats], join='inner', axis=1) + df = top_taxa.merge(taxa_stats[['taxid', 'tax_name', 'rank', 'n_protein']], left_on = 'taxid', right_on = 'taxid') + df = df[['taxid', 'tax_name', 'rank', 'hits', 'n_protein']] + df.index.name = 'taxid' + if fout: df.sort_values(by="hits", ascending=False).to_csv(fout, index = False) def read_dmd(diamond_output): dmd_colnames = ['scans','uniref_seq','pident','evalue','mismatch'] @@ -207,3 +201,15 @@ def get_taxa_members(member_tbl_file, unique_unirefs): return df +# prefix = "S1_NM0001_NMDC_MixedCommunities_R3_mgf" +# diamond_search_out = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_diamond_search_output.dmd") +# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction_top_taxa.csv") +# ncbi_taxa_folder = Path(PureWindowsPath("Kaiko_volume/Kaiko_stationary_files/ncbi_taxa").as_posix()) + +# run_diamond_tally(diamond_search_out, +# -1, +# ncbi_taxa_folder, +# "member", +# kaiko_tally, +# 100, +# 150000) \ No newline at end of file diff --git a/Kaiko_4.py b/Kaiko_4.py index a644329..cdb418c 100644 --- a/Kaiko_4.py +++ b/Kaiko_4.py @@ -25,7 +25,7 @@ def aggregate_fasta(ref_fasta, diamon_tally, fout, ntops, taxa_key, kingdom_list print(tdf.head(20)) - taxids = set(tdf.tax_id.tolist()) + taxids = set(tdf.taxid.tolist()) print("NCBI-TaxIDs:", taxids) key_for_taxid = '{}='.format(taxa_key) @@ -64,6 +64,48 @@ def aggregate_fasta(ref_fasta, diamon_tally, fout, ntops, taxa_key, kingdom_list ofile.close() +# def write_proteins(database_file, taxa, fout): +# index_s_path = Path('Kaiko_volume/Kaiko_stationary_files/uniref100_index_s.txt') +# index_path = Path('Kaiko_volume/Kaiko_stationary_files/uniref100_index.txt') +# index_s = pd.read_csv(index_s_path, sep = '\t', header = None) +# index_s.index = index_s[0] +# index_pos = int(index_s.loc[f'taxid_{taxa}_positions'][1]) +# index = index_path.open('r') +# index.seek(index_pos) +# protein_pos = index.readline() +# protein_pos = protein_pos.split(';;')[1].split(';')[:-1] + +# with fout.open('a') as output_fasta: +# for pos in protein_pos: +# database_file.seek(int(pos)) + +# line = database_file.readline() +# line = line.decode("utf-8") + +# reading_protein = True +# while reading_protein: +# output_fasta.write(line) +# line = database_file.readline() +# if line[0] == 62: # to find `>` +# reading_protein = False +# else: +# line = line.decode("utf-8") + + +# def write_single_protein(database_file, pos, output_fasta): +# database_file.seek(pos) + +# line = database_file.readline() +# line = line.decode("utf-8") + +# reading_protein = True +# while reading_protein: +# output_fasta.write(line) +# line = database_file.readline() +# if line[0] == 62: # to find `>` +# reading_protein = False +# else: +# line = line.decode("utf-8") def rank_to_lineage(df): coverage = {} @@ -77,37 +119,23 @@ def rank_to_lineage(df): EXCLUDED_RANKS = ['family','order','class','phylum','kingdom','superkingdom'] -############################################################### -# parser = argparse.ArgumentParser() - -# parser.add_argument( -# '--taxafile', type=str, required=True, -# help='top-rank taxa file.') - -# parser.add_argument( -# '--ref_fasta', type=str, required=True, -# help='reference fasta file') - -# parser.add_argument( -# '--fout', type=str, required=True, -# help='output fasta file.') - -# parser.add_argument( -# '--key', type=str, default="TaxID", -# help='key value for the taxa ID. `TaxID` for UniRef fasta and `OX` for UniProtKB fasta') - -# parser.add_argument('-l','--list', nargs='+', required=False) - -# # parser.add_argument('-r','--rank', nargs='+', required=True) - -# parser.add_argument( -# '--ntops', type=int, default=5, required=True, -# help='top n taxa. -1 for all') -# FLAGS = parser.parse_args() -# print(FLAGS) +# from pathlib import Path, PureWindowsPath -############################################################### +# prefix = "S1_NM0001_NMDC_MixedCommunities_R3_mgf" +# diamond_search_out = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_diamond_search_output.dmd") +# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction_top_taxa.csv") +# ncbi_taxa_folder = Path(PureWindowsPath("Kaiko_volume/Kaiko_stationary_files/ncbi_taxa").as_posix()) +# nprot = '{:.5e}'.format(int(300000)) +# kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction" + f'_top_taxa_nprot_{nprot}.csv') +# ref_fasta = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100.fasta.gz').as_posix()) +# kaiko_final_output = Path("Kaiko_volume/Kaiko_output/" + prefix + "_kaiko_output.fasta") +# aggregate_fasta(ref_fasta, +# kaiko_tally, +# kaiko_final_output, +# 5, +# 'TaxID', +# []) \ No newline at end of file diff --git a/Kaiko_parse_uniref.py b/Kaiko_parse_uniref.py new file mode 100644 index 0000000..7fb9a2d --- /dev/null +++ b/Kaiko_parse_uniref.py @@ -0,0 +1,207 @@ +import gzip +import time +import re +import gc + +import pandas as pd +import numpy as np + +from pathlib import Path, PureWindowsPath + +## Moving this function here, as the lineage info can be included in the taxa stats file. +## taxon info +def read_ncbi_taxa_lineage(rankedlineage_file, nodes_file): + taxon_info = pd.read_csv(rankedlineage_file, sep='|', header=None, + names=['tax_id','tax_name','species','genus','family','order', + 'class','phylum','kingdom','superkingdom','null']) + taxon_info = taxon_info.replace('\t\t', np.nan) + for col in ['tax_name','species','genus','family','order','class','phylum','kingdom','superkingdom']: + taxon_info[col] = taxon_info[col].str.extract(r'^\t(.*)\t$')[0].tolist() + # taxon_info[col] = taxon_info[col].str.extract(r'^\t(.*)\t$').tolist() + del taxon_info['null'] + print(' taxon_info:', taxon_info.shape, taxon_info.tax_id.drop_duplicates().shape) + + rank = pd.read_csv(nodes_file, sep='|', usecols=[0,2], header=None, + names=['tax_id','rank']) + rank = rank.replace('\t\t', np.nan) + rank['rank'] = rank['rank'].str.extract(r'^\t(.*)\t$')[0].tolist() + # rank['rank'] = rank['rank'].str.extract(r'^\t(.*)\t$').tolist() + print(' rank:', rank.shape, rank.tax_id.drop_duplicates().shape) + + final_df = taxon_info.merge(rank, left_on='tax_id', right_on='tax_id') + print(' final_df:', final_df.shape, final_df.tax_id.drop_duplicates().shape) + return final_df.set_index('tax_id') + +## Part 1 of the index process. +## This makes a partial index. Takes a few hours. A complete index in one pass would take too much time. +## We are keeping track of the location of each protein in thge FASTA file, and the taxa it belongs to. +def gather_taxa_stats_1(ref_fasta, fout, taxa_key = 'TaxID', protein_key = '>UniRef100_'): + """ + python -u ExtractProteinSequenceByTaxID.py --ref_fasta uniref100.fasta.gz --taxafile Weintraub_kaiko_25p_UniRef100_toptaxa.csv --fout Weintraub_kaiko_25p_UniRef100_Bacteria_top100.fasta --ntops 100 -l Bacteria --key TaxID + """ + + key_for_taxid = '{}='.format(taxa_key) + print("Key for parsing Tax IDs:", key_for_taxid) + # ofile = open(fout, 'w') + ofile = fout.open('w') + ofile.write('protein_id' + '\t' + 'taxid' + '\t' + 'seq_length' + '\n') + + num_seqs = 0 + start_time = time.time() + + with gzip.open(ref_fasta, 'rb') as file: + try: + reading_protein = False + current_position = 0 + protein_position = 0 + output = open("UniRef100_index.txt", "w") + universe = {} + for bline in file: + + # if line.startswith('>UniRef'): + if bline[0] == 62: # to find `>` + if reading_protein: + if taxid in universe.keys(): + universe[taxid] = universe[taxid] + f':{protein_id};{protein_position};{seq_length}' + else: + universe[taxid] = f'{protein_id};{protein_position};{seq_length}' + if (num_seqs % 100000) == 0: + for k, v in universe.items(): + output.write(f'taxid_{k}\t{v}\n') + print("{}M sequences has been parsed. {:.1f}min".format(num_seqs//1e6, (time.time()-start_time)/60)) + universe = {} + protein_position = current_position + seq_length = 0 + num_seqs += 1 + line = bline.decode("utf-8") + taxid = line.split(key_for_taxid)[1].split(' ')[0] + protein_id = line.split(protein_key)[1].split(' ')[0] + if taxid in ["", "N/A"]: + reading_protein = False + else: + reading_protein = True + else: + if reading_protein: + seq_length += (len(bline) - 1) + current_position += len(bline) + + except Exception as e: + print(line, e) + + output.close() + +def rank_to_lineage(df): + coverage = {} + for c in ['species','genus','family','order','class','phylum','kingdom','superkingdom']: + idx = df['rank']==c + df.loc[idx, c] = df.loc[idx, 'tax_name'] + for c in ['species','genus','family','order','class','phylum','superkingdom']: + coverage[c] = 100*df[c].dropna().shape[0]/df.shape[0] + print("{}: {:.2f}%".format(c, coverage[c])) + return df, coverage + + +## Part two of the index process. This makes the complete index from the partial index above. Takes about ~10 mins. +## The result is: +# taxid protein1;protein2;... +# taxid_position pos1;pos2;... +# taxid_size (n_proteins, n_AA) +def gather_taxa_stats_2(parse1_output, parse2_output): + start_time = time.time() + universe = dict() + index = 0 + with parse1_output.open('r') as file: + for line in file: + index = index + 1 + taxid = line.split('_')[1].split('\t')[0] + if not taxid in universe.keys(): + universe[taxid] = 'uniref100;;' + ## first coordinate is protein number, second is total protein lenth (fasta size) + universe[taxid + '_positions'] = 'uniref100.fasta;;' + universe[taxid + '_size'] = (0, 0) + info_list = re.split('[: ;]', line.split('_')[1].split('\t')[1]) + + # protein is index of the form 3n (start at zero) + # protein position in fasta is 3n + 1 + # protein length is 3n + 2 + num_proteins = len(info_list)//3 + new_proteins = [info_list[3*i] for i in range(num_proteins)] + prot_pos = [info_list[3*i+1] for i in range(num_proteins)] + new_len = sum([int(info_list[3*i+2]) for i in range(num_proteins)]) + new_proteins = ';'.join(new_proteins) + universe[taxid] = universe[taxid] + new_proteins + ';' + universe[taxid + '_size'] = (universe[taxid + '_size'][0] + num_proteins, universe[taxid + '_size'][1] + new_len) + universe[taxid + '_positions'] = universe[taxid + '_positions'] + ';'.join(prot_pos) + ';' + if (index % 100000) == 0: + print("{} total partial taxa found. {:.1f}min".format(len(universe.keys())//3, (time.time()-start_time)/60)) + with parse2_output.open('w') as file: + for k, v in universe.items(): + file.write(f'taxid_{k}\t{v}\n') + +## The index is quite large on its own. +## To speed up parsing and allow for fast and dynamic making of FASTA files, +## we make an index for the index. Yes. +def gather_taxa_stats_3(parse2_output, parse3_output): + index = 0 + pos = 0 + with parse2_output.open('r') as file: + output = parse3_output.open("w") + for line in file: + if index % 3 == 0: + taxid = line.split('\t')[0] + output.write(f'{taxid}\t{pos}\n') + ## have to add +1 to account for the \n character. + pos = pos + len(line) + 1 + +## Get taxa stats in an easy to open table +def gather_taxa_stats_4(parse2_output, parse3_output, stats_output): + index_file = parse2_output.open('r') + output = stats_output.open('w') + output.write('taxid\tn_protein\tn_AA\n') + with parse3_output.open('r') as file: + for line in file: + line_start = line.split('\t')[0].split('_') + if len(line_start) > 2: + taxid = line_start[1] + if line_start[2] == "size": + pos = int(line.split('\t')[1]) + index_file.seek(pos) + xx = index_file.readline() + xx = xx.split('\t')[1] + n_protein = re.sub(r'\(([0-9]+), ([0-9]+)\)\n$', "\\1", xx) + n_AA = re.sub(r'\(([0-9]+), ([0-9]+)\)\n$', "\\2", xx) + output.write(f'{taxid}\t{n_protein}\t{n_AA}\n') + +def gather_taxa_stats_5(stats_output, ncbi_taxa_folder): + ############################################################ + # retrieve taxa lineage info such as taxa rank and lineage + ############################################################ + print("Retrieving taxa lineage info...") + taxa_table = read_ncbi_taxa_lineage(ncbi_taxa_folder / 'rankedlineage.dmp', + ncbi_taxa_folder / 'nodes.dmp') + taxa_stats = pd.read_csv(stats_output, sep = '\t') + print(len(taxa_stats)) + taxa_stats = taxa_stats.merge(taxa_table, left_on="taxid", right_on="tax_id", how="left") + taxa_stats = taxa_stats[['taxid', 'tax_name', 'rank', 'n_protein', 'n_AA']] + taxa_stats.to_csv(stats_output, sep = '\t', index = False) + + + +parse1_output = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100_index_intermediate.txt')) +parse2_output = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100_index.txt')) +parse3_output = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100_index_s.txt')) +stats_output = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100_stats.txt')) +ref_fasta = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/uniref100.fasta.gz')) +ncbi_taxa_folder = Path(PureWindowsPath('Kaiko_volume/Kaiko_stationary_files/ncbi_taxa').as_posix()) + +# gather_taxa_stats_1(ref_fasta, parse1_output) + +# gc.collect() +# gather_taxa_stats_2(parse1_output, parse2_output) + +# gc.collect() +# gather_taxa_stats_3(parse2_output, parse3_output) + +gather_taxa_stats_4(parse2_output, parse3_output, stats_output) + +gather_taxa_stats_5(stats_output, ncbi_taxa_folder) \ No newline at end of file diff --git a/Kaiko_pipeline_main.py b/Kaiko_pipeline_main.py index 0d50c81..3d3d6ad 100644 --- a/Kaiko_pipeline_main.py +++ b/Kaiko_pipeline_main.py @@ -55,10 +55,11 @@ print("DeNovo: Running the following command:\n") for i in range(len(kaiko_1_args)): kaiko_1_args[i] = str(kaiko_1_args[i]) -print(" ".join(kaiko_1_args) + "\n") -subprocess.run(kaiko_1_args, cwd = "Kaiko_denovo") -# subprocess.call(kaiko_1_args, cwd = "Kaiko_denovo") +if not config['denovo']['cached']: + print(" ".join(kaiko_1_args) + "\n") + subprocess.run(kaiko_1_args, cwd = "Kaiko_denovo") + # subprocess.call(kaiko_1_args, cwd = "Kaiko_denovo") if (config['denovo'])['profile']: profiler = cProfile.Profile() @@ -84,18 +85,17 @@ diamond_search_out.resolve().as_posix(), "-f", "6", "qseqid", "stitle", "pident", "evalue", "mismatch"] +if not config['diamond tally']['cached']: + print("DeNovo: Running the following command:\n") + print(" ".join(diamond_args) + "\n") + # os.chdir("Kaiko_volume/Kaiko_stationary_files/diamond-2.0.15") + os.chdir(diamond_folder) + print(os.getcwd()) + os.system(" ".join(diamond_args)) + os.chdir(working_dir) -print("DeNovo: Running the following command:\n") -print(" ".join(diamond_args) + "\n") - - -# os.chdir("Kaiko_volume/Kaiko_stationary_files/diamond-2.0.15") -os.chdir(diamond_folder) -print(os.getcwd()) -os.system(" ".join(diamond_args)) -os.chdir(working_dir) - -kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction_top_taxa.csv") +nprot = '{:.5e}'.format(int(config['diamond tally']['n_protein_cutoff'])) +kaiko_tally = Path("Kaiko_volume/Kaiko_intermediate/" + prefix + "_kaiko_prediction" + f'_top_taxa_nprot_{nprot}.csv') # Step 4. Tallying the diamond results run_diamond_tally(diamond_search_out, @@ -103,12 +103,12 @@ ncbi_taxa_folder, config['diamond tally']['mode'], kaiko_tally, - float(config['diamond tally']['pident'])) + float(config['diamond tally']['pident']), + int(config['diamond tally']['n_protein_cutoff'])) ## Step 5. Putting together the final fasta file. - if config['taxa to fasta']['kingdom_list'] != "": kingdom_list = config['taxa to fasta']['kingdom_list'].split(', ') print(kingdom_list) diff --git a/Parsing.py b/Parsing.py index f28b618..1e3a370 100644 --- a/Parsing.py +++ b/Parsing.py @@ -13,6 +13,7 @@ 'profile' : False, 'beam_size' : 5, 'mgf_dir' : 'Kaiko_volume/Kaiko_input_files/', + 'cached' : False, # 'train_dir' : 'model/' # 'decode_dir' : 'Kaiko_volume/Kaiko_intermediate/denovo_output/', } @@ -24,7 +25,9 @@ 'mode' : 'member', # 'fout' : 'Kaiko_volume/Kaiko_intermediate/kaiko_prediction_top_taxa.csv', # 'diamond_output' : "Kaiko_volume/Kaiko_intermediate/denovo_output/diamond_search_output.dmd", - 'pident' : 100} + 'n_protein_cutoff' : 300000, + 'pident' : 100, + 'cached' : False} config['taxa to fasta'] = {'ref_fasta' : "Kaiko_volume/Kaiko_stationary_files/uniref100.fasta.gz", diff --git a/README.md b/README.md index 99e73e5..d9fefc7 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,8 @@ Put simply, this tool takes raw proteomic input and outputs a FASTA file of those organisms most likely to be present in the proteomic input. -The pipeline uses a neural network to identify peptide sequences from raw proteomic input, which are then aligned against all protein sequences using a diamond search. This offers us a view of those organisms most likely to be present in the proteomic samples, with which we make a FASTA file from the most likely organisms identified. +The pipeline uses neural networks to identify peptide sequences from raw proteomic input, which are then aligned against all protein sequences using a diamond search. This offers us a view of those organisms most likely to be present in the proteomic samples, with which we make a FASTA file from the most likely organisms identified. -This is the pipeline discussed in the following paper: https://pubs.acs.org/doi/full/10.1021/acs.jproteome.2c00334 ## Setup @@ -24,8 +23,6 @@ Download the following files to the ```Kaiko_volume/Kaiko_stationary_files``` fo 4) [Diamond search](https://github.com/bbuchfink/diamond/releases), choosing the appropriate system. If using Docker, get the Linux version. -5) Denovo model: Within the ```Kaiko_denovo/model``` folder, open the ```get_data.sh``` script to download the trained model. - ### Processing 1) Extract the diamond file from step 4 into its own folder within ```Kaiko_volume/Kaiko_stationary_files```, eg ```Kaiko_volume/Kaiko_stationary_files/diamond```. diff --git a/kaiko_defaults.yaml b/kaiko_defaults.yaml index 0b7e717..616722c 100644 --- a/kaiko_defaults.yaml +++ b/kaiko_defaults.yaml @@ -1,13 +1,16 @@ denovo: beam_search: true beam_size: 5 + cached: false mgf_dir: Kaiko_volume/Kaiko_input_files/ multi_decode: true profile: false topk: false diamond tally: + cached: false diamond_folder: Kaiko_volume/Kaiko_stationary_files/diamond mode: member + n_protein_cutoff: 300000 ncbi_taxa_folder: Kaiko_volume/Kaiko_stationary_files/ncbi_taxa ntops: -1 pident: 100