-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParsing Enzyme KEGG.cpp
138 lines (122 loc) · 2.99 KB
/
Parsing Enzyme KEGG.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// Parsing Enzyme KEGG.cpp : Defines the entry point for the console application.
//
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <vector>
#include "linkedlist.h"
using namespace std;
bool Find(string str, string piece)
{
int L = piece.size();
for (int i = 0; i < str.size(); i++)
{
string text = str.substr(i, i + L);
if (text == piece)
return true;
}
return false;
}
long long ComputeHash(string word, int m)
{
const int p = 22543;
const int x = 26;
long long hash = 0;
long long x_powered = 1;
char *word_array = new char[word.length() + 1];
std::strcpy(word_array, word.c_str());
for (int i = 0; i <= word.length(); i++)
{
hash = (hash % p) + (word_array[i] % p) * (x_powered % p);
x_powered = (x_powered % p) * (x % p);
}
delete[] word_array;
return (hash % m);
}
int main()
{
vector <LinkedList> Hash_Table;
vector < pair<long long, string> > Gene_Bank;
vector <string> Enzyme_File;
vector <string> Gene_Names;
vector <int> Enzyme_Hash;
string line;
string temp;
string tempone;
string temptwo;
string tempthree;
int count = 0;
ifstream myfile("ko01000.txt");
if (myfile.is_open())
{
while (getline(myfile, line))
{
stringstream ss(line);
getline(ss, temp, ' ');
if (temp == "E")
{
getline(ss, temp, ' ');
getline(ss, temp, ' ');
getline(ss, temp, ' ');
getline(ss, temp, ' ');
getline(ss, temp, ' ');
getline(ss, temp, ' ');
getline(ss, temp, ' ');
getline(ss, temp, ' ');
getline(ss, tempone, ' ');
getline(ss, tempone, ':');
stringstream sq(tempone);
getline(sq, temptwo, ';');
getline(ss, tempthree, ']');
Enzyme_File.push_back(tempthree);
Gene_Names.push_back(temptwo);
count++;
}
}
myfile.close();
}
cout << "P0" << endl;
for (int i = 0; i < Gene_Names.size(); i++)
{
string copy = Gene_Names[i];
stringstream ss(copy);
getline(ss, temp, ',');
Gene_Names[i] = temp;
}
cout << "P1" << endl;
Hash_Table.resize(count);
for (int i = 0; i < Enzyme_File.size(); i++)
{
//construct hash table for enzymes using n.m.l.k string as hash value
if (!Find(Enzyme_File[i], "-"))
{
long long hash = ComputeHash(Enzyme_File[i], count);
Hash_Table[hash].push_front(Gene_Names[i], Enzyme_File[i]);
Enzyme_Hash.push_back(hash);
}
}
cout << "P1.5" << endl;
for (int i = 0; i < Enzyme_Hash.size(); i++)
{
Gene_Bank.push_back(make_pair(Enzyme_Hash[i], Gene_Names[i]));
}
cout << "P2" << endl;
ofstream output_file("Gene Bank.txt");
//upload hash table
if (output_file.is_open())
{
for (int i = 0; i < Hash_Table.size(); i++)
{
output_file << i << ":";
for (int j = 0; j < Hash_Table[i].size(); j++)
{
output_file << Hash_Table[i].top_front() << " " << Hash_Table[i].name() << ",";
Hash_Table[i].pop_front();
}
output_file << endl;
}
output_file.close();
}
return 0;
}