-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatabase.py
349 lines (294 loc) · 11.7 KB
/
database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!/bin/python
import os
import mysql.connector as database
def databasemaken(cursor, connection):
"""
Creates the tables for the database in SQL command format.
Desired values for the tables are given and are put in a list.
Looping through all SQL tables in the list, the tables will be created with
cursor.execute(...).
This function is not called when the data tables already exist!
:param cursor: a variable to communicate with the database.
:param connection: a variable to communicate with the database.
:return: N/A
"""
sql_lijst = []
sql1 = """
CREATE TABLE CANCER (
cancer_id VARCHAR(25) PRIMARY KEY,
name TEXT
);
"""
sql_lijst.append(sql1)
sql2 = """
CREATE TABLE SAMPLE (
sample_id VARCHAR(50) PRIMARY KEY,
cancer_id VARCHAR(25) REFERENCES CANCER(cancer_id),
normal BOOLEAN,
tumor BOOLEAN);
"""
sql_lijst.append(sql2)
sql3 = """
CREATE TABLE GENES (
gene_id INT PRIMARY KEY,
gene_name VARCHAR(30));
"""
sql_lijst.append(sql3)
sql4 = """
CREATE TABLE GENE_EXP (
gene_id INT REFERENCES GENES(gene_id),
sample_id VARCHAR(50) REFERENCES SAMPLE(sample_id),
gene_expression FLOAT);
"""
sql_lijst.append(sql4)
sql5 = """
CREATE TABLE COUNTS (
substitution VARCHAR(2),
sample_id VARCHAR(50) REFERENCES SAMPLE(sample_id),
count INT);
"""
sql_lijst.append(sql5)
sql6 = """
CREATE TABLE CLUSTER_NORMAL (
cancer_id VARCHAR(25) REFERENCES CANCER(cancer_id),
gene_id INT REFERENCES GENES(gene_id),
wf_up FLOAT,
wf_down FLOAT,
wy_up FLOAT,
wy_down FLOAT,
num_up FLOAT,
num_down FLOAT);
"""
sql_lijst.append(sql6)
sql7 = """
CREATE TABLE CLUSTER_TUMOR (
cancer_id VARCHAR(25) REFERENCES CANCER(cancer_id),
gene_id INT REFERENCES GENES(gene_id),
wf_up FLOAT,
wf_down FLOAT,
wy_up FLOAT,
wy_down FLOAT,
num_up FLOAT,
num_down FLOAT);
"""
sql_lijst.append(sql7)
for z in sql_lijst:
cursor.execute(z)
connection.commit()
def dictionaries():
"""
Creates dictionaries for all data required for each table.
(With each cancer type there will be common genes, but also other (new)
genes not found in other cancer type data. To only add new genes to the
database and avoid errors, all unique (new) genes will be written to
'genes.txt' file. When a gene is not already in the 'genes.txt' file,
the gene will be added to the database and also to the 'genes.txt' file.)
:return:
cancer_dict: Dictionary for CANCER table
sample_dict: Dictionary for SAMPLE table
counts_dict: Dictionary for COUNTS table
genes_exp_dict: Dictionary for GENE_EXP table
new_genes_dict: Dictionary for GENES table
cluster_normal: Dictionary for CLUSTER_NORMAL table
cluster_tumor: Dictionary for CLUSTER_TUMOR table
"""
cancer_dict = {'LSCC': ['LSCC', 'Lung Squamous Cell Carcinoma']}
sample_dict = {}
genes_exp_dict = {}
genes_dict = {}
new_genes_dict = {}
counts_dict = {}
cluster_normal = {}
cluster_tumor = {}
dict = {}
dict2 = {}
count2 = 1
count3 = 1
file_genes1 = open('genes.txt', 'r')
for gene in file_genes1.readlines():
count = int(gene.strip().split('\t')[0])
genes_dict[gene.strip().split('\t')[1]] = gene.strip().split('\t')
file_genes1.close()
file_genes2 = open('genes.txt', 'a')
with open("/DATA/Jasmine/LSCC/cluster_tumor/TUMOR.txt") as file:
while True:
line = file.readline()
if not line:
break
if line.split('\t')[0].startswith('id'):
for column in line.split('\t'):
if not column.startswith('id') and len(column) == 2 and column.startswith('w'):
dict[str(line.split('\t').index(column))] = column.upper()
if not column.startswith('id') and not column.startswith('w'):
dict2[str(line.split('\t').index(column))] = column.rstrip('\n')
if column.rstrip('\n') not in genes_dict.keys():
count += 1
genes_dict[column.rstrip('\n')] = [count, column.rstrip('\n')]
file_genes2.write(str(count) + "\t" + column.rstrip('\n') +"\n")
new_genes_dict[column.rstrip('\n')] = [count, column.rstrip('\n')]
if not line.split('\t')[0].startswith('id'):
sample_dict[line.split('\t')[0]] = [line.split('\t')[0], 'LSCC', 0, 1]
for key, value in dict.items():
counts_dict[count2] = [dict[key], line.split('\t')[0], int(line.split('\t')[int(key)])]
count2 += 1
for key, value in dict2.items():
try:
genes_exp_dict[count3] = [genes_dict[value][0], line.split('\t')[0],
float(line.split('\t')[int(key)])]
except ValueError:
genes_exp_dict[count3] = [genes_dict[value][0], line.split('\t')[0], None]
count3 += 1
with open("/DATA/Jasmine/LSCC/cluster_normal/Normal.txt") as file:
while True:
line = file.readline()
if not line:
break
if not line.split('\t')[0].startswith('id'):
sample_dict[line.split('\t')[0]] = [line.split('\t')[0], 'LSCC', 1, 0]
for key, value in dict.items():
counts_dict[count2] = [dict[key], line.split('\t')[0], int(line.split('\t')[int(key)])]
count2 += 1
for key, value in dict2.items():
count3 += 1
try:
genes_exp_dict[count3] = [genes_dict[value][0], line.split('\t')[0],
float(line.split('\t')[int(key)])]
except ValueError:
genes_exp_dict[count3] = [genes_dict[value][0], line.split('\t')[0], None]
with open("/DATA/Jasmine/LSCC/cluster_tumor/GENES_WF_WY_expe_added_2.txt") as file:
while True:
line = file.readline()
if not line:
break
lijst = line.strip().split('\t')
if line.strip().split('\t')[1] == 'NA':
lijst[1] = None
if line.strip().split('\t')[2] == 'NA':
lijst[2] = None
if line.strip().split('\t')[3] == 'NA':
lijst[3] = None
if line.strip().split('\t')[4] == 'NA':
lijst[4] = None
if line.strip().split('\t')[5] == 'NA':
lijst[5] = None
if line.strip().split('\t')[6] == 'NA':
lijst[6] = None
try:
cluster_tumor[lijst[0]] = ['LSCC', genes_dict[lijst[0]][0],
lijst[1], lijst[2],
lijst[3], lijst[4],
lijst[5], lijst[6]]
except KeyError:
pass
with open("/DATA/Jasmine/LSCC/cluster_normal/GENES_WF_WY_expe_added.txt") as file:
while True:
line = file.readline()
if not line:
break
lijst = line.strip().split('\t')
if line.strip().split('\t')[1] == 'NA':
lijst[1] = None
if line.strip().split('\t')[2] == 'NA':
lijst[2] = None
if line.strip().split('\t')[3] == 'NA':
lijst[3] = None
if line.strip().split('\t')[4] == 'NA':
lijst[4] = None
if line.strip().split('\t')[5] == 'NA':
lijst[5] = None
if line.strip().split('\t')[6] == 'NA':
lijst[6] = None
try:
cluster_normal[lijst[0]] = ['LSCC', genes_dict[lijst[0]][0],
lijst[1], lijst[2],
lijst[3], lijst[4],
lijst[5], lijst[6]]
except KeyError:
pass
return cancer_dict, sample_dict, counts_dict, genes_exp_dict, \
new_genes_dict, cluster_normal, cluster_tumor
def wegschrijven(cursor, connection, cancer_dict, sample_dict, counts_dict,
genes_exp_dict, genes_dict, cluster_normal, cluster_tumor):
"""
Loops through dictionary and creates the insert commands for the
corresponding table. Data in the dictionary will also be inserted in this
loop with cursor.execute(sql, ...).
:param cursor: a variable to communicate with the database.
:param connection: a variable to communicate with the database.
:param cancer_dict: Dictionary with data to insert in CANCER table
:param sample_dict: Dictionary with data to insert in SAMPLE table
:param counts_dict: Dictionary with data to insert in COUNTS table
:param genes_exp_dict: Dictionary with data to insert in GENE_EXP table
:param genes_dict: Dictionary with data to insert in GENES table
:param cluster_normal: Dictionary with data to insert in CLUSTER_NORMAL
table
:param cluster_tumor: Dictionary with data to insert in CLUSTER_TUMOR
table
:return: N/A
"""
sql = """
INSERT INTO CANCER VALUES (%s, %s);
"""
for i in cancer_dict:
cursor.execute(sql, cancer_dict[i])
print(cancer_dict[i])
sql = """
INSERT INTO SAMPLE VALUES (%s, %s, %s, %s);
"""
for j in sample_dict:
cursor.execute(sql, sample_dict[j])
print(sample_dict[j])
sql = """
INSERT INTO GENES VALUES (%s, %s);
"""
for i in genes_dict:
cursor.execute(sql, genes_dict[i])
print(genes_dict[i])
sql = """
INSERT INTO GENE_EXP VALUES (%s, %s, %s);
"""
for k in genes_exp_dict:
cursor.execute(sql, genes_exp_dict[k])
print(genes_exp_dict[k])
sql = """
INSERT INTO COUNTS VALUES (%s, %s, %s)
"""
for i in counts_dict:
cursor.execute(sql, counts_dict[i])
print(counts_dict[i])
sql = """
INSERT INTO CLUSTER_NORMAL VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
"""
for j in cluster_normal:
cursor.execute(sql, cluster_normal[j])
print(cluster_normal[j])
sql = """
INSERT INTO CLUSTER_TUMOR VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
"""
for j in cluster_tumor:
cursor.execute(sql, cluster_tumor[j])
print(cluster_tumor[j])
print("Done")
connection.commit()
def main():
"""
Opens connection with the database. Function databasemaken(...) creates the
database. This function won't be called when the database already exists.
Function dictionaries()
:return: N/A
"""
connection = database.connect(
user="agami-admin",
password="XMCpw9F5",
host="localhost",
database="substitutions")
cursor = connection.cursor()
print("Connected!\n")
#databasemaken(cursor, connection)
cancer_dict, sample_dict, counts_dict, genes_exp_dict, genes_dict, \
cluster_normal, cluster_tumor = dictionaries()
wegschrijven(cursor, connection, cancer_dict, sample_dict, counts_dict,
genes_exp_dict, genes_dict,
cluster_normal, cluster_tumor)
connection.close()
main()