-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeaturize_custom.py
270 lines (180 loc) · 9.58 KB
/
featurize_custom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""
Module to create chemical and symmetry-based features using custom functions
Author: Son Gyo Jung
Email: sgj13@cam.ac.uk
"""
import os
import pandas as pd
import numpy as np
import chemparse
import requests
import joblib
import pathlib
from pymatgen import MPRester
from pymatgen.core import Composition, Element
from lxml import html
class use_custom_descriptors():
"""
Class to generate custom features
args:
(1) name_of_parent_folder (type:str) - must match name of the chemical data file generated by the 'retrieve_data' module
(2) csv (type:bool) - whether to save data as csv
return:
(1) pandas.Dataframe of custom features (pkl and/or csv)
"""
def __init__(self, name_of_parent_folder, csv):
self.name_of_parent_folder = name_of_parent_folder
self.csv = csv
self.cur_dir = pathlib.Path().resolve()
self.directory = os.path.join(self.cur_dir, 'retrieved_data', self.name_of_parent_folder)
#Import chemcial data
self.df_chem = pd.read_csv(os.path.join(self.directory, r''+ str(self.name_of_parent_folder) + '.csv'))
#Import space group ref data
self.df_sg = joblib.load('space_group_ref.pkl')
#Drop unwanted columns
self.df_sg = self.df_sg.drop(columns = ['Unnamed: 0', 'full_name', 'x', 'y', 'z'])
#Join the two dataframe using 'spacegroup' column
self.df = pd.merge(self.df_chem, self.df_sg, how='left', left_on=['spacegroup.number'], right_on = ['sg_no'])
def movecol(self, dataframe, cols_to_move = [], ref_col = '', place = 'after'):
"""
Function to rearrange columns
arg:
(a) cols_to_move (list) - list of columns to move
(b) ref_col (type:str) - reference column
(c) place (type:str) - whether to move the specified columns 'before' or 'after' the reference column (set to 'after' as default)
return:
(a) pandas.Dataframe
"""
cols = dataframe.columns.tolist()
if place == 'after':
s1 = cols[:list(cols).index(ref_col) + 1]
s2 = cols_to_move
if place == 'before':
s1 = cols[:list(cols).index(ref_col)]
s2 = cols_to_move + [ref_col]
s1 = [i for i in s1 if i not in s2]
s3 = [i for i in cols if i not in s1 + s2]
return dataframe[s1 + s2 + s3]
def featurize(self):
"""
Create custom features which includes:
(a) symmetry-based features,
(b) composition,
(c) atomic fraction,
(d) weight fraction,
(e) weight,
(f) total electrons,
(g) electronegativity,
(h) noble_gas,
(i) transition_metal,
(j) post_transition_metal,
(k) rare_earth_metal,
(l) metal, metalloid,
(m) alkali,
(n) alkaline,
(o) halogen,
(p) chalcogen,
(q) lanthanoid,
(r) actinoid,
(s) quadrupolar,
(t) s-block,
(u) p-block,
(v) d-block,
(w) f-block,
(x) magetic order
"""
# Create composition column
self.df['composition'] = self.df['pretty_formula']
self.df['composition'] = self.df['composition'].apply(lambda x: chemparse.parse_formula(x))
# Move column
self.df = self.movecol(
dataframe =self.df,
cols_to_move = ['composition'],
ref_col = 'task_id',
place = 'after'
)
self.df['composition'] = self.df['composition'].apply(lambda x: list(x.keys()))
# Create pandas.Dataframes for elements (based on atomic and weight fraction)
df_atomic_fraction = pd.DataFrame(index=np.arange(len(self.df['task_id'])), columns=np.arange(118))
df_wt_fraction = pd.DataFrame(index=np.arange(len(self.df['task_id'])), columns=np.arange(118))
# List of elements to consider
name_list = [
'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar',
'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br',
'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te',
'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm',
'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn',
'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',
'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og'
]
# Column names
df_atomic_fraction.columns = name_list
df_wt_fraction.columns = name_list
# Join the df_atomic_fraction with df
df2 = pd.concat([self.df, df_atomic_fraction], axis=1, sort=False)
# Generate atomic fraction features (https://pymatgen.org/pymatgen.core.composition.html)
for ele in df_atomic_fraction.columns.values:
df2[ele] = df2['pretty_formula'].map(lambda x: Composition(str(x)).get_atomic_fraction(Element(str(ele))) if str(ele) in x else 0)
df2 = df2.rename(columns={str(ele): str(ele) + '_af'})
# Join the df_wt_fraction with df
df3 = pd.DataFrame(df2, columns=['task_id', 'pretty_formula'])
df3 = pd.concat([df3, df_wt_fraction], axis=1, sort=False)
# Generate weight fraction features (https://pymatgen.org/pymatgen.core.composition.html)
for ele in df_wt_fraction.columns.values:
df3[ele] = df3['pretty_formula'].map(lambda x: Composition(str(x)).get_wt_fraction(Element(str(ele))) if str(ele) in x else 0)
df3 = df3.rename(columns={str(ele): str(ele) + '_wf'})
# Generate total molecular weight of Composition
df3['weight'] = None
df3['weight'] = df3['pretty_formula'].map(lambda x: Composition(str(x)).weight)
# Generate total electrons
df3['total_e'] = None
df3['total_e'] = df3['pretty_formula'].map(lambda x: Composition(str(x)).total_electrons)
# Generate average electronegativity of the composition
df3['avg_electroneg'] = None
df3['avg_electroneg'] = df3['pretty_formula'].map(lambda x: Composition(str(x)).average_electroneg)
# Check if Composition contains any elements matching a given category
category = [
'noble_gas', 'transition_metal', 'post_transition_metal', 'rare_earth_metal', 'metal', 'metalloid', \
'alkali', 'alkaline', 'halogen', 'chalcogen', 'lanthanoid', 'actinoid', 'quadrupolar', 's-block', 'p-block', \
'd-block', 'f-block'
]
for c in category:
df3[c] = None
df3[c] = df3['pretty_formula'].map(lambda x: Composition(str(x)).contains_element_type(c))
# Drop columns
df3 = df3.drop(['task_id', 'pretty_formula'], axis = 1)
# Concatenate df2 and df3
self.df4 = pd.concat([df2, df3], axis=1, sort=False)
#Save data as csv
joblib.dump(self.df4, os.path.join(self.directory, r'custom_features_' + str(self.name_of_parent_folder) + '.pkl'))
print('Successfully saved data as: ', 'custom_features_' + str(self.name_of_parent_folder) + '.pkl')
if self.csv == True:
self.df4.to_csv(os.path.join(self.directory, r'custom_features_' + str(self.name_of_parent_folder) + '.csv'))
print('Successfully saved data as: ', 'custom_features_' + str(self.name_of_parent_folder) + '.csv')
def join(self):
"""
Join features with CFID features
"""
# Import CIFD features
try:
df_cifd = joblib.load(os.path.join(self.directory,r'CFID_features_' + str(self.name_of_parent_folder) + '.pkl'))
except:
print('Pickle of CFID features does not exist')
print('Checking for csv file')
df_cifd = pd.read_csv(os.path.join(self.directory,r'CFID_features_' + str(self.name_of_parent_folder) + '.csv'))
else:
print('CFID feature file does not exist')
# Join the two featurised dataframe
df_joined = pd.merge(df_cifd, self.df4, how = 'left', left_on = ['task_id'], right_on = ['task_id'])
df_joined = self.movecol(
dataframe = df_joined,
cols_to_move = ['composition', 'pretty_formula'],
ref_col = 'task_id',
place = 'after'
)
#Save data as csv
joblib.dump(df_joined, os.path.join(self.directory, r'cfid_and_custom_features_' + str(self.name_of_parent_folder) + '.pkl'))
print('Data saved as: "cfid_and_custom_features_' + str(self.name_of_parent_folder) + '.pkl"')
if self.csv == True:
df_joined.to_csv(os.path.join(self.directory, r'cfid_and_custom_features_' + str(self.name_of_parent_folder) + '.csv'))
print('Data saved as: "cfid_and_custom_features_' + str(self.name_of_parent_folder) + '.csv"')