-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSEDDataParser.py
753 lines (665 loc) · 34.8 KB
/
SEDDataParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
import pandas as pd
import numpy as np
from ast import literal_eval
from tqdm import tqdm
import astropy.units as u
import astropy.coordinates as coord
from astroquery.vizier import Vizier
"""A class that parses SED data. It will either load in the masterlists (from a specified
directory) and pull pandas dataframes out of these, or it will create the dataframse by
scraping Vizier for data for an individual source"""
# silence warning about pandas copies
pd.options.mode.chained_assignment = None
class SEDDataParser:
def __init__(
self,
use_local=True,
masterlist_filepath="./data/allsky_masterlist.csv",
peak_masterlist_filepath="./data/allsky_peak_masterlist.csv",
almacal_masterlist_filepath="./data/allsky_almacal_masterlist.csv",
survey_info_filepath="./data/included_surveys_final.csv",
):
self.survey_info = pd.read_csv(survey_info_filepath, header=0)
# check survey info does not have any NaNs, these are bad!
if use_local:
################################################################################
# read in the master file
self.masterlist = pd.read_csv(masterlist_filepath, header=0)
self.peak_masterlist = pd.read_csv(peak_masterlist_filepath, header=0)
# plus the almacal masterlist!
self.almacal_masterlist = pd.read_csv(almacal_masterlist_filepath, header=0)
#####################################
# init tqdm for nice progress bars
tqdm.pandas(desc="parsing frequencies")
# print(masterlist.columns)
print("initialising masterlist data, please be patient...")
# convert survey_fluxes, survey_freqs, survey_bibcodes, survey_flux_errs
self.masterlist["survey_fluxes"] = self.masterlist[
"survey_fluxes"
].progress_apply(lambda x: literal_eval(x.replace("nan", "None")))
tqdm.pandas(desc="parsing fluxes")
self.masterlist["survey_freqs"] = self.masterlist[
"survey_freqs"
].progress_apply(lambda x: literal_eval(x))
tqdm.pandas(desc="parsing flux errors")
self.masterlist["survey_flux_errs"] = self.masterlist[
"survey_flux_errs"
].progress_apply(lambda x: literal_eval(x.replace("nan", "None")))
tqdm.pandas(desc="parsing bibcodes")
self.masterlist["survey_bibcodes"] = self.masterlist[
"survey_bibcodes"
].progress_apply(lambda x: literal_eval(x))
tqdm.pandas(desc="parsing survey names")
self.masterlist["survey_names"] = self.masterlist[
"survey_names"
].progress_apply(lambda x: literal_eval(x))
tqdm.pandas(desc="parsing vizier locations")
self.masterlist["survey_vizier"] = self.masterlist[
"survey_vizier"
].progress_apply(lambda x: literal_eval(x))
# convert survey_fluxes, survey_freqs, survey_bibcodes, survey_flux_errs
tqdm.pandas(desc="parsing peak fluxes")
self.peak_masterlist["survey_fluxes"] = self.peak_masterlist[
"survey_fluxes"
].progress_apply(lambda x: literal_eval(x.replace("nan", "None")))
tqdm.pandas(desc="parsing peak frequencies")
self.peak_masterlist["survey_freqs"] = self.peak_masterlist[
"survey_freqs"
].progress_apply(lambda x: literal_eval(x))
tqdm.pandas(desc="parsing peak flux errors")
self.peak_masterlist["survey_flux_errs"] = self.peak_masterlist[
"survey_flux_errs"
].progress_apply(lambda x: literal_eval(x.replace("nan", "None")))
tqdm.pandas(desc="parsing peak bibcodes")
self.peak_masterlist["survey_bibcodes"] = self.peak_masterlist[
"survey_bibcodes"
].progress_apply(lambda x: literal_eval(x))
tqdm.pandas(desc="parsing peak names")
self.peak_masterlist["survey_names"] = self.peak_masterlist[
"survey_names"
].progress_apply(lambda x: literal_eval(x))
tqdm.pandas(desc="parsing peak vizier locations")
self.peak_masterlist["survey_vizier"] = self.peak_masterlist[
"survey_vizier"
].progress_apply(lambda x: literal_eval(x))
print("=== masterlists ready ===")
else:
print(
"SEDDataParser initialised for scraping data from the web only. If you want to use local data please re-initialise the instance of this class, passing in 'use_local = True'"
)
return
def retrieve_fluxdata_local(self, iau_name=None, racs_id=None, idx=None):
if idx is not None:
# extract from masterlist
datarow = self.masterlist.loc[idx, :]
racs_id = datarow['source_id']
# same for peak data!
peak_datarow = self.peak_masterlist.loc[
self.peak_masterlist["source_id"] == racs_id, :
]
else:
# extract from masterlist
datarow = self.masterlist.loc[self.masterlist["source_id"] == racs_id, :]
# same for peak data!
peak_datarow = self.peak_masterlist.loc[
self.peak_masterlist["source_id"] == racs_id, :
]
# if more than one, just pick first one THIS WILL ONLY BE AN ISSUE IF IT IS A COMPLEX SOURCE!
# and in that case it will be flagged anyway!
if peak_datarow.shape[0] > 1:
print(peak_datarow[["survey_fluxes", "survey_names"]])
raise ValueError("there should only be 1 matching peak flux file!!")
peak_datarow = peak_datarow.loc[peak_datarow.index.tolist()[0], :]
else:
peak_datarow = peak_datarow.squeeze()
# if uncertainties not the same length as fluxes, check for CCA survey, which doesn't have errors on Vizier
if len(datarow["survey_fluxes"].squeeze()) != len(datarow["survey_flux_errs"].squeeze()):
# raise ValueError('mismatch length beween fluxes and uncertainties!')
if "CCA" in datarow["survey_names"].squeeze():
# if CCA then insert a 10% flux error
first_surv_idx = datarow["survey_names"].squeeze().index("CCA")
temp_err_list = datarow["survey_flux_errs"].squeeze()
datarow["survey_flux_errs"].squeeze().insert(
first_surv_idx, datarow["survey_fluxes"].squeeze()[first_surv_idx] * 0.1
)
# and second CCA flux
datarow["survey_flux_errs"].squeeze().insert(
first_surv_idx + 1,
datarow["survey_fluxes"].squeeze()[first_surv_idx + 1] * 0.1,
)
else:
print(datarow["survey_fluxes"], datarow["survey_flux_errs"])
print(datarow["survey_names"])
raise Exception("flux lengths do not match when building table!")
# now make into a flux table!
flux_data = pd.DataFrame(
{
"Frequency (Hz)": datarow["survey_freqs"].squeeze(),
"Flux Density (Jy)": datarow["survey_fluxes"].squeeze(),
"Uncertainty": datarow["survey_flux_errs"].squeeze(),
"Survey quickname": datarow["survey_vizier"].squeeze(),
"Refcode": datarow["survey_bibcodes"].squeeze(),
"Survey quickname": datarow["survey_names"].squeeze(),
}
)
# drop vlssr as it is peak flux!
if "vlssr" in flux_data["Survey quickname"].apply(lambda x: x.lower()).tolist():
# get index and drop because it's peak!
drop_idx = (
flux_data["Survey quickname"]
.apply(lambda x: x.lower())
.tolist()
.index("vlssr")
)
flux_data = flux_data.drop(index=drop_idx)
flux_data = flux_data.reset_index(drop=True)
# add racs info
append_flux_idx = flux_data.shape[0]
flux_data.loc[append_flux_idx, "Frequency (Hz)"] = 888000000
flux_data.loc[append_flux_idx, "Flux Density (Jy)"] = (
datarow["total_flux_source"].values[0] / 1e3
)
flux_data.loc[append_flux_idx, "Uncertainty"] = (
datarow["e_total_flux_source"].values[0] / 1e3
)
flux_data.loc[append_flux_idx, "Survey quickname"] = ""
flux_data.loc[append_flux_idx, "Refcode"] = "2021PASA...38...58H"
flux_data.loc[append_flux_idx, "Survey quickname"] = "RACS"
# now add ALMA calibrator data if available!
almacal_pts = self.almacal_masterlist[
self.almacal_masterlist["source_id"] == racs_id
]
# if alma has multiple fluxes at the same frequency (to the nearest 5ghz) that vary by more than 10 sigma, flag
# this source as variable, and don't include fluxes in fitting
alma_variable = -1
alma_vi = -1
if almacal_pts.shape[0] > 0:
alma_variable = self.is_alma_variable(almacal_pts)
alma_vi = self.get_alma_variability_index(almacal_pts)
if not alma_variable:
# add almacal to flux_data
alma_count = 0
for alma_idx in almacal_pts.index.tolist():
flux_data.loc[
append_flux_idx + 1 + alma_count, "Frequency (Hz)"
] = (almacal_pts.loc[alma_idx, "Freq"] * 1e9)
flux_data.loc[
append_flux_idx + 1 + alma_count, "Flux Density (Jy)"
] = almacal_pts.loc[alma_idx, "Flux"]
flux_data.loc[
append_flux_idx + 1 + alma_count, "Survey quickname"
] = "ALMACAL"
flux_data.loc[
append_flux_idx + 1 + alma_count, "Uncertainty"
] = almacal_pts.loc[alma_idx, "e_Flux"]
flux_data.loc[
append_flux_idx + 1 + alma_count, "Survey quickname"
] = "J/MNRAS/485/1188/acccat"
flux_data.loc[
append_flux_idx + 1 + alma_count, "Refcode"
] = "2019MNRAS.485.1188B"
alma_count += 1
# resort by frequency
flux_data = flux_data.sort_values(by="Frequency (Hz)")
# remove nan frequencies
flux_data = flux_data[~pd.isnull(flux_data["Flux Density (Jy)"])]
# remove negative fluxes - shouldn't be necessary but sometimes GLEAM forced
# fluxes drop to negatives
flux_data = flux_data[flux_data["Flux Density (Jy)"] > 0]
if len(peak_datarow["survey_freqs"]) > 0:
# now make into a flux table!
peak_flux_data = pd.DataFrame(
{
"Frequency (Hz)": peak_datarow["survey_freqs"],
"Flux Density (Jy)": peak_datarow["survey_fluxes"],
"Uncertainty": peak_datarow["survey_flux_errs"],
"Survey quickname": peak_datarow["survey_vizier"],
"Refcode": peak_datarow["survey_bibcodes"],
"Survey quickname": peak_datarow["survey_names"],
}
)
else:
peak_flux_data = pd.DataFrame(
columns=[
"Frequency (Hz)",
"Flux Density (Jy)",
"Uncertainty",
"Survey quickname",
"Refcode",
"Survey quickname",
]
)
if peak_flux_data.shape[0] > 1:
# add racs info
append_peak_idx = peak_flux_data.shape[0] + 1
peak_flux_data.loc[append_peak_idx, "Frequency (Hz)"] = 888000000
peak_flux_data.loc[append_peak_idx, "Flux Density (Jy)"] = (
peak_datarow["peak_flux"] / 1e3
)
peak_flux_data.loc[append_peak_idx, "Uncertainty"] = (
peak_datarow["e_peak_flux"] / 1e3
)
peak_flux_data.loc[append_peak_idx, "Survey quickname"] = ""
peak_flux_data.loc[append_peak_idx, "Refcode"] = "2021PASA...38...58H"
peak_flux_data.loc[append_peak_idx, "Survey quickname"] = "RACS"
# resort by frequency
peak_flux_data = peak_flux_data.sort_values(by="Frequency (Hz)")
else:
# add racs info
peak_flux_data["Flux Density (Jy)"] = peak_datarow["peak_flux"] / 1e3
peak_flux_data["Uncertainty"] = peak_datarow["e_peak_flux"] / 1e3
peak_flux_data["Survey quickname"] = ""
peak_flux_data["Refcode"] = "2021PASA...38...58H"
peak_flux_data["Frequency (Hz)"] = 888 * 1e6
peak_flux_data["Survey quickname"] = "RACS"
return flux_data, peak_flux_data, alma_variable, racs_id, alma_vi
def query_vizier(self, cat, ra, dec, columns, radius):
# radius to search (arcsec)
rad = radius / 3600
coords = coord.SkyCoord(ra=ra, dec=dec, unit=(u.deg, u.deg), frame="icrs")
if columns == None:
result = Vizier.query_region(coords, radius=rad * u.deg, catalog=cat)
else:
viz_instance = Vizier(
columns=["**"]
) # columns, catalog = cat)# <-- what we currently have just gets all cols!
result = viz_instance.query_region(coords, radius=rad * u.deg, catalog=cat)
return result
def retrieve_fluxdata_remote(
self,
iau_name=None,
racs_id=None,
ra=None,
dec=None,
reliable_only=True,
use_island_radius=True,
):
"""
Essentially a wrapper for astroquery's Vizier.query_region() function.
Output is a pandas table of (integrated) flux data, peak flux data
(where this is provided in addition) and a flag for alma_variable sources.
Takes ra,dec coordinates as input
"""
photometry_table = pd.DataFrame(
columns=[
"Frequency (Hz)",
"Flux Density (Jy)",
"Uncertainty",
"Survey quickname",
"Refcode",
]
)
peak_phot_table = pd.DataFrame(
columns=[
"Frequency (Hz)",
"Flux Density (Jy)",
"Uncertainty",
"Survey quickname",
"Refcode",
]
)
# filter out so we only use survey we want
if reliable_only:
rel_threshold = 1
else:
rel_threshold = 99
surveys_used = self.survey_info[
self.survey_info["reliability"] <= rel_threshold
]
count = 1
# first,query object
for idx in surveys_used.index.tolist():
radius = surveys_used.loc[idx, "match_radius_95"]
# SKIP IF ALMACAL
if surveys_used.loc[idx, "Name"] == "ALMACAL":
alma_cat = surveys_used.loc[idx, "Vizier name"]
alma_columns = ["Flux", "e_Flux", "Band"]
alma_radius = radius
alma_refcode = surveys_used.loc[idx, "bibcode"]
continue
# simple RACS info
if surveys_used.loc[idx, "Name"] == "RACS":
res = Vizier.query_constraints(
catalog=surveys_used.loc[idx, "Vizier name"],
ID="={}".format(racs_id),
)[0].to_pandas()
single_row = pd.DataFrame(
{
"Frequency (Hz)": [
float(surveys_used.loc[idx, "frequencies (MHz)"])
],
"Flux Density (Jy)": [res["Ftot"].values[0] / 1e3],
"Uncertainty": [res["e_Ftot"].values[0] / 1e3],
"Survey quickname": [surveys_used.loc[idx, "Vizier name"]],
"Refcode": [surveys_used.loc[idx, "bibcode"]],
"Survey quickname": [surveys_used.loc[idx, "Name"]],
}
)
photometry_table = (single_row.copy() if photometry_table.empty
else pd.concat(
[photometry_table, single_row], ignore_index=True, axis=0)
)
peak_single_row = pd.DataFrame(
{
"Frequency (Hz)": [
float(surveys_used.loc[idx, "frequencies (MHz)"])
],
"Flux Density (Jy)": [res["Fpk"].values[0] / 1e3],
"Uncertainty": [res["e_Fpk"].values[0] / 1e3],
"Survey quickname": [surveys_used.loc[idx, "Vizier name"]],
"Refcode": [surveys_used.loc[idx, "bibcode"]],
"Survey quickname": [surveys_used.loc[idx, "Name"]],
}
)
peak_phot_table = (peak_single_row.copy() if peak_phot_table.empty
else pd.concat([peak_phot_table, peak_single_row], ignore_index=True, axis=0)
)
continue
# get column names to return from Vizier query
if ";" in surveys_used.loc[idx, "flux_columns"] and not pd.isnull(
surveys_used.loc[idx, "e_flux_columns"]
):
colnames = surveys_used.loc[idx, "flux_columns"].split(
";"
) + surveys_used.loc[idx, "e_flux_columns"].split(";")
elif not pd.isnull(surveys_used.loc[idx, "e_flux_columns"]):
colnames = (
surveys_used.loc[idx, "flux_columns"]
+ surveys_used.loc[idx, "e_flux_columns"]
)
else:
colnames = surveys_used.loc[idx, "flux_columns"].split(";")
cat_table = self.query_vizier(
surveys_used.loc[idx, "Vizier name"],
ra,
dec,
columns=colnames,
radius=radius,
)
if len(cat_table) == 0:
count += 1
continue
# because query_vizier returns a list of catalogues, but we are only querying 1 at a time
cat_table = cat_table[0]
# now append photometry val if it exists
if len(cat_table) == 0:
# here the table was empty ... not sure I need this, I think it's covered
# above? leave it in anyway, I'm too lazy to check...
count += 1
continue
else:
# now we add the new photometry values, we want to fill 'NED Uncertainty',
# Frequency, and Flux Density
if len(cat_table) > 1:
# if VLASS, check whether multiple entries are duplicates, and if they are pick the preferred one
if (surveys_used.loc[idx, "Name"] == "VLASS") and (
len(np.unique(cat_table["CompName"])) == 1
):
cat_table = cat_table[cat_table["DupFlag"] == 1]
else:
print(
"MORE THAN 1 ENTRY FOUND WITHIN 99% MATCH RADIUS FOR THIS OBJECT"
)
cat_table.pprint(max_lines=-1, max_width=-1)
cat_table = cat_table[
cat_table["_r"] == np.min(cat_table["_r"])
]
# exit()
# otherwise extract the useful information from the prefilled survey table
cat_freqs = surveys_used.loc[idx, "frequencies (MHz)"].split(";")
cat_fluxes = surveys_used.loc[idx, "flux_columns"].split(";")
try:
cat_flux_errs = surveys_used.loc[idx, "e_flux_columns"].split(";")
except AttributeError:
cat_flux_errs = ""
# loop through all the frequencies in this survey, and extract data from the query
for i in range(len(cat_fluxes)):
# assign frequency - this allows us to have peak and integrated at the same frequency!
if len(cat_freqs) == 1:
current_freq = cat_freqs[0]
else:
current_freq = cat_freqs[i]
current_flux_col = cat_fluxes[i]
current_flux_val = cat_table[current_flux_col].data[0]
# if C1813_new then skip unless marked as new measurement!
if (
surveys_used.loc[idx, "Name"] == "C1813_new"
and cat_table["n_" + current_flux_col] != "*"
):
print(cat_table["n_" + current_flux_col])
print("no")
continue
# if actual error column exists, get it
if cat_flux_errs != "":
current_err_col = cat_flux_errs[i]
current_flux_err = cat_table[current_err_col].data[0]
else:
#assume 15% flux errors
current_flux_err = 0.15 * cat_table[current_flux_col].data[0]
# if vlass, bump up err! - no longer needed as of version 2, but this is not currently on Vizier!
if surveys_used.loc[idx, "Name"] == "VLASS":
current_err_col = cat_flux_errs[i]
current_flux_err = (
0.15 * cat_table[current_flux_col].data[0]
+ cat_table[current_err_col].data[0]
)
# account for cats where measurement is in mJy
if surveys_used.loc[idx, "flux units"] == "mJy":
current_flux_val = current_flux_val / 1000
current_flux_err = current_flux_err / 1000
# make sure NOT VLSSr which is peak only, and not a masked flux value (NaN) from the query
if not surveys_used.loc[idx, "Name"] == "VLSSr" and not isinstance(current_flux_val, np.ma.core.MaskedConstant):
# append to table to return
single_row = pd.DataFrame(
{
"Frequency (Hz)": [current_freq],
"Flux Density (Jy)": [current_flux_val],
"Uncertainty": [current_flux_err],
"Survey quickname": [
surveys_used.loc[idx, "Vizier name"]
],
"Refcode": [surveys_used.loc[idx, "bibcode"]],
"Survey quickname": [surveys_used.loc[idx, "Name"]],
}
)
photometry_table = pd.concat(
[photometry_table, single_row], ignore_index=True, axis=0
)
##############################################################################################
# now do the same for the peak flux if it exists
# get column names for peak fluxes
if not pd.isnull(surveys_used.loc[idx, "peak_fluxcol"]):
peak_colnames = surveys_used.loc[idx, "peak_fluxcol"].split(
";"
) + surveys_used.loc[idx, "peak_efluxcol"].split(";")
peak_cat_table = self.query_vizier(
surveys_used.loc[idx, "Vizier name"],
ra,
dec,
columns=peak_colnames,
radius=radius,
)
if len(cat_table) == 0:
continue
# because query_vizier returns a list of catalogues, but we are only querying 1 at a time
peak_cat_table = peak_cat_table[0]
# now append photometry val if it exists
if len(peak_cat_table) == 0:
continue
# if more than one match within the radius only append the closest
if len(peak_cat_table) > 1:
# if VLASS, check whether multiple entries are duplicates, and if they are pick the preferred one
if (surveys_used.loc[idx, "Name"] == "VLASS") and (
len(np.unique(peak_cat_table["CompName"])) == 1
):
peak_cat_table = peak_cat_table[
peak_cat_table["DupFlag"] == 1
]
else:
# print('MORE THAN 1 ENTRY FOUND WITHIN 99% MATCH RADIUS FOR THIS OBJECT')
# peak_cat_table.pprint(max_lines=-1, max_width=-1)
peak_cat_table = peak_cat_table[
peak_cat_table["_r"] == np.min(peak_cat_table["_r"])
]
# otherwise extract the useful information from the prefilled survey table
peak_cat_freqs = surveys_used.loc[
idx, "frequencies (MHz)"
].split(";")
peak_cat_fluxes = surveys_used.loc[idx, "peak_fluxcol"].split(
";"
)
peak_cat_flux_errs = surveys_used.loc[
idx, "peak_efluxcol"
].split(";")
# read and append
# loop through all the frequencies in this survey, and extract data from the query
for i in range(len(peak_cat_fluxes)):
# assign frequency - this allows us to have peak and integrated at the same frequency!
if len(peak_cat_freqs) == 1:
peak_current_freq = peak_cat_freqs[0]
else:
peak_current_freq = peak_cat_freqs[i]
peak_current_flux_col = peak_cat_fluxes[i]
peak_current_flux_val = peak_cat_table[
peak_current_flux_col
].data[0]
# if C1813_new then skip unless marked as new measurement!
if (
surveys_used.loc[idx, "Name"] == "C1813_new"
and peak_at_table["n_" + peak_current_flux_col] != "*"
):
print(peak_cat_table["n_" + peak_current_flux_col])
print("no")
continue
# if actual error column
if peak_cat_flux_errs != "":
peak_current_err_col = peak_cat_flux_errs[i]
peak_current_flux_err = peak_cat_table[
peak_current_err_col
].data[0]
else:
peak_current_flux_err = (
cat[9]
* peak_cat_table[peak_current_flux_col].data[0]
)
# if vlass, bump up err!
if surveys_used.loc[idx, "Name"] == "VLASS":
peak_current_flux_err = (
cat[9]
* peak_cat_table[peak_current_flux_col].data[0]
+ peak_cat_table[peak_current_err_col].data[0]
)
# fix mJy fluxes to be Jy
if surveys_used.loc[idx, "flux units"] == "mJy":
peak_current_flux_val = peak_current_flux_val / 1000
peak_current_flux_err = peak_current_flux_err / 1000
#check to remove masked fluxes
if not isinstance(peak_current_flux_val, np.ma.core.MaskedConstant):
peak_single_row = pd.DataFrame(
{
"Frequency (Hz)": [peak_current_freq],
"Flux Density (Jy)": [peak_current_flux_val],
"Uncertainty": [peak_current_flux_err],
"Survey quickname": [
surveys_used.loc[idx, "Vizier name"]
],
"Refcode": [surveys_used.loc[idx, "bibcode"]],
"Survey quickname": [surveys_used.loc[idx, "Name"]],
}
)
peak_phot_table = pd.concat(
[peak_phot_table, peak_single_row],
ignore_index=True,
axis=0,
)
# now add almacal if it exists!
alma_fluxes = self.query_vizier(
cat=alma_cat, ra=ra, dec=dec, columns=alma_columns, radius=alma_radius
)
alma_variable = -1
alma_vi = -1
if len(alma_fluxes) > 0:
alma_fluxes = alma_fluxes[0].to_pandas()
#check if variable
alma_variable = self.is_alma_variable(alma_fluxes)
alma_vi = self.get_alma_variability_index(alma_fluxes)
flux_idx = photometry_table.shape[0]
print(alma_fluxes)
if alma_fluxes.shape[0] > 0 and not alma_variable:
for alma_idx in alma_fluxes.index.tolist():
photometry_table.loc[flux_idx + alma_idx + 1, "Frequency (Hz)"] = (
alma_fluxes.loc[alma_idx, "Freq"] * 1e3
)
photometry_table.loc[
flux_idx + alma_idx + 1, "Flux Density (Jy)"
] = alma_fluxes.loc[alma_idx, "Flux"]
photometry_table.loc[
flux_idx + alma_idx + 1, "Uncertainty"
] = alma_fluxes.loc[alma_idx, "e_Flux"]
photometry_table.loc[
flux_idx + alma_idx + 1, "Survey quickname"
] = "ALMACAL"
photometry_table.loc[flux_idx + alma_idx + 1, "Refcode"] = alma_fluxes.loc[alma_idx, "_tab2_10"][0:4] + '/' + alma_refcode
# now sort the table so that it is in ascending frequency order
photometry_table["Frequency (Hz)"] = (
photometry_table["Frequency (Hz)"].astype(float) * 1e6
)
peak_phot_table["Frequency (Hz)"] = (
peak_phot_table["Frequency (Hz)"].astype(float) * 1e6
)
photometry_table.sort_values(by="Frequency (Hz)", inplace=True)
peak_phot_table.sort_values(by="Frequency (Hz)", inplace=True)
#now remove any entries that do not have a flux density
photometry_table = photometry_table[~pd.isnull(photometry_table['Frequency (Hz)'])]
peak_phot_table = peak_phot_table[~pd.isnull(peak_phot_table['Frequency (Hz)'])]
#and any negative fluxes (sometimes the GLEAM forced photometry returns these in
#some bands)
#now remove any entries that do not have a flux density
photometry_table = photometry_table[photometry_table['Flux Density (Jy)'] > 0]
peak_phot_table = peak_phot_table[peak_phot_table['Flux Density (Jy)'] > 0]
return photometry_table, peak_phot_table, alma_variable, alma_vi
def is_alma_variable(self, almacal_data: pd.DataFrame):
"""Function to determine whether the source exhibits variability
in the ALMA band, based on flux density measurements from the ALMA
calibrator catalogue (Bonato+2019). This is methodologically simple,
we just check to see if there is variability at the 10 sigma level
over multiple epochs of ALMA observations in small bands"""
if almacal_data.shape[0] > 0:
#round so that it we are looking in bins of 5GHz
almacal_data["Freq_rounded"] = almacal_data["Freq"].apply(
lambda x: 5 * round(x / 5)
)
max_flux_diff = almacal_data.groupby("Freq_rounded")["Flux"].agg(np.ptp)
max_err = almacal_data.groupby("Freq_rounded")["e_Flux"].max()
if (max_flux_diff > 10 * max_err).any():
return True
return False
def get_alma_variability_index(self, almacal_data: pd.DataFrame):
"""Function to determine whether the source exhibits variability
in the ALMA band, based on flux density measurements from the ALMA
calibrator catalogue (Bonato+2019). This is methodologically simple,
we just check to see if there is variability at the 10 sigma level
over multiple epochs of ALMA observations in small bands"""
if almacal_data.shape[0] > 0:
#calculate the variability index for each band
vi_dict = dict()
for alma_band in almacal_data['Band'].unique():
if not almacal_data[almacal_data['Band'] == alma_band].shape[0] > 1:
continue
fluxes = almacal_data.loc[almacal_data['Band'] == alma_band, 'Flux']
uncertainties = almacal_data.loc[almacal_data['Band'] == alma_band, 'e_Flux']
mean_flux = np.mean(fluxes)
#follow Barvainis e tal.(2005), Sadler et al. (2006) and set VI to be negative if value inside
# square root is negative
sqrt_term = np.sum((fluxes - mean_flux)**2) - np.sum(uncertainties**2)
var_index = np.sign(sqrt_term)*(100/mean_flux)*np.sqrt(np.abs(sqrt_term))/np.sqrt(fluxes.shape[0])
vi_dict[alma_band] = var_index
return vi_dict
return np.nan
def add_survey_radius(self, survey_viz):
"""Function to add info automatically to the list of included surveys. Currently
this only works for surveys hosted on Vizier, but could be easily extended to those
on CIRADA. Anything else is only for the adventurous!"""
# NOTE that you will need to add the other info manually - there's no nice way to scrape
# all of this from the catalogue info online.
return