Skip to content

Commit

Permalink
experiment post processing #79
Browse files Browse the repository at this point in the history
  • Loading branch information
dvitel committed Aug 17, 2022
1 parent 4820455 commit 2e30d60
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ testing/deca.json
testing/evo.json
data
figures
db.log
101 changes: 97 additions & 4 deletions evopie/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,19 +820,112 @@ def get_space_params(file_name):
f[space] = Series([v2 for v in g for v2 in v[1]])
f.astype('float')
# print(f'Debug {param} and {f}')
stats_frames = []
for param, frame in frames.items():
sorting_values = sorted(frame.columns, key = lambda c: (-frame[c].median(), *c[0], c[1], c[2]))
sorting_values = sorted(frame.columns, key = lambda c: (*c[0], c[1], c[2]))
frame = frame.reindex(sorting_values, axis=1)
frame.rename(columns = lambda c: f"{c[0]} {c[1]} {c[2]}".strip(), inplace=True)
stats = DataFrame({"mean": frame.mean(), "std": frame.std() })
# print(f"Frame\n{frame}\n-----------")
print(f"{param} stats:\n{stats}\n--------")
mn = frame.mean()
std = frame.std()
clr = ((0.3 + mn * 0.7) if param in ['dim_coverage', 'arr'] else (1 - mn * 0.7)).round(2).astype(str)
stats = DataFrame({param: "cellcolor[rgb]{" + clr + "," + clr + "," + clr + "}" + mn.round(2).astype(str) + " ± " + std.round(2).astype(str) })
stats_frames.append(stats)
# print(f"Frame\n{frame}\n-----------")
boxplot = frame.boxplot(figsize=(12, 7), column = list(frame.columns), fontsize='small')
boxplot.set_xticklabels(boxplot.get_xticklabels(), rotation=-60)
fig = boxplot.get_figure()
fig.set_tight_layout(True)
fig.savefig(os.path.join(figure_folder, f"{param}.png"), format='png')
plt.close(fig)
stats = pandas.concat(stats_frames, axis=1)
latex_table = stats.to_latex().replace("cellcolor[rgb]\\{", "\\cellcolor[rgb]{").replace("\\}", "}")
print(f"Stats:\n{latex_table}\n--------")

@quiz_cli.command("plot-metric-vs-num-of-dims")
@click.option("-p", "--param", required=True)
@click.option("-pt", "--param-title")
@click.option("--data-folder", required=True)
@click.option("--path-suffix", required=True)
@click.option("--figure-folder", required=True)
@click.option("--file-name-pattern")
@click.option("--fixate-range", is_flag=True)
def post_process(data_folder, path_suffix, figure_folder, param, param_title, file_name_pattern, fixate_range):
import re
pattern = re.compile(file_name_pattern)
frames_data = {}
def get_space_params(file_name):
[axes, spanned] = file_name.split('-on-')[1].split('-')[1:3]
dims = tuple([int(d) for d in axes.split('_')])
# sp = int(spanned.split('s_')[1])
return dims[0]

for dim_data_dir in os.listdir(data_folder):
num_dims = int(dim_data_dir.split('-')[1])
file_path = os.path.join(data_folder, dim_data_dir, path_suffix)
for file in os.listdir(file_path):
if not pattern.match(file):
continue
# print(file)
dim_size = get_space_params(file)
# file_id = f"{len(dims)} {sp} {i}"
# file_id = f"{dims} {sp} {i}"
file_frame = pandas.read_csv(os.path.join(file_path, file))
file_data = file_frame[param].to_list()
frames_data.setdefault(dim_size, {}).setdefault(num_dims, []).extend(file_data)
frames = { dim_size: DataFrame(data = df) for dim_size, df in frames_data.items() }
frames = {dim_size: frame.reindex(sorted(frame.columns, key = lambda c: int(c)), axis=1) for dim_size, frame in frames.items() }
sorted_dim_sizes = sorted(frames.keys())
frames_list = []
markers = ["o", "s", "x"]
legend = []
for dim_size_id, dim_size in enumerate(sorted_dim_sizes):
frame = frames[dim_size]
mn = frame.mean()
std = frame.std()
clr = ((0.3 + mn * 0.7) if param in ['dim_coverage', 'arr'] else (1 - mn * 0.7)).round(2).astype(str)
frames_list.append(DataFrame({dim_size: "cellcolor[rgb]{" + clr + "," + clr + "," + clr + "}" + mn.round(2).astype(str) + " ± " + std.round(2).astype(str)}))
#drawing plot
legend.append(f"Axis size {dim_size}")
plot = mn.plot(xlabel="Number of DECA axes", ylabel=param_title, marker = markers[dim_size_id], fontsize = 16)
# boxplot.set_xticklabels(boxplot.get_xticklabels(), rotation=-60)
if fixate_range:
plot.set_ylim([0, 1.05])
legend = plot.legend(legend, fontsize=16)
plot.xaxis.label.set_fontsize(18)
plot.yaxis.label.set_fontsize(18)
fig = plot.get_figure()
fig.set_tight_layout(True)
fig.savefig(os.path.join(figure_folder, f"{param}.png"), format='png')
plt.close(fig)
stat_frame = pandas.concat(frames_list, axis=1)
latex_table = stat_frame.to_latex().replace("cellcolor[rgb]\\{", "\\cellcolor[rgb]{").replace("\\}", "}")
print(f"Frame:\n{latex_table}\n")
# for param, values_group in frames_data.items():
# f = frames.setdefault(param, DataFrame())
# for space, g in groupby(values_group, key = lambda x: x[0]):
# f[space] = Series([v2 for v in g for v2 in v[1]])
# f.astype('float')
# # print(f'Debug {param} and {f}')
# stats_frames = []
# for param, frame in frames.items():
# sorting_values = sorted(frame.columns, key = lambda c: (*c[0], c[1], c[2]))
# frame = frame.reindex(sorting_values, axis=1)
# frame.rename(columns = lambda c: f"{c[0]} {c[1]} {c[2]}".strip(), inplace=True)
# mn = frame.mean()
# std = frame.std()
# clr = ((0.3 + mn * 0.7) if param in ['dim_coverage', 'arr'] else (1 - mn * 0.7)).round(2).astype(str)
# stats = DataFrame({param: "cellcolor[rgb]{" + clr + "," + clr + "," + clr + "}" + mn.round(2).astype(str) + " ± " + std.round(2).astype(str) })
# stats_frames.append(stats)
# # print(f"Frame\n{frame}\n-----------")
# boxplot = frame.boxplot(figsize=(12, 7), column = list(frame.columns), fontsize='small')
# boxplot.set_xticklabels(boxplot.get_xticklabels(), rotation=-60)
# fig = boxplot.get_figure()
# fig.set_tight_layout(True)
# fig.savefig(os.path.join(figure_folder, f"{param}.png"), format='png')
# plt.close(fig)
# stats = pandas.concat(stats_frames, axis=1)
# latex_table = stats.to_latex().replace("cellcolor[rgb]\\{", "\\cellcolor[rgb]{").replace("\\}", "}")
# print(f"Stats:\n{latex_table}\n--------")

APP.cli.add_command(quiz_cli)
APP.cli.add_command(student_cli)
Expand Down
2 changes: 1 addition & 1 deletion evopie/deca.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def redundancy(space, population_distractors):
'''
population_spanned_distractors = [did for _, point in space['spanned'].items() for _, did in point['dids'] if did in population_distractors]
pop_r = len(population_spanned_distractors) / len(population_distractors)
deca_r = len(space['spanned']) / (len(space['spanned']) + sum([len(points) for points in space["axes"].values()]) + len(space["zero"]["dids"]))
deca_r = len(space['spanned']) / (len(space['spanned']) + sum([len(points) for points in space["axes"].values()]))
return {"population_redundancy":pop_r, "deca_redundancy":deca_r, 'num_spanned': len(space['spanned']) }

# redundancy(space, [5,6,14])
Expand Down
69 changes: 68 additions & 1 deletion experiment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@ set -e
pipenv shell
flask DB-reboot
flask quiz init -nq 4 -nd 25 #search space size

flask quiz init -nq 2 -nd 10
# TODO - estimate search space size
# TODO - estimate sampled number of individuals

# flask student init -ns 100

flask student init -ns 20

# flask student knows -kr -ef 's{}@usf.edu' -k '{"sid":{"range":[1,10]},"qid":1,"did":4,"step":1,"chance":1}' -k '{"sid":{"range":[1,20]},"qid":1,"did":8,"step":1,"chance":1}' -k '{"sid":{"range":[11,20]},"qid":2,"did":12,"step":1,"chance":1}' -k '{"sid":{"range":[1,20]},"qid":2,"did":16,"step":1,"chance":1}'

# flask deca init -q 1 -o deca-spaces -a 2 -a 3 --spanned 1 --best-students-percent 0.1 --spanned-geometric-p 0.8 --noninfo 0.1 -n 1
flask deca init-many -ns 100 -nq 4 -nd 25 \
-an 3 -an 7 -an 10 \
Expand Down Expand Up @@ -42,4 +48,65 @@ flask quiz post-process --result-folder data/data-8/results --figure-folder figu
--group-by-space

flask quiz post-process --result-folder data/data-2/results --figure-folder figures --file-name-pattern '.*_0-\d+.csv' \
-p population_duplication --group-by-space
-p population_duplication --group-by-space


flask quiz post-process --result-folder data/data-2022-08-08/data-2/results --figure-folder figures --file-name-pattern '.*_0-\d+.csv' \
-p dim_coverage -p dim_coverage_with_spanned -p arr -p arr_with_spanned -p population_redundancy -p deca_redundancy -p num_spanned \
-p deca_duplication -p population_duplication -p noninfo

flask quiz post-process --result-folder data/data-2022-08-08/data-2/results --figure-folder figures --file-name-pattern '.*_0-\d+.csv' \
-p dim_coverage -p arr -p population_duplication -p noninfo

flask quiz post-process --result-folder data/data-2022-08-08/data-10/results --figure-folder figures --file-name-pattern '.*_0-\d+.csv' \
-p dim_coverage -p arr -p population_duplication -p noninfo

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_0-\d+.csv' \
-p dim_coverage -pt 'Dimension coverage'

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_0-\d+.csv' \
-p arr -pt 'Average rank of representatives'

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_0-\d+.csv' \
-p noninfo -pt 'Non-informativeness'

flask quiz post-process --result-folder data/data-2022-08-08/data-5/results --figure-folder figures --file-name-pattern '.*_2-\d+.csv' \
-p dim_coverage -p arr -p population_redundancy -p population_duplication -p noninfo

flask quiz post-process --result-folder data/data-2022-08-08/data-10/results --figure-folder figures --file-name-pattern '.*_2-\d+.csv' \
-p dim_coverage -p arr -p population_redundancy -p population_duplication -p noninfo

flask quiz post-process --result-folder data/data-2022-08-08/data-2/results --figure-folder figures --file-name-pattern '.*_20-\d+.csv' \
-p dim_coverage -p arr -p population_redundancy -p population_duplication -p noninfo


flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_2-\d+.csv' \
-p dim_coverage -pt 'Dimension coverage'

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_2-\d+.csv' \
-p arr -pt 'Average rank of representatives'

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_2-\d+.csv' \
-p population_redundancy -pt 'Redundancy' --fixate-range

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_2-\d+.csv' \
-p population_duplication -pt 'Duplication' --fixate-range

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_2-\d+.csv' \
-p noninfo -pt 'Non-informativeness' --fixate-range


flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_20-\d+.csv' \
-p dim_coverage -pt 'Dimension coverage'

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_20-\d+.csv' \
-p arr -pt 'Average rank of representatives' --fixate-range

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_20-\d+.csv' \
-p population_redundancy -pt 'Redundancy' --fixate-range

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_20-\d+.csv' \
-p population_duplication -pt 'Duplication' --fixate-range

flask quiz plot-metric-vs-num-of-dims -p dim_coverage --data-folder data/data-2022-08-08 --path-suffix results --figure-folder figures --file-name-pattern '.*-s_20-\d+.csv' \
-p noninfo -pt 'Non-informativeness' --fixate-range

0 comments on commit 2e30d60

Please sign in to comment.