-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpymfe-runner.py
executable file
·85 lines (65 loc) · 2.88 KB
/
pymfe-runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3.10
"""Creates a CSV file that can be used to train a safeguard system predictor from a given slurm_file."""
import argparse
import pandas
from pymfe.mfe import MFE
from sklearn.datasets import fetch_openml
from linear_ensemble_basis import time_limit
from read_out_txt import OpenMLDatasetResult
def fetch_dataset(openml_dataset):
fetched_dataset = fetch_openml(name=openml_dataset, as_frame=True, parser='auto')
combined_df = fetched_dataset.frame
return fetched_dataset, openml_dataset, combined_df
def run(dataset, groups):
try:
fetched_dataset, openml_dataset, combined_df = fetch_dataset(dataset.openml_dataset)
print(f"{openml_dataset=}")
# get accuracy diff and if >=0 save as plus, else minus
pos_or_neg = "minus"
if dataset.get_accuracy()[1] - dataset.get_accuracy()[0] >= 0:
pos_or_neg = "plus"
class_name = fetched_dataset.target_names[0]
# mfe requires list or numpy array
X = combined_df.drop(columns=[class_name]).to_numpy()
y = combined_df[class_name].to_numpy()
mfe = MFE(groups=groups)
# sometimes fit takes too long to run
with time_limit(10 * 60): # 10 minutes, same as Autogluon run
mfe.fit(X, y)
ft = mfe.extract(out_type=pandas.DataFrame)
# NOTE: move this?
ft["class"] = pos_or_neg
return ft
# IndexError e.g. for sylva_agnostic
except (ValueError, RecursionError, IndexError, TimeoutError) as e:
print(f"ignoring {dataset.openml_dataset}, reason: {e}")
return pandas.DataFrame()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, help='slurm_output file name',
default="../auto-gluon/slurm_output/ag8tpymf.4022621.i23r06c04s12.out.txt")
parser.add_argument("-j", '--json', type=str, help='json input file name', default=None)
parser.add_argument('-o', '--output', type=str, help='csv output file name', required=True)
args = parser.parse_args()
groups = ["clustering", "model-based"]
print(f"\n\n{groups=}\n\n")
if args.json is None:
file_name = args.input
else:
file_name = args.json
output: list[OpenMLDatasetResult] = OpenMLDatasetResult.read_json_or_txt(file_name)
total_todo = len(output)
current = 0
ft = pandas.DataFrame()
for item in output:
# see https://stackoverflow.com/a/75956237
ft = pandas.concat([ft, run(item, groups)], ignore_index=True)
current += 1
print(f"{current} of {total_todo}")
# from https://saturncloud.io/blog/how-to-remove-index-column-while-saving-csv-in-pandas/
ft_csv = ft.to_csv(index=False)
# print(ft_csv)
file_name_csv = args.output
with open(file_name_csv, "w") as f:
f.write(ft_csv)
print(f"wrote to {file_name_csv}")