-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
missing taxonomy python notebook to script #245
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import argparse | ||
import tdtax | ||
import os | ||
import scope_phenom # https://github.com/bfhealy/scope-phenomenology | ||
bfhealy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
from scope.utils import read_parquet | ||
import pandas as pd | ||
import numpy as np | ||
|
||
phenom = scope_phenom.taxonomy | ||
sitewide = tdtax.taxonomy | ||
|
||
# Recursive functions to perform a reverse lookup in taxonomy dictionaries | ||
|
||
def trace_path(dct, key): | ||
if dct == key: | ||
return [dct] | ||
elif isinstance(dct, dict): | ||
for k, v in dct.items(): | ||
p = trace_path(v, key) | ||
if p: | ||
return [k] + p | ||
elif isinstance(dct, list): | ||
lst = dct | ||
for i in range(len(lst)): | ||
p = trace_path(lst[i], key) | ||
if p: | ||
return [str(i)] + p | ||
|
||
def get_class_path(trail, dct, key): | ||
stepdown = dct[trail[0]][int(trail[1])] | ||
cls = stepdown['class'] | ||
if cls == key: | ||
return [cls] | ||
else: | ||
trail = trail[2:] | ||
p = get_class_path(trail, stepdown, key) | ||
return [cls] + p | ||
|
||
def missing_taxonomy(parquet, mapper): | ||
# Read in golden dataset (downloaded from Fritz), mapper | ||
parquet_path = os.path.join(os.path.dirname(__file__), parquet) | ||
mapper_path = os.path.join(os.path.dirname(__file__), mapper) | ||
output_path = os.path.join(os.path.dirname(__file__), "golden_missing_labels.csv") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be best to allow the user to customize the name of this output file using another argument. |
||
gold_new = read_parquet(parquet_path) | ||
golden_dataset_mapper = pd.read_json(mapper_path) | ||
gold_map = golden_dataset_mapper.copy() | ||
none_cols = gold_map.loc['fritz_label'] == 'None' | ||
gold_map = gold_map.drop(columns=none_cols.index[none_cols.values]) | ||
|
||
# Manipulate golden_dataset_mapper to flip keys and values | ||
gold_map = gold_map.transpose() | ||
gold_map.index.rename('trainingset_label', inplace=True) | ||
gold_map = gold_map.reset_index(drop=False).set_index('fritz_label') | ||
gold_dict = gold_map.transpose().to_dict() | ||
|
||
labels_gold = gold_new.set_index('obj_id')[gold_new.columns[1:54]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The columns corresponding to the labels may not always be 1:54. Perhaps we could use the mapper's keys or the config file to generate a list of classifications? |
||
|
||
classes = labels_gold.columns.values.tolist() | ||
|
||
missing_df = pd.DataFrame({'obj_id':labels_gold.index, 'missing_descr':np.zeros(len(labels_gold),dtype=str)}).set_index('obj_id') | ||
|
||
values = [] | ||
missing_items = [] | ||
cnt = 0 | ||
|
||
for index, row in labels_gold.iterrows(): | ||
nonzero_vals = row[row>0].index.values | ||
for value in nonzero_vals: | ||
mapped_c = golden_dataset_mapper[value]['fritz_label'] | ||
try: | ||
trail = trace_path(sitewide, mapped_c) | ||
class_path = get_class_path(trail, sitewide, mapped_c) | ||
except TypeError: | ||
trail = trace_path(phenom, mapped_c) | ||
class_path = get_class_path(trail, phenom, mapped_c) | ||
for item in class_path: | ||
if (item != mapped_c) & (item in classes): | ||
mapped_item = gold_dict[item]['trainingset_label'] | ||
if labels_gold.loc[index, mapped_item] == 0: | ||
cnt += 1 | ||
print(cnt, index, value, 'missing', mapped_item, ';') | ||
values += [value] | ||
missing_items += [mapped_item] | ||
missing_df.loc[index, 'missing_descr'] = missing_df.loc[index, 'missing_descr'] + f"{value} missing {mapped_item};" | ||
missing_df.loc[index, 'missing_descr'] = missing_df.loc[index, 'missing_descr'][:-1] | ||
missing_df[missing_df['missing_descr']!=''].reset_index().to_csv(output_path,index=False) | ||
return None | ||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-parquet", type=str, help="path to parquet") | ||
parser.add_argument("-mapper", type=str, help="path to mapper") | ||
parser.add_argument( | ||
"-merge_features", | ||
type=bool, | ||
nargs='?', | ||
const=True, | ||
default=False, | ||
help="merge downloaded results with features from Kowalski", | ||
) | ||
parser.add_argument( | ||
"-features_catalog", | ||
type=str, | ||
default='ZTF_source_features_DR5', | ||
help="catalog of features on Kowalski", | ||
) | ||
Comment on lines
+94
to
+107
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These arguments are not used by the code above, so they can be removed. |
||
|
||
args = parser.parse_args() | ||
|
||
missing_taxonomy(args.parquet, args.mapper) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Above the first import, add this commented line:
#!/usr/bin/env python
This allows users to run the script using
./missing_taxonomy.py
in addition topython missing_taxonomy.py
.