Skip to content

Commit

Permalink
Merge pull request #46 from HDI-Project/2020.1.9-20200212
Browse files Browse the repository at this point in the history
2020.1.9 20200212
  • Loading branch information
csala authored Feb 21, 2020
2 parents 06604bb + 8ba30b5 commit c985da2
Show file tree
Hide file tree
Showing 552 changed files with 6,729 additions and 929 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ static/
notebooks/
.tox/
.git/
primitives/
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ ENV/
# vim
.*.swp

input*
input
output/
static/
notebooks/
templates.bak/
primitives/
static
15 changes: 4 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2019.11.10-20191127-050901
# FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2019.11.10
FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9-20200212-063959

ARG UID=1000
ARG D3MPORT=45042
Expand All @@ -10,10 +9,6 @@ EXPOSE $D3MPORT

RUN mkdir -p /user_dev

# RUN mkdir -p /user_dev && \
# mkdir -p /user_dev/output && \
# mkdir -p /user_dev/input && \
# mkdir -p /user_dev/static && \
RUN ln -s /output /user_dev/output && \
ln -s /input /user_dev/input && \
ln -s /static /user_dev/static
Expand All @@ -24,11 +19,9 @@ RUN pip3 install -r /user_dev/requirements.txt

# Copy code
COPY setup.py MANIFEST.in /user_dev/
COPY ta2 /user_dev/ta2
RUN chown -R $UID:$UID /user_dev
RUN pip3 install -e /user_dev ipdb

# Install project
RUN pip3 install /user_dev
RUN pip3 install ipdb
COPY ta2 /user_dev/ta2
# RUN chown -R $UID:$UID /user_dev

CMD ["python3", "/user_dev/ta2/ta3/server.py", "-v"]
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,8 @@ build: ## build the mit-d3m-ta2 docker image

.PHONY: submit
submit: login build ## push to TA2 submission registry
docker tag mit-d3m-ta2:latest registry.datadrivendiscovery.org/ta2-submissions/ta2-mit/december2019
docker push registry.datadrivendiscovery.org/ta2-submissions/ta2-mit/december2019
docker tag mit-d3m-ta2:latest registry.datadrivendiscovery.org/ta2-submissions/ta2-mit/winter2020evaluation
docker push registry.datadrivendiscovery.org/ta2-submissions/ta2-mit/winter2020evaluation

.PHONY: submit-ci
submit-ci: login build ## push to TA2 submission registry
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
-e git+https://gitlab.com/datadrivendiscovery/ta3ta2-api.git@0494c7088542c79c4d8eed0059d512bc08414e42#egg=ta3ta2-api
-e git+https://gitlab.com/datadrivendiscovery/ta3ta2-api.git@1214abaac7cccd3f578e9589509b279bd820a758#egg=ta3ta2-api
-e git+https://github.com/HDI-Project/BTB.git@31c6349932accd6b168ad2d00af6b4110e8c4a66#egg=baytune
2 changes: 1 addition & 1 deletion run_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ rm -r output
mkdir -p output
chown $USER output


function echodo() {
echo $*
$*
Expand All @@ -22,6 +21,7 @@ echodo docker run -i -t --rm \
-e D3MINPUTDIR=/input \
-e D3MOUTPUTDIR=/output \
-e D3MSTATICDIR=/static \
-v $(pwd)/ta2:/user_dev/ta2 \
-v $(pwd)/input:/input \
-v $(pwd)/output:/output \
-v $(pwd)/static:/static \
Expand Down
9 changes: 9 additions & 0 deletions scripts/evaluate_templates.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

docker build --build-arg UID=$UID -t mit-d3m-ta2 .

COMMANDS=${*:-/bin/bash}
DATASETS=/home/pythia/Projects/d3m/datasets/seed_datasets_current/

docker run -i -t --rm -v $DATASETS:/input -v $(pwd):/home/user -w /home/user -u $UID mit-d3m-ta2 \
python3 run_templates.py templates /input/LL1_terra_canopy_height_long_form_s4_90_MIN_METADATA
306 changes: 306 additions & 0 deletions scripts/generate_templates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
import argparse
import glob
import json
import logging
import os
import sys
import traceback
from datetime import datetime, timezone

import numpy as np
import pandas as pd
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.problem import Problem
from d3m.utils import yaml_load_all

LOGGER = logging.getLogger(__name__)
TUNING_PARAMETER = 'https://metadata.datadrivendiscovery.org/types/TuningParameter'


def load_pipeline(pipeline):
with open(pipeline) as _pipeline:
if pipeline.endswith('.json'):
pipeline = Pipeline.from_json(_pipeline)
else:
pipeline = Pipeline.from_yaml(_pipeline)

return pipeline


def get_default_step_hyperparams(step):
default_tunable_hyperparams = {}
for name, hp in step.get_all_hyperparams().items():
if TUNING_PARAMETER not in hp.semantic_types:
continue

default_tunable_hyperparams[name] = hp.get_default()

return default_tunable_hyperparams


def clean_hyperparams(pipeline):
for step in pipeline.steps:
default_tunable_hyperparams = get_default_step_hyperparams(step)

for name, value in step.hyperparams.items():
if name in default_tunable_hyperparams.keys():
value['data'] = default_tunable_hyperparams[name]

return pipeline


def pipeline_to_template(pipeline_path):
pipeline = load_pipeline(pipeline_path)
template = clean_hyperparams(pipeline)

template.id = ''
template.schema = 'https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json'
template.created = datetime(2016, 11, 11, 12, 30, tzinfo=timezone.utc)

return template


def write_template(templates_path, template):
template_id = template.get_digest()[:12]
template_path = os.path.join(templates_path, template_id + '.json')

with open(template_path, 'w') as template_file:
print("Creating template {}".format(template_path))
template.to_json(template_file)


def generate_templates(pipelines_path, templates_path):
for pipeline in os.listdir(pipelines_path):
pipeline_path = os.path.join(pipelines_path, pipeline)
try:
template = pipeline_to_template(pipeline_path)
write_template(templates_path, template)
except Exception as ex:
print(ex)


def read_pipeline_run(pipeline_run_path):
data = open(pipeline_run_path)
docs = yaml_load_all(stream=data)
res = []
for doc in docs:
res.append(doc)

data.close()

return res


def load_problem(root_path, phase):
path = os.path.join(root_path, phase, 'problem_' + phase, 'problemDoc.json')
return Problem.load(problem_uri=path)


def detect_data_modality(dataset_doc):
with open(dataset_doc) as f:
dataset_doc = json.load(f)

resources = list()
for resource in dataset_doc['dataResources']:
resources.append(resource['resType'])

if len(resources) == 1:
return 'single_table'
else:
for resource in resources:
if resource == 'edgeList':
return 'graph'
elif resource not in ('table', 'raw'):
return resource

return 'multi_table'


def get_dataset_info(dataset_name, datasets_path):

dataset_root = os.path.join(datasets_path, dataset_name)

if not os.path.exists(dataset_root):
dataset_root += '_MIN_METADATA'

dataset_doc = os.path.join(dataset_root, 'TRAIN', 'dataset_TRAIN', 'datasetDoc.json')
dataset_root = 'file://' + os.path.abspath(dataset_root)
problem = load_problem(dataset_root, 'TRAIN')

# Dataset Meta
data_modality = detect_data_modality(dataset_doc)
task_type = problem['problem']['task_keywords'][0].name.lower()
task_subtype = problem['problem']['task_keywords'][1].name.lower()

return data_modality, task_type, task_subtype


def get_template_id(pipeline_id, pipelines_path, templates_path):

pipeline_path = os.path.join(pipelines_path, '{}.json'.format(pipeline_id))
if not os.path.isfile(pipeline_path):
raise ValueError('Can not find: {}'.format(pipeline_path))

template = pipeline_to_template(pipeline_path)
write_template(templates_path, template)
return template.get_digest()[:12]


def produce_phase(pipeline_run):
"""Produce result with Produce phase data."""
scores = pipeline_run['run']['results']['scores']

if len(scores) > 1:
raise ValueError('This run has more than one score!')

scores = scores[0]

return {
'metric': scores['metric']['metric'],
'context': pipeline_run['context'],
'normalized_score': scores['normalized']
}


def extract_pipeline_run(pipeline_run, pipelines_path, templates_path, datasets_path):
dataset_id = pipeline_run['datasets'][0]['id']
phase = pipeline_run['run']['phase']
succeed = pipeline_run.get('status').get('state')
pipeline_id = pipeline_run['pipeline']['id']

if dataset_id.endswith('TRAIN'):
dataset_name = dataset_id.replace('_dataset_TRAIN', '')
else:
dataset_name = dataset_id.replace('_dataset_SCORE', '')

# TODO: Lazy Loader
data_modality, task_type, task_subtype = get_dataset_info(dataset_name, datasets_path)

template_id = get_template_id(pipeline_id, pipelines_path, templates_path)

result = {
'dataset': dataset_name,
'pipeline_id': pipeline_id,
'template_id': template_id,
'modality': data_modality,
'type': task_type,
'subtype': task_subtype,
'phase': phase,
'succeed': succeed,
}

if phase == 'PRODUCE' and succeed != 'FAILURE':
try:
score = produce_phase(pipeline_run)
result.update(score)
except:
# Timeout
result['phase'] = 'TIMEOUT'

return result, succeed


def extract_meta_information(pipeline_runs, pipelines_path, templates_path, datasets_path):
pipeline_runs_path = os.path.join(pipeline_runs, '*')

results = []
errored = []
discarded = []

for pipeline_run_path in glob.glob(pipeline_runs_path):
pipeline_runs = load_pipeline_run(pipeline_run_path)

data_extracted = []

failed = False

for pipeline_run in pipeline_runs:
try:
run_data, run_status = extract_pipeline_run(
pipeline_run, pipelines_path, templates_path, datasets_path)

failed = run_status == 'FAILURE'

data_extracted.append(run_data)

except Exception as e:
LOGGER.warning('Failed %s with: %s', pipeline_run_path, e)
continue

if not failed:
results.extend(data_extracted)

else:
LOGGER.warning('Pipeline run %s discarded.', pipeline_run_path)
discarded.append(data_extracted)

return results, discarded


def apply_mean_score(df):
mean_score = df.groupby(['pipeline_id', 'context'])['normalized_score'].mean()
mean_score = mean_score.reset_index()
mean_score.rename(columns={'normalized_score': 'mean_score'}, inplace=True)
return df.merge(mean_score, on=['pipeline_id', 'context'], how='left')


def z_score(x):
if len(x) == 1 or x.std() == 0:
return pd.Series(np.zeros(len(x)), index=x.index)

return (x - x.mean()) / x.std()


def apply_z_score(df):
z_scores = df.groupby('dataset').normalized_score.apply(z_score)
df['z_score'] = z_scores
templates_z_score = df.groupby('template_id').z_score.mean()
del df['z_score']

return df.merge(templates_z_score, how='left', left_on='template_id', right_index=True)


def generate_metadata_report(pipeline_runs, pipelines_path, templates_path, datasets_path, report):

results, discarded = extract_meta_information(
pipeline_runs, pipelines_path, templates_path, datasets_path)

if report is None:
report = os.path.join(templates_path, 'templates.csv')

df = pd.DataFrame(results)
df = apply_mean_score(df)
df = apply_z_score(df)
df.to_csv(report, index=False)

if errored:
with open('errors.txt', 'w') as f:
for error in errored:
f.write('{}\n'.format(error))


def get_parser():
parser = argparse.ArgumentParser(
description='Generate new templates from pipeline runs and the metadata reffered to them.')
parser.add_argument('pipeline_runs_path', help='Path to the pipeline runs folder')
parser.add_argument('pipelines_path', help='Path to the pipelines folder')
parser.add_argument('templates_path', help='Path to the templates folder')
parser.add_argument('datasets_path', help='Path where the datasets are located')
parser.add_argument('-r', '--report', help='Path to the CSV file where scores will be dumped.')

return parser.parse_args()


def main():
args = parse_args()
generate_metadata_report(
args.pipeline_runs_path,
args.pipelines_scored_path,
args.templates_path,
args.datasets_path,
args.report,
)


if __name__ == '__main__':
main()
Loading

0 comments on commit c985da2

Please sign in to comment.