Merge pull request #46 from HDI-Project/2020.1.9-20200212

2020.1.9 20200212
MLBazaar · Feb 21, 2020 · c985da2 · c985da2
2 parents 06604bb + 8ba30b5
commit c985da2
Show file tree

Hide file tree

Showing 552 changed files with 6,729 additions and 929 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -8,3 +8,4 @@ static/
 notebooks/
 .tox/
 .git/
+primitives/
diff --git a/.gitignore b/.gitignore
@@ -105,8 +105,8 @@ ENV/
 # vim
 .*.swp
 
-input*
+input
 output/
-static/
 notebooks/
-templates.bak/
+primitives/
+static
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,4 @@
-FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2019.11.10-20191127-050901
-# FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2019.11.10
+FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9-20200212-063959
 
 ARG UID=1000
 ARG D3MPORT=45042
@@ -10,10 +9,6 @@ EXPOSE $D3MPORT
 
 RUN mkdir -p /user_dev
 
-# RUN mkdir -p /user_dev && \
-#     mkdir -p /user_dev/output && \
-#     mkdir -p /user_dev/input && \
-#     mkdir -p /user_dev/static && \
 RUN ln -s /output /user_dev/output && \
     ln -s /input /user_dev/input && \
     ln -s /static /user_dev/static
@@ -24,11 +19,9 @@ RUN pip3 install -r /user_dev/requirements.txt
 
 # Copy code
 COPY setup.py MANIFEST.in /user_dev/
-COPY ta2 /user_dev/ta2
-RUN chown -R $UID:$UID /user_dev
+RUN pip3 install -e /user_dev ipdb
 
-# Install project
-RUN pip3 install /user_dev
-RUN pip3 install ipdb
+COPY ta2 /user_dev/ta2
+# RUN chown -R $UID:$UID /user_dev
 
 CMD ["python3", "/user_dev/ta2/ta3/server.py", "-v"]
diff --git a/Makefile b/Makefile
@@ -231,8 +231,8 @@ build: ## build the mit-d3m-ta2 docker image
 
 .PHONY: submit
 submit: login build ## push to TA2 submission registry
-	docker tag mit-d3m-ta2:latest registry.datadrivendiscovery.org/ta2-submissions/ta2-mit/december2019
-	docker push registry.datadrivendiscovery.org/ta2-submissions/ta2-mit/december2019
+	docker tag mit-d3m-ta2:latest registry.datadrivendiscovery.org/ta2-submissions/ta2-mit/winter2020evaluation
+	docker push registry.datadrivendiscovery.org/ta2-submissions/ta2-mit/winter2020evaluation
 
 .PHONY: submit-ci
 submit-ci: login build ## push to TA2 submission registry

diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
--e git+https://gitlab.com/datadrivendiscovery/ta3ta2-api.git@0494c7088542c79c4d8eed0059d512bc08414e42#egg=ta3ta2-api
+-e git+https://gitlab.com/datadrivendiscovery/ta3ta2-api.git@1214abaac7cccd3f578e9589509b279bd820a758#egg=ta3ta2-api
+-e git+https://github.com/HDI-Project/BTB.git@31c6349932accd6b168ad2d00af6b4110e8c4a66#egg=baytune
diff --git a/run_docker.sh b/run_docker.sh
@@ -10,7 +10,6 @@ rm -r output
 mkdir -p output
 chown $USER output
 
-
 function echodo() {
     echo $*
     $*
@@ -22,6 +21,7 @@ echodo docker run -i -t --rm \
     -e D3MINPUTDIR=/input \
     -e D3MOUTPUTDIR=/output \
     -e D3MSTATICDIR=/static \
+    -v $(pwd)/ta2:/user_dev/ta2 \
     -v $(pwd)/input:/input \
     -v $(pwd)/output:/output \
     -v $(pwd)/static:/static \

diff --git a/scripts/evaluate_templates.sh b/scripts/evaluate_templates.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+docker build --build-arg UID=$UID -t mit-d3m-ta2 .
+
+COMMANDS=${*:-/bin/bash}
+DATASETS=/home/pythia/Projects/d3m/datasets/seed_datasets_current/
+
+docker run -i -t --rm -v $DATASETS:/input -v $(pwd):/home/user -w /home/user -u $UID mit-d3m-ta2 \
+    python3 run_templates.py templates /input/LL1_terra_canopy_height_long_form_s4_90_MIN_METADATA
diff --git a/scripts/generate_templates.py b/scripts/generate_templates.py
@@ -0,0 +1,306 @@
+import argparse
+import glob
+import json
+import logging
+import os
+import sys
+import traceback
+from datetime import datetime, timezone
+
+import numpy as np
+import pandas as pd
+from d3m.metadata.pipeline import Pipeline
+from d3m.metadata.problem import Problem
+from d3m.utils import yaml_load_all
+
+LOGGER = logging.getLogger(__name__)
+TUNING_PARAMETER = 'https://metadata.datadrivendiscovery.org/types/TuningParameter'
+
+
+def load_pipeline(pipeline):
+    with open(pipeline) as _pipeline:
+        if pipeline.endswith('.json'):
+            pipeline = Pipeline.from_json(_pipeline)
+        else:
+            pipeline = Pipeline.from_yaml(_pipeline)
+
+    return pipeline
+
+
+def get_default_step_hyperparams(step):
+    default_tunable_hyperparams = {}
+    for name, hp in step.get_all_hyperparams().items():
+        if TUNING_PARAMETER not in hp.semantic_types:
+            continue
+
+        default_tunable_hyperparams[name] = hp.get_default()
+
+    return default_tunable_hyperparams
+
+
+def clean_hyperparams(pipeline):
+    for step in pipeline.steps:
+        default_tunable_hyperparams = get_default_step_hyperparams(step)
+
+        for name, value in step.hyperparams.items():
+            if name in default_tunable_hyperparams.keys():
+                value['data'] = default_tunable_hyperparams[name]
+
+    return pipeline
+
+
+def pipeline_to_template(pipeline_path):
+    pipeline = load_pipeline(pipeline_path)
+    template = clean_hyperparams(pipeline)
+
+    template.id = ''
+    template.schema = 'https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json'
+    template.created = datetime(2016, 11, 11, 12, 30, tzinfo=timezone.utc)
+
+    return template
+
+
+def write_template(templates_path, template):
+    template_id = template.get_digest()[:12]
+    template_path = os.path.join(templates_path, template_id + '.json')
+
+    with open(template_path, 'w') as template_file:
+        print("Creating template {}".format(template_path))
+        template.to_json(template_file)
+
+
+def generate_templates(pipelines_path, templates_path):
+    for pipeline in os.listdir(pipelines_path):
+        pipeline_path = os.path.join(pipelines_path, pipeline)
+        try:
+            template = pipeline_to_template(pipeline_path)
+            write_template(templates_path, template)
+        except Exception as ex:
+            print(ex)
+
+
+def read_pipeline_run(pipeline_run_path):
+    data = open(pipeline_run_path)
+    docs = yaml_load_all(stream=data)
+    res = []
+    for doc in docs:
+        res.append(doc)
+
+    data.close()
+
+    return res
+
+
+def load_problem(root_path, phase):
+    path = os.path.join(root_path, phase, 'problem_' + phase, 'problemDoc.json')
+    return Problem.load(problem_uri=path)
+
+
+def detect_data_modality(dataset_doc):
+    with open(dataset_doc) as f:
+        dataset_doc = json.load(f)
+
+    resources = list()
+    for resource in dataset_doc['dataResources']:
+        resources.append(resource['resType'])
+
+    if len(resources) == 1:
+        return 'single_table'
+    else:
+        for resource in resources:
+            if resource == 'edgeList':
+                return 'graph'
+            elif resource not in ('table', 'raw'):
+                return resource
+
+    return 'multi_table'
+
+
+def get_dataset_info(dataset_name, datasets_path):
+
+    dataset_root = os.path.join(datasets_path, dataset_name)
+
+    if not os.path.exists(dataset_root):
+        dataset_root += '_MIN_METADATA'
+
+    dataset_doc = os.path.join(dataset_root, 'TRAIN', 'dataset_TRAIN', 'datasetDoc.json')
+    dataset_root = 'file://' + os.path.abspath(dataset_root)
+    problem = load_problem(dataset_root, 'TRAIN')
+
+    # Dataset Meta
+    data_modality = detect_data_modality(dataset_doc)
+    task_type = problem['problem']['task_keywords'][0].name.lower()
+    task_subtype = problem['problem']['task_keywords'][1].name.lower()
+
+    return data_modality, task_type, task_subtype
+
+
+def get_template_id(pipeline_id, pipelines_path, templates_path):
+
+    pipeline_path = os.path.join(pipelines_path, '{}.json'.format(pipeline_id))
+    if not os.path.isfile(pipeline_path):
+        raise ValueError('Can not find: {}'.format(pipeline_path))
+
+    template = pipeline_to_template(pipeline_path)
+    write_template(templates_path, template)
+    return template.get_digest()[:12]
+
+
+def produce_phase(pipeline_run):
+    """Produce result with Produce phase data."""
+    scores = pipeline_run['run']['results']['scores']
+
+    if len(scores) > 1:
+        raise ValueError('This run has more than one score!')
+
+    scores = scores[0]
+
+    return {
+        'metric': scores['metric']['metric'],
+        'context': pipeline_run['context'],
+        'normalized_score': scores['normalized']
+    }
+
+
+def extract_pipeline_run(pipeline_run, pipelines_path, templates_path, datasets_path):
+    dataset_id = pipeline_run['datasets'][0]['id']
+    phase = pipeline_run['run']['phase']
+    succeed = pipeline_run.get('status').get('state')
+    pipeline_id = pipeline_run['pipeline']['id']
+
+    if dataset_id.endswith('TRAIN'):
+        dataset_name = dataset_id.replace('_dataset_TRAIN', '')
+    else:
+        dataset_name = dataset_id.replace('_dataset_SCORE', '')
+
+    # TODO: Lazy Loader
+    data_modality, task_type, task_subtype = get_dataset_info(dataset_name, datasets_path)
+
+    template_id = get_template_id(pipeline_id, pipelines_path, templates_path)
+
+    result = {
+        'dataset': dataset_name,
+        'pipeline_id': pipeline_id,
+        'template_id': template_id,
+        'modality': data_modality,
+        'type': task_type,
+        'subtype': task_subtype,
+        'phase': phase,
+        'succeed': succeed,
+    }
+
+    if phase == 'PRODUCE' and succeed != 'FAILURE':
+        try:
+            score = produce_phase(pipeline_run)
+            result.update(score)
+        except:
+            # Timeout
+            result['phase'] = 'TIMEOUT'
+
+    return result, succeed
+
+
+def extract_meta_information(pipeline_runs, pipelines_path, templates_path, datasets_path):
+    pipeline_runs_path = os.path.join(pipeline_runs, '*')
+
+    results = []
+    errored = []
+    discarded = []
+
+    for pipeline_run_path in glob.glob(pipeline_runs_path):
+        pipeline_runs = load_pipeline_run(pipeline_run_path)
+
+        data_extracted = []
+
+        failed = False
+
+        for pipeline_run in pipeline_runs:
+            try:
+                run_data, run_status = extract_pipeline_run(
+                    pipeline_run, pipelines_path, templates_path, datasets_path)
+
+                failed = run_status == 'FAILURE'
+
+                data_extracted.append(run_data)
+
+            except Exception as e:
+                LOGGER.warning('Failed %s with: %s', pipeline_run_path, e)
+                continue
+
+        if not failed:
+            results.extend(data_extracted)
+
+        else:
+            LOGGER.warning('Pipeline run %s discarded.', pipeline_run_path)
+            discarded.append(data_extracted)
+
+    return results, discarded
+
+
+def apply_mean_score(df):
+    mean_score = df.groupby(['pipeline_id', 'context'])['normalized_score'].mean()
+    mean_score = mean_score.reset_index()
+    mean_score.rename(columns={'normalized_score': 'mean_score'}, inplace=True)
+    return df.merge(mean_score, on=['pipeline_id', 'context'], how='left')
+
+
+def z_score(x):
+    if len(x) == 1 or x.std() == 0:
+        return pd.Series(np.zeros(len(x)), index=x.index)
+
+    return (x - x.mean()) / x.std()
+
+
+def apply_z_score(df):
+    z_scores = df.groupby('dataset').normalized_score.apply(z_score)
+    df['z_score'] = z_scores
+    templates_z_score = df.groupby('template_id').z_score.mean()
+    del df['z_score']
+
+    return df.merge(templates_z_score, how='left', left_on='template_id', right_index=True)
+
+
+def generate_metadata_report(pipeline_runs, pipelines_path, templates_path, datasets_path, report):
+
+    results, discarded = extract_meta_information(
+        pipeline_runs, pipelines_path, templates_path, datasets_path)
+
+    if report is None:
+        report = os.path.join(templates_path, 'templates.csv')
+
+    df = pd.DataFrame(results)
+    df = apply_mean_score(df)
+    df = apply_z_score(df)
+    df.to_csv(report, index=False)
+
+    if errored:
+        with open('errors.txt', 'w') as f:
+            for error in errored:
+                f.write('{}\n'.format(error))
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description='Generate new templates from pipeline runs and the metadata reffered to them.')
+    parser.add_argument('pipeline_runs_path', help='Path to the pipeline runs folder')
+    parser.add_argument('pipelines_path', help='Path to the pipelines folder')
+    parser.add_argument('templates_path', help='Path to the templates folder')
+    parser.add_argument('datasets_path', help='Path where the datasets are located')
+    parser.add_argument('-r', '--report', help='Path to the CSV file where scores will be dumped.')
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    generate_metadata_report(
+        args.pipeline_runs_path,
+        args.pipelines_scored_path,
+        args.templates_path,
+        args.datasets_path,
+        args.report,
+    )
+
+
+if __name__ == '__main__':
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,4 @@ static/ @@
     notebooks/
     .tox/
     .git/
+    primitives/