Skip to content

Commit

Permalink
qc: add report module
Browse files Browse the repository at this point in the history
  • Loading branch information
abhidg committed Oct 16, 2023
1 parent a129b0a commit a189345
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 20 deletions.
14 changes: 12 additions & 2 deletions adtl/qc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
Quality Control module for ADTL
"""
import copy
import json
import functools
from typing import List, Union
from pathlib import Path
from typing import TypedDict, Dict, List, Any, Optional
from typing import List, Union, TypedDict, Any, Optional, Dict

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -104,6 +105,15 @@ def schema(schema_path: Union[str, Path], pattern: str = "*.csv"):
pass


def get_result_from_insertion(data: Dict[str, Any]) -> WorkUnitResult:
result: Dict[str, Any] = copy.deepcopy(data) # type: ignore
if result["fail_data"]:
result["fail_data"] = pd.DataFrame(json.loads(result["fail_data"]))
if result["rows_fail_idx"]:
result["rows_fail_idx"] = [int(x) for x in result["rows_fail_idx"].split(",")]
return result


def main(args=None):
from .runner import _main

Expand Down
97 changes: 97 additions & 0 deletions adtl/qc/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""
Quality Control module for ADTL, report submodule
"""

import json
import sqlite3
from string import Template
from pathlib import Path
from typing import List, Any, Dict

import pandas as pd

from . import get_result_from_insertion

RULES_SUBFOLDER = "r"
DATASET_SUBFOLDER = "d"
TEMPLATES = Path(__file__).parent / "templates"
STYLE = TEMPLATES / "style.css"
INDEX = "index.html"


def render_result(result: Dict[str, Any]) -> str:
result = get_result_from_insertion(result) # type: ignore
tmpl = "<li><tt>[{ success } / { total }}]</tt> { dataset } / { file }"
if result["fail_data"] is not None:
fail_data = pd.DataFrame(json.loads(result["fail_data"]))
tmpl += """
<details>
<summary>Failed rows</summary>
<pre>{ log }</p>
</details></li>""".format(
log=str(fail_data)
)
else:
tmpl += "</li>"
return tmpl.format(**result)


def render_index(rules: List[Dict[str, Any]], datasets: List[str]) -> str:
dataset_index = "\n".join(
f"""<li><a href="d/{dataset}.html">{dataset}</a></li>"""
for dataset in datasets
)
rule_index = "\n".join(
f"""<li><a href="r/{r["name"]}.html">{r["description"]}</a></li>"""
for r in rules
)
return Template((TEMPLATES / "index.html").read_text()).substitute(
dict(dataset_index=dataset_index, rule_index=rule_index)
)


def read_sql(
conn: sqlite3.Connection, sql: str, columns: List[str]
) -> List[Dict[str, Any]]:
cur = conn.cursor()
res = cur.execute(sql)
return [dict(zip(columns, r)) for r in res.fetchall()]


def make_report(store_database: str, output_folder: Path = Path("qc_report")):
"Makes report from results database"

if not output_folder.exists():
output_folder.mkdir()
conn = sqlite3.connect(store_database)
datasets = read_sql(conn, "SELECT DISTINCT dataset FROM results", ["dataset"])
datasets = [n["dataset"] if n["dataset"] else "unlabelled" for n in datasets]
rules = read_sql(
conn,
"SELECT name, description, long_description FROM rules",
["name", "description", "long_description"],
)
print(rules)
(output_folder / "style.css").write_text(STYLE.read_text())
(output_folder / INDEX).write_text(render_index(rules, datasets))
results = read_sql(
conn,
"SELECT * from results",
[
"rule",
"dataset",
"file",
"rows_success",
"rows_fail",
"ratio_success",
"rows_fail_idx",
"success",
"mostly",
"fail_data",
],
)
print(results)


if __name__ == "__main__":
make_report("adtl-qc.db")
16 changes: 6 additions & 10 deletions adtl/qc/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pandas as pd

from . import Dataset, Rule, WorkUnit, WorkUnitResult
from .report import make_report

DEFAULT_PATTERN = "*.csv"

Expand All @@ -34,7 +35,7 @@
)"""

DDL_RULES = """CREATE TABLE IF NOT EXISTS rules (
rule TEXT,
name TEXT,
description TEXT,
long_description TEXT
)"""
Expand Down Expand Up @@ -125,15 +126,6 @@ def prepare_result_for_insertion(work_unit_result: WorkUnitResult) -> Dict[str,
return result


def get_result_from_insertion(data: Dict[str, Any]) -> WorkUnitResult:
result: Dict[str, Any] = copy.deepcopy(data) # type: ignore
if result["fail_data"]:
result["fail_data"] = pd.DataFrame(json.loads(result["fail_data"]))
if result["rows_fail_idx"]:
result["rows_fail_idx"] = [int(x) for x in result["rows_fail_idx"].split(",")]
return result


def process_work_unit(unit: WorkUnit, save_db: Optional[str] = None) -> WorkUnitResult:
rule = unit["rule"]
module = importlib.import_module(rule["module"])
Expand All @@ -158,6 +150,7 @@ def start(
rules_path: Path = Path("qc"),
data_file_formats: List[str] = ["csv"],
store_database: Optional[str] = None,
disable_report: bool = False,
) -> List[WorkUnitResult]:
rules = collect_rules(rules_path)
datasets = collect_datasets(data_path, data_file_formats)
Expand All @@ -174,6 +167,8 @@ def start(

pool = multiprocessing.Pool()
process_work_unit_db = functools.partial(process_work_unit, save_db=store_database)
if store_database and not disable_report:
make_report(store_database)
return pool.map(process_work_unit_db, work_units)


Expand All @@ -198,4 +193,5 @@ def _main(args=None):
Path(args.rule_root),
data_file_formats=args.format.split(","),
store_database=args.database,
disable_report=args.no_report,
)
12 changes: 4 additions & 8 deletions adtl/qc/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,19 @@
<h1>✱ adtl-qc report</h1>

<p class="runinfo">
Run on {{ date }}
Updated on {{ date }}
</p>
<h2>Datasets</h2>
<ul id="datasets">
{{#datasets}}
<li><a href="/d/{{ name }}.html">{{ name }}</a></li>
{{/datasets}}
$dataset_index
</ul>
</body>

</html>

<h2>Rules</h2>
<ul id="datasets">
{{#rules}}
<li><a href="/r/{{ rule }}.html">{{ description }}</a></li>
{{/rules}}
<ul id="rules">
$rule_index
</ul>


Expand Down

0 comments on commit a189345

Please sign in to comment.