From 8dc41980ad23f349da2ec8879c8ac993390fc70f Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Mon, 4 Nov 2024 17:48:05 +0000 Subject: [PATCH 1/4] run pre-commit hook --- PRD.md | 2 +- src/arcmapper/app.py | 33 +++++++++++++++++++++++++++++---- src/arcmapper/arc.py | 1 - src/arcmapper/dictionary.py | 11 +++++++++-- src/arcmapper/strategies.py | 2 -- src/arcmapper/util.py | 10 +++++++--- tests/test_arc.py | 7 +++++-- tests/test_util.py | 28 +++++++++++++++++++++++----- 8 files changed, 74 insertions(+), 20 deletions(-) diff --git a/PRD.md b/PRD.md index 37b94df..5debc04 100644 --- a/PRD.md +++ b/PRD.md @@ -90,4 +90,4 @@ notebook or Python interface. Figma: https://www.figma.com/design/bpaumciR1DPhGejjCXER86/ISARIC-ARCmapper -![ARCmapper UX mockup](images/arcmapper-mockup.png) \ No newline at end of file +![ARCmapper UX mockup](images/arcmapper-mockup.png) diff --git a/src/arcmapper/app.py b/src/arcmapper/app.py index d6637c7..642e4c6 100644 --- a/src/arcmapper/app.py +++ b/src/arcmapper/app.py @@ -219,7 +219,8 @@ def upload_data_dictionary( ): ok = dbc.Alert("Upload successful", color="success") - def err(msg): return dbc.Alert(msg, color="danger") + def err(msg): + return dbc.Alert(msg, color="danger") if ctx.triggered_id == "upload-btn" and upload_contents is not None: try: @@ -231,7 +232,12 @@ def err(msg): return dbc.Alert(msg, color="danger") return {}, err("Description column not found") if col_responses not in df.columns: return {}, err("Responses column not found") - data = read_data_dictionary(df, description_field=col_description, response_field=col_responses, response_func="redcap") + data = read_data_dictionary( + df, + description_field=col_description, + response_field=col_responses, + response_func="redcap", + ) return data.to_json(), ok except Exception as e: @@ -258,7 +264,10 @@ def invoke_map_arc(data, _, version, method, num_matches): dash_table.DataTable( id="mapping", data=mapped_data.to_dict("records"), - columns=[{"name": i, "id": i} for i in mapped_data.columns], + columns=[ + {"name": i, "id": i, "editable": i != "status"} + for i in mapped_data.columns + ], editable=True, style_data={ "whiteSpace": "normal", @@ -269,7 +278,23 @@ def invoke_map_arc(data, _, version, method, num_matches): page_size=25, ), ) - return html.Span("No data to see here") + else: + return html.Span("No data to see here") + +@callback( + Output("mapping", "data"), + Output("mapping", "active_cell"), + State("mapping-store", "data"), + Input("mapping", "active_cell"), +) +def handle_status(data, active_cell): + if active_cell and active_cell.get("column_id") == "status": + i = active_cell.get("row") + row = data[i] + active_cell = False # Permits the button to be clicked again straight away + row["status"] = "✓" if row["status"] == "-" else "-" + return data, active_cell + app.layout = html.Div([navbar, upload_form, arc_form, output_table]) server = app.server diff --git a/src/arcmapper/arc.py b/src/arcmapper/arc.py index 40fef0e..c97e2c6 100644 --- a/src/arcmapper/arc.py +++ b/src/arcmapper/arc.py @@ -11,7 +11,6 @@ def arc_schema_url(arc_version: str) -> str: return f"https://github.com/ISARICResearch/DataPlatform/raw/refs/heads/main/ARCH/ARCH{arc_version}/ARCH.csv" - def read_arc_schema( arc_version_or_file: str, preset: str | None = None ) -> pd.DataFrame: diff --git a/src/arcmapper/dictionary.py b/src/arcmapper/dictionary.py index e26546a..899df0d 100644 --- a/src/arcmapper/dictionary.py +++ b/src/arcmapper/dictionary.py @@ -101,9 +101,16 @@ def read_from_jsonschema(data: str | dict[str, Any]) -> pd.DataFrame: # type: i dd = [] variables = data["properties"] for v in variables: - t = variables[v].get("type","string") + t = variables[v].get("type", "string") if "enum" in variables[v]: - dd.append((v, variables[v].get("description", ""), [(x, x) for x in variables[v]["enum"]], 'categorical')) + dd.append( + ( + v, + variables[v].get("description", ""), + [(x, x) for x in variables[v]["enum"]], + "categorical", + ) + ) else: dd.append((v, variables[v].get("description", ""), None, t)) return pd.DataFrame(dd, columns=["variable", "description", "responses", "type"]) diff --git a/src/arcmapper/strategies.py b/src/arcmapper/strategies.py index 2247765..f434ca6 100644 --- a/src/arcmapper/strategies.py +++ b/src/arcmapper/strategies.py @@ -232,5 +232,3 @@ def map( return sbert(dictionary, arc, num_matches=num_matches) case _: raise ValueError(f"Unknown mapping method: {method}") - - diff --git a/src/arcmapper/util.py b/src/arcmapper/util.py index 14937a4..5064ee2 100644 --- a/src/arcmapper/util.py +++ b/src/arcmapper/util.py @@ -12,6 +12,10 @@ from .types import Responses +def ctx_trigger(ctx, event): + return any(k["prop_id"] == event for k in ctx.triggered) + + def read_data(file_or_dataframe: str | pd.DataFrame) -> pd.DataFrame: if isinstance(file_or_dataframe, pd.DataFrame): return file_or_dataframe @@ -22,8 +26,9 @@ def read_data(file_or_dataframe: str | pd.DataFrame) -> pd.DataFrame: case ".csv": return pd.read_csv(file) + def read_upload_data(contents: str, filename) -> pd.DataFrame | None: - _, content_string = contents.split(',') + _, content_string = contents.split(",") decoded = base64.b64decode(content_string) path = Path(filename) @@ -31,8 +36,7 @@ def read_upload_data(contents: str, filename) -> pd.DataFrame | None: match path.suffix: case ".csv": # Assume that the user uploaded a CSV file - df = pd.read_csv( - io.StringIO(decoded.decode('utf-8'))) + df = pd.read_csv(io.StringIO(decoded.decode("utf-8"))) case ".xlsx": df = pd.read_excel(io.BytesIO(decoded)) case _: diff --git a/tests/test_arc.py b/tests/test_arc.py index 5976d7c..75baeee 100644 --- a/tests/test_arc.py +++ b/tests/test_arc.py @@ -6,9 +6,12 @@ def test_arc_schema_url(): - assert arc_schema_url("1.0.0") == "https://github.com/ISARICResearch/DataPlatform/raw/refs/heads/main/ARCH/ARCH1.0.0/ARCH.csv" + assert ( + arc_schema_url("1.0.0") + == "https://github.com/ISARICResearch/DataPlatform/raw/refs/heads/main/ARCH/ARCH1.0.0/ARCH.csv" + ) def test_read_arc_schema(): - arc = read_arc_schema(str(Path(__file__).parent / 'data' / 'ARCH.csv')) + arc = read_arc_schema(str(Path(__file__).parent / "data" / "ARCH.csv")) print(arc) diff --git a/tests/test_util.py b/tests/test_util.py index c70999c..b7d079b 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,26 +1,44 @@ "Utility functions for arcmapper" + from pathlib import Path import pandas as pd -from arcmapper.util import read_data, read_csv_with_encoding_detection, parse_redcap_response, read_upload_data +from arcmapper.util import ( + read_data, + read_csv_with_encoding_detection, + parse_redcap_response, + read_upload_data, +) + def test_read_data(): - arc_path = str(Path(__file__).parent / 'data' / 'ARCH.csv') + arc_path = str(Path(__file__).parent / "data" / "ARCH.csv") assert isinstance(read_data(arc_path), pd.DataFrame) + def test_read_csv_with_encoding_detection(): "Reads CSV file with encoding detection" - arc_path = str(Path(__file__).parent / 'data' / 'ARCH.csv') + arc_path = str(Path(__file__).parent / "data" / "ARCH.csv") assert isinstance(read_csv_with_encoding_detection(arc_path), pd.DataFrame) + def test_read_upload_data(): contents = "something,dmFyaWFibGUsZGVzY3JpcHRpb24Kc3ViamlkLFN1YmplY3QgSUQKZ2VuLEdlbmRlciBvZiB0aGUgcGF0aWVudAo=" - expected = pd.DataFrame({'variable': ['subjid', 'gen'], 'description': ['Subject ID', 'Gender of the patient']}) + expected = pd.DataFrame( + { + "variable": ["subjid", "gen"], + "description": ["Subject ID", "Gender of the patient"], + } + ) df = read_upload_data(contents, "file.csv") assert df is not None assert df.equals(expected) + def test_parse_redcap_response(): - assert parse_redcap_response("1, male | 2, female") == [("1", "male"), ("2", "female")] + assert parse_redcap_response("1, male | 2, female") == [ + ("1", "male"), + ("2", "female"), + ] From 1fbe431ab231ac721ca39ba394efedb96eb05f01 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Mon, 4 Nov 2024 17:46:22 +0000 Subject: [PATCH 2/4] app: allow editing table and accepting rows for final mapping --- src/arcmapper/app.py | 24 ++++++++++++++++++------ src/arcmapper/strategies.py | 2 ++ src/arcmapper/util.py | 1 - 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/arcmapper/app.py b/src/arcmapper/app.py index 642e4c6..1de5e9a 100644 --- a/src/arcmapper/app.py +++ b/src/arcmapper/app.py @@ -217,10 +217,10 @@ def upload_data_dictionary( col_responses, col_description, ): - ok = dbc.Alert("Upload successful", color="success") + ok = dbc.Alert("Upload successful", color="success", style={"marginTop": "1em"}) def err(msg): - return dbc.Alert(msg, color="danger") + return dbc.Alert(msg, color="danger", style={"marginTop": "1em"}) if ctx.triggered_id == "upload-btn" and upload_contents is not None: try: @@ -259,6 +259,7 @@ def invoke_map_arc(data, _, version, method, num_matches): if ctx.triggered_id == "map-btn": arc = read_arc_schema(version) dictionary = pd.read_json(data) + mapped_data = map_data_dictionary_to_arc(method, dictionary, arc, num_matches) return ( dash_table.DataTable( @@ -281,19 +282,30 @@ def invoke_map_arc(data, _, version, method, num_matches): else: return html.Span("No data to see here") + @callback( Output("mapping", "data"), + Output("mapping", "style_data_conditional"), Output("mapping", "active_cell"), - State("mapping-store", "data"), + Input("mapping", "data"), Input("mapping", "active_cell"), + prevent_initial_call=True, ) def handle_status(data, active_cell): if active_cell and active_cell.get("column_id") == "status": i = active_cell.get("row") row = data[i] - active_cell = False # Permits the button to be clicked again straight away - row["status"] = "✓" if row["status"] == "-" else "-" - return data, active_cell + row["status"] = "✅" if row["status"] == "-" else "-" + else: + raise dash.exceptions.PreventUpdate + highlight_row_idx = [i for i, row in enumerate(data) if row["status"] == "✅"] + return ( + data, # mapping data + [ + {"if": {"row_index": highlight_row_idx}, "backgroundColor": "bisque"} + ], # style_data_conditional + False, # unsets active cell, allowing the cell to be clicked immediately again + ) app.layout = html.Div([navbar, upload_form, arc_form, output_table]) diff --git a/src/arcmapper/strategies.py b/src/arcmapper/strategies.py index f434ca6..819ce67 100644 --- a/src/arcmapper/strategies.py +++ b/src/arcmapper/strategies.py @@ -42,6 +42,7 @@ def get_match_dataframe_from_similarity_matrix( match_df = pd.DataFrame( columns=[ + "status", "raw_variable", "raw_description", "arc_variable", @@ -52,6 +53,7 @@ def get_match_dataframe_from_similarity_matrix( [ [ [ + "-", dictionary.iloc[i].variable, dictionary.iloc[i].description, arc.iloc[k].variable, diff --git a/src/arcmapper/util.py b/src/arcmapper/util.py index 5064ee2..0c85a83 100644 --- a/src/arcmapper/util.py +++ b/src/arcmapper/util.py @@ -15,7 +15,6 @@ def ctx_trigger(ctx, event): return any(k["prop_id"] == event for k in ctx.triggered) - def read_data(file_or_dataframe: str | pd.DataFrame) -> pd.DataFrame: if isinstance(file_or_dataframe, pd.DataFrame): return file_or_dataframe From 9b312bbd7f4fd99a94776ad3b2242b3d872667ab Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Tue, 5 Nov 2024 13:13:48 +0000 Subject: [PATCH 3/4] app: add download mapping file support move forms into components --- src/arcmapper/app.py | 177 ++++++------------------------------ src/arcmapper/components.py | 151 ++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 149 deletions(-) diff --git a/src/arcmapper/app.py b/src/arcmapper/app.py index 1de5e9a..314511b 100644 --- a/src/arcmapper/app.py +++ b/src/arcmapper/app.py @@ -6,7 +6,7 @@ import dash_bootstrap_components as dbc -from .components import select +from .components import arc_form, upload_form from .util import read_upload_data from .dictionary import read_data_dictionary from .strategies import map as map_data_dictionary_to_arc @@ -43,159 +43,23 @@ dark=True, ) -upload_form = dbc.Container( +output_table = dbc.Container( html.Div( - dbc.Form( - [ - dbc.Row( - [ - html.P( - [ - html.Strong("SOURCE: "), - "ARCmapper supports data dictionaries in CSV, XLSX, JSON schema, or you can upload sample data.", - ] - ) - ] - ), - dbc.Row( - dbc.Col( - dbc.Switch( - id="upload-is-sample-data", - label="Uploaded file is sample data, not a data dictionary. Data will be sent to server, only use on local deployments", - disabled=True, - ) - ) - ), - dbc.Row( - [ - dbc.Label("Responses column", width="auto"), - dbc.Col( - dbc.Input( - id="upload-col-responses", - type="text", - value="Choices, Calculations, OR Slider Labels", - ), - className="me-3", - ), - dbc.Label("Description column", width="auto"), - dbc.Col( - dbc.Input( - id="upload-col-description", - type="text", - placeholder="Defaults to longest column", - value="Field Label", - ), - className="me-3", - ), - dbc.Col( - dcc.Upload( - id="upload-input-file", - children=html.Div( - "Drag and drop or select file", - style={ - "border": "1px dashed silver", - "padding": "0.3em", - }, - ), - ), - className="me-3", - ), - dbc.Col( - dbc.Button( - "Upload", id="upload-btn", color="primary", n_clicks=0 - ) - ), - dcc.Store(id="upload-data-dictionary"), - ], - className="g-2", - ), - dbc.Row(id="upload-status"), - ] - ), - style={ - "border": "1px solid silver", - "border-radius": "0.4em", - "padding": "1em", - }, - ), - style={"margin-top": "1em"}, + dbc.Row(id="output"), + style={"padding": "0.5em", "border": "1px solid silver", "borderRadius": "5px"}, + ) ) -arc_form = dbc.Container( - html.Div( - dbc.Form( +final_mapping_form = dbc.Container( + dbc.Row( + dbc.Col( [ - dbc.Row( - html.P( - [ - html.Strong("TARGET: "), - "Choose the target ARC version and select method and method parameters", - ] - ) - ), - dbc.Row( - [ - dbc.Label("Target ARC version", width="auto"), - dbc.Col( - select("arc-version", ["1.0.0", "1.0.1"]), - className="me-3", - ), - dbc.Label("Mapping method", width="auto"), - dbc.Col( - dbc.Select( - id="arc-mapping-method", - options=[ - {"label": "TF-IDF", "value": "tf-idf"}, - { - "label": "Sentence Transformers", - "value": "sbert", - }, - ], - value="tf-idf", - ), - className="me-3", - ), - dbc.Label("Number of matches", width="auto"), - dbc.Col( - dbc.Input( - id="arc-num-matches", - type="number", - min=2, - max=10, - step=1, - value=3, - ), - ), - dbc.Label("Threshold", width="auto"), - dbc.Col( - dbc.Input( - id="arc-threshold", - type="number", - min=0.1, - max=1, - step=0.1, - value=0.3, - ), - ), - dbc.Col(dbc.Button("Map to ARC", id="map-btn"), width="auto"), - ], - className="g-2", + dcc.Download(id="download-mapping"), + dbc.Button( + "Download mapping", id="download-btn", style={"marginTop": "1em"} ), ] - ), - style={ - "border": "1px solid silver", - "border-radius": "0.4em", - "padding": "1em", - }, - ), - style={"margin-top": "1em"}, -) - -output_table = dbc.Container( - html.Div( - dbc.Row(id="output"), - style={"padding": "0.5em", "border": "1px solid silver", "borderRadius": "5px"}, + ) ) ) @@ -308,5 +172,20 @@ def handle_status(data, active_cell): ) -app.layout = html.Div([navbar, upload_form, arc_form, output_table]) +@callback( + Output("download-mapping", "data"), + Input("download-btn", "n_clicks"), + State("mapping", "data"), + prevent_initial_call=True, +) +def handle_download(_, data): + if ctx.triggered_id == "download-btn": + df = pd.DataFrame(data) + df = df[df.status == "✅"].drop(columns=["status", "rank"]) + return dcc.send_data_frame(df.to_csv, "arcmapper-mapping-file.csv", index=False) + else: + raise dash.exceptions.PreventUpdate + + +app.layout = html.Div([navbar, upload_form, arc_form, output_table, final_mapping_form]) server = app.server diff --git a/src/arcmapper/components.py b/src/arcmapper/components.py index 9669cbc..afb6751 100644 --- a/src/arcmapper/components.py +++ b/src/arcmapper/components.py @@ -1,3 +1,4 @@ +from dash import html, dcc import dash_bootstrap_components as dbc @@ -7,3 +8,153 @@ def select(id: str, values: list[str], default: str | None = None) -> dbc.Select value=default if default else values[0], options=[{"label": v, "value": v} for v in values], ) + + +upload_form = dbc.Container( + html.Div( + dbc.Form( + [ + dbc.Row( + [ + html.P( + [ + html.Strong("SOURCE: "), + "ARCmapper supports data dictionaries in CSV, XLSX, JSON schema, or you can upload sample data.", + ] + ) + ] + ), + dbc.Row( + dbc.Col( + dbc.Switch( + id="upload-is-sample-data", + label="Uploaded file is sample data, not a data dictionary. Data will be sent to server, only use on local deployments", + disabled=True, + ) + ) + ), + dbc.Row( + [ + dbc.Label("Responses column", width="auto"), + dbc.Col( + dbc.Input( + id="upload-col-responses", + type="text", + value="Choices, Calculations, OR Slider Labels", + ), + className="me-3", + ), + dbc.Label("Description column", width="auto"), + dbc.Col( + dbc.Input( + id="upload-col-description", + type="text", + placeholder="Defaults to longest column", + value="Field Label", + ), + className="me-3", + ), + dbc.Col( + dcc.Upload( + id="upload-input-file", + children=html.Div( + "Drag and drop or select file", + style={ + "border": "1px dashed silver", + "padding": "0.3em", + }, + ), + ), + className="me-3", + ), + dbc.Col( + dbc.Button( + "Upload", id="upload-btn", color="primary", n_clicks=0 + ) + ), + dcc.Store(id="upload-data-dictionary"), + ], + className="g-2", + ), + dbc.Row(id="upload-status"), + ] + ), + style={ + "border": "1px solid silver", + "border-radius": "0.4em", + "padding": "1em", + }, + ), + style={"margin-top": "1em"}, +) + +arc_form = dbc.Container( + html.Div( + dbc.Form( + [ + dbc.Row( + html.P( + [ + html.Strong("TARGET: "), + "Choose the target ARC version and select method and method parameters", + ] + ) + ), + dbc.Row( + [ + dbc.Label("Target ARC version", width="auto"), + dbc.Col( + select("arc-version", ["1.0.0", "1.0.1"]), + className="me-3", + ), + dbc.Label("Mapping method", width="auto"), + dbc.Col( + dbc.Select( + id="arc-mapping-method", + options=[ + {"label": "TF-IDF", "value": "tf-idf"}, + { + "label": "Sentence Transformers", + "value": "sbert", + }, + ], + value="tf-idf", + ), + className="me-3", + ), + dbc.Label("Number of matches", width="auto"), + dbc.Col( + dbc.Input( + id="arc-num-matches", + type="number", + min=2, + max=10, + step=1, + value=3, + ), + ), + dbc.Label("Threshold", width="auto"), + dbc.Col( + dbc.Input( + id="arc-threshold", + type="number", + min=0.1, + max=1, + step=0.1, + value=0.3, + ), + ), + dbc.Col(dbc.Button("Map to ARC", id="map-btn"), width="auto"), + ], + className="g-2", + ), + ] + ), + style={ + "border": "1px solid silver", + "border-radius": "0.4em", + "padding": "1em", + }, + ), + style={"margin-top": "1em"}, +) From b5441ca6158375fcdc5f155664a3da316eade124 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Tue, 5 Nov 2024 14:06:56 +0000 Subject: [PATCH 4/4] app: fix highlighting using row_ids row_index cannot be used as it is reset on every page turn, use stable row_ids instead for consistent marking --- src/arcmapper/app.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/arcmapper/app.py b/src/arcmapper/app.py index 314511b..d39b1c5 100644 --- a/src/arcmapper/app.py +++ b/src/arcmapper/app.py @@ -5,7 +5,6 @@ from dash import dcc, html, ctx, callback, dash_table, Input, Output, State import dash_bootstrap_components as dbc - from .components import arc_form, upload_form from .util import read_upload_data from .dictionary import read_data_dictionary @@ -15,6 +14,10 @@ app = dash.Dash("arcmapper", external_stylesheets=[dbc.themes.BOOTSTRAP]) app.title = "ARCMapper" +PAGE_SIZE = 25 +OK = "✅" +HIGHLIGHT_COLOR = "bisque" + navbar = dbc.Navbar( dbc.Container( [ @@ -125,10 +128,13 @@ def invoke_map_arc(data, _, version, method, num_matches): dictionary = pd.read_json(data) mapped_data = map_data_dictionary_to_arc(method, dictionary, arc, num_matches) + data = mapped_data.to_dict("records") + for i, row in enumerate(data): + row["id"] = i return ( dash_table.DataTable( id="mapping", - data=mapped_data.to_dict("records"), + data=data, columns=[ {"name": i, "id": i, "editable": i != "status"} for i in mapped_data.columns @@ -140,7 +146,7 @@ def invoke_map_arc(data, _, version, method, num_matches): "fontSize": "90%", }, style_table={"overflowX": "auto"}, - page_size=25, + page_size=PAGE_SIZE, ), ) else: @@ -157,16 +163,23 @@ def invoke_map_arc(data, _, version, method, num_matches): ) def handle_status(data, active_cell): if active_cell and active_cell.get("column_id") == "status": - i = active_cell.get("row") + i = active_cell.get("row_id") row = data[i] - row["status"] = "✅" if row["status"] == "-" else "-" + row["status"] = OK if row["status"] == "-" else "-" else: raise dash.exceptions.PreventUpdate - highlight_row_idx = [i for i, row in enumerate(data) if row["status"] == "✅"] + highlighted_rows = [i for i in range(len(data)) if data[i]["status"] == OK] return ( data, # mapping data [ - {"if": {"row_index": highlight_row_idx}, "backgroundColor": "bisque"} + { + "if": { + "filter_query": " || ".join( + f"{{id}} = {k}" for k in highlighted_rows + ) + }, + "backgroundColor": HIGHLIGHT_COLOR, + } ], # style_data_conditional False, # unsets active cell, allowing the cell to be clicked immediately again ) @@ -181,7 +194,7 @@ def handle_status(data, active_cell): def handle_download(_, data): if ctx.triggered_id == "download-btn": df = pd.DataFrame(data) - df = df[df.status == "✅"].drop(columns=["status", "rank"]) + df = df[df.status == OK].drop(columns=["status", "rank"]) return dcc.send_data_frame(df.to_csv, "arcmapper-mapping-file.csv", index=False) else: raise dash.exceptions.PreventUpdate