Documenting the metadata (#321)

* documentation metadata and explain command
LAAC-LSCP · Nov 24, 2021 · 8bc67dd · 8bc67dd
1 parent 74f6a77
commit 8bc67dd
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 0 deletions.
diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py
@@ -404,6 +404,73 @@ def overview(args):
         )
 
 
+@subcommand(
+    [arg("source", help="source data path"), arg("variable", help="name of the variable")]
+)
+def explain(args):
+    """prints information about a certain metadata variable"""
+
+    variable = args.variable.lower()
+
+    project = ChildProject(args.source)
+    project.read()
+
+    documentation = project.read_documentation()
+    documentation = documentation[documentation["variable"].str.lower() == variable]
+
+    if not len(documentation):
+        documentation = [
+            {
+                'variable': col.name,
+                'description': col.description,
+                'table': 'recordings',
+                'scope': 'unknown' 
+            }
+            for col in project.RECORDINGS_COLUMNS
+        ]
+
+        documentation += [
+            {
+                'variable': col.name,
+                'description': col.description,
+                'table': 'children',
+                'scope': 'unknown' 
+            }
+            for col in project.CHILDREN_COLUMNS
+        ]
+
+        documentation += [
+            {
+                'variable': col.name,
+                'description': col.description,
+                'table': 'annotations',
+                'scope': 'unknown' 
+            }
+            for col in AnnotationManager.SEGMENTS_COLUMNS
+        ]
+
+        documentation = pd.DataFrame(documentation)
+        documentation = documentation[documentation["variable"].str.lower() == variable]
+
+
+    if not len(documentation):
+        print(f"could not find any documentation for variable '{variable}'")
+        return
+
+    print(f"Matching documentation for '{variable}':")
+    for doc in documentation.to_dict(orient = 'records'):
+        print(f"\n\033[94mtable\033[0m: {doc['table']}")
+        print(f"\033[94mdescription\033[0m: {doc['description']}")
+
+        if 'values' in doc and not pd.isnull(doc['values']):
+            print(f"\033[94mvalues\033[0m: {doc['values']}")
+
+        if 'annotation_set' in doc and not pd.isnull(doc['annotation_set']):
+            print(f"\033[94mannotation set(s)\033[0m: {doc['annotation_set']}")
+
+        if 'scope' in doc and not pd.isnull(doc['scope']):
+            print(f"\033[94mscope\033[0m: {doc['scope']}")
+
 @subcommand(
     [
         arg("source", help="source data path"),

diff --git a/ChildProject/projects.py b/ChildProject/projects.py
@@ -212,6 +212,21 @@ class ChildProject:
         ),
     ]
 
+    DOCUMENTATION_COLUMNS = [
+        IndexColumn(
+            name="variable", description="name of the variable", unique=True, required=True
+        ),
+        IndexColumn(
+            name="description", description="a definition of this field", required=True
+        ),
+        IndexColumn(name="values", description="a summary of authorized values"),
+        IndexColumn(name="scope", description="which group of users has access to it"),
+        IndexColumn(
+            name="annotation_set",
+            description="for annotations: which set(s) contain this variable",
+        ),
+    ]
+
     RAW_RECORDINGS = "recordings/raw"
     CONVERTED_RECORDINGS = "recordings/converted"
 
@@ -584,3 +599,21 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame:
         recordings["duration"] = (recordings["duration"] * 1000).astype(int)
 
         return recordings
+
+    def read_documentation(self) -> pd.DataFrame:
+        docs = ["children", "recordings", "annotations"]
+
+        documentation = []
+
+        for doc in docs:
+            path = os.path.join(self.path, "docs", f"{doc}.csv")
+
+            if not os.path.exists(path):
+                continue
+
+            table = IndexTable(f"{doc}-documentation", path, self.DOCUMENTATION_COLUMNS)
+            table.read()
+            documentation.append(table.df.assign(table=doc))
+
+        documentation = pd.concat(documentation)
+        return documentation
diff --git a/docs/source/_ext/directives.py b/docs/source/_ext/directives.py
@@ -52,6 +52,8 @@ def __init__(self, *args, **kwargs):
             table = AnnotationManager.SEGMENTS_COLUMNS
         elif array == 'annotations':
             table = [c for c in AnnotationManager.INDEX_COLUMNS if (c.generated or c.required)]
+        elif array == 'documentation':
+            table = ChildProject.DOCUMENTATION_COLUMNS
 
         if not table:
             raise Exception("invalid table '{}'".format(array))

diff --git a/docs/source/format.rst b/docs/source/format.rst
@@ -49,12 +49,17 @@ organize your files into this structure):
    │   │   └───raw
    │   │   │   │   child1_3600.TextGrid
    │
+   └───docs (*)
+   │   │   children.csv
+   │   │   recordings.csv
    └───extra
        │   notes.txt
 
 The children and recordings notebooks should be CSV dataframes formatted according to
 the standards detailed right below.
 
+   (*) The ``docs`` folder is optional.
+
 .. _format-metadata:
 
 Metadata
@@ -204,3 +209,40 @@ following format as an input:
    In order to avoid rounding errors, all timestamps are integers,
    expressed in milliseconds.
 
+Documentation
+-------------
+
+An important aspect of a dataset is its documentation.
+Documentation includes:
+
+ - authorship, references, contact information
+ - a description of the corpus (population, collection process, etc.)
+ - instructions to re-use the data
+ - description of the data itself (e.g. a definition of each metadata field)
+
+We currently do not provide a format for *all* these annotations.
+It is up to you to decide how to provide users with each of these information.
+
+However, we suggest several options below.
+
+Metadata and annotations
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ChildProject package supports a machine-readable format 
+to describe the contents of the metadata and the annotations.
+
+This format consists in CSV dataframe structured according 
+to the following table:
+
+.. index-table:: Machine-readable documentation
+   :header: documentation
+
+ - Documentation for the children metadata should be stored in ``docs/children.csv``
+ - Documentation for the recordings metadata should be stored in ``docs/recordings.csv``
+ - Documentation for annotations should be stored in ``docs/annotations.csv``
+
+Authorship
+~~~~~~~~~~
+
+We recommend DataCite's .yaml format (see `here <https://github.com/G-Node/gogs/blob/master/conf/datacite/datacite.yml>`_)
+
diff --git a/examples/valid_raw_data/docs/children.csv b/examples/valid_raw_data/docs/children.csv
@@ -0,0 +1,3 @@
+variable,description
+child_dob,The date of birth of the child!
+notes,Random notes!
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -44,3 +44,23 @@ def test_compute_durations():
     )
     assert exit_code == 0
 
+def test_explain():
+    stdout, stderr, exit_code = cli(
+        [
+            "child-project",
+            "explain",
+            "examples/valid_raw_data",
+            "notes"
+        ]
+    )
+    assert exit_code == 0
+
+    stdout, stderr, exit_code = cli(
+        [
+            "child-project",
+            "explain",
+            "examples/valid_raw_data",
+            "non-existent-variable"
+        ]
+    )
+    assert exit_code == 0
diff --git a/tests/test_documentation.py b/tests/test_documentation.py
@@ -0,0 +1,20 @@
+from ChildProject.projects import ChildProject
+import pandas as pd
+
+
+def standardize_dataframe(df, columns):
+    df = df[list(columns)]
+    return df.sort_index(axis=1).sort_values(list(columns)).reset_index(drop=True)
+
+
+def test_read():
+    project = ChildProject("examples/valid_raw_data")
+    project.read()
+
+    doc = project.read_documentation()
+    truth = pd.read_csv("tests/truth/docs.csv")
+
+    pd.testing.assert_frame_equal(
+        standardize_dataframe(doc, columns=truth.columns),
+        standardize_dataframe(truth, columns=truth.columns),
+    )
diff --git a/tests/truth/docs.csv b/tests/truth/docs.csv
@@ -0,0 +1,3 @@
+variable,description,table
+child_dob,The date of birth of the child!,children
+notes,Random notes!,children