Skip to content

Commit

Permalink
Documenting the metadata (#321)
Browse files Browse the repository at this point in the history
* documentation metadata and explain command
  • Loading branch information
lucasgautheron authored Nov 24, 2021
1 parent 74f6a77 commit 8bc67dd
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 0 deletions.
67 changes: 67 additions & 0 deletions ChildProject/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,73 @@ def overview(args):
)


@subcommand(
[arg("source", help="source data path"), arg("variable", help="name of the variable")]
)
def explain(args):
"""prints information about a certain metadata variable"""

variable = args.variable.lower()

project = ChildProject(args.source)
project.read()

documentation = project.read_documentation()
documentation = documentation[documentation["variable"].str.lower() == variable]

if not len(documentation):
documentation = [
{
'variable': col.name,
'description': col.description,
'table': 'recordings',
'scope': 'unknown'
}
for col in project.RECORDINGS_COLUMNS
]

documentation += [
{
'variable': col.name,
'description': col.description,
'table': 'children',
'scope': 'unknown'
}
for col in project.CHILDREN_COLUMNS
]

documentation += [
{
'variable': col.name,
'description': col.description,
'table': 'annotations',
'scope': 'unknown'
}
for col in AnnotationManager.SEGMENTS_COLUMNS
]

documentation = pd.DataFrame(documentation)
documentation = documentation[documentation["variable"].str.lower() == variable]


if not len(documentation):
print(f"could not find any documentation for variable '{variable}'")
return

print(f"Matching documentation for '{variable}':")
for doc in documentation.to_dict(orient = 'records'):
print(f"\n\033[94mtable\033[0m: {doc['table']}")
print(f"\033[94mdescription\033[0m: {doc['description']}")

if 'values' in doc and not pd.isnull(doc['values']):
print(f"\033[94mvalues\033[0m: {doc['values']}")

if 'annotation_set' in doc and not pd.isnull(doc['annotation_set']):
print(f"\033[94mannotation set(s)\033[0m: {doc['annotation_set']}")

if 'scope' in doc and not pd.isnull(doc['scope']):
print(f"\033[94mscope\033[0m: {doc['scope']}")

@subcommand(
[
arg("source", help="source data path"),
Expand Down
33 changes: 33 additions & 0 deletions ChildProject/projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,21 @@ class ChildProject:
),
]

DOCUMENTATION_COLUMNS = [
IndexColumn(
name="variable", description="name of the variable", unique=True, required=True
),
IndexColumn(
name="description", description="a definition of this field", required=True
),
IndexColumn(name="values", description="a summary of authorized values"),
IndexColumn(name="scope", description="which group of users has access to it"),
IndexColumn(
name="annotation_set",
description="for annotations: which set(s) contain this variable",
),
]

RAW_RECORDINGS = "recordings/raw"
CONVERTED_RECORDINGS = "recordings/converted"

Expand Down Expand Up @@ -584,3 +599,21 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame:
recordings["duration"] = (recordings["duration"] * 1000).astype(int)

return recordings

def read_documentation(self) -> pd.DataFrame:
docs = ["children", "recordings", "annotations"]

documentation = []

for doc in docs:
path = os.path.join(self.path, "docs", f"{doc}.csv")

if not os.path.exists(path):
continue

table = IndexTable(f"{doc}-documentation", path, self.DOCUMENTATION_COLUMNS)
table.read()
documentation.append(table.df.assign(table=doc))

documentation = pd.concat(documentation)
return documentation
2 changes: 2 additions & 0 deletions docs/source/_ext/directives.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def __init__(self, *args, **kwargs):
table = AnnotationManager.SEGMENTS_COLUMNS
elif array == 'annotations':
table = [c for c in AnnotationManager.INDEX_COLUMNS if (c.generated or c.required)]
elif array == 'documentation':
table = ChildProject.DOCUMENTATION_COLUMNS

if not table:
raise Exception("invalid table '{}'".format(array))
Expand Down
42 changes: 42 additions & 0 deletions docs/source/format.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,17 @@ organize your files into this structure):
│ │ └───raw
│ │ │ │ child1_3600.TextGrid
└───docs (*)
│ │ children.csv
│ │ recordings.csv
└───extra
│ notes.txt

The children and recordings notebooks should be CSV dataframes formatted according to
the standards detailed right below.

(*) The ``docs`` folder is optional.

.. _format-metadata:

Metadata
Expand Down Expand Up @@ -204,3 +209,40 @@ following format as an input:
In order to avoid rounding errors, all timestamps are integers,
expressed in milliseconds.

Documentation
-------------

An important aspect of a dataset is its documentation.
Documentation includes:

- authorship, references, contact information
- a description of the corpus (population, collection process, etc.)
- instructions to re-use the data
- description of the data itself (e.g. a definition of each metadata field)

We currently do not provide a format for *all* these annotations.
It is up to you to decide how to provide users with each of these information.

However, we suggest several options below.

Metadata and annotations
~~~~~~~~~~~~~~~~~~~~~~~~

The ChildProject package supports a machine-readable format
to describe the contents of the metadata and the annotations.

This format consists in CSV dataframe structured according
to the following table:

.. index-table:: Machine-readable documentation
:header: documentation

- Documentation for the children metadata should be stored in ``docs/children.csv``
- Documentation for the recordings metadata should be stored in ``docs/recordings.csv``
- Documentation for annotations should be stored in ``docs/annotations.csv``

Authorship
~~~~~~~~~~

We recommend DataCite's .yaml format (see `here <https://github.com/G-Node/gogs/blob/master/conf/datacite/datacite.yml>`_)

3 changes: 3 additions & 0 deletions examples/valid_raw_data/docs/children.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
variable,description
child_dob,The date of birth of the child!
notes,Random notes!
20 changes: 20 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,23 @@ def test_compute_durations():
)
assert exit_code == 0

def test_explain():
stdout, stderr, exit_code = cli(
[
"child-project",
"explain",
"examples/valid_raw_data",
"notes"
]
)
assert exit_code == 0

stdout, stderr, exit_code = cli(
[
"child-project",
"explain",
"examples/valid_raw_data",
"non-existent-variable"
]
)
assert exit_code == 0
20 changes: 20 additions & 0 deletions tests/test_documentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from ChildProject.projects import ChildProject
import pandas as pd


def standardize_dataframe(df, columns):
df = df[list(columns)]
return df.sort_index(axis=1).sort_values(list(columns)).reset_index(drop=True)


def test_read():
project = ChildProject("examples/valid_raw_data")
project.read()

doc = project.read_documentation()
truth = pd.read_csv("tests/truth/docs.csv")

pd.testing.assert_frame_equal(
standardize_dataframe(doc, columns=truth.columns),
standardize_dataframe(truth, columns=truth.columns),
)
3 changes: 3 additions & 0 deletions tests/truth/docs.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
variable,description,table
child_dob,The date of birth of the child!,children
notes,Random notes!,children

0 comments on commit 8bc67dd

Please sign in to comment.