From 7b8f70e42df8a7a230038293c3bd7332f1ca5982 Mon Sep 17 00:00:00 2001 From: johentsch Date: Wed, 22 Feb 2023 18:50:11 +0100 Subject: [PATCH 01/44] end-of-day commig of dezrann.py collab with Louis Couturier --- src/ms3/dezrann.py | 243 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 src/ms3/dezrann.py diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py new file mode 100644 index 00000000..3265a8de --- /dev/null +++ b/src/ms3/dezrann.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +DCML to Dezrann +=============== + +Script to convert contiguous score annotations from a tabular format (one line per label) into +the JSON-LD format .dez, used by the Dezrann annotation tool developed at the Algomus group. + +# Intro + +The script presents a first application of what is to become a formal standard of a "measure map"; +see first discussion points at + +* https://gitlab.com/algomus.fr/dezrann/dezrann/-/issues/1030#note_1122509147) +* https://github.com/MarkGotham/bar-measure/ + +As an early proxy of a measure map, the current version uses the measure tables that each +DCML corpus provides in its `measures` folder. This is beneficial in the current context because: + +1. The files are required for correct, actionable quarter-note positions without having to re-parse + the entire score. +2. The files play an essential role for validating the conversion output. +3. They help avoiding the confusion that necessarily arises when several addressing schemes are + at play. + +In detail: + +## 1. Quarterbeats + +From a technical perspective, offsets in the sense of "distance from the origin" represent the +primary mechanism of referencing positions in a text (character counts being the default in NLP). +Music scores are typically aligned with a time line of "musical time", an alignment which is +frequently expressed as float values representing an event's distance from the score's beginning, +measured in quarter notes, here referred to as quarterbeats. The fundamental problem, however, is +ensuring that quarterbeat positions refer to the same time line. The commonplace +score encoding formats do not indicate quarterbeat positions. Instead, they structure +musical time in a sequence of containers, generally called "measures", each of which represents +a time line starting from 0. Counting measure units (of some kind) therefore represents the second +prevalent way of indicating positions in a score, together with an event onset indicating an +event's distance from the container's beginning. To avoid terminological confusion, we call +the distance from the beginning of a measure container "onset". + +Looking at a single score, there is an unambiguous mapping between the two types of positions: +`event_offset = measure_offset + event_onset`. Problems arise, however when information from one +score is to be set into relation with timed information from another source. This is a wide-spread +problem in the context of music research and musical corpus studies where data from different +sources with different ways of expressing timestamps frequently needs to be aligned, often in +absence of the original score that one of the source is aligned to. Currently, there is no +standardized way of storing such alignments for later re-use. Hence the idea of a central +mapping file for storing alignments between positions given as quarterbeats, measure+onset, +recording timestamps in seconds, IDs, and other data relevant for score addressability. + +**Different types of quarterbeats** + +All TSV files issued by the DCML come with the column `quarterbeats` indicating every event's +offset from the score's beginning (position 0). With the caveat that, in the case of first/second endings +("voltas"), the indicated values do not take into account any but the second ending, with the +rationale that they should represent the temporal proportion of a single playthrough without any +repetitions. For correct conversion, therefore, using a strict, measuring-stick-based variant +of `quarterbeats` will probably be useful. This means that the default `quarterbeats` should be +ignored (unless first endings are to be categorically excluded) in favour of a +`quarterbeats_all_endings` column. Since the DCML measure maps already come with columns of both +names, the simple formula mentioned above `quarterbeats = quarterbeats(measure) + event_onset` +has its analogue `quarterbeats_all_measures = quarterbeats_all_measures(measure) + event_onset`. + +Input: DataFrame containing DCML harmony labels as output via the command `ms3 extract -X` +(X for 'expanded'), stored by default in a folder called 'harmonies'. Using these TSV files +ensures using only valid DCML labels but in principle this script can be used for converting +labels of all kinds as long as they come in the specified tabular format. + +## 2. Validating the output + +Going from a `DcmlLabel` dictionary to a `DezrannLabel` dictionary is straightforward because +they exchange positions as quarterbeats. Validation, on the other hand, requires relating +the output .dez format with the converted score which it is layed over in Dezrann. In the +interface, positions are shown to the user in terms of `measure_count + event_onset`. Extracting +this information and comparing it to the one in the original TSVs will + +Columns: + +* `mc`: measure count (XML measures, always starting from 1) +* + + + + +Output: +JSON Dezrann file (.dez) containing all the harmony labels, aligned with the score. +Here is an example of Dezrann file structure: +''' +{ + "labels": [ + {"type": "Harmony", "start": 0, "duration": 4, "line": "top.3", "tag": "I{"}, + {"type": "Harmony", "start": 4, "duration": 4, "line": "top.3", "tag": "V(64)"}, + {"type": "Harmony", "start": 8, "duration": 4, "line": "top.3", "tag": "V}"}, + ... +} +''' +""" + +import json +import os +from typing import Dict, List, TypedDict, Any, Union + +from fractions import Fraction +import pandas as pd + + + +def safe_frac(s: str) -> Union[Fraction, str]: + try: + return Fraction(s) + except Exception: + return s + +class DezrannLabel(TypedDict): + type: str #= "Harmony" # Default value ? + start: float + duration: float + line: str #= "top.3" #Literal? + tag: str + +class DezrannDict(TypedDict): + labels: List[DezrannLabel] + meta: Dict + +class DcmlLabel(TypedDict): + quarterbeats: float + duration: float + label: str + + +def transform_df(labels: pd.DataFrame, + measures: pd.DataFrame, + label_column: str = 'label') -> List[DcmlLabel]: + """ + + Parameters + ---------- + labels: + Dataframe as found in the 'harmonies' folder of a DCML corpus. Needs to have columns with + the correct dtypes {'mc': int, 'mc_onset': fractions.Fraction} and no missing values. + measures: + Dataframe as found in the 'measures' folder of a DCML corpus. Requires the columns + {'mc': int, 'quarterbeats_all_endings': fractions.Fraction} + label_column: str, optional + The column that is to be used as label string. Defaults to 'label'. + + Returns + ------- + List of dictionaries where each represents one row of the input labels. + """ + offset_dict = measures.set_index("mc")["quarterbeats_all_endings"] + quarterbeats = labels['mc'].map(offset_dict) + quarterbeats = quarterbeats.astype('float') + (labels.mc_onset * 4.0) + transformed_df = pd.concat([quarterbeats.rename('quarterbeats'), labels.duration_qb.rename('duration'), labels[label_column].rename('label')], axis=1) + return transformed_df.to_dict(orient='records') + +def make_dezrann_label(quarterbeats: float, duration: float, label: str) -> DezrannLabel: + return DezrannLabel(type="Harmony", start=quarterbeats, duration=duration, line="top.3", tag=label) + +def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel]) -> List[DezrannDict]: + label_list = [] + for e in values_dict: + label_list.append( + make_dezrann_label( + quarterbeats=e["quarterbeats"], + duration=e["duration"], + label=e["label"] + ) + ) + return DezrannDict(labels=label_list, meta={"layout": []}) + + +def generate_dez(path_measures, path_labels, output_path="labels.dez"): # need paths for harmony.TSV + paths for measures.TSV + """ + path_measures : :obj:`str` + Path to a TSV file as output by format_data(). + path_labels : :obj:`str` + Path to a TSV file as output by format_data(). + output_labels : :obj:`str` + Path to a TSV file as output by format_data(). + """ + harmonies = pd.read_csv( + path_labels, sep='\t', + usecols=['mc', 'mc_onset', 'duration_qb', 'label'], #'chord' + converters={'mc_onset': safe_frac} + ) + measures = pd.read_csv( + path_measures, sep='\t', + usecols=['mc', 'quarterbeats_all_endings'], + converters={'quarterbeats_all_endings': safe_frac} + ) + dcml_labels = transform_df(labels=harmonies, measures=measures) + dezrann_content = convert_dcml_list_to_dezrann_list(dcml_labels) + + # Manual post-processing #TODO: improve these cases + # 1) Avoid NaN values in "duration" (happens in second endings) + # optional : in the transform_df : transformed_df = transformed_df.replace('NaN', 0) ? + for label in dezrann_content['labels']: + if pd.isnull(label['duration']): + print(f"WARNING: NaN duration detected in label {label}.") + label['duration'] = 0 + # 2) Remove "start" value in the first label ? + if dezrann_content['labels'][0]['start'] == 0.: + del dezrann_content['labels'][0]['start'] + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(dezrann_content, f, indent=2) + + +# Test +MOZART_SONATAS = [ + 'K279-1', 'K279-2', 'K279-3', + 'K280-1', 'K280-2', 'K280-3', + 'K283-1', 'K283-2', 'K283-3', +] +MEASURE_DIR = os.path.join("src", "ms3") #to be updated +HARMONY_DIR = os.path.join("src", "ms3") #to be updated +MEASURE_PATHS = [ + os.path.join(MEASURE_DIR, f"{movement}_measures.tsv") + for movement in MOZART_SONATAS +] +HARMONY_PATHS = [ + os.path.join(HARMONY_DIR, f"{movement}_harmonies.tsv") + for movement in MOZART_SONATAS +] + +OUTPUT_DIR = "." #to be updated +def generate_all_dez(output_dir=OUTPUT_DIR): + for i_piece, piece in enumerate(MOZART_SONATAS): + generate_dez(MEASURE_PATHS[i_piece], HARMONY_PATHS[i_piece]) + + +if __name__ == "__main__": + #measures = ms3.load_tsv('src/ms3/K283-2_measures.tsv') + #harmonies = ms3.load_tsv('src/ms3/K283-2_harmonies.tsv') + #transformed = transform_df(labels=harmonies, measures=measures) + #print(transformed) + + dez = generate_dez('src/ms3/K283-2_measures.tsv', 'src/ms3/K283-2_harmonies.tsv') + #generate_all_dez() \ No newline at end of file From 0851ce25f798f0305019a49bd84b065bf7d03694 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Thu, 23 Feb 2023 14:14:29 +0100 Subject: [PATCH 02/44] add 'origin' argument to handle 'layers' attributes in Dezrann labels --- src/ms3/dezrann.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 3265a8de..a7e68f22 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -120,6 +120,7 @@ class DezrannLabel(TypedDict): duration: float line: str #= "top.3" #Literal? tag: str + layers: List[str] class DezrannDict(TypedDict): labels: List[DezrannLabel] @@ -157,23 +158,32 @@ def transform_df(labels: pd.DataFrame, transformed_df = pd.concat([quarterbeats.rename('quarterbeats'), labels.duration_qb.rename('duration'), labels[label_column].rename('label')], axis=1) return transformed_df.to_dict(orient='records') -def make_dezrann_label(quarterbeats: float, duration: float, label: str) -> DezrannLabel: - return DezrannLabel(type="Harmony", start=quarterbeats, duration=duration, line="top.3", tag=label) +def make_dezrann_label( + quarterbeats: float, duration: float, label: str, origin: List[str]) -> DezrannLabel: + return DezrannLabel( + type="Harmony", + start=quarterbeats, + duration=duration, + line="top.3", + tag=label, + layers=origin + ) -def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel]) -> List[DezrannDict]: +def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], origin: List[str]) -> List[DezrannDict]: label_list = [] for e in values_dict: label_list.append( make_dezrann_label( quarterbeats=e["quarterbeats"], duration=e["duration"], - label=e["label"] + label=e["label"], + origin=origin ) ) return DezrannDict(labels=label_list, meta={"layout": []}) -def generate_dez(path_measures, path_labels, output_path="labels.dez"): # need paths for harmony.TSV + paths for measures.TSV +def generate_dez(path_measures, path_labels, output_path="labels.dez", origin: List[str] = ["DCML"]): """ path_measures : :obj:`str` Path to a TSV file as output by format_data(). @@ -181,6 +191,8 @@ def generate_dez(path_measures, path_labels, output_path="labels.dez"): # need p Path to a TSV file as output by format_data(). output_labels : :obj:`str` Path to a TSV file as output by format_data(). + origin : :obj:`list` + List of source(s) from which the labels originate. Defaults to ["DCML"]. """ harmonies = pd.read_csv( path_labels, sep='\t', @@ -193,7 +205,7 @@ def generate_dez(path_measures, path_labels, output_path="labels.dez"): # need p converters={'quarterbeats_all_endings': safe_frac} ) dcml_labels = transform_df(labels=harmonies, measures=measures) - dezrann_content = convert_dcml_list_to_dezrann_list(dcml_labels) + dezrann_content = convert_dcml_list_to_dezrann_list(dcml_labels, origin=origin) # Manual post-processing #TODO: improve these cases # 1) Avoid NaN values in "duration" (happens in second endings) From d395db5088e9fb6ede3f718bc372f668b109b88b Mon Sep 17 00:00:00 2001 From: johentsch Date: Thu, 23 Feb 2023 17:03:09 +0100 Subject: [PATCH 03/44] makes the new argument 'origin' a str|Tuple[str] (lists should be avoided as default arguments) --- src/ms3/dezrann.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index a7e68f22..53470225 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -101,7 +101,7 @@ import json import os -from typing import Dict, List, TypedDict, Any, Union +from typing import Dict, List, TypedDict, Union, Tuple from fractions import Fraction import pandas as pd @@ -123,6 +123,7 @@ class DezrannLabel(TypedDict): layers: List[str] class DezrannDict(TypedDict): + """Represents one .dez file.""" labels: List[DezrannLabel] meta: Dict @@ -159,17 +160,22 @@ def transform_df(labels: pd.DataFrame, return transformed_df.to_dict(orient='records') def make_dezrann_label( - quarterbeats: float, duration: float, label: str, origin: List[str]) -> DezrannLabel: + quarterbeats: float, duration: float, label: str, origin: Union[str, Tuple[str]]) -> DezrannLabel: + if isinstance(origin, str): + layers = [origin] + else: + layers = list(origin) return DezrannLabel( type="Harmony", start=quarterbeats, duration=duration, line="top.3", tag=label, - layers=origin + layers=layers ) -def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], origin: List[str]) -> List[DezrannDict]: +def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], + origin: Union[str, Tuple[str]] = "DCML") -> List[DezrannDict]: label_list = [] for e in values_dict: label_list.append( @@ -183,7 +189,10 @@ def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], origin: List return DezrannDict(labels=label_list, meta={"layout": []}) -def generate_dez(path_measures, path_labels, output_path="labels.dez", origin: List[str] = ["DCML"]): +def generate_dez(path_measures: str, + path_labels: str, + output_path: str = "labels.dez", + origin: Union[str, Tuple[str]] = "DCML"): """ path_measures : :obj:`str` Path to a TSV file as output by format_data(). From 62682e1d25bde6b934702cc006ea55f074330d1b Mon Sep 17 00:00:00 2001 From: johentsch Date: Thu, 23 Feb 2023 17:03:43 +0100 Subject: [PATCH 04/44] corrects output annotation --- src/ms3/dezrann.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 53470225..b34bb648 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -175,7 +175,7 @@ def make_dezrann_label( ) def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], - origin: Union[str, Tuple[str]] = "DCML") -> List[DezrannDict]: + origin: Union[str, Tuple[str]] = "DCML") -> DezrannDict: label_list = [] for e in values_dict: label_list.append( From 14148c13d9a73f3cd0a6cd10625a7c3646459c30 Mon Sep 17 00:00:00 2001 From: johentsch Date: Thu, 23 Feb 2023 17:54:37 +0100 Subject: [PATCH 05/44] adds CLI skeleton --- src/ms3/dezrann.py | 53 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index b34bb648..90cf4d7a 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -253,12 +253,63 @@ def generate_all_dez(output_dir=OUTPUT_DIR): for i_piece, piece in enumerate(MOZART_SONATAS): generate_dez(MEASURE_PATHS[i_piece], HARMONY_PATHS[i_piece]) +def main(input_dir: str, + measures_dir: str, + output_dir: str, + harmony_layer: int, + keys_layer:int, + phrases_layer: int, + cadences_layer: int, + raw_layer: int): + pass + +def process_arguments(args) -> dict: + pass + + +def run(): + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, + description='''\ + ----------------------------- + | DCML => Dezrann converter | + ----------------------------- + + This script converts DCML harmony annotations into the .dez JSON format used by the dezrann.net app. It is + standalone and does not require ms3 to be installed. Its only requirement is pandas. + + Apart from that, the script requires that you have previously extracted both harmonies and measures from the + annotated scores or that you are converting a DCML corpus (https://github.com/DCMLab/dcml_corpora), + where both facets are provided by default. In order to (re-) extract the labels, use the command: + + ms3 extract -X -M + + Or, if you want to convert other harmony or chord labels from your MuseScore files, use -L for labels. + ms3 extract -h will show you all options. + ''') + parser.add_argument(metavar='DIR', default=os.getcwd(), + help='Folder that will be scanned for TSV files to convert. Defaults to current working directory.') + parser.add_argument('-m', '--measures', metavar='DIR', + help='Folder(s) that will be scanned for TSV files to convert. Defaults to current working directory.') + parser.add_argument('-o', '--out', metavar='OUT_DIR', + help='Output directory for .dez files. Defaults to the input directory.') + parser.add_argument('-H', '--harmonies', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('-K', '--keys', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('-P', '--phrases', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('-C', '--cadences', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('--raw', choices=[0, 1, 2, 3, 4, 5, 6]) + args = parser.parse_args() + kwargs = process_arguments(args) + main(**kwargs) if __name__ == "__main__": + run() + + + #measures = ms3.load_tsv('src/ms3/K283-2_measures.tsv') #harmonies = ms3.load_tsv('src/ms3/K283-2_harmonies.tsv') #transformed = transform_df(labels=harmonies, measures=measures) #print(transformed) - dez = generate_dez('src/ms3/K283-2_measures.tsv', 'src/ms3/K283-2_harmonies.tsv') + dez = generate_dez('K283-2_measures.tsv', 'K283-2_harmonies.tsv') #generate_all_dez() \ No newline at end of file From a2b40a3c2280103feb93e54e54a7f517e5d72a55 Mon Sep 17 00:00:00 2001 From: johentsch Date: Thu, 23 Feb 2023 18:01:35 +0100 Subject: [PATCH 06/44] adds import and corrects positional argument --- src/ms3/dezrann.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 90cf4d7a..83833f62 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -98,7 +98,7 @@ } ''' """ - +import argparse import json import os from typing import Dict, List, TypedDict, Union, Tuple @@ -286,7 +286,7 @@ def run(): Or, if you want to convert other harmony or chord labels from your MuseScore files, use -L for labels. ms3 extract -h will show you all options. ''') - parser.add_argument(metavar='DIR', default=os.getcwd(), + parser.add_argument("dir", metavar='DIR', help='Folder that will be scanned for TSV files to convert. Defaults to current working directory.') parser.add_argument('-m', '--measures', metavar='DIR', help='Folder(s) that will be scanned for TSV files to convert. Defaults to current working directory.') @@ -311,5 +311,5 @@ def run(): #transformed = transform_df(labels=harmonies, measures=measures) #print(transformed) - dez = generate_dez('K283-2_measures.tsv', 'K283-2_harmonies.tsv') + #dez = generate_dez('K283-2_measures.tsv', 'K283-2_harmonies.tsv') #generate_all_dez() \ No newline at end of file From 8775778698bf989c7139206c2aa1d31db787066e Mon Sep 17 00:00:00 2001 From: johentsch Date: Thu, 23 Feb 2023 18:17:12 +0100 Subject: [PATCH 07/44] streamlines CLI arguments and suggests defaults --- src/ms3/dezrann.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 83833f62..b3a167e8 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -292,11 +292,29 @@ def run(): help='Folder(s) that will be scanned for TSV files to convert. Defaults to current working directory.') parser.add_argument('-o', '--out', metavar='OUT_DIR', help='Output directory for .dez files. Defaults to the input directory.') - parser.add_argument('-H', '--harmonies', choices=[0, 1, 2, 3, 4, 5, 6]) - parser.add_argument('-K', '--keys', choices=[0, 1, 2, 3, 4, 5, 6]) - parser.add_argument('-P', '--phrases', choices=[0, 1, 2, 3, 4, 5, 6]) - parser.add_argument('-C', '--cadences', choices=[0, 1, 2, 3, 4, 5, 6]) - parser.add_argument('--raw', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('-C', + '--cadences', + action="store_true", + ) + parser.add_argument('-H', + '--harmonies', + metavar="{1-6}, default: 4", + default=4, + choices=[1, 2, 3, 4, 5, 6], + ) + parser.add_argument('-K', + '--keys', + metavar="{1-6}, default: 5", + default=5, + choices=[1, 2, 3, 4, 5, 6]) + parser.add_argument('-P', + '--phrases', + metavar="{1-6}, default: 6", + default=6, + choices=[1, 2, 3, 4, 5, 6]) + parser.add_argument('--raw', + metavar="{1-6}", + choices=[1, 2, 3, 4, 5, 6]) args = parser.parse_args() kwargs = process_arguments(args) main(**kwargs) From ee7e56013f30b703e07bb42c87519f385416372d Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Thu, 23 Feb 2023 18:39:19 +0100 Subject: [PATCH 08/44] work in progress: CLI + line layout --- src/ms3/dezrann.py | 79 ++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index b3a167e8..6c95bf1f 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -175,6 +175,11 @@ def make_dezrann_label( ) def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], + cadences: bool, + harmony_line: int, + keys_line: int, + phrases_line: int, + raw_line: int, origin: Union[str, Tuple[str]] = "DCML") -> DezrannDict: label_list = [] for e in values_dict: @@ -186,7 +191,14 @@ def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], origin=origin ) ) - return DezrannDict(labels=label_list, meta={"layout": []}) + layout = [] + if raw_line > 0: + layout.append({"filter": {"type": "Harmony"}, "style": {"line": line}}) + #if harmony_line > 0: + # ... + #if keys_line > 0: + # ... + return DezrannDict(labels=label_list, meta={"layout": layout}) def generate_dez(path_measures: str, @@ -256,14 +268,38 @@ def generate_all_dez(output_dir=OUTPUT_DIR): def main(input_dir: str, measures_dir: str, output_dir: str, - harmony_layer: int, - keys_layer:int, - phrases_layer: int, - cadences_layer: int, - raw_layer: int): + cadences: bool, + harmony_line: Optional[str], # will transform and pass in "bot.1", None otherwise + keys_line: Optional[str], + phrases_line: Optional[str], + raw_line: Optional[str]): pass +LINE_VALUES = { + 1: "top.1", + 2: "top.2", + 3: "top.3", + 4: "bot.1", + 5: "bot.2", + 6: "bot.3" +} + +def transform_line_argument(line: Optional[Union[int, str]]) -> Optional[str]: + if line is None: + return + try: + line = int(line) + assert line in [1,2,3,4,5,6, -1, -2, -3] + except (TypeError, ValueError, AssertionError): + raise ValueError(f"{line} is not a valid argument, shoube within 1-6.") + if line < 0: + line = abs(line) + 3 + return LINE_VALUES[line] + + def process_arguments(args) -> dict: + kwargs = {} + line_args = ('harmonies', 'keys', 'phrases', 'raw') pass @@ -289,32 +325,15 @@ def run(): parser.add_argument("dir", metavar='DIR', help='Folder that will be scanned for TSV files to convert. Defaults to current working directory.') parser.add_argument('-m', '--measures', metavar='DIR', - help='Folder(s) that will be scanned for TSV files to convert. Defaults to current working directory.') + help="Folder in which to look for the corrsponding measure maps. By default, the script will try " + "to find a sibling to the source dir called 'measures'.") parser.add_argument('-o', '--out', metavar='OUT_DIR', help='Output directory for .dez files. Defaults to the input directory.') - parser.add_argument('-C', - '--cadences', - action="store_true", - ) - parser.add_argument('-H', - '--harmonies', - metavar="{1-6}, default: 4", - default=4, - choices=[1, 2, 3, 4, 5, 6], - ) - parser.add_argument('-K', - '--keys', - metavar="{1-6}, default: 5", - default=5, - choices=[1, 2, 3, 4, 5, 6]) - parser.add_argument('-P', - '--phrases', - metavar="{1-6}, default: 6", - default=6, - choices=[1, 2, 3, 4, 5, 6]) - parser.add_argument('--raw', - metavar="{1-6}", - choices=[1, 2, 3, 4, 5, 6]) + parser.add_argument('-H', '--harmonies', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('-K', '--keys', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('-P', '--phrases', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('-C', '--cadences', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('--raw', choices=[0, 1, 2, 3, 4, 5, 6]) args = parser.parse_args() kwargs = process_arguments(args) main(**kwargs) From 9095d0e7dc0722856ee66abad7a043fb7cd8e8c1 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Thu, 23 Feb 2023 19:43:33 +0100 Subject: [PATCH 09/44] end-of-day commit after collab session LC & JH --- src/ms3/dezrann.py | 197 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 157 insertions(+), 40 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 6c95bf1f..af4dfb51 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -101,7 +101,7 @@ import argparse import json import os -from typing import Dict, List, TypedDict, Union, Tuple +from typing import Dict, List, TypedDict, Union, Tuple, Optional from fractions import Fraction import pandas as pd @@ -115,10 +115,11 @@ def safe_frac(s: str) -> Union[Fraction, str]: return s class DezrannLabel(TypedDict): - type: str #= "Harmony" # Default value ? + """Represents one label in a .dez file.""" + type: str start: float duration: float - line: str #= "top.3" #Literal? + #line: str # Determined by the meta-layout tag: str layers: List[str] @@ -128,13 +129,18 @@ class DezrannDict(TypedDict): meta: Dict class DcmlLabel(TypedDict): + """Represents one label from a TSV annotation file""" quarterbeats: float duration: float label: str + harmony: str + key: str + phrase: str + cadence: str -def transform_df(labels: pd.DataFrame, - measures: pd.DataFrame, +def transform_df(labels: pd.DataFrame, + measures: pd.DataFrame, label_column: str = 'label') -> List[DcmlLabel]: """ @@ -166,20 +172,19 @@ def make_dezrann_label( else: layers = list(origin) return DezrannLabel( - type="Harmony", + type="Harmony", #TODO: adapt type to current label start=quarterbeats, duration=duration, - line="top.3", tag=label, layers=layers ) def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], - cadences: bool, - harmony_line: int, - keys_line: int, - phrases_line: int, - raw_line: int, + cadences: bool = False, + harmony_line: Optional[str] = None, + keys_line: Optional[str] = None, + phrases_line: Optional[str] = None, + raw_line: Optional[str] = None, origin: Union[str, Tuple[str]] = "DCML") -> DezrannDict: label_list = [] for e in values_dict: @@ -192,18 +197,28 @@ def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], ) ) layout = [] - if raw_line > 0: - layout.append({"filter": {"type": "Harmony"}, "style": {"line": line}}) - #if harmony_line > 0: - # ... - #if keys_line > 0: - # ... + if cadences: + layout.append({"filter": {"type": "Cadence"}, "style": {"line": "all"}}) + if harmony_line: + layout.append({"filter": {"type": "Harmony"}, "style": {"line": harmony_line}}) + if keys_line: + layout.append({"filter": {"type": "Localkey"}, "style": {"line": keys_line}}) + if phrases_line: + layout.append({"filter": {"type": "Phrase"}, "style": {"line": phrases_line}}) + if raw_line: + layout.append({"filter": {"type": "Harmony"}, "style": {"line": raw_line}}) + return DezrannDict(labels=label_list, meta={"layout": layout}) def generate_dez(path_measures: str, path_labels: str, output_path: str = "labels.dez", + cadences: bool = False, + harmonies: Optional[str] = None, + keys: Optional[str] = None, + phrases: Optional[str] = None, + raw: Optional[str] = None, origin: Union[str, Tuple[str]] = "DCML"): """ path_measures : :obj:`str` @@ -212,21 +227,33 @@ def generate_dez(path_measures: str, Path to a TSV file as output by format_data(). output_labels : :obj:`str` Path to a TSV file as output by format_data(). - origin : :obj:`list` - List of source(s) from which the labels originate. Defaults to ["DCML"]. + origin : :obj:`tuple` + Tuple of source(s) from which the labels originate. Defaults to "DCML". """ - harmonies = pd.read_csv( + harmonies_df = pd.read_csv( path_labels, sep='\t', usecols=['mc', 'mc_onset', 'duration_qb', 'label'], #'chord' converters={'mc_onset': safe_frac} ) - measures = pd.read_csv( - path_measures, sep='\t', - usecols=['mc', 'quarterbeats_all_endings'], - converters={'quarterbeats_all_endings': safe_frac} + try: + measures_df = pd.read_csv( + path_measures, sep='\t', + usecols=['mc', 'quarterbeats_all_endings'], + converters={'quarterbeats_all_endings': safe_frac} + ) + except ValueError as e: + raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") + + dcml_labels = transform_df(labels=harmonies_df, measures=measures_df) + dezrann_content = convert_dcml_list_to_dezrann_list( + dcml_labels, + cadences=cadences, + harmony_line=harmonies, + keys_line=keys, + phrases_line=phrases, + raw_line=raw, + origin=origin ) - dcml_labels = transform_df(labels=harmonies, measures=measures) - dezrann_content = convert_dcml_list_to_dezrann_list(dcml_labels, origin=origin) # Manual post-processing #TODO: improve these cases # 1) Avoid NaN values in "duration" (happens in second endings) @@ -268,12 +295,45 @@ def generate_all_dez(output_dir=OUTPUT_DIR): def main(input_dir: str, measures_dir: str, output_dir: str, - cadences: bool, - harmony_line: Optional[str], # will transform and pass in "bot.1", None otherwise - keys_line: Optional[str], - phrases_line: Optional[str], - raw_line: Optional[str]): - pass + cadences: bool = False, + harmonies: Optional[str] = None, + keys: Optional[str] = None, + phrases: Optional[str] = None, + raw: Optional[str] = None): + if not cadences and all(arg is None for arg in (harmonies, keys, phrases, raw)): + print(f"Nothing to do because no features have been selected.") + return + input_files = [f for f in os.listdir(input_dir) if f.endswith('.tsv')] + # measures_files = glob.glob(f"{measures_dir}/*.tsv") + harmony_measure_matches = [] + for tsv_name in input_files: + measures_file_path = os.path.join(measures_dir, tsv_name) + if os.path.isfile(measures_file_path): + harmonies_file_path = os.path.join(input_dir, tsv_name) + harmony_measure_matches.append((harmonies_file_path, measures_file_path)) + else: + print(f"No measure map found for {tsv_name}. Skipping.") + continue + for input_file, measure_file in harmony_measure_matches: + if output_dir == input_dir: + output_file_path = measure_file.replace(".tsv", ".dez") + else: + dez_file = os.path.basename(measure_file).replace(".tsv", ".dez") + output_file_path = os.path.join(output_dir, dez_file) + try: + generate_dez( + path_labels=input_file, + path_measures=measure_file, + output_path=output_file_path, + cadences=cadences, + harmonies=harmonies, + keys=keys, + phrases=phrases, + raw=raw + ) + print(f"{output_file_path} successfully written.") + except Exception as e: + print(f"Converting {input_file} failed with '{e}'") LINE_VALUES = { 1: "top.1", @@ -295,12 +355,51 @@ def transform_line_argument(line: Optional[Union[int, str]]) -> Optional[str]: if line < 0: line = abs(line) + 3 return LINE_VALUES[line] + +def resolve_dir(d): + """ Resolves '~' to HOME directory and turns ``d`` into an absolute path. + """ + if d is None: + return None + d = str(d) + if '~' in d: + return os.path.expanduser(d) + return os.path.abspath(d) def process_arguments(args) -> dict: - kwargs = {} + input_dir = resolve_dir(args.dir) + assert os.path.isdir(input_dir), f"{args.dir} is not an existing directory." + if args.measures is None: + measures_dir = os.path.abspath(os.path.join(input_dir, '..', 'measures')) + if not os.path.isdir(measures_dir): + raise ValueError(f"No directory with measure maps was specified and the default path " + f"{measures_dir} does not exist.") + else: + measures_dir = resolve_dir(args.measures) + if not os.path.isdir(measures_dir): + raise ValueError(f"{measures_dir} is not an existing directory.") + if args.out is None: + output_dir = input_dir + else: + output_dir = resolve_dir(args.out) + if not os.path.isdir(output_dir): + raise ValueError(f"{output_dir} is not an existing directory.") + kwargs = dict( + input_dir=input_dir, + measures_dir=measures_dir, + output_dir=output_dir + ) line_args = ('harmonies', 'keys', 'phrases', 'raw') - pass + for arg in line_args: + arg_val = getattr(args, arg) + if arg_val is None: + continue + kwargs[arg] = transform_line_argument(arg_val) + if args.cadences: + kwargs['cadences'] = True + print(kwargs) + return kwargs def run(): @@ -329,11 +428,29 @@ def run(): "to find a sibling to the source dir called 'measures'.") parser.add_argument('-o', '--out', metavar='OUT_DIR', help='Output directory for .dez files. Defaults to the input directory.') - parser.add_argument('-H', '--harmonies', choices=[0, 1, 2, 3, 4, 5, 6]) - parser.add_argument('-K', '--keys', choices=[0, 1, 2, 3, 4, 5, 6]) - parser.add_argument('-P', '--phrases', choices=[0, 1, 2, 3, 4, 5, 6]) - parser.add_argument('-C', '--cadences', choices=[0, 1, 2, 3, 4, 5, 6]) - parser.add_argument('--raw', choices=[0, 1, 2, 3, 4, 5, 6]) + parser.add_argument('-C', + '--cadences', + action="store_true", + ) + parser.add_argument('-H', + '--harmonies', + metavar="{1-6}, default: 4", + default="4", + choices=["1", "2", "3", "4", "5", "6", "-1", "-2", "-3"], + ) + parser.add_argument('-K', + '--keys', + metavar="{1-6}, default: 5", + default="5", + choices=["1", "2", "3", "4", "5", "6", "-1", "-2", "-3"]) + parser.add_argument('-P', + '--phrases', + metavar="{1-6}, default: 6", + default="6", + choices=["1", "2", "3", "4", "5", "6", "-1", "-2", "-3"]) + parser.add_argument('--raw', + metavar="{1-6}", + choices=["1", "2", "3", "4", "5", "6", "-1", "-2", "-3"]) args = parser.parse_args() kwargs = process_arguments(args) main(**kwargs) From 6ec8cd2e71fe714ed71d1388d4fc715e35f7dc67 Mon Sep 17 00:00:00 2001 From: johentsch Date: Fri, 24 Feb 2023 09:45:57 +0100 Subject: [PATCH 10/44] refines commandline interface with docstrings and better argument treatment; enables deactivating the default layers by passing 0 --- src/ms3/dezrann.py | 89 +++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index af4dfb51..edda7e97 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -308,6 +308,9 @@ def main(input_dir: str, harmony_measure_matches = [] for tsv_name in input_files: measures_file_path = os.path.join(measures_dir, tsv_name) + if not os.path.isfile(measures_file_path): + # could be a directory + continue if os.path.isfile(measures_file_path): harmonies_file_path = os.path.join(input_dir, tsv_name) harmony_measure_matches.append((harmonies_file_path, measures_file_path)) @@ -345,13 +348,16 @@ def main(input_dir: str, } def transform_line_argument(line: Optional[Union[int, str]]) -> Optional[str]: + """Takes a number bet""" if line is None: return try: line = int(line) - assert line in [1,2,3,4,5,6, -1, -2, -3] + assert line in [1,2,3,4,5,6, 0 -1, -2, -3] except (TypeError, ValueError, AssertionError): - raise ValueError(f"{line} is not a valid argument, shoube within 1-6.") + raise ValueError(f"{line} is not a valid argument, should be within [0, 6].") + if line == 0: + return None if line < 0: line = abs(line) + 3 return LINE_VALUES[line] @@ -367,7 +373,8 @@ def resolve_dir(d): return os.path.abspath(d) -def process_arguments(args) -> dict: +def process_arguments(args: argparse.Namespace) -> dict: + """Transforms the user's input arguments into keyword arguments for :func:`main` or raises a ValueError.""" input_dir = resolve_dir(args.dir) assert os.path.isdir(input_dir), f"{args.dir} is not an existing directory." if args.measures is None: @@ -395,7 +402,13 @@ def process_arguments(args) -> dict: arg_val = getattr(args, arg) if arg_val is None: continue - kwargs[arg] = transform_line_argument(arg_val) + line_arg = transform_line_argument(arg_val) + if line_arg is None: + continue + kwargs[arg] = line_arg + if len(set(kwargs.values())) < len(kwargs.values()): + selected_args = {arg: f"'{getattr(args, arg)}' => {kwargs[arg]}" for arg in line_args if arg in kwargs} + raise ValueError(f"You selected the same annotation layer more than once: {selected_args}.") if args.cadences: kwargs['cadences'] = True print(kwargs) @@ -405,25 +418,26 @@ def process_arguments(args) -> dict: def run(): parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''\ - ----------------------------- - | DCML => Dezrann converter | - ----------------------------- - - This script converts DCML harmony annotations into the .dez JSON format used by the dezrann.net app. It is - standalone and does not require ms3 to be installed. Its only requirement is pandas. - - Apart from that, the script requires that you have previously extracted both harmonies and measures from the - annotated scores or that you are converting a DCML corpus (https://github.com/DCMLab/dcml_corpora), - where both facets are provided by default. In order to (re-) extract the labels, use the command: - - ms3 extract -X -M - - Or, if you want to convert other harmony or chord labels from your MuseScore files, use -L for labels. - ms3 extract -h will show you all options. - ''') - parser.add_argument("dir", metavar='DIR', - help='Folder that will be scanned for TSV files to convert. Defaults to current working directory.') - parser.add_argument('-m', '--measures', metavar='DIR', +----------------------------- +| DCML => Dezrann converter | +----------------------------- + +This script converts DCML harmony annotations into the .dez JSON format used by the dezrann.net app. It is +standalone and does not require ms3 to be installed. Its only requirement is pandas. + +Apart from that, the script requires that you have previously extracted both harmonies and measures from the +annotated scores or that you are converting a DCML corpus (https://github.com/DCMLab/dcml_corpora), +where both facets are provided by default. In order to (re-) extract the labels, use the command: + + ms3 extract -X -M + +Or, if you want to convert other harmony or chord labels from your MuseScore files, use -L for labels. +ms3 extract -h will show you all options. +''') + parser.add_argument("dir", metavar='IN_DIR', + help='Folder that will be scanned for TSV files to convert. Defaults to current working directory. ' + 'Sub-directories are not taken into account.') + parser.add_argument('-m', '--measures', metavar='MEASURES_DIR', help="Folder in which to look for the corrsponding measure maps. By default, the script will try " "to find a sibling to the source dir called 'measures'.") parser.add_argument('-o', '--out', metavar='OUT_DIR', @@ -431,26 +445,37 @@ def run(): parser.add_argument('-C', '--cadences', action="store_true", + help="Pass this flag if you want to add time-point cadence labels to the .dez files." ) - parser.add_argument('-H', - '--harmonies', - metavar="{1-6}, default: 4", + possible_line_arguments = ("0", "1", "2", "3", "4", "5", "6", "-1", "-2", "-3") + parser.add_argument('-H', + '--harmonies', + metavar="{0-6}, default: 4", default="4", - choices=["1", "2", "3", "4", "5", "6", "-1", "-2", "-3"], + choices=possible_line_arguments, + help="By default, harmony annotations will be set on the first line under the system (layer " + "4 out of 6). Pick another layer or pass 0 to not add harmonies." ) parser.add_argument('-K', '--keys', - metavar="{1-6}, default: 5", + metavar="{0-6}, default: 5", default="5", - choices=["1", "2", "3", "4", "5", "6", "-1", "-2", "-3"]) + choices=possible_line_arguments, + help="By default, local key segments will be set on the second line under the system (layer " + "5 out of 6). Pick another layer or pass 0 to not add key segments. Note, however, " + "that harmonies are underdetermined without their local key.") parser.add_argument('-P', '--phrases', - metavar="{1-6}, default: 6", + metavar="{0-6}, default: 6", default="6", - choices=["1", "2", "3", "4", "5", "6", "-1", "-2", "-3"]) + choices=possible_line_arguments, + help="By default, phrase annotations will be set on the third line under the system (layer " + "6 out of 6). Pick another layer or pass 0 to not add phrases.") parser.add_argument('--raw', metavar="{1-6}", - choices=["1", "2", "3", "4", "5", "6", "-1", "-2", "-3"]) + choices=possible_line_arguments, + help="Pass this argument to add a layer with the 'raw' labels, i.e. including local key, " + "cadence and phrase annotations.") args = parser.parse_args() kwargs = process_arguments(args) main(**kwargs) From 58b2cca1aa43ca61273ce74de521bd8e6bd4c5fe Mon Sep 17 00:00:00 2001 From: johentsch Date: Fri, 24 Feb 2023 10:23:19 +0100 Subject: [PATCH 11/44] this version converts raw labels, independent of the given line arguments --- src/ms3/dezrann.py | 66 ++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index edda7e97..a86f80ff 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -140,7 +140,7 @@ class DcmlLabel(TypedDict): def transform_df(labels: pd.DataFrame, - measures: pd.DataFrame, + measures: Optional[pd.DataFrame], label_column: str = 'label') -> List[DcmlLabel]: """ @@ -148,10 +148,18 @@ def transform_df(labels: pd.DataFrame, ---------- labels: Dataframe as found in the 'harmonies' folder of a DCML corpus. Needs to have columns with - the correct dtypes {'mc': int, 'mc_onset': fractions.Fraction} and no missing values. + the correct dtypes {'mc': int, + 'mc_onset': fractions.Fraction, + 'duration_qb': float, + 'quarterbeats': fraction.Fraction, + 'label': str, + 'chord': str, + 'cadence': str, + 'phraseend': str} + and no missing values. measures: - Dataframe as found in the 'measures' folder of a DCML corpus. Requires the columns - {'mc': int, 'quarterbeats_all_endings': fractions.Fraction} + (optional) Dataframe as found in the 'measures' folder of a DCML corpus for computing quarterbeats for pieces with + voltas. Requires the columns {'mc': int, 'quarterbeats_all_endings': fractions.Fraction} (ms3 >= 1.0.0). label_column: str, optional The column that is to be used as label string. Defaults to 'label'. @@ -159,10 +167,16 @@ def transform_df(labels: pd.DataFrame, ------- List of dictionaries where each represents one row of the input labels. """ - offset_dict = measures.set_index("mc")["quarterbeats_all_endings"] - quarterbeats = labels['mc'].map(offset_dict) - quarterbeats = quarterbeats.astype('float') + (labels.mc_onset * 4.0) - transformed_df = pd.concat([quarterbeats.rename('quarterbeats'), labels.duration_qb.rename('duration'), labels[label_column].rename('label')], axis=1) + + if measures is None or "quarterbeats_all_endings" not in measures.columns: + assert "quarterbeats" in labels.columns, f"Labels are lacking 'quarterbeats': {labels.columns}" + quarterbeats = labels["quarterbeats"] + else: + offset_dict = measures.set_index("mc")["quarterbeats_all_endings"] + quarterbeats = labels['mc'].map(offset_dict) + quarterbeats = quarterbeats.astype('float') + (labels.mc_onset * 4.0) + quarterbeats.rename('quarterbeats', inplace=True) + transformed_df = pd.concat([quarterbeats, labels.duration_qb.rename('duration'), labels[label_column].rename('label')], axis=1) return transformed_df.to_dict(orient='records') def make_dezrann_label( @@ -232,7 +246,7 @@ def generate_dez(path_measures: str, """ harmonies_df = pd.read_csv( path_labels, sep='\t', - usecols=['mc', 'mc_onset', 'duration_qb', 'label'], #'chord' + usecols=['mc', 'mc_onset', 'duration_qb', 'quarterbeats', 'label', 'chord', 'cadence', 'phraseend'], converters={'mc_onset': safe_frac} ) try: @@ -241,10 +255,13 @@ def generate_dez(path_measures: str, usecols=['mc', 'quarterbeats_all_endings'], converters={'quarterbeats_all_endings': safe_frac} ) - except ValueError as e: - raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") - - dcml_labels = transform_df(labels=harmonies_df, measures=measures_df) + except (ValueError, AssertionError) as e: + measures_df = None + # raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") + try: + dcml_labels = transform_df(labels=harmonies_df, measures=measures_df) + except Exception as e: + raise ValueError(f"Converting {path_labels} failed with the exception '{e}'.") dezrann_content = convert_dcml_list_to_dezrann_list( dcml_labels, cadences=cadences, @@ -317,9 +334,12 @@ def main(input_dir: str, else: print(f"No measure map found for {tsv_name}. Skipping.") continue + if len(harmony_measure_matches) == 0: + print(f"No matching measure maps found for any of these files: {input_files}") + return for input_file, measure_file in harmony_measure_matches: if output_dir == input_dir: - output_file_path = measure_file.replace(".tsv", ".dez") + output_file_path = input_file.replace(".tsv", ".dez") else: dez_file = os.path.basename(measure_file).replace(".tsv", ".dez") output_file_path = os.path.join(output_dir, dez_file) @@ -398,6 +418,7 @@ def process_arguments(args: argparse.Namespace) -> dict: output_dir=output_dir ) line_args = ('harmonies', 'keys', 'phrases', 'raw') + transformed_line_args = {} for arg in line_args: arg_val = getattr(args, arg) if arg_val is None: @@ -405,10 +426,11 @@ def process_arguments(args: argparse.Namespace) -> dict: line_arg = transform_line_argument(arg_val) if line_arg is None: continue - kwargs[arg] = line_arg - if len(set(kwargs.values())) < len(kwargs.values()): - selected_args = {arg: f"'{getattr(args, arg)}' => {kwargs[arg]}" for arg in line_args if arg in kwargs} + transformed_line_args[arg] = line_arg + if len(set(transformed_line_args.values())) < len(transformed_line_args.values()): + selected_args = {arg: f"'{getattr(args, arg)}' => {arg_val}" for arg, arg_val in transformed_line_args.items()} raise ValueError(f"You selected the same annotation layer more than once: {selected_args}.") + kwargs.update(transformed_line_args) if args.cadences: kwargs['cadences'] = True print(kwargs) @@ -484,11 +506,11 @@ def run(): run() + # import ms3 + # measures = ms3.load_tsv('K283-2_measures.tsv') + # harmonies = ms3.load_tsv('K283-2_harmonies.tsv') + # transformed = transform_df(labels=harmonies, measures=measures) + # print(transformed) - #measures = ms3.load_tsv('src/ms3/K283-2_measures.tsv') - #harmonies = ms3.load_tsv('src/ms3/K283-2_harmonies.tsv') - #transformed = transform_df(labels=harmonies, measures=measures) - #print(transformed) - #dez = generate_dez('K283-2_measures.tsv', 'K283-2_harmonies.tsv') #generate_all_dez() \ No newline at end of file From d14f9caec7c221b3da4fa3c671d5eb4f8adda02c Mon Sep 17 00:00:00 2001 From: johentsch Date: Fri, 24 Feb 2023 14:50:45 +0100 Subject: [PATCH 12/44] adds to transform_df() the algorithm that copies the preceding label onto beat 1 of each alternative ending (volta); also prepares the function for creating labels pertaining to the various layers --- src/ms3/dezrann.py | 100 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 81 insertions(+), 19 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index a86f80ff..c50ded71 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -133,15 +133,39 @@ class DcmlLabel(TypedDict): quarterbeats: float duration: float label: str - harmony: str - key: str - phrase: str - cadence: str + + +def get_volta_groups(mc2volta: pd.Series) -> List[List[int]]: + """Takes a Series where the index has measure counts and values are NA for 'normal' measures and 1, 2... for + measures belonging to a first, second... ending. Returns for each group a list of MCs each of which pertains + to the first measure of an alternative ending. For example, two alternative two-bar endings in MC [15, 16][17, 18] + would figure as [15, 17] in the result list. + """ + volta_groups = [] + filled_volta_col = mc2volta.fillna(-1).astype(int) + volta_segmentation = (filled_volta_col != filled_volta_col.shift()).fillna(True).cumsum() + current_groups_first_mcs = [] + for i, segment in filled_volta_col.groupby(volta_segmentation): + volta_number = segment.iloc[0] + if volta_number == -1: + # current group ends, if there is one + if i == 1: + continue + elif len(current_groups_first_mcs) == 0: + raise RuntimeError(f"Mistake in the algorithm when processing column {filled_volta_col.volta}") + else: + volta_groups.append(current_groups_first_mcs) + current_groups_first_mcs = [] + else: + first_mc = segment.index[0] + current_groups_first_mcs.append(first_mc) + return volta_groups def transform_df(labels: pd.DataFrame, - measures: Optional[pd.DataFrame], - label_column: str = 'label') -> List[DcmlLabel]: + measures: pd.DataFrame, + label_column: str = 'label', + ) -> List[DcmlLabel]: """ Parameters @@ -154,31 +178,68 @@ def transform_df(labels: pd.DataFrame, 'quarterbeats': fraction.Fraction, 'label': str, 'chord': str, + 'localkey': str, 'cadence': str, 'phraseend': str} and no missing values. measures: (optional) Dataframe as found in the 'measures' folder of a DCML corpus for computing quarterbeats for pieces with voltas. Requires the columns {'mc': int, 'quarterbeats_all_endings': fractions.Fraction} (ms3 >= 1.0.0). - label_column: str, optional + label_column: {'label', 'chord', 'cadence', 'phraseend'} The column that is to be used as label string. Defaults to 'label'. Returns ------- List of dictionaries where each represents one row of the input labels. """ - - if measures is None or "quarterbeats_all_endings" not in measures.columns: - assert "quarterbeats" in labels.columns, f"Labels are lacking 'quarterbeats': {labels.columns}" + score_has_voltas = "quarterbeats_all_endings" in measures.columns + if not score_has_voltas: + assert "quarterbeats" in labels.columns, f"Labels are lacking 'quarterbeats' column: {labels.columns}" quarterbeats = labels["quarterbeats"] + last_mc = measures.iloc[-1] + end_of_score = last_mc.quarterbeats + last_mc.act_dur * 4.0 else: - offset_dict = measures.set_index("mc")["quarterbeats_all_endings"] + # the column 'quarterbeats_all_endings' is present, meaning the piece has first and second endings and the + # quarterbeats, which normally leave out first endings, need to be recomputed + last_mc = measures.iloc[-1] + end_of_score = last_mc.quarterbeats_all_endings + last_mc.act_dur * 4.0 + M = measures.set_index("mc") + offset_dict = M["quarterbeats_all_endings"] quarterbeats = labels['mc'].map(offset_dict) - quarterbeats = quarterbeats.astype('float') + (labels.mc_onset * 4.0) + quarterbeats = quarterbeats + (labels.mc_onset * 4.0) quarterbeats.rename('quarterbeats', inplace=True) - transformed_df = pd.concat([quarterbeats, labels.duration_qb.rename('duration'), labels[label_column].rename('label')], axis=1) + # also, the first beat of each volta needs to have a label for computing correct durations + volta_groups = get_volta_groups(M.volta) + label_and_qb = pd.concat([labels[label_column].rename('label'), quarterbeats.astype(float)], axis=1) + n_before = len(labels.index) + if label_column == 'phraseend': + label_and_qb = label_and_qb[label_and_qb.label == '{'] + if label_column == 'localkey': + label_and_qb = label_and_qb[label_and_qb.label != label_and_qb.label.shift().fillna(True)] + else: # {'chord', 'cadence', 'label'} + label_and_qb = label_and_qb[label_and_qb.label.notna()] + n_after = len(label_and_qb.index) + print(f"Creating labels for {n_after} {label_column} labels out of {n_before} rows.") + if label_column == 'cadence': + duration = pd.Series(0.0, dtype=float, index=label_and_qb.index, name='duration') + else: + if score_has_voltas: + for group in volta_groups: + volta_beginnings_quarterbeats = [M.loc[mc, 'quarterbeats_all_endings'] for mc in group] + labels_before_group = label_and_qb.loc[label_and_qb.quarterbeats < volta_beginnings_quarterbeats[0], 'label'] + for volta_beginning_qb in volta_beginnings_quarterbeats: + if volta_beginning_qb in label_and_qb.quarterbeats.values: + continue + repeated_label = pd.DataFrame([[labels_before_group.iloc[-1], float(volta_beginning_qb)]], + columns=['label', 'quarterbeats']) + label_and_qb = pd.concat([label_and_qb, repeated_label], ignore_index=True) + label_and_qb = label_and_qb.sort_values('quarterbeats') + qb_column = label_and_qb.quarterbeats + duration = qb_column.shift(-1).fillna(end_of_score) - qb_column + duration = duration.rename('duration').astype(float) + transformed_df = pd.concat([label_and_qb, duration], axis=1) return transformed_df.to_dict(orient='records') - + def make_dezrann_label( quarterbeats: float, duration: float, label: str, origin: Union[str, Tuple[str]]) -> DezrannLabel: if isinstance(origin, str): @@ -246,18 +307,19 @@ def generate_dez(path_measures: str, """ harmonies_df = pd.read_csv( path_labels, sep='\t', - usecols=['mc', 'mc_onset', 'duration_qb', 'quarterbeats', 'label', 'chord', 'cadence', 'phraseend'], - converters={'mc_onset': safe_frac} + converters={'mc': int, + 'mc_onset': safe_frac, + 'quarterbeats': safe_frac, + } ) try: measures_df = pd.read_csv( path_measures, sep='\t', - usecols=['mc', 'quarterbeats_all_endings'], + dtype={'mc': int, 'volta': 'Int64'}, converters={'quarterbeats_all_endings': safe_frac} ) except (ValueError, AssertionError) as e: - measures_df = None - # raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") + raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") try: dcml_labels = transform_df(labels=harmonies_df, measures=measures_df) except Exception as e: From aefeb9b33d11579c61107536676a1ff983821c4a Mon Sep 17 00:00:00 2001 From: johentsch Date: Fri, 24 Feb 2023 15:11:42 +0100 Subject: [PATCH 13/44] integrates calls to transform_df with the current logic within generate_dez() --- src/ms3/dezrann.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index c50ded71..bc9acb4e 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -193,16 +193,16 @@ def transform_df(labels: pd.DataFrame, List of dictionaries where each represents one row of the input labels. """ score_has_voltas = "quarterbeats_all_endings" in measures.columns + last_mc_row = measures.iloc[-1] + end_of_score = float(last_mc_row.act_dur) * 4.0 if not score_has_voltas: assert "quarterbeats" in labels.columns, f"Labels are lacking 'quarterbeats' column: {labels.columns}" quarterbeats = labels["quarterbeats"] - last_mc = measures.iloc[-1] - end_of_score = last_mc.quarterbeats + last_mc.act_dur * 4.0 + end_of_score += float(last_mc_row.quarterbeats) else: # the column 'quarterbeats_all_endings' is present, meaning the piece has first and second endings and the # quarterbeats, which normally leave out first endings, need to be recomputed - last_mc = measures.iloc[-1] - end_of_score = last_mc.quarterbeats_all_endings + last_mc.act_dur * 4.0 + end_of_score += float(last_mc_row.quarterbeats_all_endings) M = measures.set_index("mc") offset_dict = M["quarterbeats_all_endings"] quarterbeats = labels['mc'].map(offset_dict) @@ -316,14 +316,24 @@ def generate_dez(path_measures: str, measures_df = pd.read_csv( path_measures, sep='\t', dtype={'mc': int, 'volta': 'Int64'}, - converters={'quarterbeats_all_endings': safe_frac} + converters={'quarterbeats_all_endings': safe_frac, + 'act_dur': safe_frac} ) except (ValueError, AssertionError) as e: raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") - try: - dcml_labels = transform_df(labels=harmonies_df, measures=measures_df) - except Exception as e: - raise ValueError(f"Converting {path_labels} failed with the exception '{e}'.") + converted_labels = {} + if cadences: + converted_labels['cadences'] = transform_df(labels=harmonies_df, measures=measures_df, label_column='cadence') + for arg, label_column in ((harmonies, "chord"), + (keys, "localkey"), + (phrases, "phraseend"), + (raw, "label")): + if arg is not None: + converted_labels[arg] = transform_df(labels=harmonies_df, measures=measures_df, label_column=label_column) + # from pprint import pprint + # for line, converted in converted_labels.items(): + # print(line) + # pprint(converted) dezrann_content = convert_dcml_list_to_dezrann_list( dcml_labels, cadences=cadences, @@ -572,7 +582,6 @@ def run(): # measures = ms3.load_tsv('K283-2_measures.tsv') # harmonies = ms3.load_tsv('K283-2_harmonies.tsv') # transformed = transform_df(labels=harmonies, measures=measures) - # print(transformed) - #dez = generate_dez('K283-2_measures.tsv', 'K283-2_harmonies.tsv') + #dez = generate_dez('K283-2_measures.tsv', 'K283-2_harmonies.tsv', cadences=True, harmonies="bot.4", keys="bot.5", phrases="bot.6", raw="top.3") #generate_all_dez() \ No newline at end of file From 0ffd23a4a2ef09d068b0d8599acb085eb608b462 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Fri, 24 Feb 2023 16:33:38 +0100 Subject: [PATCH 14/44] link DCML labels to final Dezrann labels and layout --- src/ms3/dezrann.py | 95 +++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index bc9acb4e..1f257471 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -116,7 +116,7 @@ def safe_frac(s: str) -> Union[Fraction, str]: class DezrannLabel(TypedDict): """Represents one label in a .dez file.""" - type: str + label_type: str start: float duration: float #line: str # Determined by the meta-layout @@ -161,7 +161,6 @@ def get_volta_groups(mc2volta: pd.Series) -> List[List[int]]: current_groups_first_mcs.append(first_mc) return volta_groups - def transform_df(labels: pd.DataFrame, measures: pd.DataFrame, label_column: str = 'label', @@ -241,13 +240,17 @@ def transform_df(labels: pd.DataFrame, return transformed_df.to_dict(orient='records') def make_dezrann_label( - quarterbeats: float, duration: float, label: str, origin: Union[str, Tuple[str]]) -> DezrannLabel: + label_type: str, + quarterbeats: float, + duration: float, + label: str, + origin: Union[str, Tuple[str]]) -> DezrannLabel: if isinstance(origin, str): layers = [origin] else: layers = list(origin) return DezrannLabel( - type="Harmony", #TODO: adapt type to current label + label_type=label_type, start=quarterbeats, duration=duration, tag=label, @@ -255,37 +258,46 @@ def make_dezrann_label( ) def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], - cadences: bool = False, - harmony_line: Optional[str] = None, - keys_line: Optional[str] = None, - phrases_line: Optional[str] = None, - raw_line: Optional[str] = None, + label_type: str, origin: Union[str, Tuple[str]] = "DCML") -> DezrannDict: - label_list = [] + dezrann_label_list = [] for e in values_dict: - label_list.append( + dezrann_label_list.append( make_dezrann_label( + label_type=label_type, quarterbeats=e["quarterbeats"], duration=e["duration"], label=e["label"], origin=origin ) ) + + return dezrann_label_list + #return DezrannDict(labels=label_list, meta={"layout": layout}) + +def make_layout( + cadences: bool = False, + harmonies: Optional[str] = None, + keys: Optional[str] = None, + phrases: Optional[str] = None, + raw: Optional[str] = None): + """ + Compile the line positions for target labels into Dezrann layout parameter. + """ layout = [] if cadences: layout.append({"filter": {"type": "Cadence"}, "style": {"line": "all"}}) - if harmony_line: - layout.append({"filter": {"type": "Harmony"}, "style": {"line": harmony_line}}) - if keys_line: - layout.append({"filter": {"type": "Localkey"}, "style": {"line": keys_line}}) - if phrases_line: - layout.append({"filter": {"type": "Phrase"}, "style": {"line": phrases_line}}) - if raw_line: - layout.append({"filter": {"type": "Harmony"}, "style": {"line": raw_line}}) - - return DezrannDict(labels=label_list, meta={"layout": layout}) + if harmonies: + layout.append({"filter": {"type": "Harmony"}, "style": {"line": harmonies}}) + if keys: + layout.append({"filter": {"type": "Local Key"}, "style": {"line": keys}}) + if phrases: + layout.append({"filter": {"type": "Phrase"}, "style": {"line": phrases}}) + if raw: + layout.append({"filter": {"type": "Harmony"}, "style": {"line": raw}}) + + return layout - def generate_dez(path_measures: str, path_labels: str, output_path: str = "labels.dez", @@ -321,28 +333,31 @@ def generate_dez(path_measures: str, ) except (ValueError, AssertionError) as e: raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") - converted_labels = {} + + dezrann_labels = [] if cadences: - converted_labels['cadences'] = transform_df(labels=harmonies_df, measures=measures_df, label_column='cadence') - for arg, label_column in ((harmonies, "chord"), - (keys, "localkey"), - (phrases, "phraseend"), - (raw, "label")): + dcml_labels = transform_df(labels=harmonies_df, measures=measures_df, label_column='cadence') + dezrann_labels += convert_dcml_list_to_dezrann_list(dcml_labels, label_type="Cadence", origin=origin) + for arg, label_column, label_type in ((harmonies, "chord", "Harmony"), #Third argument + (keys, "localkey", "Local Key"), + (phrases, "phraseend", "Phrase"), + (raw, "label", "Harmony")): if arg is not None: - converted_labels[arg] = transform_df(labels=harmonies_df, measures=measures_df, label_column=label_column) - # from pprint import pprint - # for line, converted in converted_labels.items(): - # print(line) - # pprint(converted) - dezrann_content = convert_dcml_list_to_dezrann_list( - dcml_labels, + dcml_labels = transform_df(labels=harmonies_df, measures=measures_df, label_column=label_column) + dezrann_labels += convert_dcml_list_to_dezrann_list( + dcml_labels, + label_type=label_type, + origin=origin + ) + + layout = make_layout( cadences=cadences, - harmony_line=harmonies, - keys_line=keys, - phrases_line=phrases, - raw_line=raw, - origin=origin + harmonies=harmonies, + keys=keys, + phrases=phrases, + raw=raw ) + dezrann_content = DezrannDict(labels=dezrann_labels, meta={"layout": layout}) # Manual post-processing #TODO: improve these cases # 1) Avoid NaN values in "duration" (happens in second endings) From 6a9d2e479fb45a3e515d7e0db76cd881fd488082 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Fri, 24 Feb 2023 16:42:15 +0100 Subject: [PATCH 15/44] fix label_type naming typo --- src/ms3/dezrann.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 1f257471..3168fab8 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -116,7 +116,7 @@ def safe_frac(s: str) -> Union[Fraction, str]: class DezrannLabel(TypedDict): """Represents one label in a .dez file.""" - label_type: str + type: str start: float duration: float #line: str # Determined by the meta-layout @@ -250,7 +250,7 @@ def make_dezrann_label( else: layers = list(origin) return DezrannLabel( - label_type=label_type, + type=label_type, start=quarterbeats, duration=duration, tag=label, @@ -273,7 +273,6 @@ def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], ) return dezrann_label_list - #return DezrannDict(labels=label_list, meta={"layout": layout}) def make_layout( cadences: bool = False, @@ -411,7 +410,7 @@ def main(input_dir: str, # measures_files = glob.glob(f"{measures_dir}/*.tsv") harmony_measure_matches = [] for tsv_name in input_files: - measures_file_path = os.path.join(measures_dir, tsv_name) + measures_file_path = os.path.join(measures_dir, tsv_name).replace("harmonies", "measures") if not os.path.isfile(measures_file_path): # could be a directory continue From c12c4cf3ecd28b41ee20c82522799cdab3a703be Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Fri, 24 Feb 2023 16:44:20 +0100 Subject: [PATCH 16/44] remove fixed manual post-processing steps (NaN durations and handling start=0.) --- src/ms3/dezrann.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 3168fab8..27f9ca99 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -357,17 +357,6 @@ def generate_dez(path_measures: str, raw=raw ) dezrann_content = DezrannDict(labels=dezrann_labels, meta={"layout": layout}) - - # Manual post-processing #TODO: improve these cases - # 1) Avoid NaN values in "duration" (happens in second endings) - # optional : in the transform_df : transformed_df = transformed_df.replace('NaN', 0) ? - for label in dezrann_content['labels']: - if pd.isnull(label['duration']): - print(f"WARNING: NaN duration detected in label {label}.") - label['duration'] = 0 - # 2) Remove "start" value in the first label ? - if dezrann_content['labels'][0]['start'] == 0.: - del dezrann_content['labels'][0]['start'] with open(output_path, 'w', encoding='utf-8') as f: json.dump(dezrann_content, f, indent=2) @@ -410,7 +399,7 @@ def main(input_dir: str, # measures_files = glob.glob(f"{measures_dir}/*.tsv") harmony_measure_matches = [] for tsv_name in input_files: - measures_file_path = os.path.join(measures_dir, tsv_name).replace("harmonies", "measures") + measures_file_path = os.path.join(measures_dir, tsv_name) if not os.path.isfile(measures_file_path): # could be a directory continue From d9c6d0c3fe0c0276878d99ab3df65f657617984e Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Fri, 24 Feb 2023 18:26:08 +0100 Subject: [PATCH 17/44] add safe conversion of quarterbeats --- src/ms3/dezrann.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 27f9ca99..ff5aee61 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -328,6 +328,7 @@ def generate_dez(path_measures: str, path_measures, sep='\t', dtype={'mc': int, 'volta': 'Int64'}, converters={'quarterbeats_all_endings': safe_frac, + 'quarterbeats': safe_frac, 'act_dur': safe_frac} ) except (ValueError, AssertionError) as e: @@ -420,14 +421,14 @@ def main(input_dir: str, output_file_path = os.path.join(output_dir, dez_file) try: generate_dez( - path_labels=input_file, - path_measures=measure_file, - output_path=output_file_path, - cadences=cadences, - harmonies=harmonies, - keys=keys, - phrases=phrases, - raw=raw + path_labels=input_file, + path_measures=measure_file, + output_path=output_file_path, + cadences=cadences, + harmonies=harmonies, + keys=keys, + phrases=phrases, + raw=raw ) print(f"{output_file_path} successfully written.") except Exception as e: From 3c0c6d1a311d6a45a88aa054eb82f51094430774 Mon Sep 17 00:00:00 2001 From: johentsch Date: Mon, 1 May 2023 16:35:29 +0200 Subject: [PATCH 18/44] more error handling --- src/ms3/dezrann.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index ff5aee61..67cd4643 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -316,13 +316,16 @@ def generate_dez(path_measures: str, origin : :obj:`tuple` Tuple of source(s) from which the labels originate. Defaults to "DCML". """ - harmonies_df = pd.read_csv( - path_labels, sep='\t', - converters={'mc': int, - 'mc_onset': safe_frac, - 'quarterbeats': safe_frac, - } - ) + try: + harmonies_df = pd.read_csv( + path_labels, sep='\t', + converters={'mc': int, + 'mc_onset': safe_frac, + 'quarterbeats': safe_frac, + } + ) + except (ValueError, AssertionError, FileNotFoundError) as e: + raise ValueError(f"{path_labels} could not be loaded as a measure map because of the following error:\n'{e}'") try: measures_df = pd.read_csv( path_measures, sep='\t', @@ -331,7 +334,7 @@ def generate_dez(path_measures: str, 'quarterbeats': safe_frac, 'act_dur': safe_frac} ) - except (ValueError, AssertionError) as e: + except (ValueError, AssertionError, FileNotFoundError) as e: raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") dezrann_labels = [] From b8b0578361ba944a480423bcadb042afce97df6c Mon Sep 17 00:00:00 2001 From: johentsch Date: Mon, 1 May 2023 16:36:21 +0200 Subject: [PATCH 19/44] renames transform_df() => dcml_labels2dicts() --- src/ms3/dezrann.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 67cd4643..096c614f 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -161,10 +161,10 @@ def get_volta_groups(mc2volta: pd.Series) -> List[List[int]]: current_groups_first_mcs.append(first_mc) return volta_groups -def transform_df(labels: pd.DataFrame, - measures: pd.DataFrame, - label_column: str = 'label', - ) -> List[DcmlLabel]: +def dcml_labels2dicts(labels: pd.DataFrame, + measures: pd.DataFrame, + label_column: str = 'label', + ) -> List[DcmlLabel]: """ Parameters @@ -339,14 +339,14 @@ def generate_dez(path_measures: str, dezrann_labels = [] if cadences: - dcml_labels = transform_df(labels=harmonies_df, measures=measures_df, label_column='cadence') + dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column='cadence') dezrann_labels += convert_dcml_list_to_dezrann_list(dcml_labels, label_type="Cadence", origin=origin) for arg, label_column, label_type in ((harmonies, "chord", "Harmony"), #Third argument (keys, "localkey", "Local Key"), (phrases, "phraseend", "Phrase"), (raw, "label", "Harmony")): if arg is not None: - dcml_labels = transform_df(labels=harmonies_df, measures=measures_df, label_column=label_column) + dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column=label_column) dezrann_labels += convert_dcml_list_to_dezrann_list( dcml_labels, label_type=label_type, From 455f1066b297b16fcfeb357e26fbd67924f1b724 Mon Sep 17 00:00:00 2001 From: johentsch Date: Mon, 1 May 2023 17:04:36 +0200 Subject: [PATCH 20/44] improves type annotations & docstrings --- src/ms3/dezrann.py | 93 +++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 35 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 096c614f..f25ccfca 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -101,12 +101,23 @@ import argparse import json import os -from typing import Dict, List, TypedDict, Union, Tuple, Optional +from typing import Dict, List, TypedDict, Union, Tuple, Optional, TypeAlias, Literal from fractions import Fraction import pandas as pd +LINE_VALUES = { + 1: "top.1", + 2: "top.2", + 3: "top.3", + 4: "bot.1", + 5: "bot.2", + 6: "bot.3" +} +"""The six annotation layers of the Dezrann app, three above ('top') and three below ('bot') the score.""" +DezrannLayer: TypeAlias = Literal["top.1", "top.2", "top.3", "bot.1", "bot.2", "bot.3"] +"""More expressive than simply annotating with 'str'.""" def safe_frac(s: str) -> Union[Fraction, str]: try: @@ -276,10 +287,10 @@ def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], def make_layout( cadences: bool = False, - harmonies: Optional[str] = None, - keys: Optional[str] = None, - phrases: Optional[str] = None, - raw: Optional[str] = None): + harmonies: Optional[DezrannLayer] = None, + keys: Optional[DezrannLayer] = None, + phrases: Optional[DezrannLayer] = None, + raw: Optional[DezrannLayer] = None): """ Compile the line positions for target labels into Dezrann layout parameter. """ @@ -299,22 +310,25 @@ def make_layout( def generate_dez(path_measures: str, path_labels: str, - output_path: str = "labels.dez", + output_path: str, cadences: bool = False, - harmonies: Optional[str] = None, - keys: Optional[str] = None, - phrases: Optional[str] = None, - raw: Optional[str] = None, - origin: Union[str, Tuple[str]] = "DCML"): - """ - path_measures : :obj:`str` - Path to a TSV file as output by format_data(). - path_labels : :obj:`str` - Path to a TSV file as output by format_data(). - output_labels : :obj:`str` - Path to a TSV file as output by format_data(). - origin : :obj:`tuple` - Tuple of source(s) from which the labels originate. Defaults to "DCML". + harmonies: Optional[DezrannLayer] = None, + keys: Optional[DezrannLayer] = None, + phrases: Optional[DezrannLayer] = None, + raw: Optional[DezrannLayer] = None, + origin: Union[str, Tuple[str]] = "DCML") -> None: + """ Create a .dez file from a path to a measures TSV file and a path to a labels/expanded TSV file. + + Args: + path_measures: Path to a DCML measures TSV file. + path_labels: Path to a DCML labels/expanded TSV file. + output_path: Path to the .dez file to be (over)written. + cadences: If True (default), labels in the 'cadence' column (if present) are included as vertical lines. + harmonies: Specify a DezrannLayer to include the labels from the 'chord' column. + keys: Specify a DezrannLayer to include the labels from the 'localkey' column. + phrases: Specify a DezrannLayer to include the labels from the 'phraseend' column. + raw: Specify a DezrannLayer to include the labels from the 'label' column. + origin: Value to show in Dezrann's "Layer" field. Defaults to 'DCML'. """ try: harmonies_df = pd.read_csv( @@ -338,7 +352,7 @@ def generate_dez(path_measures: str, raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") dezrann_labels = [] - if cadences: + if cadences and 'cadence' in harmonies_df.columns: dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column='cadence') dezrann_labels += convert_dcml_list_to_dezrann_list(dcml_labels, label_type="Cadence", origin=origin) for arg, label_column, label_type in ((harmonies, "chord", "Harmony"), #Third argument @@ -392,10 +406,27 @@ def main(input_dir: str, measures_dir: str, output_dir: str, cadences: bool = False, - harmonies: Optional[str] = None, - keys: Optional[str] = None, - phrases: Optional[str] = None, - raw: Optional[str] = None): + harmonies: Optional[DezrannLayer] = None, + keys: Optional[DezrannLayer] = None, + phrases: Optional[DezrannLayer] = None, + raw: Optional[DezrannLayer] = None) -> None: + """ Main function for using this module as a script. It gathers file paths and converts the detected DCML-style + labels/expanded TSV file to .dez format. + + Args: + input_dir: Path containing DCML-style labels/expanded TSV files. + measures_dir: Path containing the corresponding DCML-style measures TSV files. + output_dir: Path at which the resulting .dez file are to be (over)written. + cadences: If True (default), labels in the 'cadence' column (if present) are included as vertical lines. + harmonies: Specify a DezrannLayer to include the labels from the 'chord' column. + keys: Specify a DezrannLayer to include the labels from the 'localkey' column. + phrases: Specify a DezrannLayer to include the labels from the 'phraseend' column. + raw: Specify a DezrannLayer to include the labels from the 'label' column. + origin: Value to show in Dezrann's "Layer" field. Defaults to 'DCML'. + + Returns: + + """ if not cadences and all(arg is None for arg in (harmonies, keys, phrases, raw)): print(f"Nothing to do because no features have been selected.") return @@ -411,10 +442,10 @@ def main(input_dir: str, harmonies_file_path = os.path.join(input_dir, tsv_name) harmony_measure_matches.append((harmonies_file_path, measures_file_path)) else: - print(f"No measure map found for {tsv_name}. Skipping.") + print(f"No measures TSV found for {tsv_name}. Skipping.") continue if len(harmony_measure_matches) == 0: - print(f"No matching measure maps found for any of these files: {input_files}") + print(f"No matching measures TSVs found for any of these files: {input_files}") return for input_file, measure_file in harmony_measure_matches: if output_dir == input_dir: @@ -437,14 +468,6 @@ def main(input_dir: str, except Exception as e: print(f"Converting {input_file} failed with '{e}'") -LINE_VALUES = { - 1: "top.1", - 2: "top.2", - 3: "top.3", - 4: "bot.1", - 5: "bot.2", - 6: "bot.3" -} def transform_line_argument(line: Optional[Union[int, str]]) -> Optional[str]: """Takes a number bet""" From c567df68eb89176e514102291eea6b0547854e19 Mon Sep 17 00:00:00 2001 From: johentsch Date: Mon, 1 May 2023 17:10:26 +0200 Subject: [PATCH 21/44] replaces simple tests by new_tests/test_dezrann.py --- src/ms3/dezrann.py | 33 +---------------------- tests/conftest.py | 13 ++++++++- tests/test_metarepo_files/test_dezrann.py | 18 +++++++++++++ 3 files changed, 31 insertions(+), 33 deletions(-) create mode 100644 tests/test_metarepo_files/test_dezrann.py diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index f25ccfca..193e3c50 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -380,28 +380,6 @@ def generate_dez(path_measures: str, json.dump(dezrann_content, f, indent=2) -# Test -MOZART_SONATAS = [ - 'K279-1', 'K279-2', 'K279-3', - 'K280-1', 'K280-2', 'K280-3', - 'K283-1', 'K283-2', 'K283-3', -] -MEASURE_DIR = os.path.join("src", "ms3") #to be updated -HARMONY_DIR = os.path.join("src", "ms3") #to be updated -MEASURE_PATHS = [ - os.path.join(MEASURE_DIR, f"{movement}_measures.tsv") - for movement in MOZART_SONATAS -] -HARMONY_PATHS = [ - os.path.join(HARMONY_DIR, f"{movement}_harmonies.tsv") - for movement in MOZART_SONATAS -] - -OUTPUT_DIR = "." #to be updated -def generate_all_dez(output_dir=OUTPUT_DIR): - for i_piece, piece in enumerate(MOZART_SONATAS): - generate_dez(MEASURE_PATHS[i_piece], HARMONY_PATHS[i_piece]) - def main(input_dir: str, measures_dir: str, output_dir: str, @@ -605,13 +583,4 @@ def run(): main(**kwargs) if __name__ == "__main__": - run() - - - # import ms3 - # measures = ms3.load_tsv('K283-2_measures.tsv') - # harmonies = ms3.load_tsv('K283-2_harmonies.tsv') - # transformed = transform_df(labels=harmonies, measures=measures) - - #dez = generate_dez('K283-2_measures.tsv', 'K283-2_harmonies.tsv', cadences=True, harmonies="bot.4", keys="bot.5", phrases="bot.6", raw="top.3") - #generate_all_dez() \ No newline at end of file + run() \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 9d8c0fcf..e5084ae3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,7 @@ CORPUS_DIR = "~" # Directory holding your clone of DCMLab/unittest_metacorpus TEST_COMMIT = "5899afe" # commit of DCMLab/unittest_metacorpus for which the tests should pass -MS3_DIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '..', '..')) +MS3_DIR = os.path.abspath(os.path.join(os.path.realpath(__file__), '..', '..')) DOCS_DIR = os.path.join(MS3_DIR, 'docs') DOCS_EXAMPLES_DIR = os.path.join(DOCS_DIR, 'examples') @@ -34,6 +34,16 @@ def small_directory(directory): path = os.path.join(directory, "ravel_piano") return path +@pytest.fixture(scope="session") +def mozart_piano_sonatas() -> str: + """Get the path to local clone of DCMLab/mozart_piano_sonatas""" + path = os.path.join(os.path.expanduser(CORPUS_DIR), "mozart_piano_sonatas") + if not os.path.isdir(path): + print(f"Directory does not exist: {path} Clone DCMLab/mozart_piano_sonatas into the CORPUS_DIR specified above.") + assert os.path.isdir(path) + assert Repo(path) # this makes sure it's a Git + return path + @pytest.fixture( scope="session", params=[ @@ -106,6 +116,7 @@ def parse_obj(directory, request) -> Parse: p.add_dir(os.path.join(directory, 'outputs')) for path in files: p.add_files(path, corpus_name='sweelinck_keyboard') + return p @pytest.fixture( diff --git a/tests/test_metarepo_files/test_dezrann.py b/tests/test_metarepo_files/test_dezrann.py new file mode 100644 index 00000000..3e33d1e9 --- /dev/null +++ b/tests/test_metarepo_files/test_dezrann.py @@ -0,0 +1,18 @@ +import os + +from ms3.dezrann import generate_dez + +MOZART_MOVEMENTS = [ + 'K279-1', 'K279-2', 'K279-3', + 'K280-1', 'K280-2', 'K280-3', + 'K283-1', 'K283-2', 'K283-3', + ] + +def test_dcml2dez(mozart_piano_sonatas): + for fname in MOZART_MOVEMENTS: + measures_path = os.path.join(mozart_piano_sonatas, 'measures', f"{fname}.tsv") + harmonies_path = os.path.join(mozart_piano_sonatas, 'harmonies', f"{fname}.tsv") + out_path = os.path.join(mozart_piano_sonatas, f"{fname}.dez") + generate_dez(path_measures=measures_path, + path_labels=harmonies_path, + output_path=out_path) \ No newline at end of file From 3fafa71c357438202ed25f51471534ea52d54c93 Mon Sep 17 00:00:00 2001 From: johentsch Date: Mon, 1 May 2023 17:21:54 +0200 Subject: [PATCH 22/44] factors out generate_dez_from_dfs() --- src/ms3/dezrann.py | 80 ++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 27 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 193e3c50..3245bc7a 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -307,6 +307,58 @@ def make_layout( layout.append({"filter": {"type": "Harmony"}, "style": {"line": raw}}) return layout + + + +def generate_dez_from_dfs(measures_df: pd.DataFrame, + harmonies_df: pd.DataFrame, + output_path: str, + cadences: bool = False, + harmonies: Optional[DezrannLayer] = None, + keys: Optional[DezrannLayer] = None, + phrases: Optional[DezrannLayer] = None, + raw: Optional[DezrannLayer] = None, + origin: Union[str, Tuple[str]] = "DCML") -> None: + """ Create a .dez file from a measures and a labels/expanded dataframe. + + Args: + measures_df: + harmonies_df: + output_path: Path to the .dez file to be (over)written. + cadences: If True (default), labels in the 'cadence' column (if present) are included as vertical lines. + harmonies: Specify a DezrannLayer to include the labels from the 'chord' column. + keys: Specify a DezrannLayer to include the labels from the 'localkey' column. + phrases: Specify a DezrannLayer to include the labels from the 'phraseend' column. + raw: Specify a DezrannLayer to include the labels from the 'label' column. + origin: Value to show in Dezrann's "Layer" field. Defaults to 'DCML'. + """ + dezrann_labels = [] + if cadences and 'cadence' in harmonies_df.columns: + dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column='cadence') + dezrann_labels += convert_dcml_list_to_dezrann_list(dcml_labels, label_type="Cadence", origin=origin) + for arg, label_column, label_type in ((harmonies, "chord", "Harmony"), # Third argument + (keys, "localkey", "Local Key"), + (phrases, "phraseend", "Phrase"), + (raw, "label", "Harmony")): + if arg is not None: + dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column=label_column) + dezrann_labels += convert_dcml_list_to_dezrann_list( + dcml_labels, + label_type=label_type, + origin=origin + ) + layout = make_layout( + cadences=cadences, + harmonies=harmonies, + keys=keys, + phrases=phrases, + raw=raw + ) + dezrann_content = DezrannDict(labels=dezrann_labels, meta={"layout": layout}) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(dezrann_content, f, indent=2) + + def generate_dez(path_measures: str, path_labels: str, @@ -351,33 +403,7 @@ def generate_dez(path_measures: str, except (ValueError, AssertionError, FileNotFoundError) as e: raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") - dezrann_labels = [] - if cadences and 'cadence' in harmonies_df.columns: - dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column='cadence') - dezrann_labels += convert_dcml_list_to_dezrann_list(dcml_labels, label_type="Cadence", origin=origin) - for arg, label_column, label_type in ((harmonies, "chord", "Harmony"), #Third argument - (keys, "localkey", "Local Key"), - (phrases, "phraseend", "Phrase"), - (raw, "label", "Harmony")): - if arg is not None: - dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column=label_column) - dezrann_labels += convert_dcml_list_to_dezrann_list( - dcml_labels, - label_type=label_type, - origin=origin - ) - - layout = make_layout( - cadences=cadences, - harmonies=harmonies, - keys=keys, - phrases=phrases, - raw=raw - ) - dezrann_content = DezrannDict(labels=dezrann_labels, meta={"layout": layout}) - - with open(output_path, 'w', encoding='utf-8') as f: - json.dump(dezrann_content, f, indent=2) + generate_dez_from_dfs(measures_df, harmonies_df, output_path, cadences, harmonies, keys, phrases, raw, origin) def main(input_dir: str, From 683908bcacdab0616f80e801e885abc123b1b76b Mon Sep 17 00:00:00 2001 From: johentsch Date: Mon, 1 May 2023 18:46:10 +0200 Subject: [PATCH 23/44] generate_dez() and generate_dez_from_dfs() return boolean and also accept numerical arguments for Dezrann layers --- src/ms3/dezrann.py | 67 +++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 3245bc7a..8886e15a 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -116,9 +116,27 @@ } """The six annotation layers of the Dezrann app, three above ('top') and three below ('bot') the score.""" -DezrannLayer: TypeAlias = Literal["top.1", "top.2", "top.3", "bot.1", "bot.2", "bot.3"] +DEZ_LINE_ARGS = (0, 1, 2, 3, 4, 5, 6, -1, -2, -3) + +DezrannLayer: TypeAlias = Literal["top.1", "top.2", "top.3", "bot.1", "bot.2", "bot.3", 1, 2, 3, 4, 5, 6, -1, -2, -3] """More expressive than simply annotating with 'str'.""" +def transform_line_argument(line: Optional[Union[int, str]]) -> Optional[str]: + """Takes a number between -3 and 6 and turns it into one of the possible Dezrann line values. + 0 is interpreted as None. -1, -2, -3 correspond to 4, 5, 6""" + if line is None: + return + try: + line = int(line) + assert line in DEZ_LINE_ARGS + except (TypeError, ValueError, AssertionError): + raise ValueError(f"{line} is not a valid argument, should be within [-3, 6].") + if line == 0: + return None + if line < 0: + line = abs(line) + 3 + return LINE_VALUES[line] + def safe_frac(s: str) -> Union[Fraction, str]: try: return Fraction(s) @@ -223,7 +241,7 @@ def dcml_labels2dicts(labels: pd.DataFrame, label_and_qb = pd.concat([labels[label_column].rename('label'), quarterbeats.astype(float)], axis=1) n_before = len(labels.index) if label_column == 'phraseend': - label_and_qb = label_and_qb[label_and_qb.label == '{'] + label_and_qb = label_and_qb[label_and_qb.label.fillna('').str.contains('{')] if label_column == 'localkey': label_and_qb = label_and_qb[label_and_qb.label != label_and_qb.label.shift().fillna(True)] else: # {'chord', 'cadence', 'label'} @@ -318,7 +336,7 @@ def generate_dez_from_dfs(measures_df: pd.DataFrame, keys: Optional[DezrannLayer] = None, phrases: Optional[DezrannLayer] = None, raw: Optional[DezrannLayer] = None, - origin: Union[str, Tuple[str]] = "DCML") -> None: + origin: Union[str, Tuple[str]] = "DCML") -> bool: """ Create a .dez file from a measures and a labels/expanded dataframe. Args: @@ -331,7 +349,16 @@ def generate_dez_from_dfs(measures_df: pd.DataFrame, phrases: Specify a DezrannLayer to include the labels from the 'phraseend' column. raw: Specify a DezrannLayer to include the labels from the 'label' column. origin: Value to show in Dezrann's "Layer" field. Defaults to 'DCML'. + + Returns: + True if a .dez file was written. """ + annotation_layer_arguments = {arg: transform_line_argument(arg_val) for arg, arg_val in zip(("harmonies", "keys", "phrases", "raw"), (harmonies, keys, phrases, raw))} + parameters = {arg: arg_val is not None for arg, arg_val in annotation_layer_arguments.items()} + parameters['cadences'] = cadences + if not any(parameters.values()): + print(f"Nothing to do because no features have been selected.") + return False dezrann_labels = [] if cadences and 'cadence' in harmonies_df.columns: dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column='cadence') @@ -347,6 +374,9 @@ def generate_dez_from_dfs(measures_df: pd.DataFrame, label_type=label_type, origin=origin ) + if len(dezrann_labels) == 0: + print(f"{output_path} not written because no labels correspond to the parameters: {parameters}") + return False layout = make_layout( cadences=cadences, harmonies=harmonies, @@ -357,6 +387,7 @@ def generate_dez_from_dfs(measures_df: pd.DataFrame, dezrann_content = DezrannDict(labels=dezrann_labels, meta={"layout": layout}) with open(output_path, 'w', encoding='utf-8') as f: json.dump(dezrann_content, f, indent=2) + return True @@ -368,7 +399,7 @@ def generate_dez(path_measures: str, keys: Optional[DezrannLayer] = None, phrases: Optional[DezrannLayer] = None, raw: Optional[DezrannLayer] = None, - origin: Union[str, Tuple[str]] = "DCML") -> None: + origin: Union[str, Tuple[str]] = "DCML") -> bool: """ Create a .dez file from a path to a measures TSV file and a path to a labels/expanded TSV file. Args: @@ -381,6 +412,12 @@ def generate_dez(path_measures: str, phrases: Specify a DezrannLayer to include the labels from the 'phraseend' column. raw: Specify a DezrannLayer to include the labels from the 'label' column. origin: Value to show in Dezrann's "Layer" field. Defaults to 'DCML'. + + Returns: + True if a .dez file was written. + + Raises: + ValueError if reading from one of the specified file paths fails. """ try: harmonies_df = pd.read_csv( @@ -403,7 +440,7 @@ def generate_dez(path_measures: str, except (ValueError, AssertionError, FileNotFoundError) as e: raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") - generate_dez_from_dfs(measures_df, harmonies_df, output_path, cadences, harmonies, keys, phrases, raw, origin) + return generate_dez_from_dfs(measures_df, harmonies_df, output_path, cadences, harmonies, keys, phrases, raw, origin) def main(input_dir: str, @@ -451,6 +488,7 @@ def main(input_dir: str, if len(harmony_measure_matches) == 0: print(f"No matching measures TSVs found for any of these files: {input_files}") return + created_files = 0 for input_file, measure_file in harmony_measure_matches: if output_dir == input_dir: output_file_path = input_file.replace(".tsv", ".dez") @@ -458,7 +496,7 @@ def main(input_dir: str, dez_file = os.path.basename(measure_file).replace(".tsv", ".dez") output_file_path = os.path.join(output_dir, dez_file) try: - generate_dez( + created_files += generate_dez( path_labels=input_file, path_measures=measure_file, output_path=output_file_path, @@ -471,22 +509,9 @@ def main(input_dir: str, print(f"{output_file_path} successfully written.") except Exception as e: print(f"Converting {input_file} failed with '{e}'") + print(f"Done. Created {created_files} .dez files.") -def transform_line_argument(line: Optional[Union[int, str]]) -> Optional[str]: - """Takes a number bet""" - if line is None: - return - try: - line = int(line) - assert line in [1,2,3,4,5,6, 0 -1, -2, -3] - except (TypeError, ValueError, AssertionError): - raise ValueError(f"{line} is not a valid argument, should be within [0, 6].") - if line == 0: - return None - if line < 0: - line = abs(line) + 3 - return LINE_VALUES[line] def resolve_dir(d): """ Resolves '~' to HOME directory and turns ``d`` into an absolute path. @@ -575,7 +600,7 @@ def run(): action="store_true", help="Pass this flag if you want to add time-point cadence labels to the .dez files." ) - possible_line_arguments = ("0", "1", "2", "3", "4", "5", "6", "-1", "-2", "-3") + possible_line_arguments = tuple(str(i) for i in DEZ_LINE_ARGS) parser.add_argument('-H', '--harmonies', metavar="{0-6}, default: 4", From 10e1ef07fa1483e18a15e2801b24bbd7960149a4 Mon Sep 17 00:00:00 2001 From: johentsch Date: Mon, 1 May 2023 18:52:32 +0200 Subject: [PATCH 24/44] extends test_dcml2dez() so that the number of written .dez labels correspond to the original annotation file --- src/ms3/utils/functions.py | 76 +++++++++++++++++++++++ tests/test_metarepo_files/test_dezrann.py | 52 +++++++++++++--- 2 files changed, 118 insertions(+), 10 deletions(-) diff --git a/src/ms3/utils/functions.py b/src/ms3/utils/functions.py index 14597b8e..d68ba99d 100644 --- a/src/ms3/utils/functions.py +++ b/src/ms3/utils/functions.py @@ -6477,3 +6477,79 @@ def replace_extension(filepath: str, new_extension: str) -> str: if new_extension[0] != ".": new_extension = "." + new_extension return os.path.splitext(filepath)[0] + new_extension + + + +def get_value_profile_mask(series: pd.Series, + na_values: str = "group", + prevent_merge: bool = False, + logger: Optional[logging.Logger | str] = None + ) -> pd.Series: + """Turns a Series into a boolean mask indicating those values that are distinct from their predecessors. + There are several ways of dealing with NA values. + + NB: This is a duplicate of a DiMCAT function. + + Args: + S: Series in which to group identical adjacent values with each other. + na_values: + | 'group' creates individual groups for NA values (default). + | 'backfill' or 'bfill' groups NA values with the subsequent group + | 'pad', 'ffill' groups NA values with the preceding group + | Any other string works like 'group', with the difference that the groups will be named with this value. + | Passing None means NA values & ranges are being ignored, i.e. they will also be present in the output and + the subsequent value will be based on the preceding value. + prevent_merge: + By default, if you use the `na_values` argument to fill NA values, they might lead to two ranges merging. + Pass True to prevent this. For example, take the sequence ['a', NA, 'a'] with ``na_values='ffill'``: + By default, the outcome would be a single range ``[True, False, False]``. However, + passing ``prevent_merge=True`` will result in ``[True, False, True]``. + + Returns: + A boolean mask where False marks values that are not identical with their preceding values. + Using that mask on the input series yields the value profile. + """ + if logger is None: + logger = module_logger + elif isinstance(logger, str): + logger = get_logger(logger) + reindex_flag = False + if prevent_merge: + forced_beginnings = series.notna() & ~series.notna().shift().fillna(False) + if na_values is None: + if series.isna().any(): + s = series.dropna() + reindex_flag = True + else: + s = series + elif na_values == "group": + s = series + elif na_values in ("backfill", "bfill", "pad", "ffill"): + s = series.fillna(method=na_values) + else: + s = series.fillna(value=na_values) + + if s.isna().any(): + if na_values == "group": + shifted = s.shift() + if pd.isnull(series.iloc[0]): + shifted.iloc[0] = True + beginnings = ~nan_eq(s, shifted) + else: + logger.warning( + f"After treating the Series '{series.name}' with na_values='{na_values}', " + f"there were still {s.isna().sum()} NA values left." + ) + s = s.dropna() + beginnings = (s != s.shift()).fillna(False) + beginnings.iloc[0] = True + reindex_flag = True + else: + beginnings = s != s.shift() + beginnings.iloc[0] = True + if prevent_merge: + beginnings |= forced_beginnings + beginnings = beginnings.astype("boolean") + if reindex_flag: + return beginnings.reindex(series.index) + return beginnings \ No newline at end of file diff --git a/tests/test_metarepo_files/test_dezrann.py b/tests/test_metarepo_files/test_dezrann.py index 3e33d1e9..4f44d6b3 100644 --- a/tests/test_metarepo_files/test_dezrann.py +++ b/tests/test_metarepo_files/test_dezrann.py @@ -1,18 +1,50 @@ +import json import os +from collections import Counter +import pytest +from ms3.utils import get_value_profile_mask, load_tsv from ms3.dezrann import generate_dez -MOZART_MOVEMENTS = [ +@pytest.fixture(params=[ 'K279-1', 'K279-2', 'K279-3', 'K280-1', 'K280-2', 'K280-3', 'K283-1', 'K283-2', 'K283-3', - ] + ]) +def movement(request) -> str: + return request.param -def test_dcml2dez(mozart_piano_sonatas): - for fname in MOZART_MOVEMENTS: - measures_path = os.path.join(mozart_piano_sonatas, 'measures', f"{fname}.tsv") - harmonies_path = os.path.join(mozart_piano_sonatas, 'harmonies', f"{fname}.tsv") - out_path = os.path.join(mozart_piano_sonatas, f"{fname}.dez") - generate_dez(path_measures=measures_path, - path_labels=harmonies_path, - output_path=out_path) \ No newline at end of file +def test_dcml2dez(mozart_piano_sonatas, movement): + # first, create .dez file + measures_path = os.path.join(mozart_piano_sonatas, 'measures', f"{movement}.tsv") + harmonies_path = os.path.join(mozart_piano_sonatas, 'harmonies', f"{movement}.tsv") + out_path = os.path.join(mozart_piano_sonatas, f"{movement}.dez") + generate_dez(path_measures=measures_path, + path_labels=harmonies_path, + output_path=out_path, + cadences=True, + harmonies=4, + keys=5, + phrases=6 + ) + # then, count the contained labels and compare with the target number (except if score contains voltas because then, + # the .dez file might contain additional, repeated labels at the beginning of each ending). + expanded = load_tsv(harmonies_path) + if 'volta' in expanded and expanded.volta.notna().any(): + return + with open(out_path, 'r', encoding='utf-8') as f: + dezrann_file = json.load(f) + type2column = { + 'Harmony': 'chord', + 'Cadence': 'cadence', + 'Phrase': 'phraseend', + 'Local Key': 'localkey', + } + written_labels = dict(Counter(type2column[label['type']] for label in dezrann_file['labels'])) + expected_counts = dict( + chord = expanded['chord'].notna().sum(), + cadence = expanded['cadence'].notna().sum(), + phraseend = expanded['phraseend'].str.contains('{').sum(), + localkey = get_value_profile_mask(expanded['localkey']).sum(), + ) + assert written_labels == expected_counts \ No newline at end of file From 487242a9ff3d67931001e35e3587ce846210578c Mon Sep 17 00:00:00 2001 From: johentsch Date: Mon, 1 May 2023 19:21:31 +0200 Subject: [PATCH 25/44] adds additional unittest --- tests/conftest.py | 5 ++- tests/test_metarepo_files/test_dezrann.py | 53 +++++++++++++++++++---- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e5084ae3..1187f162 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,8 +41,9 @@ def mozart_piano_sonatas() -> str: if not os.path.isdir(path): print(f"Directory does not exist: {path} Clone DCMLab/mozart_piano_sonatas into the CORPUS_DIR specified above.") assert os.path.isdir(path) - assert Repo(path) # this makes sure it's a Git - return path + repo = Repo(path) + yield path + repo.git.clean('-fdx') # removes new files potentially generated during test @pytest.fixture( scope="session", diff --git a/tests/test_metarepo_files/test_dezrann.py b/tests/test_metarepo_files/test_dezrann.py index 4f44d6b3..a7ddfdc3 100644 --- a/tests/test_metarepo_files/test_dezrann.py +++ b/tests/test_metarepo_files/test_dezrann.py @@ -3,18 +3,29 @@ from collections import Counter import pytest -from ms3.utils import get_value_profile_mask, load_tsv -from ms3.dezrann import generate_dez +from ms3 import Parse +from ms3.utils import get_value_profile_mask, load_tsv, assert_all_lines_equal +from ms3.dezrann import generate_dez, generate_dez_from_dfs -@pytest.fixture(params=[ +MOZART_MOVEMENTS = [ 'K279-1', 'K279-2', 'K279-3', 'K280-1', 'K280-2', 'K280-3', 'K283-1', 'K283-2', 'K283-3', - ]) + ] + +SETTINGS = dict( + cadences=True, + harmonies=4, + keys=5, + phrases=6 +) + +@pytest.fixture(params=MOZART_MOVEMENTS) def movement(request) -> str: return request.param def test_dcml2dez(mozart_piano_sonatas, movement): + """This test creates Dezrann files from DCML annotations and compares the number of written labels with the target.""" # first, create .dez file measures_path = os.path.join(mozart_piano_sonatas, 'measures', f"{movement}.tsv") harmonies_path = os.path.join(mozart_piano_sonatas, 'harmonies', f"{movement}.tsv") @@ -22,10 +33,7 @@ def test_dcml2dez(mozart_piano_sonatas, movement): generate_dez(path_measures=measures_path, path_labels=harmonies_path, output_path=out_path, - cadences=True, - harmonies=4, - keys=5, - phrases=6 + **SETTINGS ) # then, count the contained labels and compare with the target number (except if score contains voltas because then, # the .dez file might contain additional, repeated labels at the beginning of each ending). @@ -47,4 +55,31 @@ def test_dcml2dez(mozart_piano_sonatas, movement): phraseend = expanded['phraseend'].str.contains('{').sum(), localkey = get_value_profile_mask(expanded['localkey']).sum(), ) - assert written_labels == expected_counts \ No newline at end of file + assert written_labels == expected_counts + +def test_parse2dez(mozart_piano_sonatas): + """This test creates two .dez files per piece and checks if they are identical. One is created from the DataFrames + as parsed by the ms3.Parse() object, and the other is created directly from the TSV files. + """ + file_re = "|".join(MOZART_MOVEMENTS) + p = Parse(mozart_piano_sonatas,file_re=file_re) + p.view.include('facets', 'measures', 'expanded') + p.view.fnames_with_incomplete_facets = False + p.parse_tsv() + facet_dataframes = p.get_facets(['expanded', 'measures'], concatenate=False, choose='auto') + for (corpus, fname), facet2file_df_pair in facet_dataframes.items(): + measures_file, measures_df = facet2file_df_pair['measures'][0] + harmonies_file, harmonies_df = facet2file_df_pair['expanded'][0] + output_from_tsv = os.path.join(mozart_piano_sonatas, f"{fname}_from_tsv.dez") + output_from_dfs = os.path.join(mozart_piano_sonatas, f"{fname}_from_df.dez") + generate_dez(path_measures=measures_file.full_path, + path_labels=harmonies_file.full_path, + output_path=output_from_tsv, + **SETTINGS) + generate_dez_from_dfs(measures_df=measures_df, + harmonies_df=harmonies_df, + output_path=output_from_dfs, + **SETTINGS) + dez_from_tsv = open(output_from_tsv, 'r', encoding='utf-8').read() + dez_from_dfs = open(output_from_dfs, 'r', encoding='utf-8').read() + assert_all_lines_equal(dez_from_tsv, dez_from_dfs, output_from_tsv, output_from_dfs) From f95799e75e47dc5e9b9c170a0e7a1c5a10fb68a4 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Mon, 11 Sep 2023 17:54:46 +0200 Subject: [PATCH 26/44] pre-commit run --all-files --- codemeta.json | 2 +- src/ms3/annotations.py | 472 +++++++++----- src/ms3/dezrann.py | 512 +++++++++------ src/ms3/utils/functions.py | 14 +- src/ms3/view.py | 608 +++++++++++------- tests/conftest.py | 158 +++-- tests/test_local_files/IGNORED_WARNINGS | 2 +- tests/test_local_files/MS3/05_symph_fant.mscx | 10 +- .../MS3/Did03M-Son_regina-1762-Sarti.mscx | 4 +- tests/test_local_files/test_repeats.py | 22 +- tests/test_metarepo_files/test_dezrann.py | 111 ++-- .../test_metarepo_files/test_docs_examples.py | 8 +- .../test_transformations.py | 31 +- 13 files changed, 1207 insertions(+), 747 deletions(-) diff --git a/codemeta.json b/codemeta.json index cb38a5d9..3981b22c 100644 --- a/codemeta.json +++ b/codemeta.json @@ -63,4 +63,4 @@ } } ] -} \ No newline at end of file +} diff --git a/src/ms3/annotations.py b/src/ms3/annotations.py index e193d313..86bcda2f 100644 --- a/src/ms3/annotations.py +++ b/src/ms3/annotations.py @@ -5,22 +5,64 @@ import pandas as pd -from .utils import decode_harmonies, is_any_row_equal, html2format, load_tsv, \ - name2format, resolve_dir, rgb2format, column_order, update_cfg -from .utils.constants import DCML_REGEX, DCML_DOUBLE_REGEX, FORM_DETECTION_REGEX -from .logger import LoggedClass from .expand_dcml import expand_labels +from .logger import LoggedClass +from .utils import ( + column_order, + decode_harmonies, + html2format, + is_any_row_equal, + load_tsv, + name2format, + resolve_dir, + rgb2format, + update_cfg, +) +from .utils.constants import DCML_DOUBLE_REGEX, DCML_REGEX, FORM_DETECTION_REGEX + class Annotations(LoggedClass): """ Class for storing, converting and manipulating annotation labels. """ - main_cols = ['label', 'mc', 'mc_onset', 'staff', 'voice'] - additional_cols = ['harmony_layer', 'regex_match', 'absolute_root', 'rootCase', 'absolute_base', 'leftParen', 'rightParen', 'offset_x', 'offset_y', - 'nashville', 'decoded', 'color_name', 'color_html', 'color_r', 'color_g', 'color_b', 'color_a', 'placement', 'minDistance', 'style', 'z'] - - def __init__(self, tsv_path=None, df=None, cols={}, index_col=None, sep='\t', mscx_obj=None, infer_types=None, read_only=False, **logger_cfg): + main_cols = ["label", "mc", "mc_onset", "staff", "voice"] + additional_cols = [ + "harmony_layer", + "regex_match", + "absolute_root", + "rootCase", + "absolute_base", + "leftParen", + "rightParen", + "offset_x", + "offset_y", + "nashville", + "decoded", + "color_name", + "color_html", + "color_r", + "color_g", + "color_b", + "color_a", + "placement", + "minDistance", + "style", + "z", + ] + + def __init__( + self, + tsv_path=None, + df=None, + cols={}, + index_col=None, + sep="\t", + mscx_obj=None, + infer_types=None, + read_only=False, + **logger_cfg, + ): """ Parameters @@ -33,14 +75,17 @@ def __init__(self, tsv_path=None, df=None, cols={}, index_col=None, sep='\t', ms cols : :obj:`dict` If your columns don't have standard names, pass a {NAME -> ACTUAL_NAME} dictionary. Required columns: label, mc, mc_onset, staff, voice - Additional columns: harmony_layer, regex_match, absolute_root, rootCase, absolute_base, leftParen, rightParen, offset_x, offset_y, nashville, decoded, color_name, + Additional columns: harmony_layer, regex_match, absolute_root, rootCase, absolute_base, leftParen, + rightParen, offset_x, offset_y, nashville, decoded, color_name, color_html, color_r, color_g, color_b, color_a, placement, minDistance, style, z index_col sep mscx_obj infer_types : :obj:`dict`, optional - If you want to check all labels against one or several regular expressions, pass them as a {label_type -> regEx} dictionary. - The column regex_match will display the label_type of the last matched regEx. If you pass None, the default behaviour + If you want to check all labels against one or several regular expressions, pass them as a {label_type -> + regEx} dictionary. + The column regex_match will display the label_type of the last matched regEx. If you pass None, + the default behaviour is detecting labels of the DCML harmony annotation standard's current version. read_only logger_cfg : :obj:`dict`, optional @@ -50,11 +95,12 @@ def __init__(self, tsv_path=None, df=None, cols={}, index_col=None, sep='\t', ms 'file': PATH_TO_LOGFILE to store all log messages under the given path. kwargs : """ - super().__init__(subclass='Annotations', logger_cfg=logger_cfg) + super().__init__(subclass="Annotations", logger_cfg=logger_cfg) if infer_types is None: - self.regex_dict = {'dcml': DCML_DOUBLE_REGEX, - 'form_labels': FORM_DETECTION_REGEX, - } + self.regex_dict = { + "dcml": DCML_DOUBLE_REGEX, + "form_labels": FORM_DETECTION_REGEX, + } else: self.regex_dict = infer_types self._expanded = None @@ -67,18 +113,21 @@ def __init__(self, tsv_path=None, df=None, cols={}, index_col=None, sep='\t', ms self.cols = {c: c for c in columns} cols_update, incorrect = update_cfg(cols, self.cols.keys()) if len(incorrect) > 0: - last_5 = ', '.join(f"-{i}: {stack()[i].function}()" for i in range(1, 6)) - plural = 'These mappings do' if len(incorrect) > 1 else 'This mapping does' - self.logger.warning(f"{plural} not pertain to standard columns: {incorrect}\nLast 5 function calls leading here: {last_5}") + last_5 = ", ".join(f"-{i}: {stack()[i].function}()" for i in range(1, 6)) + plural = "These mappings do" if len(incorrect) > 1 else "This mapping does" + self.logger.warning( + f"{plural} not pertain to standard columns: {incorrect}\nLast 5 function calls leading here: {last_5}" + ) self.cols.update(cols_update) - if df is not None: self.df = df.copy() else: - assert tsv_path is not None, "Name a TSV file to be loaded or pass a DataFrame." + assert ( + tsv_path is not None + ), "Name a TSV file to be loaded or pass a DataFrame." self.df = load_tsv(tsv_path, index_col=index_col, sep=sep) - sorting_cols = ['mc', 'mn', 'mc_onset', 'staff'] + sorting_cols = ["mc", "mn", "mc_onset", "staff"] sorting_cols = [self.cols[c] if c in self.cols else c for c in sorting_cols] sorting_cols = [c for c in sorting_cols if c in self.df.columns] self.df.sort_values(sorting_cols, inplace=True) @@ -86,94 +135,123 @@ def __init__(self, tsv_path=None, df=None, cols={}, index_col=None, sep='\t', ms def add_initial_dots(self): if self.read_only: - self.logger.warning(f"Cannot change labels attached to a score. Detach them first.") + self.logger.warning( + "Cannot change labels attached to a score. Detach them first." + ) return - label_col = self.cols['label'] - notes = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'} - add_dots = lambda s: '.' + s if s[0].lower() in notes else s - self.df[label_col] = self.df[label_col].map(add_dots) + label_col = self.cols["label"] + notes = {"a", "b", "c", "d", "e", "f", "g", "h"} + def add_dots(s): + return "." + s if s[0].lower() in notes else s + + self.df[label_col] = self.df[label_col].map(add_dots) - def prepare_for_attaching(self, staff=None, voice=None, harmony_layer=1, check_for_clashes=True): + def prepare_for_attaching( + self, staff=None, voice=None, harmony_layer=1, check_for_clashes=True + ): if self.mscx_obj is None: - self.logger.warning(f"Annotations object not aware to which MSCX object it is attached.") + self.logger.warning( + "Annotations object not aware to which MSCX object it is attached." + ) return pd.DataFrame() df = self.df.copy() cols = list(df.columns) - staff_col = self.cols['staff'] + staff_col = self.cols["staff"] if staff_col not in cols: if staff is None: - self.logger.info("Annotations don't have staff information. Using the default -1 (lowest staff).") + self.logger.info( + "Annotations don't have staff information. Using the default -1 (lowest staff)." + ) staff = -1 df[staff_col] = staff else: if staff is None: if df[staff_col].isna().any(): staff = -1 - self.logger.info(f"Some labels don't have staff information. Assigned staff {staff}.") + self.logger.info( + f"Some labels don't have staff information. Assigned staff {staff}." + ) df[staff_col].fillna(staff, inplace=True) else: df[staff_col] = staff - - voice_col = self.cols['voice'] + voice_col = self.cols["voice"] if voice_col not in cols: if voice is None: - self.logger.info("Annotations don't have voice information. Attaching to the default, voice 1.") + self.logger.info( + "Annotations don't have voice information. Attaching to the default, voice 1." + ) voice = 1 df[voice_col] = voice else: if voice is None: if df[voice_col].isna().any(): voice = 1 - self.logger.info("Some labels don't have voice information. Attaching to the default, voice 1.") + self.logger.info( + "Some labels don't have voice information. Attaching to the default, voice 1." + ) df[voice_col].fillna(voice, inplace=True) else: df[voice_col] = voice - layer_col = self.cols['harmony_layer'] + layer_col = self.cols["harmony_layer"] if layer_col not in cols: if harmony_layer is None: - self.logger.info("Annotations don't have harmony_layer information. Using the default, 1 (Roman numerals).") + self.logger.info( + "Annotations don't have harmony_layer information. Using the default, 1 (Roman numerals)." + ) harmony_layer = 1 df[layer_col] = harmony_layer else: if harmony_layer is None: if df[layer_col].isna().any(): harmony_layer = 1 - self.logger.info("Some labels don't have harmony_layer information. Using the default, 1 (Roman numerals).") + self.logger.info( + "Some labels don't have harmony_layer information. Using the default, 1 (Roman numerals)." + ) df[layer_col].fillna(harmony_layer, inplace=True) else: df[layer_col] = harmony_layer error = False - if self.cols['mc'] not in cols: - mn_col = self.cols['mn'] if 'mn' in self.cols else 'mn' + if self.cols["mc"] not in cols: + mn_col = self.cols["mn"] if "mn" in self.cols else "mn" if mn_col not in cols: - self.logger.error(f"Annotations need to have at least one column named 'mn' or 'mc'.") + self.logger.error( + "Annotations need to have at least one column named 'mn' or 'mc'." + ) error = True else: inferred_positions = self.infer_mc_from_mn() if inferred_positions.isna().any().any(): - self.logger.error(f"Measure counts and corresponding mc_onsets could not be successfully inferred.") + self.logger.error( + "Measure counts and corresponding mc_onsets could not be successfully inferred." + ) error = True else: - if 'mn_onset' not in self.cols: - self.logger.info(f"Measure counts successfully inferred. Since there is no 'mn_onset' column, all " - f"mc_onsets have been set to 0.") + if "mn_onset" not in self.cols: + self.logger.info( + "Measure counts successfully inferred. Since there is no 'mn_onset' column, all " + "mc_onsets have been set to 0." + ) else: - self.logger.info(f"Measure counts and corresponding mc_onsets successfully inferred.") - df.insert(df.columns.get_loc('mn'), 'mc', inferred_positions['mc']) - df.loc[:, 'mc_onset'] = inferred_positions['mc_onset'] - cols.extend(['mc', 'mc_onset']) - - mc_onset_col = self.cols['mc_onset'] + self.logger.info( + "Measure counts and corresponding mc_onsets successfully inferred." + ) + df.insert(df.columns.get_loc("mn"), "mc", inferred_positions["mc"]) + df.loc[:, "mc_onset"] = inferred_positions["mc_onset"] + cols.extend(["mc", "mc_onset"]) + + mc_onset_col = self.cols["mc_onset"] if mc_onset_col not in cols: - self.logger.info("No 'mc_onset' column found. All labels will be inserted at mc_onset 0.") - new_col = pd.Series([0]*len(df), index=df.index, name='mc_onset') + self.logger.info( + "No 'mc_onset' column found. All labels will be inserted at mc_onset 0." + ) + new_col = pd.Series([0] * len(df), index=df.index, name="mc_onset") df = pd.concat([new_col, df], axis=1) - position_cols = ['mc', 'mc_onset', 'staff', 'voice'] + position_cols = ["mc", "mc_onset", "staff", "voice"] new_pos_cols = [self.cols[c] for c in position_cols] if all(c in df.columns for c in new_pos_cols): if check_for_clashes and self.mscx_obj.has_annotations: @@ -182,25 +260,26 @@ def prepare_for_attaching(self, staff=None, voice=None, harmony_layer=1, check_f clashes = is_any_row_equal(existing, to_be_attached) has_clashes = len(clashes) > 0 if has_clashes: - self.logger.error(f"The following positions already have labels:\n{pd.DataFrame(clashes, columns=position_cols)}") + self.logger.error( + f"The following positions already have labels:\n{pd.DataFrame(clashes, columns=position_cols)}" + ) error = True elif check_for_clashes: - self.logger.error(f"Check for clashes could not be performed because there are columns missing.") + self.logger.error( + "Check for clashes could not be performed because there are columns missing." + ) if error: return pd.DataFrame() return df - def count(self): return len(self.df) - @property def harmony_layer_counts(self): - """ Returns the counts of the harmony_layers as dict. - """ - if 'harmony_layer' in self.df.columns: + """Returns the counts of the harmony_layers as dict.""" + if "harmony_layer" in self.df.columns: return self.df.harmony_layer.value_counts(dropna=False).to_dict() else: return {None: len(self.df)} @@ -208,34 +287,48 @@ def harmony_layer_counts(self): @property def annotation_layers(self): df = self.df.copy() - layers = ['staff', 'voice', 'harmony_layer'] + layers = ["staff", "voice", "harmony_layer"] for c in layers: if self.cols[c] not in df.columns: df[c] = None - color_cols = ['color_name', 'color_html', 'color_r'] + color_cols = ["color_name", "color_html", "color_r"] if any(True for c in color_cols if self.cols[c] in df): - color_name = self.cols['color_name'] + color_name = self.cols["color_name"] if color_name in df.columns: pass - elif self.cols['color_html'] in df.columns: - df[color_name] = html2format(df, 'name') - elif self.cols['color_r'] in df.columns: - df[color_name] = rgb2format(df, 'name') - df[color_name] = df[color_name].fillna('default') + elif self.cols["color_html"] in df.columns: + df[color_name] = html2format(df, "name") + elif self.cols["color_r"] in df.columns: + df[color_name] = rgb2format(df, "name") + df[color_name] = df[color_name].fillna("default") layers.append(color_name) else: - df['color_name'] = 'default' - layers.append('color_name') - if 'regex_match' in df.columns: - df.harmony_layer = df.harmony_layer.astype(str) + (' (' + df.regex_match + ')').fillna('') + df["color_name"] = "default" + layers.append("color_name") + if "regex_match" in df.columns: + df.harmony_layer = df.harmony_layer.astype(str) + ( + " (" + df.regex_match + ")" + ).fillna("") return self.count(), df.groupby(layers, dropna=False).size() def __repr__(self): n, layers = self.annotation_layers return f"{n} labels:\n{layers.to_string()}" - def get_labels(self, staff=None, voice=None, harmony_layer=None, positioning=False, decode=True, drop=False, inverse=False, column_name=None, color_format=None, regex=None): - """ Returns a DataFrame of annotation labels. + def get_labels( + self, + staff=None, + voice=None, + harmony_layer=None, + positioning=False, + decode=True, + drop=False, + inverse=False, + column_name=None, + color_format=None, + regex=None, + ): + """Returns a DataFrame of annotation labels. Parameters ---------- @@ -268,53 +361,64 @@ def get_labels(self, staff=None, voice=None, harmony_layer=None, positioning=Fal sel = pd.Series(True, index=self.df.index) if staff is not None: - sel = sel & (self.df[self.cols['staff']] == staff) + sel = sel & (self.df[self.cols["staff"]] == staff) if voice is not None: - sel = sel & (self.df[self.cols['voice']] == voice) - if harmony_layer is not None and 'harmony_layer' in self.df.columns: + sel = sel & (self.df[self.cols["voice"]] == voice) + if harmony_layer is not None and "harmony_layer" in self.df.columns: # TODO: account for the split into harmony_layer and regex_match - #harmony_layer = self._treat_harmony_layer_param(harmony_layer, warnings=warnings) + # harmony_layer = self._treat_harmony_layer_param(harmony_layer, warnings=warnings) sel = sel & (self.df.harmony_layer == str(harmony_layer)) if regex is not None: - sel = sel & self.df[self.cols['label']].str.match(regex).fillna(False) + sel = sel & self.df[self.cols["label"]].str.match(regex).fillna(False) if inverse: sel = ~sel res = self.df[sel].copy() if positioning: - pos_cols = [c for c in ('offset',) if c in res.columns] + pos_cols = [c for c in ("offset",) if c in res.columns] else: - pos_cols = [c for c in ('minDistance', 'offset', 'offset_x', 'offset_y') if c in res.columns] + pos_cols = [ + c + for c in ("minDistance", "offset", "offset_x", "offset_y") + if c in res.columns + ] res.drop(columns=pos_cols, inplace=True) if drop: self.df = self.df[~sel] - label_col = self.cols['label'] + label_col = self.cols["label"] if decode: res = decode_harmonies(res, label_col=label_col, logger=self.logger) if column_name is not None and column_name != label_col: res = res.rename(columns={label_col: column_name}) - color_cols = ['color_html', 'color_r', 'color_g', 'color_b', 'color_a', 'color_name'] - rgb_cols = ['color_r', 'color_g', 'color_b'] + color_cols = [ + "color_html", + "color_r", + "color_g", + "color_b", + "color_a", + "color_name", + ] + rgb_cols = ["color_r", "color_g", "color_b"] present_cols = [c for c in color_cols if c in res.columns] if color_format is not None and len(present_cols) > 0: - res['color'] = pd.NA - has_html = 'color_html' in res.columns - has_name = 'color_name' in res.columns - has_rgb = all(col in res.columns for col in rgb_cols) - has_rgba = has_rgb and 'color_a' in res.columns + res["color"] = pd.NA + has_html = "color_html" in res.columns + has_name = "color_name" in res.columns + has_rgb = all(col in res.columns for col in rgb_cols) + has_rgba = has_rgb and "color_a" in res.columns def tuple_or_na(row): if row.isna().all(): return pd.NA return tuple(row) - if color_format == 'html' and has_html: + if color_format == "html" and has_html: res.color = res.color_html - elif color_format == 'name' and has_name: + elif color_format == "name" and has_name: res.color = res.color_name - elif color_format == 'rgb' and has_rgb: + elif color_format == "rgb" and has_rgb: res.color = res[rgb_cols].apply(tuple_or_na, axis=1) - elif color_format == 'rgba' and has_rgba: - res.color = res[rgb_cols + ['color_a']].apply(tuple_or_na, axis=1) + elif color_format == "rgba" and has_rgba: + res.color = res[rgb_cols + ["color_a"]].apply(tuple_or_na, axis=1) elif has_html: res.color = html2format(res, color_format) elif has_name: @@ -322,7 +426,9 @@ def tuple_or_na(row): elif has_rgb: res.color = rgb2format(res, color_format) else: - logger.warning(f"Color format '{color_format}' could not be computed from columns {present_cols}.") + self.logger.warning( + f"Color format '{color_format}' could not be computed from columns {present_cols}." + ) res.drop(columns=present_cols, inplace=True) if self.mscx_obj is not None: @@ -330,8 +436,19 @@ def tuple_or_na(row): return res @cache - def expand_dcml(self, drop_others=True, warn_about_others=True, drop_empty_cols=False, chord_tones=True, relative_to_global=False, absolute=False, all_in_c=False, **kwargs): - """ Expands all labels where the regex_match has been inferred as 'dcml' and stores the DataFrame in self._expanded. + def expand_dcml( + self, + drop_others=True, + warn_about_others=True, + drop_empty_cols=False, + chord_tones=True, + relative_to_global=False, + absolute=False, + all_in_c=False, + **kwargs, + ): + """Expands all labels where the regex_match has been inferred as 'dcml' and stores the DataFrame in + self._expanded. Parameters ---------- @@ -369,27 +486,48 @@ def expand_dcml(self, drop_others=True, warn_about_others=True, drop_empty_cols= :obj:`pandas.DataFrame` Expanded DCML labels """ - if 'dcml' not in self.regex_dict: + if "dcml" not in self.regex_dict: self.regex_dict = dict(dcml=DCML_DOUBLE_REGEX, **self.regex_dict) self.infer_types() df = self.get_labels(**kwargs) - select_dcml = (df.regex_match == 'dcml').fillna(False) + select_dcml = (df.regex_match == "dcml").fillna(False) if not select_dcml.any(): - self.logger.info(f"Score does not contain any DCML harmonic annotations.") + self.logger.info("Score does not contain any DCML harmonic annotations.") return if not drop_others: warn_about_others = False if warn_about_others and (~select_dcml).any(): - self.logger.warning(f"Score contains {(~select_dcml).sum()} labels that don't (and {select_dcml.sum()} that do) match the DCML standard:\n{decode_harmonies(df[~select_dcml], keep_layer=True, logger=self.logger)[['mc', 'mn', 'label', 'harmony_layer']].to_string()}", - extra={"message_id": (15, )}) + show_labels = decode_harmonies( + df[~select_dcml], keep_layer=True, logger=self.logger + )[["mc", "mn", "label", "harmony_layer"]].to_string() + self.logger.warning( + f"Score contains {(~select_dcml).sum()} labels that don't (and {select_dcml.sum()} that do) match the " + f"DCML standard:\n{show_labels}", + extra={"message_id": (15,)}, + ) df = df[select_dcml] try: - exp = expand_labels(df, column='label', regex=DCML_REGEX, volta_structure=self.volta_structure, chord_tones=chord_tones, relative_to_global=relative_to_global, absolute=absolute, all_in_c=all_in_c, logger=self.logger) + exp = expand_labels( + df, + column="label", + regex=DCML_REGEX, + volta_structure=self.volta_structure, + chord_tones=chord_tones, + relative_to_global=relative_to_global, + absolute=absolute, + all_in_c=all_in_c, + logger=self.logger, + ) if drop_others: self._expanded = exp else: df = self.df.copy() - key_cols = ['globalkey', 'localkey', 'globalkey_is_minor', 'localkey_is_minor'] + key_cols = [ + "globalkey", + "localkey", + "globalkey_is_minor", + "localkey_is_minor", + ] with warnings.catch_warnings(): # Setting values in-place is fine, ignore the warning in Pandas >= 1.5.0 # This can be removed, if Pandas 1.5.0 does not need to be supported any longer. @@ -403,84 +541,118 @@ def expand_dcml(self, drop_others=True, warn_about_others=True, drop_empty_cols= ), ) df.loc[select_dcml, exp.columns] = exp - df.loc[:, key_cols] = df[key_cols].fillna(method='ffill') + df.loc[:, key_cols] = df[key_cols].fillna(method="ffill") self._expanded = df - drop_cols = [col for col in ('harmony_layer', 'regex_match') if col in df.columns] + drop_cols = [ + col for col in ("harmony_layer", "regex_match") if col in df.columns + ] if len(drop_cols) > 0: self._expanded.drop(columns=drop_cols, inplace=True) except Exception: - self.logger.error(f"Expanding labels failed with the following error:\n{sys.exc_info()[1]}") + self.logger.error( + f"Expanding labels failed with the following error:\n{sys.exc_info()[1]}" + ) if drop_empty_cols: - return self._expanded.dropna(axis=1, how='all') + return self._expanded.dropna(axis=1, how="all") return self._expanded - - def infer_mc_from_mn(self, mscx_obj=None): if mscx_obj is None and self.mscx_obj is None: - self.logger.error(f"Either pass an MSCX object or load this Annotations object to a score using load_annotations().") + self.logger.error( + "Either pass an MSCX object or load this Annotations object to a score using load_annotations()." + ) return False mscx = mscx_obj if mscx_obj is not None else self.mscx_obj - column_names = [self.cols[c] if c in self.cols else c for c in ['mn', 'mn_onset', 'volta']] + column_names = [ + self.cols[c] if c in self.cols else c for c in ["mn", "mn_onset", "volta"] + ] cols = [c for c in column_names if c in self.df.columns] - inferred_positions = [mscx.infer_mc(**dict(zip(cols, t))) for t in self.df[cols].values] - return pd.DataFrame(inferred_positions, index=self.df.index, columns=['mc', 'mc_onset']) - - - + inferred_positions = [ + mscx.infer_mc(**dict(zip(cols, t))) for t in self.df[cols].values + ] + return pd.DataFrame( + inferred_positions, index=self.df.index, columns=["mc", "mc_onset"] + ) def infer_types(self, regex_dict=None): - if 'harmony_layer' not in self.df.columns: - harmony_layer_col = pd.Series(0, index=self.df.index, dtype='object', name='harmony_layer') + if "harmony_layer" not in self.df.columns: + harmony_layer_col = pd.Series( + 0, index=self.df.index, dtype="object", name="harmony_layer" + ) self.df = pd.concat([self.df, harmony_layer_col], axis=1) - if 'nashville' in self.df.columns: - self.df.loc[self.df.nashville.notna(), 'harmony_layer'] = 2 - if 'absolute_root' in self.df.columns: - self.df.loc[self.df.absolute_root.notna(), 'harmony_layer'] = 3 - + if "nashville" in self.df.columns: + self.df.loc[self.df.nashville.notna(), "harmony_layer"] = 2 + if "absolute_root" in self.df.columns: + self.df.loc[self.df.absolute_root.notna(), "harmony_layer"] = 3 if regex_dict is None: regex_dict = self.regex_dict if len(regex_dict) > 0: - decoded = decode_harmonies(self.df, label_col=self.cols['label'], return_series=True, logger=self.logger) + decoded = decode_harmonies( + self.df, + label_col=self.cols["label"], + return_series=True, + logger=self.logger, + ) sel = decoded.notna() if not sel.any(): self.logger.info(f"No labels present: {self.df}") return - if 'regex_match' not in self.df.columns and sel.any(): - regex_col = pd.Series(index=self.df.index, dtype='object') - column_position = self.df.columns.get_loc('harmony_layer') + 1 - self.df.insert(column_position, 'regex_match', regex_col) + if "regex_match" not in self.df.columns and sel.any(): + regex_col = pd.Series(index=self.df.index, dtype="object") + column_position = self.df.columns.get_loc("harmony_layer") + 1 + self.df.insert(column_position, "regex_match", regex_col) for name, regex in regex_dict.items(): - # TODO: Check if in the loop, previously matched regex names are being overwritten by those matched after + # TODO: Check if in the loop, previously matched regex names are being overwritten by those matched + # after try: mtch = decoded[sel].str.match(regex) except AttributeError: - self.logger.warning(f"Couldn't match regex against these labels: {decoded[sel]}") + self.logger.warning( + f"Couldn't match regex against these labels: {decoded[sel]}" + ) raise - self.df.loc[sel & mtch, 'regex_match'] = name - + self.df.loc[sel & mtch, "regex_match"] = name def remove_initial_dots(self): if self.read_only: - self.logger.warning(f"Cannot change labels attached to a score. Detach them first.") + self.logger.warning( + "Cannot change labels attached to a score. Detach them first." + ) return - label_col = self.cols['label'] - starts_with_dot = self.df[label_col].str[0] == '.' - self.df.loc[starts_with_dot, label_col] = self.df.loc[starts_with_dot, label_col].str[1:] - - - def store_tsv(self, tsv_path, staff=None, voice=None, harmony_layer=None, positioning=True, decode=False, sep='\t', index=False, **kwargs): - df = self.get_labels(staff=staff, voice=voice, harmony_layer=harmony_layer, positioning=positioning, decode=decode) - if decode and 'harmony_layer' in df.columns: - df.drop(columns='harmony_layer', inplace=True) + label_col = self.cols["label"] + starts_with_dot = self.df[label_col].str[0] == "." + self.df.loc[starts_with_dot, label_col] = self.df.loc[ + starts_with_dot, label_col + ].str[1:] + + def store_tsv( + self, + tsv_path, + staff=None, + voice=None, + harmony_layer=None, + positioning=True, + decode=False, + sep="\t", + index=False, + **kwargs, + ): + df = self.get_labels( + staff=staff, + voice=voice, + harmony_layer=harmony_layer, + positioning=positioning, + decode=decode, + ) + if decode and "harmony_layer" in df.columns: + df.drop(columns="harmony_layer", inplace=True) df.to_csv(resolve_dir(tsv_path), sep=sep, index=index, **kwargs) self.logger.info(f"{len(df)} labels written to {tsv_path}.") return True - def _treat_harmony_layer_param(self, harmony_layer, warnings=True): if harmony_layer is None: return None @@ -492,7 +664,9 @@ def _treat_harmony_layer_param(self, harmony_layer, warnings=True): not_found = [t for t in lt if t not in all_types] if len(not_found) > 0: plural = len(not_found) > 1 - plural_s = 's' if plural else '' + plural_s = "s" if plural else "" self.logger.warning( - f"No labels found with {'these' if plural else 'this'} label{plural_s} harmony_layer{plural_s}: {', '.join(not_found)}") - return [all_types[t] for t in lt if t in all_types] \ No newline at end of file + f"No labels found with {'these' if plural else 'this'} label{plural_s} harmony_layer{plural_s}: " + f"{', '.join(not_found)}" + ) + return [all_types[t] for t in lt if t in all_types] diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 8886e15a..5b8449fc 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -""" +""" DCML to Dezrann =============== @@ -10,7 +10,7 @@ # Intro The script presents a first application of what is to become a formal standard of a "measure map"; -see first discussion points at +see first discussion points at * https://gitlab.com/algomus.fr/dezrann/dezrann/-/issues/1030#note_1122509147) * https://github.com/MarkGotham/bar-measure/ @@ -18,10 +18,10 @@ As an early proxy of a measure map, the current version uses the measure tables that each DCML corpus provides in its `measures` folder. This is beneficial in the current context because: -1. The files are required for correct, actionable quarter-note positions without having to re-parse +1. The files are required for correct, actionable quarter-note positions without having to re-parse the entire score. 2. The files play an essential role for validating the conversion output. -3. They help avoiding the confusion that necessarily arises when several addressing schemes are +3. They help avoiding the confusion that necessarily arises when several addressing schemes are at play. In detail: @@ -30,22 +30,22 @@ From a technical perspective, offsets in the sense of "distance from the origin" represent the primary mechanism of referencing positions in a text (character counts being the default in NLP). -Music scores are typically aligned with a time line of "musical time", an alignment which is -frequently expressed as float values representing an event's distance from the score's beginning, +Music scores are typically aligned with a time line of "musical time", an alignment which is +frequently expressed as float values representing an event's distance from the score's beginning, measured in quarter notes, here referred to as quarterbeats. The fundamental problem, however, is ensuring that quarterbeat positions refer to the same time line. The commonplace score encoding formats do not indicate quarterbeat positions. Instead, they structure -musical time in a sequence of containers, generally called "measures", each of which represents -a time line starting from 0. Counting measure units (of some kind) therefore represents the second +musical time in a sequence of containers, generally called "measures", each of which represents +a time line starting from 0. Counting measure units (of some kind) therefore represents the second prevalent way of indicating positions in a score, together with an event onset indicating an -event's distance from the container's beginning. To avoid terminological confusion, we call +event's distance from the container's beginning. To avoid terminological confusion, we call the distance from the beginning of a measure container "onset". Looking at a single score, there is an unambiguous mapping between the two types of positions: `event_offset = measure_offset + event_onset`. Problems arise, however when information from one score is to be set into relation with timed information from another source. This is a wide-spread problem in the context of music research and musical corpus studies where data from different -sources with different ways of expressing timestamps frequently needs to be aligned, often in +sources with different ways of expressing timestamps frequently needs to be aligned, often in absence of the original score that one of the source is aligned to. Currently, there is no standardized way of storing such alignments for later re-use. Hence the idea of a central mapping file for storing alignments between positions given as quarterbeats, measure+onset, @@ -54,17 +54,17 @@ **Different types of quarterbeats** All TSV files issued by the DCML come with the column `quarterbeats` indicating every event's -offset from the score's beginning (position 0). With the caveat that, in the case of first/second endings +offset from the score's beginning (position 0). With the caveat that, in the case of first/second endings ("voltas"), the indicated values do not take into account any but the second ending, with the rationale that they should represent the temporal proportion of a single playthrough without any repetitions. For correct conversion, therefore, using a strict, measuring-stick-based variant -of `quarterbeats` will probably be useful. This means that the default `quarterbeats` should be +of `quarterbeats` will probably be useful. This means that the default `quarterbeats` should be ignored (unless first endings are to be categorically excluded) in favour of a -`quarterbeats_all_endings` column. Since the DCML measure maps already come with columns of both -names, the simple formula mentioned above `quarterbeats = quarterbeats(measure) + event_onset` +`quarterbeats_all_endings` column. Since the DCML measure maps already come with columns of both +names, the simple formula mentioned above `quarterbeats = quarterbeats(measure) + event_onset` has its analogue `quarterbeats_all_measures = quarterbeats_all_measures(measure) + event_onset`. -Input: DataFrame containing DCML harmony labels as output via the command `ms3 extract -X` +Input: DataFrame containing DCML harmony labels as output via the command `ms3 extract -X` (X for 'expanded'), stored by default in a folder called 'harmonies'. Using these TSV files ensures using only valid DCML labels but in principle this script can be used for converting labels of all kinds as long as they come in the specified tabular format. @@ -73,20 +73,20 @@ Going from a `DcmlLabel` dictionary to a `DezrannLabel` dictionary is straightforward because they exchange positions as quarterbeats. Validation, on the other hand, requires relating -the output .dez format with the converted score which it is layed over in Dezrann. In the +the output .dez format with the converted score which it is layed over in Dezrann. In the interface, positions are shown to the user in terms of `measure_count + event_onset`. Extracting -this information and comparing it to the one in the original TSVs will +this information and comparing it to the one in the original TSVs will Columns: * `mc`: measure count (XML measures, always starting from 1) -* +* Output: -JSON Dezrann file (.dez) containing all the harmony labels, aligned with the score. +JSON Dezrann file (.dez) containing all the harmony labels, aligned with the score. Here is an example of Dezrann file structure: ''' { @@ -101,26 +101,22 @@ import argparse import json import os -from typing import Dict, List, TypedDict, Union, Tuple, Optional, TypeAlias, Literal - from fractions import Fraction +from typing import Dict, List, Literal, Optional, Tuple, TypeAlias, TypedDict, Union + import pandas as pd -LINE_VALUES = { - 1: "top.1", - 2: "top.2", - 3: "top.3", - 4: "bot.1", - 5: "bot.2", - 6: "bot.3" -} +LINE_VALUES = {1: "top.1", 2: "top.2", 3: "top.3", 4: "bot.1", 5: "bot.2", 6: "bot.3"} """The six annotation layers of the Dezrann app, three above ('top') and three below ('bot') the score.""" DEZ_LINE_ARGS = (0, 1, 2, 3, 4, 5, 6, -1, -2, -3) -DezrannLayer: TypeAlias = Literal["top.1", "top.2", "top.3", "bot.1", "bot.2", "bot.3", 1, 2, 3, 4, 5, 6, -1, -2, -3] +DezrannLayer: TypeAlias = Literal[ + "top.1", "top.2", "top.3", "bot.1", "bot.2", "bot.3", 1, 2, 3, 4, 5, 6, -1, -2, -3 +] """More expressive than simply annotating with 'str'.""" + def transform_line_argument(line: Optional[Union[int, str]]) -> Optional[str]: """Takes a number between -3 and 6 and turns it into one of the possible Dezrann line values. 0 is interpreted as None. -1, -2, -3 correspond to 4, 5, 6""" @@ -137,28 +133,35 @@ def transform_line_argument(line: Optional[Union[int, str]]) -> Optional[str]: line = abs(line) + 3 return LINE_VALUES[line] + def safe_frac(s: str) -> Union[Fraction, str]: try: return Fraction(s) except Exception: return s + class DezrannLabel(TypedDict): """Represents one label in a .dez file.""" + type: str start: float duration: float - #line: str # Determined by the meta-layout + # line: str # Determined by the meta-layout tag: str layers: List[str] + class DezrannDict(TypedDict): """Represents one .dez file.""" + labels: List[DezrannLabel] meta: Dict + class DcmlLabel(TypedDict): """Represents one label from a TSV annotation file""" + quarterbeats: float duration: float label: str @@ -172,7 +175,9 @@ def get_volta_groups(mc2volta: pd.Series) -> List[List[int]]: """ volta_groups = [] filled_volta_col = mc2volta.fillna(-1).astype(int) - volta_segmentation = (filled_volta_col != filled_volta_col.shift()).fillna(True).cumsum() + volta_segmentation = ( + (filled_volta_col != filled_volta_col.shift()).fillna(True).cumsum() + ) current_groups_first_mcs = [] for i, segment in filled_volta_col.groupby(volta_segmentation): volta_number = segment.iloc[0] @@ -181,7 +186,9 @@ def get_volta_groups(mc2volta: pd.Series) -> List[List[int]]: if i == 1: continue elif len(current_groups_first_mcs) == 0: - raise RuntimeError(f"Mistake in the algorithm when processing column {filled_volta_col.volta}") + raise RuntimeError( + f"Mistake in the algorithm when processing column {filled_volta_col.volta}" + ) else: volta_groups.append(current_groups_first_mcs) current_groups_first_mcs = [] @@ -190,10 +197,12 @@ def get_volta_groups(mc2volta: pd.Series) -> List[List[int]]: current_groups_first_mcs.append(first_mc) return volta_groups -def dcml_labels2dicts(labels: pd.DataFrame, - measures: pd.DataFrame, - label_column: str = 'label', - ) -> List[DcmlLabel]: + +def dcml_labels2dicts( + labels: pd.DataFrame, + measures: pd.DataFrame, + label_column: str = "label", +) -> List[DcmlLabel]: """ Parameters @@ -211,8 +220,8 @@ def dcml_labels2dicts(labels: pd.DataFrame, 'phraseend': str} and no missing values. measures: - (optional) Dataframe as found in the 'measures' folder of a DCML corpus for computing quarterbeats for pieces with - voltas. Requires the columns {'mc': int, 'quarterbeats_all_endings': fractions.Fraction} (ms3 >= 1.0.0). + (optional) Dataframe as found in the 'measures' folder of a DCML corpus for computing quarterbeats for pieces + with voltas. Requires the columns {'mc': int, 'quarterbeats_all_endings': fractions.Fraction} (ms3 >= 1.0.0). label_column: {'label', 'chord', 'cadence', 'phraseend'} The column that is to be used as label string. Defaults to 'label'. @@ -224,7 +233,9 @@ def dcml_labels2dicts(labels: pd.DataFrame, last_mc_row = measures.iloc[-1] end_of_score = float(last_mc_row.act_dur) * 4.0 if not score_has_voltas: - assert "quarterbeats" in labels.columns, f"Labels are lacking 'quarterbeats' column: {labels.columns}" + assert ( + "quarterbeats" in labels.columns + ), f"Labels are lacking 'quarterbeats' column: {labels.columns}" quarterbeats = labels["quarterbeats"] end_of_score += float(last_mc_row.quarterbeats) else: @@ -233,62 +244,80 @@ def dcml_labels2dicts(labels: pd.DataFrame, end_of_score += float(last_mc_row.quarterbeats_all_endings) M = measures.set_index("mc") offset_dict = M["quarterbeats_all_endings"] - quarterbeats = labels['mc'].map(offset_dict) + quarterbeats = labels["mc"].map(offset_dict) quarterbeats = quarterbeats + (labels.mc_onset * 4.0) - quarterbeats.rename('quarterbeats', inplace=True) + quarterbeats.rename("quarterbeats", inplace=True) # also, the first beat of each volta needs to have a label for computing correct durations volta_groups = get_volta_groups(M.volta) - label_and_qb = pd.concat([labels[label_column].rename('label'), quarterbeats.astype(float)], axis=1) + label_and_qb = pd.concat( + [labels[label_column].rename("label"), quarterbeats.astype(float)], axis=1 + ) n_before = len(labels.index) - if label_column == 'phraseend': - label_and_qb = label_and_qb[label_and_qb.label.fillna('').str.contains('{')] - if label_column == 'localkey': - label_and_qb = label_and_qb[label_and_qb.label != label_and_qb.label.shift().fillna(True)] - else: # {'chord', 'cadence', 'label'} + if label_column == "phraseend": + label_and_qb = label_and_qb[label_and_qb.label.fillna("").str.contains("{")] + if label_column == "localkey": + label_and_qb = label_and_qb[ + label_and_qb.label != label_and_qb.label.shift().fillna(True) + ] + else: # {'chord', 'cadence', 'label'} label_and_qb = label_and_qb[label_and_qb.label.notna()] n_after = len(label_and_qb.index) - print(f"Creating labels for {n_after} {label_column} labels out of {n_before} rows.") - if label_column == 'cadence': - duration = pd.Series(0.0, dtype=float, index=label_and_qb.index, name='duration') + print( + f"Creating labels for {n_after} {label_column} labels out of {n_before} rows." + ) + if label_column == "cadence": + duration = pd.Series( + 0.0, dtype=float, index=label_and_qb.index, name="duration" + ) else: if score_has_voltas: for group in volta_groups: - volta_beginnings_quarterbeats = [M.loc[mc, 'quarterbeats_all_endings'] for mc in group] - labels_before_group = label_and_qb.loc[label_and_qb.quarterbeats < volta_beginnings_quarterbeats[0], 'label'] + volta_beginnings_quarterbeats = [ + M.loc[mc, "quarterbeats_all_endings"] for mc in group + ] + labels_before_group = label_and_qb.loc[ + label_and_qb.quarterbeats < volta_beginnings_quarterbeats[0], + "label", + ] for volta_beginning_qb in volta_beginnings_quarterbeats: if volta_beginning_qb in label_and_qb.quarterbeats.values: continue - repeated_label = pd.DataFrame([[labels_before_group.iloc[-1], float(volta_beginning_qb)]], - columns=['label', 'quarterbeats']) - label_and_qb = pd.concat([label_and_qb, repeated_label], ignore_index=True) - label_and_qb = label_and_qb.sort_values('quarterbeats') + repeated_label = pd.DataFrame( + [[labels_before_group.iloc[-1], float(volta_beginning_qb)]], + columns=["label", "quarterbeats"], + ) + label_and_qb = pd.concat( + [label_and_qb, repeated_label], ignore_index=True + ) + label_and_qb = label_and_qb.sort_values("quarterbeats") qb_column = label_and_qb.quarterbeats duration = qb_column.shift(-1).fillna(end_of_score) - qb_column - duration = duration.rename('duration').astype(float) + duration = duration.rename("duration").astype(float) transformed_df = pd.concat([label_and_qb, duration], axis=1) - return transformed_df.to_dict(orient='records') + return transformed_df.to_dict(orient="records") + def make_dezrann_label( - label_type: str, - quarterbeats: float, - duration: float, - label: str, - origin: Union[str, Tuple[str]]) -> DezrannLabel: + label_type: str, + quarterbeats: float, + duration: float, + label: str, + origin: Union[str, Tuple[str]], +) -> DezrannLabel: if isinstance(origin, str): layers = [origin] else: layers = list(origin) return DezrannLabel( - type=label_type, - start=quarterbeats, - duration=duration, - tag=label, - layers=layers + type=label_type, start=quarterbeats, duration=duration, tag=label, layers=layers ) -def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], - label_type: str, - origin: Union[str, Tuple[str]] = "DCML") -> DezrannDict: + +def convert_dcml_list_to_dezrann_list( + values_dict: List[DcmlLabel], + label_type: str, + origin: Union[str, Tuple[str]] = "DCML", +) -> DezrannDict: dezrann_label_list = [] for e in values_dict: dezrann_label_list.append( @@ -297,18 +326,20 @@ def convert_dcml_list_to_dezrann_list(values_dict: List[DcmlLabel], quarterbeats=e["quarterbeats"], duration=e["duration"], label=e["label"], - origin=origin + origin=origin, ) ) return dezrann_label_list + def make_layout( - cadences: bool = False, - harmonies: Optional[DezrannLayer] = None, - keys: Optional[DezrannLayer] = None, - phrases: Optional[DezrannLayer] = None, - raw: Optional[DezrannLayer] = None): + cadences: bool = False, + harmonies: Optional[DezrannLayer] = None, + keys: Optional[DezrannLayer] = None, + phrases: Optional[DezrannLayer] = None, + raw: Optional[DezrannLayer] = None, +): """ Compile the line positions for target labels into Dezrann layout parameter. """ @@ -327,17 +358,18 @@ def make_layout( return layout - -def generate_dez_from_dfs(measures_df: pd.DataFrame, - harmonies_df: pd.DataFrame, - output_path: str, - cadences: bool = False, - harmonies: Optional[DezrannLayer] = None, - keys: Optional[DezrannLayer] = None, - phrases: Optional[DezrannLayer] = None, - raw: Optional[DezrannLayer] = None, - origin: Union[str, Tuple[str]] = "DCML") -> bool: - """ Create a .dez file from a measures and a labels/expanded dataframe. +def generate_dez_from_dfs( + measures_df: pd.DataFrame, + harmonies_df: pd.DataFrame, + output_path: str, + cadences: bool = False, + harmonies: Optional[DezrannLayer] = None, + keys: Optional[DezrannLayer] = None, + phrases: Optional[DezrannLayer] = None, + raw: Optional[DezrannLayer] = None, + origin: Union[str, Tuple[str]] = "DCML", +) -> bool: + """Create a .dez file from a measures and a labels/expanded dataframe. Args: measures_df: @@ -353,54 +385,66 @@ def generate_dez_from_dfs(measures_df: pd.DataFrame, Returns: True if a .dez file was written. """ - annotation_layer_arguments = {arg: transform_line_argument(arg_val) for arg, arg_val in zip(("harmonies", "keys", "phrases", "raw"), (harmonies, keys, phrases, raw))} - parameters = {arg: arg_val is not None for arg, arg_val in annotation_layer_arguments.items()} - parameters['cadences'] = cadences + annotation_layer_arguments = { + arg: transform_line_argument(arg_val) + for arg, arg_val in zip( + ("harmonies", "keys", "phrases", "raw"), (harmonies, keys, phrases, raw) + ) + } + parameters = { + arg: arg_val is not None for arg, arg_val in annotation_layer_arguments.items() + } + parameters["cadences"] = cadences if not any(parameters.values()): - print(f"Nothing to do because no features have been selected.") + print("Nothing to do because no features have been selected.") return False dezrann_labels = [] - if cadences and 'cadence' in harmonies_df.columns: - dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column='cadence') - dezrann_labels += convert_dcml_list_to_dezrann_list(dcml_labels, label_type="Cadence", origin=origin) - for arg, label_column, label_type in ((harmonies, "chord", "Harmony"), # Third argument - (keys, "localkey", "Local Key"), - (phrases, "phraseend", "Phrase"), - (raw, "label", "Harmony")): + if cadences and "cadence" in harmonies_df.columns: + dcml_labels = dcml_labels2dicts( + labels=harmonies_df, measures=measures_df, label_column="cadence" + ) + dezrann_labels += convert_dcml_list_to_dezrann_list( + dcml_labels, label_type="Cadence", origin=origin + ) + for arg, label_column, label_type in ( + (harmonies, "chord", "Harmony"), # Third argument + (keys, "localkey", "Local Key"), + (phrases, "phraseend", "Phrase"), + (raw, "label", "Harmony"), + ): if arg is not None: - dcml_labels = dcml_labels2dicts(labels=harmonies_df, measures=measures_df, label_column=label_column) + dcml_labels = dcml_labels2dicts( + labels=harmonies_df, measures=measures_df, label_column=label_column + ) dezrann_labels += convert_dcml_list_to_dezrann_list( - dcml_labels, - label_type=label_type, - origin=origin + dcml_labels, label_type=label_type, origin=origin ) if len(dezrann_labels) == 0: - print(f"{output_path} not written because no labels correspond to the parameters: {parameters}") + print( + f"{output_path} not written because no labels correspond to the parameters: {parameters}" + ) return False layout = make_layout( - cadences=cadences, - harmonies=harmonies, - keys=keys, - phrases=phrases, - raw=raw + cadences=cadences, harmonies=harmonies, keys=keys, phrases=phrases, raw=raw ) dezrann_content = DezrannDict(labels=dezrann_labels, meta={"layout": layout}) - with open(output_path, 'w', encoding='utf-8') as f: + with open(output_path, "w", encoding="utf-8") as f: json.dump(dezrann_content, f, indent=2) return True - -def generate_dez(path_measures: str, - path_labels: str, - output_path: str, - cadences: bool = False, - harmonies: Optional[DezrannLayer] = None, - keys: Optional[DezrannLayer] = None, - phrases: Optional[DezrannLayer] = None, - raw: Optional[DezrannLayer] = None, - origin: Union[str, Tuple[str]] = "DCML") -> bool: - """ Create a .dez file from a path to a measures TSV file and a path to a labels/expanded TSV file. +def generate_dez( + path_measures: str, + path_labels: str, + output_path: str, + cadences: bool = False, + harmonies: Optional[DezrannLayer] = None, + keys: Optional[DezrannLayer] = None, + phrases: Optional[DezrannLayer] = None, + raw: Optional[DezrannLayer] = None, + origin: Union[str, Tuple[str]] = "DCML", +) -> bool: + """Create a .dez file from a path to a measures TSV file and a path to a labels/expanded TSV file. Args: path_measures: Path to a DCML measures TSV file. @@ -421,37 +465,58 @@ def generate_dez(path_measures: str, """ try: harmonies_df = pd.read_csv( - path_labels, sep='\t', - converters={'mc': int, - 'mc_onset': safe_frac, - 'quarterbeats': safe_frac, - } + path_labels, + sep="\t", + converters={ + "mc": int, + "mc_onset": safe_frac, + "quarterbeats": safe_frac, + }, ) except (ValueError, AssertionError, FileNotFoundError) as e: - raise ValueError(f"{path_labels} could not be loaded as a measure map because of the following error:\n'{e}'") + raise ValueError( + f"{path_labels} could not be loaded as a measure map because of the following error:\n'{e}'" + ) try: measures_df = pd.read_csv( - path_measures, sep='\t', - dtype={'mc': int, 'volta': 'Int64'}, - converters={'quarterbeats_all_endings': safe_frac, - 'quarterbeats': safe_frac, - 'act_dur': safe_frac} + path_measures, + sep="\t", + dtype={"mc": int, "volta": "Int64"}, + converters={ + "quarterbeats_all_endings": safe_frac, + "quarterbeats": safe_frac, + "act_dur": safe_frac, + }, ) except (ValueError, AssertionError, FileNotFoundError) as e: - raise ValueError(f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'") + raise ValueError( + f"{path_measures} could not be loaded as a measure map because of the following error:\n'{e}'" + ) - return generate_dez_from_dfs(measures_df, harmonies_df, output_path, cadences, harmonies, keys, phrases, raw, origin) + return generate_dez_from_dfs( + measures_df, + harmonies_df, + output_path, + cadences, + harmonies, + keys, + phrases, + raw, + origin, + ) -def main(input_dir: str, - measures_dir: str, - output_dir: str, - cadences: bool = False, - harmonies: Optional[DezrannLayer] = None, - keys: Optional[DezrannLayer] = None, - phrases: Optional[DezrannLayer] = None, - raw: Optional[DezrannLayer] = None) -> None: - """ Main function for using this module as a script. It gathers file paths and converts the detected DCML-style +def main( + input_dir: str, + measures_dir: str, + output_dir: str, + cadences: bool = False, + harmonies: Optional[DezrannLayer] = None, + keys: Optional[DezrannLayer] = None, + phrases: Optional[DezrannLayer] = None, + raw: Optional[DezrannLayer] = None, +) -> None: + """Main function for using this module as a script. It gathers file paths and converts the detected DCML-style labels/expanded TSV file to .dez format. Args: @@ -469,9 +534,9 @@ def main(input_dir: str, """ if not cadences and all(arg is None for arg in (harmonies, keys, phrases, raw)): - print(f"Nothing to do because no features have been selected.") + print("Nothing to do because no features have been selected.") return - input_files = [f for f in os.listdir(input_dir) if f.endswith('.tsv')] + input_files = [f for f in os.listdir(input_dir) if f.endswith(".tsv")] # measures_files = glob.glob(f"{measures_dir}/*.tsv") harmony_measure_matches = [] for tsv_name in input_files: @@ -504,7 +569,7 @@ def main(input_dir: str, harmonies=harmonies, keys=keys, phrases=phrases, - raw=raw + raw=raw, ) print(f"{output_file_path} successfully written.") except Exception as e: @@ -512,27 +577,27 @@ def main(input_dir: str, print(f"Done. Created {created_files} .dez files.") - def resolve_dir(d): - """ Resolves '~' to HOME directory and turns ``d`` into an absolute path. - """ + """Resolves '~' to HOME directory and turns ``d`` into an absolute path.""" if d is None: return None d = str(d) - if '~' in d: + if "~" in d: return os.path.expanduser(d) return os.path.abspath(d) - + def process_arguments(args: argparse.Namespace) -> dict: """Transforms the user's input arguments into keyword arguments for :func:`main` or raises a ValueError.""" input_dir = resolve_dir(args.dir) assert os.path.isdir(input_dir), f"{args.dir} is not an existing directory." if args.measures is None: - measures_dir = os.path.abspath(os.path.join(input_dir, '..', 'measures')) + measures_dir = os.path.abspath(os.path.join(input_dir, "..", "measures")) if not os.path.isdir(measures_dir): - raise ValueError(f"No directory with measure maps was specified and the default path " - f"{measures_dir} does not exist.") + raise ValueError( + f"No directory with measure maps was specified and the default path " + f"{measures_dir} does not exist." + ) else: measures_dir = resolve_dir(args.measures) if not os.path.isdir(measures_dir): @@ -543,12 +608,8 @@ def process_arguments(args: argparse.Namespace) -> dict: output_dir = resolve_dir(args.out) if not os.path.isdir(output_dir): raise ValueError(f"{output_dir} is not an existing directory.") - kwargs = dict( - input_dir=input_dir, - measures_dir=measures_dir, - output_dir=output_dir - ) - line_args = ('harmonies', 'keys', 'phrases', 'raw') + kwargs = dict(input_dir=input_dir, measures_dir=measures_dir, output_dir=output_dir) + line_args = ("harmonies", "keys", "phrases", "raw") transformed_line_args = {} for arg in line_args: arg_val = getattr(args, arg) @@ -559,79 +620,106 @@ def process_arguments(args: argparse.Namespace) -> dict: continue transformed_line_args[arg] = line_arg if len(set(transformed_line_args.values())) < len(transformed_line_args.values()): - selected_args = {arg: f"'{getattr(args, arg)}' => {arg_val}" for arg, arg_val in transformed_line_args.items()} - raise ValueError(f"You selected the same annotation layer more than once: {selected_args}.") + selected_args = { + arg: f"'{getattr(args, arg)}' => {arg_val}" + for arg, arg_val in transformed_line_args.items() + } + raise ValueError( + f"You selected the same annotation layer more than once: {selected_args}." + ) kwargs.update(transformed_line_args) if args.cadences: - kwargs['cadences'] = True + kwargs["cadences"] = True print(kwargs) return kwargs def run(): - parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, - description='''\ + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="""\ ----------------------------- | DCML => Dezrann converter | ----------------------------- -This script converts DCML harmony annotations into the .dez JSON format used by the dezrann.net app. It is +This script converts DCML harmony annotations into the .dez JSON format used by the dezrann.net app. It is standalone and does not require ms3 to be installed. Its only requirement is pandas. -Apart from that, the script requires that you have previously extracted both harmonies and measures from the -annotated scores or that you are converting a DCML corpus (https://github.com/DCMLab/dcml_corpora), +Apart from that, the script requires that you have previously extracted both harmonies and measures from the +annotated scores or that you are converting a DCML corpus (https://github.com/DCMLab/dcml_corpora), where both facets are provided by default. In order to (re-) extract the labels, use the command: ms3 extract -X -M Or, if you want to convert other harmony or chord labels from your MuseScore files, use -L for labels. ms3 extract -h will show you all options. -''') - parser.add_argument("dir", metavar='IN_DIR', - help='Folder that will be scanned for TSV files to convert. Defaults to current working directory. ' - 'Sub-directories are not taken into account.') - parser.add_argument('-m', '--measures', metavar='MEASURES_DIR', - help="Folder in which to look for the corrsponding measure maps. By default, the script will try " - "to find a sibling to the source dir called 'measures'.") - parser.add_argument('-o', '--out', metavar='OUT_DIR', - help='Output directory for .dez files. Defaults to the input directory.') - parser.add_argument('-C', - '--cadences', - action="store_true", - help="Pass this flag if you want to add time-point cadence labels to the .dez files." - ) +""", + ) + parser.add_argument( + "dir", + metavar="IN_DIR", + help="Folder that will be scanned for TSV files to convert. Defaults to current working directory. " + "Sub-directories are not taken into account.", + ) + parser.add_argument( + "-m", + "--measures", + metavar="MEASURES_DIR", + help="Folder in which to look for the corrsponding measure maps. By default, the script will try " + "to find a sibling to the source dir called 'measures'.", + ) + parser.add_argument( + "-o", + "--out", + metavar="OUT_DIR", + help="Output directory for .dez files. Defaults to the input directory.", + ) + parser.add_argument( + "-C", + "--cadences", + action="store_true", + help="Pass this flag if you want to add time-point cadence labels to the .dez files.", + ) possible_line_arguments = tuple(str(i) for i in DEZ_LINE_ARGS) - parser.add_argument('-H', - '--harmonies', - metavar="{0-6}, default: 4", - default="4", - choices=possible_line_arguments, - help="By default, harmony annotations will be set on the first line under the system (layer " - "4 out of 6). Pick another layer or pass 0 to not add harmonies." - ) - parser.add_argument('-K', - '--keys', - metavar="{0-6}, default: 5", - default="5", - choices=possible_line_arguments, - help="By default, local key segments will be set on the second line under the system (layer " - "5 out of 6). Pick another layer or pass 0 to not add key segments. Note, however, " - "that harmonies are underdetermined without their local key.") - parser.add_argument('-P', - '--phrases', - metavar="{0-6}, default: 6", - default="6", - choices=possible_line_arguments, - help="By default, phrase annotations will be set on the third line under the system (layer " - "6 out of 6). Pick another layer or pass 0 to not add phrases.") - parser.add_argument('--raw', - metavar="{1-6}", - choices=possible_line_arguments, - help="Pass this argument to add a layer with the 'raw' labels, i.e. including local key, " - "cadence and phrase annotations.") + parser.add_argument( + "-H", + "--harmonies", + metavar="{0-6}, default: 4", + default="4", + choices=possible_line_arguments, + help="By default, harmony annotations will be set on the first line under the system (layer " + "4 out of 6). Pick another layer or pass 0 to not add harmonies.", + ) + parser.add_argument( + "-K", + "--keys", + metavar="{0-6}, default: 5", + default="5", + choices=possible_line_arguments, + help="By default, local key segments will be set on the second line under the system (layer " + "5 out of 6). Pick another layer or pass 0 to not add key segments. Note, however, " + "that harmonies are underdetermined without their local key.", + ) + parser.add_argument( + "-P", + "--phrases", + metavar="{0-6}, default: 6", + default="6", + choices=possible_line_arguments, + help="By default, phrase annotations will be set on the third line under the system (layer " + "6 out of 6). Pick another layer or pass 0 to not add phrases.", + ) + parser.add_argument( + "--raw", + metavar="{1-6}", + choices=possible_line_arguments, + help="Pass this argument to add a layer with the 'raw' labels, i.e. including local key, " + "cadence and phrase annotations.", + ) args = parser.parse_args() kwargs = process_arguments(args) main(**kwargs) + if __name__ == "__main__": - run() \ No newline at end of file + run() diff --git a/src/ms3/utils/functions.py b/src/ms3/utils/functions.py index d68ba99d..70d2551b 100644 --- a/src/ms3/utils/functions.py +++ b/src/ms3/utils/functions.py @@ -6479,12 +6479,12 @@ def replace_extension(filepath: str, new_extension: str) -> str: return os.path.splitext(filepath)[0] + new_extension - -def get_value_profile_mask(series: pd.Series, - na_values: str = "group", - prevent_merge: bool = False, - logger: Optional[logging.Logger | str] = None - ) -> pd.Series: +def get_value_profile_mask( + series: pd.Series, + na_values: str = "group", + prevent_merge: bool = False, + logger: Optional[logging.Logger | str] = None, +) -> pd.Series: """Turns a Series into a boolean mask indicating those values that are distinct from their predecessors. There are several ways of dealing with NA values. @@ -6552,4 +6552,4 @@ def get_value_profile_mask(series: pd.Series, beginnings = beginnings.astype("boolean") if reindex_flag: return beginnings.reindex(series.index) - return beginnings \ No newline at end of file + return beginnings diff --git a/src/ms3/view.py b/src/ms3/view.py index 9b6958b0..d6a1201f 100644 --- a/src/ms3/view.py +++ b/src/ms3/view.py @@ -1,68 +1,92 @@ import os import random import re -from collections import defaultdict, Counter +from collections import Counter, defaultdict from copy import deepcopy -from typing import Collection, Union, Iterable, List, Dict, Iterator, Optional, Tuple, Set, Literal +from typing import ( + Collection, + Dict, + Iterable, + Iterator, + List, + Literal, + Optional, + Set, + Tuple, + Union, +) import numpy as np import numpy.typing as npt -from .score import Score -from ._typing import FileList, Category, Categories -from .utils import File, unpack_json_paths, resolve_paths_argument, resolve_dir +from ._typing import Categories, Category, FileList from .logger import LoggedClass +from .score import Score +from .utils import File, resolve_dir, resolve_paths_argument, unpack_json_paths def empty_counts(): """Array for counting kept items, discarded items, and their sum.""" return np.zeros(3, dtype=int) + class View(LoggedClass): """ Object storing regular expressions and filter lists, storing and keeping track of things filtered out. """ + review_regex = "review" categories = ( - 'corpora', - 'folders', - 'pieces', - 'files', - 'suffixes', - 'facets', - 'paths', + "corpora", + "folders", + "pieces", + "files", + "suffixes", + "facets", + "paths", + ) + available_facets = ("scores",) + Score.dataframe_types + ("unknown",) + singular2category: Dict[str, Category] = dict( + zip( + ("corpus", "folder", "piece", "file", "suffix", "facet", "path"), categories + ) ) - available_facets = ('scores',) + Score.dataframe_types + ('unknown',) - singular2category: Dict[str, Category] = dict(zip(('corpus', 'folder', 'piece', 'file', 'suffix', 'facet', 'path'), - categories)) tsv_regex = re.compile(r"\.tsv$", re.IGNORECASE) - convertible_regex = Score.make_extension_regex(native=False, convertible=True, tsv=False) + convertible_regex = Score.make_extension_regex( + native=False, convertible=True, tsv=False + ) registered_regexes = (convertible_regex, review_regex, tsv_regex) - def __init__(self, - view_name: Optional[str] = 'all', - only_metadata_pieces: bool = False, - include_convertible: bool = True, - include_tsv: bool = True, - exclude_review: bool = False, - **logger_cfg - ): - super().__init__(subclass='View', logger_cfg=logger_cfg) + def __init__( + self, + view_name: Optional[str] = "all", + only_metadata_pieces: bool = False, + include_convertible: bool = True, + include_tsv: bool = True, + exclude_review: bool = False, + **logger_cfg, + ): + super().__init__(subclass="View", logger_cfg=logger_cfg) # fields - self._name: str = '' + self._name: str = "" # the two main dicts self.including: dict = {c: [] for c in self.categories} self.excluding: dict = {c: [] for c in self.categories} self.excluded_file_paths: List[str] = [] self.selected_facets = self.available_facets - self._last_filtering_counts: Dict[str, npt.NDArray[int, int, int]] = defaultdict(empty_counts) + self._last_filtering_counts: Dict[ + str, npt.NDArray[int, int, int] + ] = defaultdict(empty_counts) """For each filter method, store the counts of the last run as [n_kept, n_discarded, N (the sum)]. Keys are "category" for :meth:`filter_by_token` and 'files' or 'parsed' for :meth:`filtered_file_list`. To inspect, you can use the method :meth:`filtering_report` """ self._discarded_items: Dict[str, Set[str]] = defaultdict(set) - self._discarded_file_criteria: dict[Literal['subdir', 'file', 'suffix', 'path'], Counter] = defaultdict(Counter) - """{criterion -> {excluded_name -> n_excluded}} dict for keeping track of which file was discarded based on which criterion. + self._discarded_file_criteria: dict[ + Literal["subdir", "file", "suffix", "path"], Counter + ] = defaultdict(Counter) + """{criterion -> {excluded_name -> n_excluded}} dict for keeping track of which file was discarded based on + which criterion. """ # booleans self.pieces_in_metadata: bool = True @@ -79,10 +103,13 @@ def __init__(self, @staticmethod def check_name(view_name) -> Tuple[bool, str]: if not isinstance(view_name, str): - return False, f"Name of the view should be a string, not '{type(view_name)}'" + return ( + False, + f"Name of the view should be a string, not '{type(view_name)}'", + ) if not view_name.isidentifier(): return False, f"The string '{view_name}' cannot be used as attribute name." - return True, '' + return True, "" @property def name(self) -> str: @@ -104,49 +131,54 @@ def only_metadata_pieces(self, value): self.pieces_in_metadata = True self.pieces_not_in_metadata = False - def is_default(self, - relax_for_cli: bool = False) -> bool: + def is_default(self, relax_for_cli: bool = False) -> bool: """Checks includes and excludes that may influence the selection of pieces. Returns True if the settings do not filter out any pieces. Only if ``relax_for_cli`` is set to True, the filters :attr:`include_convertible` and :attr:`exclude_review` are permitted, too.""" # define the expected number of filter regexes per category (ignore 'corpora' and 'facets') default_excluding_lengths = { - 'suffixes': 0, - 'folders': 0, - 'pieces': 0, - 'files': 0, - 'paths': 0 + "suffixes": 0, + "folders": 0, + "pieces": 0, + "files": 0, + "paths": 0, } if relax_for_cli: if self.exclude_review: - default_excluding_lengths.update({ - 'folders': 1, - 'pieces': 1, - 'files': 1 - }) - default_excluding_lengths['files'] += not self.include_convertible - ## debugging: -# print(f"""no includes: {all(len(self.including[category]) == 0 for category in default_excluding_lengths.keys())} -# default_excludes: {all(len(self.excluding[category]) == expected for category, expected in default_excluding_lengths.items())} -# exclude_review: {not self.exclude_review or relax_for_cli} -# include_convertible: {self.include_convertible or relax_for_cli} -# no paths excluded: {len(self.excluded_file_paths) == 0} -# pieces in metadata: {self.pieces_in_metadata} -# not in metadata excluded: {self.pieces_not_in_metadata or relax_for_cli} -# incomplete facets: {self.pieces_with_incomplete_facets}""") + default_excluding_lengths.update( + {"folders": 1, "pieces": 1, "files": 1} + ) + default_excluding_lengths["files"] += not self.include_convertible + # # debugging: + # print(f"""no includes: {all(len(self.including[category]) == 0 for category in + # default_excluding_lengths.keys())} + # default_excludes: {all(len(self.excluding[category]) == expected for category, expected in + # default_excluding_lengths.items())} + # exclude_review: {not self.exclude_review or relax_for_cli} + # include_convertible: {self.include_convertible or relax_for_cli} + # no paths excluded: {len(self.excluded_file_paths) == 0} + # pieces in metadata: {self.pieces_in_metadata} + # not in metadata excluded: {self.pieces_not_in_metadata or relax_for_cli} + # incomplete facets: {self.pieces_with_incomplete_facets}""") return ( - all(len(self.including[category]) == 0 for category in default_excluding_lengths.keys()) and - all(len(self.excluding[category]) == expected for category, expected in default_excluding_lengths.items()) and - len(self.excluded_file_paths) == 0 and - self.pieces_in_metadata and - self.pieces_with_incomplete_facets and - (relax_for_cli or ( - self.include_convertible and - self.pieces_not_in_metadata - )) - ) - - def copy(self, new_name: Optional[str] = None) -> 'View': + all( + len(self.including[category]) == 0 + for category in default_excluding_lengths.keys() + ) + and all( + len(self.excluding[category]) == expected + for category, expected in default_excluding_lengths.items() + ) + and len(self.excluded_file_paths) == 0 + and self.pieces_in_metadata + and self.pieces_with_incomplete_facets + and ( + relax_for_cli + or (self.include_convertible and self.pieces_not_in_metadata) + ) + ) + + def copy(self, new_name: Optional[str] = None) -> "View": """Returns a copy of this view, i.e., a new View object.""" if new_name is None: new_name = get_ferocious_name() @@ -160,30 +192,34 @@ def copy(self, new_name: Optional[str] = None) -> 'View': new_view.pieces_with_incomplete_facets = self.pieces_with_incomplete_facets return new_view - def update_config(self, - view_name: Optional[str] = None, - only_metadata_pieces: Optional[bool] = None, - include_convertible: Optional[bool] = None, - include_tsv: Optional[bool] = None, - exclude_review: Optional[bool] = None, - file_paths: Optional[Union[str, Collection[str]]] = None, - file_re: Optional[str] = None, - folder_re: Optional[str] = None, - exclude_re: Optional[str] = None, - folder_paths: Optional[Union[str, Collection[str]]] = None, - **logger_cfg): + def update_config( + self, + view_name: Optional[str] = None, + only_metadata_pieces: Optional[bool] = None, + include_convertible: Optional[bool] = None, + include_tsv: Optional[bool] = None, + exclude_review: Optional[bool] = None, + file_paths: Optional[Union[str, Collection[str]]] = None, + file_re: Optional[str] = None, + folder_re: Optional[str] = None, + exclude_re: Optional[str] = None, + folder_paths: Optional[Union[str, Collection[str]]] = None, + **logger_cfg, + ): """Update the configuration of the View. This is a shorthand for issuing several calls to :meth:`include` and :meth:`exclude` at once. Args: view_name: New name of the view. only_metadata_pieces: Whether or not pieces that are not included in a metadata.tsv should be excluded. - include_convertible: Whether or not scores that need conversion via MuseScore before parsing should be included. + include_convertible: Whether or not scores that need conversion via MuseScore before parsing should be + included. include_tsv: Whether or not TSV files should be included. exclude_review: Whether or not files and folder that include 'review' should be excluded. file_paths: The exact file names will be extracted and used as exclusive filter, that is, all files that do not have - one of these file names will be excluded. This is regardless of eventual relative or absolute paths included + one of these file names will be excluded. This is regardless of eventual relative or absolute paths + included in the argument. file_re: Include only files whose file name includes this regular expression. folder_re: Include only files from folders whose name includes this regular expression. @@ -194,91 +230,111 @@ def update_config(self, Returns: """ - for param, value in zip(('view_name', 'only_metadata_pieces', 'include_convertible', 'include_tsv', 'exclude_review'), - (view_name, only_metadata_pieces, include_convertible, include_tsv, exclude_review) - ): + for param, value in zip( + ( + "view_name", + "only_metadata_pieces", + "include_convertible", + "include_tsv", + "exclude_review", + ), + ( + view_name, + only_metadata_pieces, + include_convertible, + include_tsv, + exclude_review, + ), + ): if value is None: continue old_value = getattr(self, param) if value != old_value: setattr(self, param, value) self.logger.debug(f"Set '{param}' (previously {old_value}) to {value}.") - if file_re is not None and file_re != '.*': - self.include('files', file_re) - if folder_re is not None and folder_re != '.*': - self.include('folders', folder_re) + if file_re is not None and file_re != ".*": + self.include("files", file_re) + if folder_re is not None and folder_re != ".*": + self.include("folders", folder_re) if exclude_re is not None: - self.exclude(('files', 'folders'), exclude_re) + self.exclude(("files", "folders"), exclude_re) if file_paths is not None: resolved_paths = resolve_paths_argument(file_paths) if len(resolved_paths) > 0: unpack_json_paths(resolved_paths) regexes = [re.escape(os.path.basename(p)) for p in resolved_paths] - self.include('files', *regexes) + self.include("files", *regexes) if folder_paths is not None: resolved_paths = resolve_paths_argument(folder_paths, files=False) if len(resolved_paths) > 0: - self.include('paths', *resolved_paths) + self.include("paths", *resolved_paths) if len(logger_cfg) > 0: self.change_logger_cfg(**logger_cfg) @property def include_convertible(self): - return self.convertible_regex not in self.excluding['files'] + return self.convertible_regex not in self.excluding["files"] @include_convertible.setter def include_convertible(self, yes: bool): if yes: - self.unexclude('files', self.convertible_regex) + self.unexclude("files", self.convertible_regex) else: - self.exclude('files', self.convertible_regex) - + self.exclude("files", self.convertible_regex) @property def include_tsv(self): - return self.tsv_regex not in self.excluding['files'] + return self.tsv_regex not in self.excluding["files"] @include_tsv.setter def include_tsv(self, yes: bool): if yes: - self.unexclude('files', self.tsv_regex) + self.unexclude("files", self.tsv_regex) else: - self.exclude('files', self.tsv_regex) - + self.exclude("files", self.tsv_regex) @property def exclude_review(self): - return all(self.review_regex in self.excluding[what_to_exclude] - for what_to_exclude in ('files', 'pieces', 'folders')) + return all( + self.review_regex in self.excluding[what_to_exclude] + for what_to_exclude in ("files", "pieces", "folders") + ) @exclude_review.setter def exclude_review(self, yes: bool): if yes: - self.exclude(('files', 'pieces', 'folders'), self.review_regex) + self.exclude(("files", "pieces", "folders"), self.review_regex) else: - self.unexclude(('files', 'pieces', 'folders'), self.review_regex) + self.unexclude(("files", "pieces", "folders"), self.review_regex) def check_token(self, category: Category, token: str) -> bool: """Checks if a string pertaining to a certain category should be included in the view or not.""" category = self.resolve_category(category) - if category == 'paths': + if category == "paths": path = resolve_dir(token) if os.path.isfile(path): path = os.path.dirname(path) - if any(path.startswith(excluded_path) for excluded_path in self.excluding['paths']): + if any( + path.startswith(excluded_path) + for excluded_path in self.excluding["paths"] + ): return False - if len(self.including['paths']) == 0: + if len(self.including["paths"]) == 0: return True - return any(path.startswith(included_path) for included_path in self.including['paths']) + return any( + path.startswith(included_path) + for included_path in self.including["paths"] + ) if any(re.search(rgx, token) is not None for rgx in self.excluding[category]): return False if len(self.including[category]) == 0: return True - return any(re.search(rgx, token) is not None for rgx in self.including[category]) - + return any( + re.search(rgx, token) is not None for rgx in self.including[category] + ) def check_file(self, file: File) -> Tuple[bool, str]: - """ Check if an individual File passes all filters w.r.t. its subdirectories, file name and suffix. + """Check if an individual File passes all filters w.r.t. its subdirectories, file name and suffix. Args: file: @@ -288,21 +344,30 @@ def check_file(self, file: File) -> Tuple[bool, str]: The criterion based on which the file is being excluded. """ if file.full_path in self.excluded_file_paths: - return False, 'file' - if not self.check_token('paths', file.directory): - return False, 'directory' - category2file_component = dict(zip((('folders', 'subdir'), ('files', 'file'), ('suffixes', 'suffix')), - (file.subdir, file.file, file.suffix) - )) + return False, "file" + if not self.check_token("paths", file.directory): + return False, "directory" + category2file_component = dict( + zip( + (("folders", "subdir"), ("files", "file"), ("suffixes", "suffix")), + (file.subdir, file.file, file.suffix), + ) + ) for (category, criterion), component in category2file_component.items(): - if any(re.search(rgx, component) is not None for rgx in self.excluding[category]): + if any( + re.search(rgx, component) is not None + for rgx in self.excluding[category] + ): return False, criterion for (category, criterion), component in category2file_component.items(): if len(self.including[category]) == 0: continue - if not any(re.search(rgx, component) is not None for rgx in self.including[category]): + if not any( + re.search(rgx, component) is not None + for rgx in self.including[category] + ): return False, criterion - return True, 'files' + return True, "files" def reset_filtering_data(self, categories: Categories = None): if categories is None: @@ -314,18 +379,19 @@ def reset_filtering_data(self, categories: Categories = None): categories = self.resolve_categories(categories) for ctgr in categories: if ctgr in self._last_filtering_counts: - del(self._last_filtering_counts[ctgr]) + del self._last_filtering_counts[ctgr] if ctgr in self._discarded_items: - del(self._discarded_items[ctgr]) - if 'files' in categories: + del self._discarded_items[ctgr] + if "files" in categories: self._discarded_file_criteria = defaultdict(Counter) self.update_facet_selection() def reset_view(self): self.__init__() - - def filter_by_token(self, category: Category, tuples: Iterable[tuple]) -> Iterator[tuple]: + def filter_by_token( + self, category: Category, tuples: Iterable[tuple] + ) -> Iterator[tuple]: """Filters out those tuples where the token (first element) does not pass _.check_token(category, token).""" category = self.resolve_category(category) n_kept, n_discarded, N = 0, 0, 0 @@ -340,16 +406,19 @@ def filter_by_token(self, category: Category, tuples: Iterable[tuple]) -> Iterat n_discarded += 1 discarded_items.append(token) key = category - self._last_filtering_counts[key] += np.array([n_kept, n_discarded, N], dtype='int') + self._last_filtering_counts[key] += np.array( + [n_kept, n_discarded, N], dtype="int" + ) self._discarded_items[key].update(discarded_items) def filtered_tokens(self, category: Category, tokens: Collection[str]) -> List[str]: """Applies :meth:`filter_by_token` to a collection of tokens.""" - return [token[0] for token in self.filter_by_token(category, ((t,) for t in tokens))] - + return [ + token[0] for token in self.filter_by_token(category, ((t,) for t in tokens)) + ] def filtered_file_list(self, files: Collection[File], key: str = None) -> FileList: - """ Keep only the files that pass _.check_file(). + """Keep only the files that pass _.check_file(). Args: files: :obj:`File` objects to be filtered. @@ -369,15 +438,21 @@ def filtered_file_list(self, files: Collection[File], key: str = None) -> FileLi discarded_items.append(file.rel_path) if key is None: # do not track discarding criteria for special keys such as 'parsed', used by View.iter_facet2parsed - self._discarded_file_criteria[criterion][getattr(file, criterion)] += 1 + self._discarded_file_criteria[criterion][ + getattr(file, criterion) + ] += 1 n_kept, n_discarded, N = len(result), len(discarded_items), len(files) if key is None: - key = 'files' - self._last_filtering_counts[key] += np.array([n_kept, n_discarded, N], dtype='int') + key = "files" + self._last_filtering_counts[key] += np.array( + [n_kept, n_discarded, N], dtype="int" + ) self._discarded_items[key].update(discarded_items) return result - def filtering_report(self, drop_zero=True, show_discarded=True, return_str=False) -> Optional[str]: + def filtering_report( + self, drop_zero=True, show_discarded=True, return_str=False + ) -> Optional[str]: aggregated_counts = defaultdict(empty_counts) for key, counts in self._last_filtering_counts.items(): aggregated_counts[key] += counts @@ -385,7 +460,7 @@ def filtering_report(self, drop_zero=True, show_discarded=True, return_str=False discarded = defaultdict(list) for key, items in self._discarded_items.items(): discarded[key].extend(items) - msg = '' + msg = "" for key, (_, n_discarded, N) in aggregated_counts.items(): if not drop_zero or n_discarded > 0: msg += f"{n_discarded}/{N} {key} are excluded from this view" @@ -395,18 +470,18 @@ def filtering_report(self, drop_zero=True, show_discarded=True, return_str=False else: msg += ", but unfortunately I don't know which ones.\n" else: - msg += '.\n' + msg += ".\n" if len(self._discarded_file_criteria) > 0: - msg += '\n' + msg += "\n" for criterion, cntr in self._discarded_file_criteria.items(): - crit = 'file name' if criterion == 'file' else criterion + crit = "file name" if criterion == "file" else criterion msg += f"{sum(cntr.values())} files have been excluded based on their {crit}" if show_discarded: - msg += ':\n' + msg += ":\n" for excluded_name, n in cntr.items(): msg += f"\t- '{excluded_name}': {n}\n" else: - msg += '.\n' + msg += ".\n" if return_str: return msg print(msg) @@ -414,8 +489,10 @@ def filtering_report(self, drop_zero=True, show_discarded=True, return_str=False def info(self, return_str=False): msg_components = [] if self.pieces_in_metadata + self.pieces_not_in_metadata == 0: - msg = f"This view is called '{self.name}'. It excludes everything because both its attributes " \ - f"pieces_in_metadata and pieces_not_in_metadata are set to False." + msg = ( + f"This view is called '{self.name}'. It excludes everything because both its attributes " + f"pieces_in_metadata and pieces_not_in_metadata are set to False." + ) if return_str: return msg print(msg) @@ -423,17 +500,29 @@ def info(self, return_str=False): if not self.pieces_in_metadata: msg_components.append("excludes pieces that are contained in the metadata") if not self.pieces_not_in_metadata: - msg_components.append("excludes pieces that are not contained in the metadata") + msg_components.append( + "excludes pieces that are not contained in the metadata" + ) if not self.include_convertible: - msg_components.append("filters out file extensions requiring conversion (such as .xml)") + msg_components.append( + "filters out file extensions requiring conversion (such as .xml)" + ) if not self.include_tsv: msg_components.append("disregards all TSV files") if self.exclude_review: msg_components.append("excludes review files and folders") - included_re = {what_to_include: [rgx for rgx in regexes if rgx not in self.registered_regexes] - for what_to_include, regexes in self.including.items()} - excluded_re = {what_to_exclude: [rgx for rgx in regexes if rgx not in self.registered_regexes] - for what_to_exclude, regexes in self.excluding.items()} + included_re = { + what_to_include: [ + rgx for rgx in regexes if rgx not in self.registered_regexes + ] + for what_to_include, regexes in self.including.items() + } + excluded_re = { + what_to_exclude: [ + rgx for rgx in regexes if rgx not in self.registered_regexes + ] + for what_to_exclude, regexes in self.excluding.items() + } for what_to_include, re_strings in included_re.items(): n_included = len(re_strings) if n_included == 0: @@ -441,11 +530,17 @@ def info(self, return_str=False): if n_included == 1: included = f"'{re_strings[0]}'" elif n_included < 11: - included = 'one of ' + str(re_strings) + included = "one of " + str(re_strings) else: - included = 'one of [' + ', '.join(f"'{regex}'" for regex in re_strings[:10]) + '... ' + included = ( + "one of [" + + ", ".join(f"'{regex}'" for regex in re_strings[:10]) + + "... " + ) included += f" ({n_included - 10} more, see filtering_report()))" - msg_components.append(f"includes only {what_to_include} containing {included}") + msg_components.append( + f"includes only {what_to_include} containing {included}" + ) for what_to_exclude, re_strings in excluded_re.items(): n_excluded = len(re_strings) if n_excluded == 0: @@ -453,15 +548,26 @@ def info(self, return_str=False): if n_excluded == 1: excluded = f"'{re_strings[0]}'" elif n_excluded < 11: - excluded = 'one of ' + str(re_strings) + excluded = "one of " + str(re_strings) else: - excluded = 'one of [' + ', '.join(f"'{regex}'" for regex in re_strings[:10]) + '... ' + excluded = ( + "one of [" + + ", ".join(f"'{regex}'" for regex in re_strings[:10]) + + "... " + ) excluded += f" ({n_excluded - 10} more, see filtering_report())" - msg_components.append(f"excludes any {what_to_exclude} containing {excluded}") + msg_components.append( + f"excludes any {what_to_exclude} containing {excluded}" + ) if not self.pieces_with_incomplete_facets: - msg_components.append(f"excludes pieces that do not have at least one file per selected facet ({', '.join(self.selected_facets)})") + msg_components.append( + f"excludes pieces that do not have at least one file per selected facet (" + f"{', '.join(self.selected_facets)})" + ) if len(self.excluded_file_paths) > 0: - msg_components.append(f"excludes {len(self.excluded_file_paths)} files based on user input") + msg_components.append( + f"excludes {len(self.excluded_file_paths)} files based on user input" + ) msg = f"This view is called '{self.name}'. It " n_components = len(msg_components) if n_components == 0: @@ -469,8 +575,8 @@ def info(self, return_str=False): elif n_components == 1: msg += msg_components[0] + "." else: - separator = '\n\t- ' - msg += separator + (',' + separator).join(msg_components[:-1]) + separator = "\n\t- " + msg += separator + ("," + separator).join(msg_components[:-1]) msg += f", and{separator}{msg_components[-1]}." if return_str: return msg @@ -482,10 +588,14 @@ def resolve_category(self, category: Category) -> Category: if category in self.singular2category: return self.singular2category[category] else: - raise ValueError(f"'{category}' is not one of the known categories {self.categories}") + raise ValueError( + f"'{category}' is not one of the known categories {self.categories}" + ) return category else: - raise ValueError(f"Pass a single category string ∈ {self.categories}, not a '{type(category)}'") + raise ValueError( + f"Pass a single category string ∈ {self.categories}, not a '{type(category)}'" + ) def resolve_categories(self, categories: Categories) -> List[str]: if isinstance(categories, str): @@ -495,67 +605,65 @@ def resolve_categories(self, categories: Categories) -> List[str]: def update_facet_selection(self): selected, discarded = [], [] for facet in self.available_facets: - if self.check_token('facet', facet): + if self.check_token("facet", facet): selected.append(facet) else: discarded.append(facet) self.selected_facets = selected - key = 'facets' + key = "facets" if len(discarded) == 0: if key in self._last_filtering_counts: - del(self._last_filtering_counts[key]) + del self._last_filtering_counts[key] if key in self._discarded_items: - del(self._discarded_items[key]) + del self._discarded_items[key] return n_kept, n_discarded = len(selected), len(discarded) - counts = np.array([n_kept, n_discarded, n_kept+n_discarded]) + counts = np.array([n_kept, n_discarded, n_kept + n_discarded]) self._last_filtering_counts[key] = counts self._discarded_items[key] = set(discarded) def include(self, categories: Categories, *regex: Union[str, re.Pattern]): categories = self.resolve_categories(categories) - if 'paths' in categories: + if "paths" in categories: paths = [resolve_dir(rgx) for rgx in regex] for what_to_include in categories: - regex_or_paths = paths if what_to_include == 'paths' else regex + regex_or_paths = paths if what_to_include == "paths" else regex for rgx in regex_or_paths: if rgx not in self.including[what_to_include]: self.including[what_to_include].append(rgx) - if what_to_include == 'facets': + if what_to_include == "facets": self.update_facet_selection() - def exclude(self, categories: Categories, *regex: Union[str, re.Pattern]): categories = self.resolve_categories(categories) - if 'paths' in categories: + if "paths" in categories: paths = [resolve_dir(rgx) for rgx in regex] for what_to_exclude in categories: - regex_or_paths = paths if what_to_exclude == 'paths' else regex + regex_or_paths = paths if what_to_exclude == "paths" else regex for rgx in regex_or_paths: if rgx not in self.excluding[what_to_exclude]: self.excluding[what_to_exclude].append(rgx) - if what_to_exclude == 'facets': + if what_to_exclude == "facets": self.update_facet_selection() def uninclude(self, categories: Categories, *regex: Union[str, re.Pattern]): categories = self.resolve_categories(categories) - if 'paths' in categories: + if "paths" in categories: paths = [resolve_dir(rgx) for rgx in regex] for what_to_uninclude in categories: - regex_or_paths = paths if what_to_uninclude == 'paths' else regex + regex_or_paths = paths if what_to_uninclude == "paths" else regex for rgx in regex_or_paths: try: self.including[what_to_uninclude].remove(rgx) except ValueError: pass - def unexclude(self, categories: Categories, *regex: Union[str, re.Pattern]): categories = self.resolve_categories(categories) - if 'paths' in categories: + if "paths" in categories: paths = [resolve_dir(rgx) for rgx in regex] for what_to_unexclude in categories: - regex_or_paths = paths if what_to_unexclude == 'paths' else regex + regex_or_paths = paths if what_to_unexclude == "paths" else regex for rgx in regex_or_paths: try: self.excluding[what_to_unexclude].remove(rgx) @@ -565,86 +673,108 @@ def unexclude(self, categories: Categories, *regex: Union[str, re.Pattern]): def __repr__(self): return self.info(return_str=True) + class DefaultView(View): + def __init__( + self, + view_name: Optional[str] = "default", + only_metadata_pieces: bool = True, + include_convertible: bool = False, + include_tsv: bool = True, + exclude_review: bool = True, + **logger_cfg, + ): + super().__init__( + view_name=view_name, + only_metadata_pieces=only_metadata_pieces, + include_convertible=include_convertible, + include_tsv=include_tsv, + exclude_review=exclude_review, + **logger_cfg, + ) - def __init__(self, - view_name: Optional[str] = 'default', - only_metadata_pieces: bool = True, - include_convertible: bool = False, - include_tsv: bool = True, - exclude_review: bool = True, - **logger_cfg - ): - super().__init__(view_name=view_name, - only_metadata_pieces=only_metadata_pieces, - include_convertible=include_convertible, - include_tsv=include_tsv, - exclude_review=exclude_review, - **logger_cfg - ) - - def is_default(self, - relax_for_cli: bool = False) -> bool: + def is_default(self, relax_for_cli: bool = False) -> bool: default_excluding_lengths = { - 'folders': 1, - 'pieces': 1, - 'files': 2, - 'suffixes': 0, + "folders": 1, + "pieces": 1, + "files": 2, + "suffixes": 0, } if relax_for_cli: - default_excluding_lengths['files'] -= self.include_convertible - ## debugging: -# print(f"""no includes: {all(len(self.including[category]) == 0 for category in default_excluding_lengths.keys())} -# default_excludes: {all(len(self.excluding[category]) == expected for category, expected in default_excluding_lengths.items())} -# exclude_review: {self.exclude_review} -# include_convertible: {not self.include_convertible or relax_for_cli} -# no paths excluded: {len(self.excluded_file_paths) == 0} -# pieces in metadata: {self.pieces_in_metadata} -# not in metadata excluded: {not self.pieces_not_in_metadata or relax_for_cli} -# incomplete facets: {self.pieces_with_incomplete_facets}""") + default_excluding_lengths["files"] -= self.include_convertible + # # debugging: + # print(f"""no includes: {all(len(self.including[category]) == 0 for category in + # default_excluding_lengths.keys())} + # default_excludes: {all(len(self.excluding[category]) == expected for category, expected in + # default_excluding_lengths.items())} + # exclude_review: {self.exclude_review} + # include_convertible: {not self.include_convertible or relax_for_cli} + # no paths excluded: {len(self.excluded_file_paths) == 0} + # pieces in metadata: {self.pieces_in_metadata} + # not in metadata excluded: {not self.pieces_not_in_metadata or relax_for_cli} + # incomplete facets: {self.pieces_with_incomplete_facets}""") return ( - all(len(self.including[category]) == 0 for category in default_excluding_lengths.keys()) and - all(len(self.excluding[category]) == expected for category, expected in default_excluding_lengths.items()) and - len(self.excluded_file_paths) == 0 and - self.pieces_in_metadata and - self.pieces_with_incomplete_facets and - (relax_for_cli or ( - not self.include_convertible and - not self.pieces_not_in_metadata - )) + all( + len(self.including[category]) == 0 + for category in default_excluding_lengths.keys() + ) + and all( + len(self.excluding[category]) == expected + for category, expected in default_excluding_lengths.items() + ) + and len(self.excluded_file_paths) == 0 + and self.pieces_in_metadata + and self.pieces_with_incomplete_facets + and ( + relax_for_cli + or (not self.include_convertible and not self.pieces_not_in_metadata) + ) ) -def create_view_from_parameters(only_metadata_pieces: bool = True, - include_convertible: bool = False, - include_tsv: bool = True, - exclude_review: bool = True, - file_paths=None, - file_re=None, - folder_re=None, - exclude_re=None, - level=None - ) -> View: +def create_view_from_parameters( + only_metadata_pieces: bool = True, + include_convertible: bool = False, + include_tsv: bool = True, + exclude_review: bool = True, + file_paths=None, + file_re=None, + folder_re=None, + exclude_re=None, + level=None, +) -> View: """From the arguments of an __init__ method, create either a DefaultView or a custom view.""" - no_legacy_params = all(param is None for param in (file_paths, file_re, folder_re, exclude_re)) - all_default = only_metadata_pieces and include_tsv and exclude_review and not include_convertible + no_legacy_params = all( + param is None for param in (file_paths, file_re, folder_re, exclude_re) + ) + all_default = ( + only_metadata_pieces + and include_tsv + and exclude_review + and not include_convertible + ) if no_legacy_params and all_default: return DefaultView(level=level) ferocious_name = get_ferocious_name() - view = View(ferocious_name, - only_metadata_pieces=only_metadata_pieces, - include_convertible=include_convertible, - include_tsv=include_tsv, - exclude_review=exclude_review, - level=level - ) - view.update_config(file_paths=file_paths, - file_re=file_re, - folder_re=folder_re, - exclude_re=exclude_re) + view = View( + ferocious_name, + only_metadata_pieces=only_metadata_pieces, + include_convertible=include_convertible, + include_tsv=include_tsv, + exclude_review=exclude_review, + level=level, + ) + view.update_config( + file_paths=file_paths, + file_re=file_re, + folder_re=folder_re, + exclude_re=exclude_re, + ) return view def get_ferocious_name(): - path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'ferocious_names.txt') - return random.choice(open(path, 'r', encoding='utf-8').readlines()).strip('\n') \ No newline at end of file + path = os.path.join( + os.path.abspath(os.path.dirname(__file__)), "ferocious_names.txt" + ) + return random.choice(open(path, "r", encoding="utf-8").readlines()).strip("\n") diff --git a/tests/conftest.py b/tests/conftest.py index 1187f162..5f66a105 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,49 +1,58 @@ import os from copy import deepcopy + import pytest from git import Repo - from ms3 import Parse, Score from ms3.logger import get_logger -from ms3.utils import scan_directory, capture_parse_logs, ignored_warnings2dict +from ms3.utils import capture_parse_logs, ignored_warnings2dict, scan_directory +CORPUS_DIR = "~" # Directory holding your clone of DCMLab/unittest_metacorpus +TEST_COMMIT = ( + "5899afe" # commit of DCMLab/unittest_metacorpus for which the tests should pass +) +MS3_DIR = os.path.abspath(os.path.join(os.path.realpath(__file__), "..", "..")) +DOCS_DIR = os.path.join(MS3_DIR, "docs") +DOCS_EXAMPLES_DIR = os.path.join(DOCS_DIR, "examples") -CORPUS_DIR = "~" # Directory holding your clone of DCMLab/unittest_metacorpus -TEST_COMMIT = "5899afe" # commit of DCMLab/unittest_metacorpus for which the tests should pass -MS3_DIR = os.path.abspath(os.path.join(os.path.realpath(__file__), '..', '..')) -DOCS_DIR = os.path.join(MS3_DIR, 'docs') -DOCS_EXAMPLES_DIR = os.path.join(DOCS_DIR, 'examples') @pytest.fixture(scope="session") def directory(): """Compose the path for the test corpus.""" path = os.path.join(os.path.expanduser(CORPUS_DIR), "unittest_metacorpus") if not os.path.isdir(path): - print(f"Directory does not exist: {path} Clone DCMLab/unittest_metacorpus, checkout ms3_tests branch, " - f"and specify CORPUS_DIR above.") + print( + f"Directory does not exist: {path} Clone DCMLab/unittest_metacorpus, checkout ms3_tests branch, " + f"and specify CORPUS_DIR above." + ) assert os.path.isdir(path) repo = Repo(path) - commit = repo.commit('HEAD') - sha = commit.hexsha[:len(TEST_COMMIT)] + commit = repo.commit("HEAD") + sha = commit.hexsha[: len(TEST_COMMIT)] assert sha == TEST_COMMIT - assert repo.git.diff() == '' + assert repo.git.diff() == "" return path + @pytest.fixture(scope="session") def small_directory(directory): path = os.path.join(directory, "ravel_piano") return path + @pytest.fixture(scope="session") def mozart_piano_sonatas() -> str: """Get the path to local clone of DCMLab/mozart_piano_sonatas""" path = os.path.join(os.path.expanduser(CORPUS_DIR), "mozart_piano_sonatas") if not os.path.isdir(path): - print(f"Directory does not exist: {path} Clone DCMLab/mozart_piano_sonatas into the CORPUS_DIR specified above.") + print( + f"Directory does not exist: {path} Clone DCMLab/mozart_piano_sonatas into the CORPUS_DIR specified above." + ) assert os.path.isdir(path) repo = Repo(path) yield path - repo.git.clean('-fdx') # removes new files potentially generated during test + repo.git.clean("-fdx") # removes new files potentially generated during test + @pytest.fixture( scope="session", @@ -62,15 +71,15 @@ def mozart_piano_sonatas() -> str: # "files_with_wrong_key", # "files_correct_without_metadata", # "files_with_correct_key", - ] + ], ) def parse_obj(directory, request) -> Parse: - logger = get_logger('ms3.tests') - if request.param == 'regex': - return Parse(directory=directory, file_re='WWV', folder_re='MS3') - if request.param == 'everything': + logger = get_logger("ms3.tests") + if request.param == "regex": + return Parse(directory=directory, file_re="WWV", folder_re="MS3") + if request.param == "everything": return Parse(directory=directory).all - if request.param == 'file_re_without_key': + if request.param == "file_re_without_key": p = Parse(directory=directory, file_re="SwWV") return p if request.param == "without_metadata": @@ -81,26 +90,32 @@ def parse_obj(directory, request) -> Parse: return Parse(add_path) if request.param == "regular_dirs_at_once": os.chdir(directory) - regular_dirs = ['ravel_piano', 'sweelinck_keyboard', 'wagner_overtures'] + regular_dirs = ["ravel_piano", "sweelinck_keyboard", "wagner_overtures"] return Parse(regular_dirs) p = Parse() if request.param == "regular_dirs": - for subdir in ['ravel_piano', 'sweelinck_keyboard', 'wagner_overtures']: + for subdir in ["ravel_piano", "sweelinck_keyboard", "wagner_overtures"]: add_path = os.path.join(directory, subdir) p.add_dir(add_path) if request.param == "chaotic_dirs": - for subdir in ['mixed_files', 'outputs']: + for subdir in ["mixed_files", "outputs"]: add_path = os.path.join(directory, subdir) p.add_dir(add_path) if request.param == "hidden_dirs": - for subdir in ['.git', '.github']: + for subdir in [".git", ".github"]: add_path = os.path.join(directory, subdir) p.add_dir(add_path) - if request.param.startswith('files_'): - add_path = os.path.join(directory, 'sweelinck_keyboard') + if request.param.startswith("files_"): + add_path = os.path.join(directory, "sweelinck_keyboard") files = list(scan_directory(add_path, logger=logger)) - files_with_inferrable_metadata = [f for f in files if os.path.basename(f) != 'metadata.tsv'] - files_without_inferrable_metadata = list(scan_directory(os.path.join(directory, 'mixed_files', 'orchestral'), logger=logger)) + files_with_inferrable_metadata = [ + f for f in files if os.path.basename(f) != "metadata.tsv" + ] + files_without_inferrable_metadata = list( + scan_directory( + os.path.join(directory, "mixed_files", "orchestral"), logger=logger + ) + ) if request.param == "files_without_key": p.add_files(files_without_inferrable_metadata) if request.param == "files_with_inferred_key": @@ -111,15 +126,18 @@ def parse_obj(directory, request) -> Parse: if request.param == "files_correct_without_metadata": key = "frankenstein" p.add_files(files_with_inferrable_metadata, corpus_name=key) - for path in scan_directory(os.path.join(directory, 'outputs'), logger=logger): + for path in scan_directory( + os.path.join(directory, "outputs"), logger=logger + ): p.add_files(path, corpus_name=key) if request.param == "files_with_correct_key": - p.add_dir(os.path.join(directory, 'outputs')) + p.add_dir(os.path.join(directory, "outputs")) for path in files: - p.add_files(path, corpus_name='sweelinck_keyboard') + p.add_files(path, corpus_name="sweelinck_keyboard") return p + @pytest.fixture( scope="session", params=[ @@ -145,16 +163,18 @@ def parsed_parse_obj(parse_obj, request) -> Parse: assert False return p + @pytest.fixture(scope="class") def parse_objects(parse_obj: Parse, request): request.cls.parse_obj = parse_obj + @pytest.fixture(scope="class") def parsed_parse_objects(parsed_parse_obj, request): request.cls.parsed_parse_obj = parsed_parse_obj -### Creating path tuples for score_object(): +# ## Creating path tuples for score_object(): # for folder, subdirs, files in os.walk('.'): # subdirs[:] = [s for s in subdirs if not s.startswith('.')] # fldrs = tuple(['mixed_files'] + folder.split('/')[1:]) @@ -162,55 +182,69 @@ def parsed_parse_objects(parsed_parse_obj, request): # if f.endswith('.mscx'): # print(f"{fldrs + (f,)},") + @pytest.fixture( - params = [ - ('mixed_files', '76CASM34A33UM.mscx'), - ('mixed_files', 'stabat_03_coloured.mscx'), - ('mixed_files', 'orchestral', '05_symph_fant.mscx'), - ('mixed_files', 'orchestral', 'Did03M-Son_regina-1762-Sarti.mscx'), - ('mixed_files', 'orchestral', 'caldara_form.mscx'), - ('mixed_files', 'keyboard', 'baroque', 'BWV_0815.mscx'), - ('mixed_files', 'keyboard', 'ancient', '12.16_Toccata_cromaticha_per_l’elevatione_phrygian.mscx'), - ('mixed_files', 'keyboard', 'nineteenth', 'D973deutscher01.mscx'), - ('mixed_files', 'keyboard', 'classic', 'K281-3.mscx'), - ], - ids = [ - 'monty[tremolo]', - 'pergolesi[form]', - 'berlioz[tremolo]', - 'sarti[endings]', - 'caldara[form]', - 'bach[endings]', - 'frescobaldi', - 'schubert[endings][tremolo]', - 'mozart' - ]) + params=[ + ("mixed_files", "76CASM34A33UM.mscx"), + ("mixed_files", "stabat_03_coloured.mscx"), + ("mixed_files", "orchestral", "05_symph_fant.mscx"), + ("mixed_files", "orchestral", "Did03M-Son_regina-1762-Sarti.mscx"), + ("mixed_files", "orchestral", "caldara_form.mscx"), + ("mixed_files", "keyboard", "baroque", "BWV_0815.mscx"), + ( + "mixed_files", + "keyboard", + "ancient", + "12.16_Toccata_cromaticha_per_l’elevatione_phrygian.mscx", + ), + ("mixed_files", "keyboard", "nineteenth", "D973deutscher01.mscx"), + ("mixed_files", "keyboard", "classic", "K281-3.mscx"), + ], + ids=[ + "monty[tremolo]", + "pergolesi[form]", + "berlioz[tremolo]", + "sarti[endings]", + "caldara[form]", + "bach[endings]", + "frescobaldi", + "schubert[endings][tremolo]", + "mozart", + ], +) def score_object(directory, request): mscx_path = os.path.join(directory, *request.param) s = Score(mscx_path) return s -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def get_all_warnings(directory): p = Parse(directory) with capture_parse_logs(p.logger) as captured_warnings: p.parse() - _ = p.extract_facets('expanded') + _ = p.extract_facets("expanded") return captured_warnings.content_list -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def get_all_warnings_parsed(get_all_warnings): return ignored_warnings2dict(get_all_warnings) -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def get_all_supressed_warnings(directory): - ignored_warnings_file = os.path.join(directory, 'mixed_files', 'ALL_WARNINGS_IGNORED') - p = Parse(directory, level='d') + ignored_warnings_file = os.path.join( + directory, "mixed_files", "ALL_WARNINGS_IGNORED" + ) + p = Parse(directory, level="d") p.load_ignored_warnings(ignored_warnings_file) - with capture_parse_logs(p.logger, level='d') as captured_msgs: + with capture_parse_logs(p.logger, level="d") as captured_msgs: p.parse() _ = p.get_dataframes(expanded=True) all_msgs = captured_msgs.content_list - return ['\n'.join(msg.split("\n\t")[1:]) for msg in all_msgs if msg.startswith('IGNORED')] \ No newline at end of file + return [ + "\n".join(msg.split("\n\t")[1:]) + for msg in all_msgs + if msg.startswith("IGNORED") + ] diff --git a/tests/test_local_files/IGNORED_WARNINGS b/tests/test_local_files/IGNORED_WARNINGS index e7719984..d2f92328 100644 --- a/tests/test_local_files/IGNORED_WARNINGS +++ b/tests/test_local_files/IGNORED_WARNINGS @@ -2,4 +2,4 @@ DCML_HARMONY_SYNTAX_WARNING (15,) ms3.Parse.old_tests.stabat_03_coloured -- /hom Score contains 2 labels that don't (and 83 that do) match the DCML standard: mc mn label harmony_layer 1 1 1 1m 2 - 2 1 1 B/F 3 \ No newline at end of file + 2 1 1 B/F 3 diff --git a/tests/test_local_files/MS3/05_symph_fant.mscx b/tests/test_local_files/MS3/05_symph_fant.mscx index f807b58f..afba9fab 100644 --- a/tests/test_local_files/MS3/05_symph_fant.mscx +++ b/tests/test_local_files/MS3/05_symph_fant.mscx @@ -271,7 +271,7 @@ I in Es (Mi♭). -2 Clarinetti. +2 Clarinetti. II in C (Ut). Clar. I @@ -387,7 +387,7 @@ II in C (Ut). I e II in E (Mi♭) -4 Corni. +4 Corni. III e IV in C (Ut). Cor. I.II. @@ -508,7 +508,7 @@ III e IV in C (Ut). I.II. -4 Fagotti +4 Fagotti III.IV. Fag. I.II. @@ -918,7 +918,7 @@ III.IV. Timpani - Timpani I + Timpani I in H (Si) E (Mi) Timp. I Timpani @@ -973,7 +973,7 @@ in H (Si) E (Mi) Timpani - Timpani II + Timpani II in Gis (Sol#) Cis (Ut#) Timp. II Timpani diff --git a/tests/test_local_files/MS3/Did03M-Son_regina-1762-Sarti.mscx b/tests/test_local_files/MS3/Did03M-Son_regina-1762-Sarti.mscx index f21cfd2e..b7025efa 100644 --- a/tests/test_local_files/MS3/Did03M-Son_regina-1762-Sarti.mscx +++ b/tests/test_local_files/MS3/Did03M-Son_regina-1762-Sarti.mscx @@ -749,7 +749,7 @@ Text: Pietro Metastasio (1689-1782) - 2i: solo 1|sent - ii: Vocal section|pd, 3i: pres - 3ii: ant|sent, + 2i: solo 1|sent - ii: Vocal section|pd, 3i: pres - 3ii: ant|sent, 4i: bi - ii: pres, 5i: } - ii: bi @@ -5669,7 +5669,7 @@ Text: Pietro Metastasio (1689-1782) - 2i: rit|sent - ii: /, 3i: pres - ii: cons|Ritornello phrase, + 2i: rit|sent - ii: /, 3i: pres - ii: cons|Ritornello phrase, 4i: si - ii: pres, 5i: mod - ii: si, 6i: } - ii: mod diff --git a/tests/test_local_files/test_repeats.py b/tests/test_local_files/test_repeats.py index 6836f31d..a1c3d7c9 100644 --- a/tests/test_local_files/test_repeats.py +++ b/tests/test_local_files/test_repeats.py @@ -1,22 +1,24 @@ #!/usr/bin/env python """Tests for `ms3` package.""" -import pytest import os - +import pytest from ms3 import Score from ms3.utils import next2sequence - -@pytest.mark.parametrize("mscx_file, expected_mc_sequence", [ - ('repeats0.mscx', [1, 2, 3, 6, 2, 4, 1, 2, 5]), - ('repeats1.mscx', [1, 2, 3, 1, 2, 4, 2, 5, 6]), - ('repeats2.mscx', [1, 2, 3, 1, 2, 4, 2, 5, 1, 2, 6]),]) +@pytest.mark.parametrize( + "mscx_file, expected_mc_sequence", + [ + ("repeats0.mscx", [1, 2, 3, 6, 2, 4, 1, 2, 5]), + ("repeats1.mscx", [1, 2, 3, 1, 2, 4, 2, 5, 6]), + ("repeats2.mscx", [1, 2, 3, 1, 2, 4, 2, 5, 1, 2, 6]), + ], +) def test_repeats(mscx_file, expected_mc_sequence): test_folder, _ = os.path.split(os.path.realpath(__file__)) - mscx_path = os.path.realpath(os.path.join(test_folder, 'repeat_dummies', mscx_file)) - s = Score(mscx_path, parser='bs4') - res = next2sequence(s.mscx.measures().set_index('mc').next) + mscx_path = os.path.realpath(os.path.join(test_folder, "repeat_dummies", mscx_file)) + s = Score(mscx_path, parser="bs4") + res = next2sequence(s.mscx.measures().set_index("mc").next) assert res == expected_mc_sequence diff --git a/tests/test_metarepo_files/test_dezrann.py b/tests/test_metarepo_files/test_dezrann.py index a7ddfdc3..a5b5a1c7 100644 --- a/tests/test_metarepo_files/test_dezrann.py +++ b/tests/test_metarepo_files/test_dezrann.py @@ -1,85 +1,102 @@ import json import os from collections import Counter -import pytest +import pytest from ms3 import Parse -from ms3.utils import get_value_profile_mask, load_tsv, assert_all_lines_equal from ms3.dezrann import generate_dez, generate_dez_from_dfs +from ms3.utils import assert_all_lines_equal, get_value_profile_mask, load_tsv MOZART_MOVEMENTS = [ - 'K279-1', 'K279-2', 'K279-3', - 'K280-1', 'K280-2', 'K280-3', - 'K283-1', 'K283-2', 'K283-3', - ] + "K279-1", + "K279-2", + "K279-3", + "K280-1", + "K280-2", + "K280-3", + "K283-1", + "K283-2", + "K283-3", +] + +SETTINGS = dict(cadences=True, harmonies=4, keys=5, phrases=6) -SETTINGS = dict( - cadences=True, - harmonies=4, - keys=5, - phrases=6 -) @pytest.fixture(params=MOZART_MOVEMENTS) def movement(request) -> str: return request.param + def test_dcml2dez(mozart_piano_sonatas, movement): - """This test creates Dezrann files from DCML annotations and compares the number of written labels with the target.""" + """ + This test creates Dezrann files from DCML annotations and compares the number of written labels with the target. + """ # first, create .dez file - measures_path = os.path.join(mozart_piano_sonatas, 'measures', f"{movement}.tsv") - harmonies_path = os.path.join(mozart_piano_sonatas, 'harmonies', f"{movement}.tsv") + measures_path = os.path.join(mozart_piano_sonatas, "measures", f"{movement}.tsv") + harmonies_path = os.path.join(mozart_piano_sonatas, "harmonies", f"{movement}.tsv") out_path = os.path.join(mozart_piano_sonatas, f"{movement}.dez") - generate_dez(path_measures=measures_path, - path_labels=harmonies_path, - output_path=out_path, - **SETTINGS - ) + generate_dez( + path_measures=measures_path, + path_labels=harmonies_path, + output_path=out_path, + **SETTINGS, + ) # then, count the contained labels and compare with the target number (except if score contains voltas because then, # the .dez file might contain additional, repeated labels at the beginning of each ending). expanded = load_tsv(harmonies_path) - if 'volta' in expanded and expanded.volta.notna().any(): + if "volta" in expanded and expanded.volta.notna().any(): return - with open(out_path, 'r', encoding='utf-8') as f: + with open(out_path, "r", encoding="utf-8") as f: dezrann_file = json.load(f) type2column = { - 'Harmony': 'chord', - 'Cadence': 'cadence', - 'Phrase': 'phraseend', - 'Local Key': 'localkey', + "Harmony": "chord", + "Cadence": "cadence", + "Phrase": "phraseend", + "Local Key": "localkey", } - written_labels = dict(Counter(type2column[label['type']] for label in dezrann_file['labels'])) + written_labels = dict( + Counter(type2column[label["type"]] for label in dezrann_file["labels"]) + ) expected_counts = dict( - chord = expanded['chord'].notna().sum(), - cadence = expanded['cadence'].notna().sum(), - phraseend = expanded['phraseend'].str.contains('{').sum(), - localkey = get_value_profile_mask(expanded['localkey']).sum(), + chord=expanded["chord"].notna().sum(), + cadence=expanded["cadence"].notna().sum(), + phraseend=expanded["phraseend"].str.contains("{").sum(), + localkey=get_value_profile_mask(expanded["localkey"]).sum(), ) assert written_labels == expected_counts + def test_parse2dez(mozart_piano_sonatas): """This test creates two .dez files per piece and checks if they are identical. One is created from the DataFrames as parsed by the ms3.Parse() object, and the other is created directly from the TSV files. """ file_re = "|".join(MOZART_MOVEMENTS) - p = Parse(mozart_piano_sonatas,file_re=file_re) - p.view.include('facets', 'measures', 'expanded') + p = Parse(mozart_piano_sonatas, file_re=file_re) + p.view.include("facets", "measures", "expanded") p.view.fnames_with_incomplete_facets = False p.parse_tsv() - facet_dataframes = p.get_facets(['expanded', 'measures'], concatenate=False, choose='auto') + facet_dataframes = p.get_facets( + ["expanded", "measures"], concatenate=False, choose="auto" + ) for (corpus, fname), facet2file_df_pair in facet_dataframes.items(): - measures_file, measures_df = facet2file_df_pair['measures'][0] - harmonies_file, harmonies_df = facet2file_df_pair['expanded'][0] + measures_file, measures_df = facet2file_df_pair["measures"][0] + harmonies_file, harmonies_df = facet2file_df_pair["expanded"][0] output_from_tsv = os.path.join(mozart_piano_sonatas, f"{fname}_from_tsv.dez") output_from_dfs = os.path.join(mozart_piano_sonatas, f"{fname}_from_df.dez") - generate_dez(path_measures=measures_file.full_path, - path_labels=harmonies_file.full_path, - output_path=output_from_tsv, - **SETTINGS) - generate_dez_from_dfs(measures_df=measures_df, - harmonies_df=harmonies_df, - output_path=output_from_dfs, - **SETTINGS) - dez_from_tsv = open(output_from_tsv, 'r', encoding='utf-8').read() - dez_from_dfs = open(output_from_dfs, 'r', encoding='utf-8').read() - assert_all_lines_equal(dez_from_tsv, dez_from_dfs, output_from_tsv, output_from_dfs) + generate_dez( + path_measures=measures_file.full_path, + path_labels=harmonies_file.full_path, + output_path=output_from_tsv, + **SETTINGS, + ) + generate_dez_from_dfs( + measures_df=measures_df, + harmonies_df=harmonies_df, + output_path=output_from_dfs, + **SETTINGS, + ) + dez_from_tsv = open(output_from_tsv, "r", encoding="utf-8").read() + dez_from_dfs = open(output_from_dfs, "r", encoding="utf-8").read() + assert_all_lines_equal( + dez_from_tsv, dez_from_dfs, output_from_tsv, output_from_dfs + ) diff --git a/tests/test_metarepo_files/test_docs_examples.py b/tests/test_metarepo_files/test_docs_examples.py index 7a4fc72b..50fcf972 100644 --- a/tests/test_metarepo_files/test_docs_examples.py +++ b/tests/test_metarepo_files/test_docs_examples.py @@ -2,11 +2,11 @@ import subprocess import pytest + from tests.conftest import DOCS_EXAMPLES_DIR -@pytest.fixture( - params=os.listdir(DOCS_EXAMPLES_DIR) -) + +@pytest.fixture(params=os.listdir(DOCS_EXAMPLES_DIR)) def example_script(request): return os.path.join(DOCS_EXAMPLES_DIR, request.param) @@ -14,4 +14,4 @@ def example_script(request): def test_examples(example_script): print(f"Running {example_script} ...") exit_value = subprocess.run(["python", example_script]) - exit_value.check_returncode() \ No newline at end of file + exit_value.check_returncode() diff --git a/tests/test_metarepo_files/test_transformations.py b/tests/test_metarepo_files/test_transformations.py index d6d67d81..5903636b 100644 --- a/tests/test_metarepo_files/test_transformations.py +++ b/tests/test_metarepo_files/test_transformations.py @@ -1,16 +1,31 @@ -from ms3 import fifths2sd, fifths2rn, fifths2name, fifths2iv +from ms3 import fifths2name, fifths2rn, fifths2sd + def test_fifths2sd(): - expected_major = [acc + sd for acc in ('bb', 'b', '', '#', '##') for sd in ('4', '1', '5', '2', '6', '3', '7')] - for fifths, exp_maj in zip(range(-15, 16), expected_major): + expected_major = [ + acc + sd + for acc in ("bb", "b", "", "#", "##") + for sd in ("4", "1", "5", "2", "6", "3", "7") + ] + for fifths, exp_maj in zip(range(-15, 16), expected_major): assert fifths2sd(fifths) == exp_maj + def test_fifths2rn(): - expected_major = [acc + rn for acc in ('bb', 'b', '', '#', '##') for rn in ('IV', 'I', 'V', 'II', 'VI', 'III', 'VII')] - for fifths, exp_maj in zip(range(-15, 16), expected_major): + expected_major = [ + acc + rn + for acc in ("bb", "b", "", "#", "##") + for rn in ("IV", "I", "V", "II", "VI", "III", "VII") + ] + for fifths, exp_maj in zip(range(-15, 16), expected_major): assert fifths2rn(fifths) == exp_maj -def test_fifths2rn(): - expected_major = [name+acc for acc in ('bb', 'b', '', '#', '##') for name in ('F', 'C', 'G', 'D', 'A', 'E', 'B')] - for fifths, exp_maj in zip(range(-15, 16), expected_major): + +def test_fifths2name(): + expected_major = [ + name + acc + for acc in ("bb", "b", "", "#", "##") + for name in ("F", "C", "G", "D", "A", "E", "B") + ] + for fifths, exp_maj in zip(range(-15, 16), expected_major): assert fifths2name(fifths=fifths) == exp_maj From 7fc77339a4e03435806561c4bd96979002773110 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Mon, 11 Sep 2023 19:36:08 +0200 Subject: [PATCH 27/44] factors out test_dezrann.py into folder dedicated to tests on Mozart --- tests/conftest.py | 14 ---------- tests/tests_on_mozart_repo/__init__.py | 0 .../test_dezrann.py | 28 +++++++++++++++++-- 3 files changed, 25 insertions(+), 17 deletions(-) create mode 100644 tests/tests_on_mozart_repo/__init__.py rename tests/{test_metarepo_files => tests_on_mozart_repo}/test_dezrann.py (83%) diff --git a/tests/conftest.py b/tests/conftest.py index 5f66a105..3b5e51b2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,20 +40,6 @@ def small_directory(directory): return path -@pytest.fixture(scope="session") -def mozart_piano_sonatas() -> str: - """Get the path to local clone of DCMLab/mozart_piano_sonatas""" - path = os.path.join(os.path.expanduser(CORPUS_DIR), "mozart_piano_sonatas") - if not os.path.isdir(path): - print( - f"Directory does not exist: {path} Clone DCMLab/mozart_piano_sonatas into the CORPUS_DIR specified above." - ) - assert os.path.isdir(path) - repo = Repo(path) - yield path - repo.git.clean("-fdx") # removes new files potentially generated during test - - @pytest.fixture( scope="session", params=[ diff --git a/tests/tests_on_mozart_repo/__init__.py b/tests/tests_on_mozart_repo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_metarepo_files/test_dezrann.py b/tests/tests_on_mozart_repo/test_dezrann.py similarity index 83% rename from tests/test_metarepo_files/test_dezrann.py rename to tests/tests_on_mozart_repo/test_dezrann.py index a5b5a1c7..1a6915a6 100644 --- a/tests/test_metarepo_files/test_dezrann.py +++ b/tests/tests_on_mozart_repo/test_dezrann.py @@ -3,9 +3,32 @@ from collections import Counter import pytest -from ms3 import Parse +from git import Repo +from ms3 import ( + Parse, + assert_all_lines_equal, + get_value_profile_mask, + load_tsv, + resolve_dir, +) from ms3.dezrann import generate_dez, generate_dez_from_dfs -from ms3.utils import assert_all_lines_equal, get_value_profile_mask, load_tsv + +MOZART_PIANO_SONATAS = "~/all_subcorpora/mozart_piano_sonatas" + + +@pytest.fixture(scope="session") +def mozart_piano_sonatas() -> str: + """Get the path to local clone of DCMLab/mozart_piano_sonatas""" + path = resolve_dir(MOZART_PIANO_SONATAS) + if not os.path.isdir(path): + print( + f"Directory does not exist: {path} Clone DCMLab/mozart_piano_sonatas into the CORPUS_DIR specified above." + ) + assert os.path.isdir(path) + repo = Repo(path) + yield path + repo.git.clean("-fdx") # removes new files potentially generated during test + MOZART_MOVEMENTS = [ "K279-1", @@ -18,7 +41,6 @@ "K283-2", "K283-3", ] - SETTINGS = dict(cadences=True, harmonies=4, keys=5, phrases=6) From 45cdf718f2b01de0b00c603bc0ea5f4b41e8adc8 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Wed, 13 Sep 2023 11:40:20 +0200 Subject: [PATCH 28/44] add measure map file with docstring --- src/ms3/measure_map.py | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 src/ms3/measure_map.py diff --git a/src/ms3/measure_map.py b/src/ms3/measure_map.py new file mode 100644 index 00000000..a336020e --- /dev/null +++ b/src/ms3/measure_map.py @@ -0,0 +1,64 @@ +""" +DCML to Measure Map +=================== + +Script to generate a 'measure map' in JSON format from DCML measure +descriptions in TSV format. + +Output: +JSON file (.json) containing the measure-map labels, aligned with the score. +Here is an example of measure map file structure, in compressed form: +''' +{ + "meter": [ + { + "qstamp": 0, + "name": "0", + "count": 1, + "time_signature": "4/4", + "actual_duration": 0.5, + }, + {"qstamp": 24.5, "name": "7a", "count": 8}, + {"qstamp": 28.5, "name": "7b", "count": 9}, + {"qstamp": 32.5, "number": "8", "count": 10, "time-signature": "3/4"} + ] +} +''' + +Keywords in measure-map labels: +* `qstamp`: float, required. Value indicating the duration, in quarter length, between +the score's beginning (position 0), and the beginning of current measure. Required in all labels. +* `name`: str, required. Nominal numbering for the measure, including repeated measure +(e.g. "6", "7a", "7b"). Required in each label. +* `number`: int. A number assigned to the measure - usually included in the "name". +* `count`: int. Ordinal position of the 'printed' measure in the piece, so that the +first has counter 1 and the last has the value of the total number of printed measures +in the piece. This value is unique for each measure of the piece. +* `id`: str. Unique identifier of the measure in the piece. It defaults to a string +a the `count` value. +* `time_signature`: str. Fraction corresponding to the time signature in the +measure. By default, the time signature is equal to the last previously labelled time +signature. Indicating a new time signature overrides this value. +* `nominal_length`: float. Standard length, in quarter values, for current time signature. +* `actual_length`: float. Actual length, in quarter values, of the measure. By default, +the actual duration is equal to the nominal duration. + * For example, a anacrusis with time signature "4/4" which actually last only +one quarter note will have an `actual_length` of 1.0. +* `start-repeat`: bool. If True, indicates a start repeat at the beginning of the measure. +Defaults to False. +* `end-repeat`: bool. If True, indicates an end repeat at the end of the measure. Defaults to False. +* `next`: list of int. By default, the list contains only one value: the following measure, +referred by its following `counter` value. Other values can be added in the case of repeats, +or second endings, for example. + + +Used columns in TSV (Tab-Separated Values) DCML measures files: +* `mc`: measure count (XML measures, always starting from 1) +* ... + +See also: +* https://gitlab.com/algomus.fr/dezrann/dezrann/-/issues/1030#note_1122509147 +* https://github.com/MarkGotham/bar-measure/ + +""" +pass From 48ac790ad3d4d486354d103b8090eb96dff061e5 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Wed, 13 Sep 2023 13:31:37 +0200 Subject: [PATCH 29/44] add main measure map generation function --- src/ms3/measure_map.py | 175 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 174 insertions(+), 1 deletion(-) diff --git a/src/ms3/measure_map.py b/src/ms3/measure_map.py index a336020e..ae262d43 100644 --- a/src/ms3/measure_map.py +++ b/src/ms3/measure_map.py @@ -61,4 +61,177 @@ * https://github.com/MarkGotham/bar-measure/ """ -pass +import json +import os +import re +from fractions import Fraction +from pprint import pprint +from typing import List, Union + +import pandas as pd + +#### +# Utils +#### + + +def safe_frac(s: str) -> Union[Fraction, str]: + try: + return Fraction(s) + except Exception: + return + + +def get_int(mn: Union[str, int]) -> int: + """ + Return the integer part of a string measure name. + For example, `get_int("7a")` returns 7. + """ + if isinstance(mn, int): + return mn + regex_mn = r"(\d+)([a-z]?)" + regex_match = re.match(regex_mn, str(mn)) + if not regex_match: + raise ValueError(f"'{mn}' is not a valid measure number.") + return int(regex_match.group(1)) + + +#### +# Main function +#### + + +def generate_measure_map(file: str, output_file=None, compressed=True) -> List[dict]: + """ + Generate a measure map in JSON from + + Args: + file: Path to a '_measures.tsv' file in the 'measure' folder of a DCML corpus. + Requires the columns {'mc': int, 'mn': int, 'quarterbeats_all_endings': fractions.Fraction} (ms3 >= 1.0.0). + output_file: TODO + compressed: TODO + + Returns: + Measure map: list of measure description. + """ + try: + measures_df = pd.read_csv( + file, + sep="\t", + dtype={"mc": int, "volta": "Int64"}, + converters={ + "quarterbeats_all_endings": safe_frac, + "quarterbeats": safe_frac, + "act_dur": safe_frac, + }, + ) + except (ValueError, AssertionError) as e: + raise ValueError( + f"{file} could not be loaded because of the following error:\n'{e}'" + ) + score_has_voltas = "quarterbeats_all_endings" in measures_df.columns + + measure_map = [] # Measure map, list + + previous_measure_dict = {"name": "0"} + current_time_sig = None + for i_measure, measure in measures_df.iterrows(): + measure_mn = int(measure.mn) + measure_mc = int(measure.mc) + measure_dict = {} + display_measure = not compressed # default value + + # Time signature, time signature upbeat + if Fraction(measure.timesig) != Fraction(measure.act_dur): + # Partial measure + display_measure = True + measure_dict["actual_duration"] = float( + measure.duration_qb + ) # str(measure.act_dur) + if measure.timesig != current_time_sig: + # New time signature + # (always the case for first measures: always displayed) + display_measure = True + measure_dict["time_signature"] = measure.timesig + current_time_sig = measure.timesig + # TODO: nominal_duration + + # Measure number + have_same_number = get_int(previous_measure_dict["name"]) == measure_mn + if i_measure > 0 and have_same_number: + # Not the next numbered measure + if measure_map[-1]["name"] != str(measure_mn): + # Add previous measure, which is needed (because not displayed yet) + measure_map.append(previous_measure_dict) + measure_map[-1]["name"] += "a" # Add letter to previous measure + measure_dict["name"] = str(measure_mn) + "b" # And to current measure + else: + measure_dict["name"] = str(measure_mn) + # measure_dict["number"] = measure_mn # if needed + measure_dict["count"] = measure_mc + + have_consecutive_number = ( + get_int(previous_measure_dict["name"]) + 1 == measure_mn + ) + if not have_consecutive_number: + # Display the new numbering (e.g. if measure numbers were skipped or reinitialized) + display_measure = True + + # Onsets / qstamp + if not score_has_voltas: + measure_dict["qstamp"] = float(measure.quarterbeats) + else: + measure_dict["qstamp"] = float(measure.quarterbeats_all_endings) + + if display_measure: + # i.e. the measure need to be in the compressed version + measure_map.append(measure_dict) + previous_measure_dict = measure_dict + # TODO: always add last measure + + json_str = {"meter": measure_map} + if output_file: + with open(output_file, "w", encoding="utf-8") as f: + json.dump(json_str, f, indent=2) + + return measure_map + + +#### +# Main and tests +#### + + +def main_generate_mm( + pieces, output_dir=".", verbose=False, compressed=True, stops=False +): + for i, piece in enumerate(pieces): + output_path = os.path.join( + output_dir, os.path.basename(piece).replace(".tsv", "_mm.json") + ) + measure_map = generate_measure_map(piece, output_path, compressed=compressed) + if verbose: + print(os.path.basename(piece)) + pprint(measure_map) + if stops and i < len(pieces) - 1: + input("Press enter to continue...") + + +if __name__ == "__main__": + # Path to local Annotated Mozart Sonatas repository for testing + INPUT_DIR = os.path.join( # from ~ms3/src/ms3 as working directory + "..", "..", "..", "mozart_piano_sonatas", "measures" + ) + TEST_PIECES = ["K284-3"] + TEST_INPUT_PATHS = [ + os.path.join(INPUT_DIR, f"{piece}.tsv") for piece in TEST_PIECES + ] + OUTPUT_DIR = "." + + main_generate_mm( + pieces=TEST_INPUT_PATHS, + output_dir=OUTPUT_DIR, + verbose=True, + stops=True, + compressed=True, + ) From 80a0b8939e7e147a8013948f70ea1bcb477a0927 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Wed, 13 Sep 2023 14:48:27 +0200 Subject: [PATCH 30/44] add full_info mode, with all keywords --- src/ms3/measure_map.py | 98 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/src/ms3/measure_map.py b/src/ms3/measure_map.py index ae262d43..92646e1a 100644 --- a/src/ms3/measure_map.py +++ b/src/ms3/measure_map.py @@ -101,7 +101,9 @@ def get_int(mn: Union[str, int]) -> int: #### -def generate_measure_map(file: str, output_file=None, compressed=True) -> List[dict]: +def generate_measure_map( + file: str, output_file=None, compressed=True, full_info=False +) -> List[dict]: """ Generate a measure map in JSON from @@ -109,10 +111,12 @@ def generate_measure_map(file: str, output_file=None, compressed=True) -> List[d file: Path to a '_measures.tsv' file in the 'measure' folder of a DCML corpus. Requires the columns {'mc': int, 'mn': int, 'quarterbeats_all_endings': fractions.Fraction} (ms3 >= 1.0.0). output_file: TODO - compressed: TODO + compressed: if True, add only necessary measures (first, last and notable changes). + full_info: if True, add all information in the measures' description. + Otherwise, only necessary information is displayed. Returns: - Measure map: list of measure description. + Measure map: list of measure descriptions. """ try: measures_df = pd.read_csv( @@ -133,30 +137,38 @@ def generate_measure_map(file: str, output_file=None, compressed=True) -> List[d measure_map = [] # Measure map, list + start_repeat_list = [] previous_measure_dict = {"name": "0"} current_time_sig = None for i_measure, measure in measures_df.iterrows(): measure_mn = int(measure.mn) measure_mc = int(measure.mc) measure_dict = {} - display_measure = not compressed # default value + """Description of current measure.""" + display_measure = False + """True if current measure needs to be added in compressed version.""" # Time signature, time signature upbeat - if Fraction(measure.timesig) != Fraction(measure.act_dur): - # Partial measure + is_partial_measure = Fraction(measure.timesig) != Fraction(measure.act_dur) + if is_partial_measure: display_measure = True + has_new_timesig = measure.timesig != current_time_sig + if has_new_timesig: + # (always the case for first measures: always displayed) + display_measure = True + + if full_info or is_partial_measure: measure_dict["actual_duration"] = float( measure.duration_qb ) # str(measure.act_dur) - if measure.timesig != current_time_sig: - # New time signature - # (always the case for first measures: always displayed) - display_measure = True + if full_info or has_new_timesig: measure_dict["time_signature"] = measure.timesig current_time_sig = measure.timesig - # TODO: nominal_duration + if full_info: + measure_dict["nominal_duration"] = Fraction(measure.timesig) * 4.0 # Measure number + # TODO: handle first/second endings which last more than 1 measure. have_same_number = get_int(previous_measure_dict["name"]) == measure_mn if i_measure > 0 and have_same_number: # Not the next numbered measure @@ -167,7 +179,8 @@ def generate_measure_map(file: str, output_file=None, compressed=True) -> List[d measure_dict["name"] = str(measure_mn) + "b" # And to current measure else: measure_dict["name"] = str(measure_mn) - # measure_dict["number"] = measure_mn # if needed + if full_info: + measure_dict["number"] = measure_mn measure_dict["count"] = measure_mc have_consecutive_number = ( @@ -183,14 +196,60 @@ def generate_measure_map(file: str, output_file=None, compressed=True) -> List[d else: measure_dict["qstamp"] = float(measure.quarterbeats_all_endings) - if display_measure: - # i.e. the measure need to be in the compressed version + # Identifier + if full_info: + # TODO: allow personalised id + # By default, from the 'count' + measure_dict["id"] = str(measure_dict["count"]) + + # Repeats, following measures + if full_info: + # TODO: cleaner conversion from tsv column? + measure_dict["next"] = [ + int(next_count) for next_count in measure.next.split(", ") + ] + + # Start/end repeats + # Warning: `start` and `end` columns in TSV does not always contain the information + measure_dict["start_repeat"] = False + measure_dict["end_repeat"] = False + for next_count in measure_dict["next"]: + if next_count == -1: + # Last measure + continue + if next_count < measure_dict["count"] + 1: + # Link to previous measure: there should be a repeat + measure_dict["end_repeat"] = True + + # Then, the target measure starts this repeat + # TODO: optimise this; avoid a second loop. + start_repeat_list.append(next_count) + + # Other cases: + # next_count == current_count+1: expected link to next printed measure (no repeat) + # next_count > current_count+1: possible link to second ending (no repeat) + + # Display the measure + if not compressed or display_measure: measure_map.append(measure_dict) previous_measure_dict = measure_dict - # TODO: always add last measure + # Always add last measure + if previous_measure_dict != measure_map[-1]: + if compressed: + # Only need the last 'count' + measure_map.append({"count": previous_measure_dict["count"]}) + else: + measure_map.append(previous_measure_dict) - json_str = {"meter": measure_map} + # Update start repeats + if full_info: + for m in measure_map: + if m["count"] in start_repeat_list: + m["start_repeat"] = True + + # Save output if output_file: + json_str = {"meter": measure_map} with open(output_file, "w", encoding="utf-8") as f: json.dump(json_str, f, indent=2) @@ -203,13 +262,15 @@ def generate_measure_map(file: str, output_file=None, compressed=True) -> List[d def main_generate_mm( - pieces, output_dir=".", verbose=False, compressed=True, stops=False + pieces, output_dir=".", verbose=False, compressed=True, full_info=False, stops=False ): for i, piece in enumerate(pieces): output_path = os.path.join( output_dir, os.path.basename(piece).replace(".tsv", "_mm.json") ) - measure_map = generate_measure_map(piece, output_path, compressed=compressed) + measure_map = generate_measure_map( + piece, output_path, compressed=compressed, full_info=full_info + ) if verbose: print(os.path.basename(piece)) pprint(measure_map) @@ -234,4 +295,5 @@ def main_generate_mm( verbose=True, stops=True, compressed=True, + full_info=True, ) From 5ae3288a4cbf0c5da72c902b95a4f467c0a5d9a9 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Wed, 13 Sep 2023 15:13:58 +0200 Subject: [PATCH 31/44] improve compressed mode --- src/ms3/measure_map.py | 48 +++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/src/ms3/measure_map.py b/src/ms3/measure_map.py index 92646e1a..4eb0798a 100644 --- a/src/ms3/measure_map.py +++ b/src/ms3/measure_map.py @@ -138,8 +138,8 @@ def generate_measure_map( measure_map = [] # Measure map, list start_repeat_list = [] - previous_measure_dict = {"name": "0"} - current_time_sig = None + previous_measure_dict = {"name": "0", "count": 0} + current_time_sig = "None" for i_measure, measure in measures_df.iterrows(): measure_mn = int(measure.mn) measure_mc = int(measure.mc) @@ -150,24 +150,30 @@ def generate_measure_map( # Time signature, time signature upbeat is_partial_measure = Fraction(measure.timesig) != Fraction(measure.act_dur) - if is_partial_measure: - display_measure = True + """If True, indicates if the printed measure's duration does not match the + time signature nominal duration; for example, in anacruses.""" has_new_timesig = measure.timesig != current_time_sig - if has_new_timesig: - # (always the case for first measures: always displayed) + """If True, indicates a change of time signature. This is always the case + for the first measure.""" + if is_partial_measure or has_new_timesig: display_measure = True if full_info or is_partial_measure: - measure_dict["actual_duration"] = float( - measure.duration_qb - ) # str(measure.act_dur) + measure_dict["actual_duration"] = float(measure.duration_qb) if full_info or has_new_timesig: measure_dict["time_signature"] = measure.timesig current_time_sig = measure.timesig if full_info: measure_dict["nominal_duration"] = Fraction(measure.timesig) * 4.0 - # Measure number + # Measure count, identifier + measure_dict["count"] = measure_mc + if full_info: + # TODO: allow personalised id + # By default, also the measure counter + measure_dict["id"] = str(measure_mc) + + # Measure number, name # TODO: handle first/second endings which last more than 1 measure. have_same_number = get_int(previous_measure_dict["name"]) == measure_mn if i_measure > 0 and have_same_number: @@ -181,7 +187,6 @@ def generate_measure_map( measure_dict["name"] = str(measure_mn) if full_info: measure_dict["number"] = measure_mn - measure_dict["count"] = measure_mc have_consecutive_number = ( get_int(previous_measure_dict["name"]) + 1 == measure_mn @@ -196,12 +201,6 @@ def generate_measure_map( else: measure_dict["qstamp"] = float(measure.quarterbeats_all_endings) - # Identifier - if full_info: - # TODO: allow personalised id - # By default, from the 'count' - measure_dict["id"] = str(measure_dict["count"]) - # Repeats, following measures if full_info: # TODO: cleaner conversion from tsv column? @@ -241,6 +240,17 @@ def generate_measure_map( else: measure_map.append(previous_measure_dict) + # Simplify first measure + if ( + compressed + and (measure_map[0]["count"] == 1) + and (measure_map[0]["name"] == "1") + and (measure_map[0]["qstamp"] < 1e-10) + ): + del measure_map[0]["count"] + del measure_map[0]["name"] + del measure_map[0]["qstamp"] + # Update start repeats if full_info: for m in measure_map: @@ -283,7 +293,7 @@ def main_generate_mm( INPUT_DIR = os.path.join( # from ~ms3/src/ms3 as working directory "..", "..", "..", "mozart_piano_sonatas", "measures" ) - TEST_PIECES = ["K284-3"] + TEST_PIECES = ["K284-1", "K284-3"] TEST_INPUT_PATHS = [ os.path.join(INPUT_DIR, f"{piece}.tsv") for piece in TEST_PIECES ] @@ -295,5 +305,5 @@ def main_generate_mm( verbose=True, stops=True, compressed=True, - full_info=True, + full_info=False, ) From 341c61067868a7204081c974aa8e3d73f38c8f47 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Mon, 18 Sep 2023 23:42:27 +0200 Subject: [PATCH 32/44] computes quarterbeats_all_endings only if not present already --- src/ms3/dezrann.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ms3/dezrann.py b/src/ms3/dezrann.py index 5b8449fc..47003f30 100644 --- a/src/ms3/dezrann.py +++ b/src/ms3/dezrann.py @@ -240,13 +240,16 @@ def dcml_labels2dicts( end_of_score += float(last_mc_row.quarterbeats) else: # the column 'quarterbeats_all_endings' is present, meaning the piece has first and second endings and the - # quarterbeats, which normally leave out first endings, need to be recomputed - end_of_score += float(last_mc_row.quarterbeats_all_endings) - M = measures.set_index("mc") - offset_dict = M["quarterbeats_all_endings"] - quarterbeats = labels["mc"].map(offset_dict) - quarterbeats = quarterbeats + (labels.mc_onset * 4.0) - quarterbeats.rename("quarterbeats", inplace=True) + # quarterbeats_all_endings (aka qstamp) need to be computed if not present + if "quarterbeats_all_endings" in labels.columns: + quarterbeats = labels["quarterbeats_all_endings"] + else: + end_of_score += float(last_mc_row.quarterbeats_all_endings) + M = measures.set_index("mc") + offset_dict = M["quarterbeats_all_endings"] + quarterbeats = labels["mc"].map(offset_dict) + quarterbeats = quarterbeats + (labels.mc_onset * 4.0) + quarterbeats = quarterbeats.rename("quarterbeats") # also, the first beat of each volta needs to have a label for computing correct durations volta_groups = get_volta_groups(M.volta) label_and_qb = pd.concat( From 41e48b99c5cc82367ed191825015b11a656b9095 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Tue, 19 Sep 2023 15:33:35 +0200 Subject: [PATCH 33/44] adds review_cmd to debugging.py --- tests/test_metarepo_files/debugging.py | 34 +++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/tests/test_metarepo_files/debugging.py b/tests/test_metarepo_files/debugging.py index ef23014d..b3621cf8 100644 --- a/tests/test_metarepo_files/debugging.py +++ b/tests/test_metarepo_files/debugging.py @@ -9,10 +9,12 @@ import os.path from ms3 import Parse +from ms3.cli import get_arg_parser, review_cmd from ms3.logger import get_logger from ms3.operations import transform_to_resources +from ms3.utils import resolve_dir -CORPUS_PATH = r"C:\Users\hentsche\baroque_keyboard_corpus\bach_en_fr_suites" +CORPUS_PATH = "~/all_subcorpora/wagner_overtures" def ignoring_warning(): @@ -27,6 +29,7 @@ def ignoring_warning(): def parse_object() -> Parse: + """Created by executing an ms3 command and coping the object initializing from the output.""" p = Parse( CORPUS_PATH, recursive=True, @@ -46,9 +49,34 @@ def parse_object() -> Parse: def extraction(): - """Created by executing an ms3 command and coping the object initializing from the output.""" p = parse_object() p.parse_scores() + _ = p.get_facet("expanded") + # p.store_extracted_facets( + # root_dir=root_dir, + # notes_folder=notes_folder, + # rests_folder=rests_folder, + # notes_and_rests_folder=notes_and_rests_folder, + # measures_folder=measures_folder, + # events_folder=events_folder, + # labels_folder=labels_folder, + # chords_folder=chords_folder, + # expanded_folder=expanded_folder, + # cadences_folder=cadences_folder, + # form_labels_folder=form_labels_folder, + # metadata_suffix=metadata_suffix, + # markdown=markdown, + # simulate=simulate, + # unfold=unfold, + # interval_index=interval_index, + # ) + return + + +def execute_review_cmd(): + parser = get_arg_parser() + args = parser.parse_args(["review", "-d", resolve_dir(CORPUS_PATH)]) + review_cmd(args) def transform_cmd(): @@ -64,4 +92,4 @@ def transform_cmd(): if __name__ == "__main__": - transform_cmd() + execute_review_cmd() From 2ae5d72c5bed4603a69e92e4bcdb48fbe2c816e9 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Tue, 19 Sep 2023 15:35:23 +0200 Subject: [PATCH 34/44] resolves #99 --- src/ms3/annotations.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ms3/annotations.py b/src/ms3/annotations.py index 1647c55c..6c25ea51 100644 --- a/src/ms3/annotations.py +++ b/src/ms3/annotations.py @@ -540,7 +540,10 @@ def expand_dcml( "To retain the old behavior, use either.*" ), ) - df.loc[select_dcml, exp.columns] = exp + exp_shared_cols = exp.columns.isin(df.columns.values) + df_shared_cols = df.columns.isin(exp.columns.values) + df.loc[select_dcml, df_shared_cols] = exp.loc[:, exp_shared_cols] + df = pd.concat([df, exp.loc[:, ~exp_shared_cols]], axis=1) df.loc[:, key_cols] = df[key_cols].ffill() self._expanded = df drop_cols = [ From e31c2fc419c33a8f02e44657f70da009e8d21224 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Tue, 19 Sep 2023 17:45:59 +0200 Subject: [PATCH 35/44] enables extracting measure maps using ms3 extract -MM --- src/ms3/cli.py | 12 ++++++ src/ms3/corpus.py | 43 ++++++++++++++++++--- src/ms3/operations.py | 3 ++ src/ms3/parse.py | 2 + src/ms3/utils/frictionless_helpers.py | 21 +++++++++-- src/ms3/utils/functions.py | 54 +++++++++++++++++++++++++++ 6 files changed, 125 insertions(+), 10 deletions(-) diff --git a/src/ms3/cli.py b/src/ms3/cli.py index ee64fd0c..f44b568a 100644 --- a/src/ms3/cli.py +++ b/src/ms3/cli.py @@ -43,6 +43,7 @@ def gather_extract_params(args) -> List[str]: for name, arg in zip( ( "measures", + "measure_maps", "notes", "rests", "labels", @@ -53,6 +54,7 @@ def gather_extract_params(args) -> List[str]: ), ( args.measures, + args.measure_maps, args.notes, args.rests, args.labels, @@ -213,6 +215,7 @@ def extract_cmd(args, parse_obj: Optional[Parse] = None): notes_folder=args.notes, labels_folder=args.labels, measures_folder=args.measures, + measure_maps_folder=args.measure_maps, rests_folder=args.rests, events_folder=args.events, chords_folder=args.chords, @@ -685,6 +688,15 @@ def get_arg_parser(): const="../measures", help="Folder where to store TSV files with measure information needed for tasks such as unfolding repetitions.", ) + extract_args.add_argument( + "-MM", + "--measure_maps", + metavar="folder", + nargs="?", + const="../measures", + help="Folder where to store measuremap.json files. They are a variant of the 'normal' --measures with renamed " + "columns, satisfying the MeasureMap specification.", + ) extract_args.add_argument( "-N", "--notes", diff --git a/src/ms3/corpus.py b/src/ms3/corpus.py index 35dee635..91264d76 100644 --- a/src/ms3/corpus.py +++ b/src/ms3/corpus.py @@ -19,8 +19,11 @@ import numpy as np import pandas as pd import pathos.multiprocessing as mp -from ms3.utils.frictionless_helpers import store_dataframe_resource -from ms3.utils.functions import compute_path_from_file +from ms3.utils.frictionless_helpers import ( + store_as_json_or_yaml, + store_dataframe_resource, +) +from ms3.utils.functions import compute_path_from_file, measures2measure_map from ._typing import ( AnnotationsFacet, @@ -3046,6 +3049,7 @@ def store_extracted_facets( view_name: Optional[str] = None, root_dir: Optional[str] = None, measures_folder: Optional[str] = None, + measure_maps_folder: Optional[str] = None, notes_folder: Optional[str] = None, rests_folder: Optional[str] = None, notes_and_rests_folder: Optional[str] = None, @@ -3097,11 +3101,19 @@ def store_extracted_facets( folder_params = { t: lcls[p] for t, p in zip(df_types, folder_vars) if lcls[p] is not None } - output_metadata = metadata_suffix is not None - if len(folder_params) == 0 and not output_metadata: + do_store_metadata = metadata_suffix is not None + do_store_measure_maps = measure_maps_folder is not None + if ( + len(folder_params) == 0 + and not do_store_metadata + and not do_store_measure_maps + ): self.logger.warning("Pass at least one parameter to store files.") return [] facets = list(folder_params.keys()) + do_store_measures_tsv = "measures" in facets + if do_store_measure_maps and not do_store_measures_tsv: + facets.append("measures") df_params = {p: True for p in folder_params.keys()} n_scores = len(self._get_parsed_score_files(view_name=view_name, flat=True)) paths = [] @@ -3124,11 +3136,30 @@ def store_extracted_facets( for facet, df in facet2dataframe.items(): if df is None: continue + piece_name = file.piece + if facet == "measures" and do_store_measure_maps: + directory = compute_path_from_file( + file, root_dir=root_dir, folder=measure_maps_folder + ) + file_path = os.path.join( + directory, f"{piece}.measures.json" + ) + if simulate: + self.logger.info( + f"Would have stored the MeasureMap from {file.rel_path} as {file_path}." + ) + else: + measure_map = measures2measure_map(df) + measure_map_json = measure_map.to_json(orient="records") + store_as_json_or_yaml( + measure_map_json, file_path, logger=self.logger + ) + if not do_store_measures_tsv: + continue folder = folder_params[facet] directory = compute_path_from_file( file, root_dir=root_dir, folder=folder ) - piece_name = file.piece if unfold: piece_name += "_unfolded" facet_param = "harmonies" if facet == "expanded" else facet @@ -3153,7 +3184,7 @@ def store_extracted_facets( logger=self.logger, ) paths.append(descriptor_or_resource_path) - if output_metadata: + if do_store_metadata: if not markdown: metadata_paths = self.update_metadata_tsv_from_parsed_scores( root_dir=root_dir, suffix=metadata_suffix, markdown_file=None diff --git a/src/ms3/operations.py b/src/ms3/operations.py index 5671d59d..c8e334a8 100644 --- a/src/ms3/operations.py +++ b/src/ms3/operations.py @@ -117,6 +117,7 @@ def extract( rests_folder: Optional[str] = None, notes_and_rests_folder: Optional[str] = None, measures_folder: Optional[str] = None, + measure_maps_folder: Optional[str] = None, events_folder: Optional[str] = None, labels_folder: Optional[str] = None, chords_folder: Optional[str] = None, @@ -141,6 +142,7 @@ def extract( rests_folder=rests_folder, notes_and_rests_folder=notes_and_rests_folder, measures_folder=measures_folder, + measure_maps_folder=measure_maps_folder, events_folder=events_folder, labels_folder=labels_folder, chords_folder=chords_folder, @@ -166,6 +168,7 @@ def extract( rests_folder=rests_folder, notes_and_rests_folder=notes_and_rests_folder, measures_folder=measures_folder, + measure_maps_folder=measure_maps_folder, events_folder=events_folder, labels_folder=labels_folder, chords_folder=chords_folder, diff --git a/src/ms3/parse.py b/src/ms3/parse.py index 2fa65bf9..aaf36e31 100644 --- a/src/ms3/parse.py +++ b/src/ms3/parse.py @@ -1678,6 +1678,7 @@ def store_extracted_facets( view_name: Optional[str] = None, root_dir: Optional[str] = None, measures_folder: Optional[str] = None, + measure_maps_folder: Optional[str] = None, notes_folder: Optional[str] = None, rests_folder: Optional[str] = None, notes_and_rests_folder: Optional[str] = None, @@ -1724,6 +1725,7 @@ def store_extracted_facets( view_name=view_name, root_dir=root_dir, measures_folder=measures_folder, + measure_maps_folder=measure_maps_folder, notes_folder=notes_folder, rests_folder=rests_folder, notes_and_rests_folder=notes_and_rests_folder, diff --git a/src/ms3/utils/frictionless_helpers.py b/src/ms3/utils/frictionless_helpers.py index 28535383..2d2c8ce7 100644 --- a/src/ms3/utils/frictionless_helpers.py +++ b/src/ms3/utils/frictionless_helpers.py @@ -1,5 +1,6 @@ import hashlib import json +import logging import os import re from ast import literal_eval @@ -330,20 +331,32 @@ def get_schema( return result -def store_as_json_or_yaml(descriptor_dict: dict, descriptor_path: str, logger=None): +def store_as_json_or_yaml( + descriptor_dict: dict | str, + descriptor_path: str, + logger: Optional[logging.Logger | str] = None, +): if logger is None: logger = module_logger elif isinstance(logger, str): logger = get_logger(logger) + directory = os.path.dirname(descriptor_path) + os.makedirs(directory, exist_ok=True) if descriptor_path.endswith(".yaml"): with open(descriptor_path, "w") as f: - yaml.dump(descriptor_dict, f) + if isinstance(descriptor_dict, str): + f.write(descriptor_dict) + else: + yaml.dump(descriptor_dict, f) elif descriptor_path.endswith(".json"): with open(descriptor_path, "w") as f: - json.dump(descriptor_dict, f, indent=2) + if isinstance(descriptor_dict, str): + f.write(descriptor_dict) + else: + json.dump(descriptor_dict, f, indent=2) else: raise ValueError( - f"Descriptor path must end with .yaml or .json: {descriptor_path}" + f"descriptor_path must end with .yaml or .json: {descriptor_path}" ) logger.info(f"Stored descriptor at {descriptor_path}.") diff --git a/src/ms3/utils/functions.py b/src/ms3/utils/functions.py index e21f05e6..4185bd88 100644 --- a/src/ms3/utils/functions.py +++ b/src/ms3/utils/functions.py @@ -6564,3 +6564,57 @@ def get_value_profile_mask( if reindex_flag: return beginnings.reindex(series.index) return beginnings + + +def measures2measure_map(df: pd.DataFrame) -> pd.DataFrame: + """Turns the given measures table into a table corresponding to the MeasureMap specification. This includes + renaming columns and applying mild transformations to some of them. The resulting measure map can be converted + to JSON format via ``df.to_json(orient='records')``. + + Args: + df: Measures table for a single piece. + + Returns: + The measure map where each row corresponds to one entry. + + Raises: + ValueError: + If there are NA values in the quarterbeats column. If a quarterbeats_all_endings column is present, + which should never include NA values, it will be used for the "qstamp" column. + """ + # renaming columns + renaming_dict = { + "mc": "count", + "mn": "number", + "timesig": "time-signature", + } + mm_columns = list(renaming_dict.values()) + measure_map = df.rename(columns=renaming_dict)[mm_columns] + + # quarterbeats_all_endings only present in measures if score has voltas + if "quarterbeats_all_endings" in df: + qstamp_col = df["quarterbeats_all_endings"] + else: + qstamp_col = df["quarterbeats"] + if qstamp_col.isna().any(): + raise ValueError(f"There are NA values in the column {qstamp_col.name!r}.") + qstamp_col = qstamp_col.astype(float).rename("qstamp") + nominal_col = (df.timesig.map(Fraction) * 4.0).rename("nominal_length") + actual_col = (df.act_dur * 4.0).rename("actual_length") + start_repeat_col = df.repeats.str.contains("start").rename("start_repeat") + end_repeat_col = df.repeats.str.contains("end").rename("end_repeat") + next_col = df.next.map(list) + + measure_map = pd.concat( + [ + measure_map, + qstamp_col, + nominal_col, + actual_col, + start_repeat_col, + end_repeat_col, + next_col, + ], + axis=1, + ) + return measure_map From 632364dbf8cae8c907802212971862fffa8d56a5 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Thu, 21 Sep 2023 01:03:30 +0200 Subject: [PATCH 36/44] changes MeasureMap suffix to measuremap.json --- src/ms3/corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ms3/corpus.py b/src/ms3/corpus.py index 91264d76..e0cdf1c1 100644 --- a/src/ms3/corpus.py +++ b/src/ms3/corpus.py @@ -3142,7 +3142,7 @@ def store_extracted_facets( file, root_dir=root_dir, folder=measure_maps_folder ) file_path = os.path.join( - directory, f"{piece}.measures.json" + directory, f"{piece}.measuremap.json" ) if simulate: self.logger.info( From a179bb24ae035ac0a6b325f246f3685ac2d451e3 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Thu, 21 Sep 2023 09:37:45 +0200 Subject: [PATCH 37/44] moves measures2measure_map() to ms3.transformations --- src/ms3/corpus.py | 3 ++- src/ms3/transformations.py | 55 ++++++++++++++++++++++++++++++++++++++ src/ms3/utils/functions.py | 54 ------------------------------------- 3 files changed, 57 insertions(+), 55 deletions(-) diff --git a/src/ms3/corpus.py b/src/ms3/corpus.py index e0cdf1c1..5be46515 100644 --- a/src/ms3/corpus.py +++ b/src/ms3/corpus.py @@ -23,7 +23,7 @@ store_as_json_or_yaml, store_dataframe_resource, ) -from ms3.utils.functions import compute_path_from_file, measures2measure_map +from ms3.utils.functions import compute_path_from_file from ._typing import ( AnnotationsFacet, @@ -46,6 +46,7 @@ ) from .piece import Piece from .score import Score, compare_two_score_objects +from .transformations import measures2measure_map from .utils import ( File, ask_user_to_choose, diff --git a/src/ms3/transformations.py b/src/ms3/transformations.py index d5ffc2d3..01b1e316 100644 --- a/src/ms3/transformations.py +++ b/src/ms3/transformations.py @@ -1,6 +1,7 @@ """Functions for transforming DataFrames as output by ms3.""" import sys import warnings +from fractions import Fraction from fractions import Fraction as frac from functools import reduce from typing import Dict, List, Optional, Tuple, Union @@ -1226,6 +1227,60 @@ def gantt_data(df): return res +def measures2measure_map(df: pd.DataFrame) -> pd.DataFrame: + """Turns the given measures table into a table corresponding to the MeasureMap specification. This includes + renaming columns and applying mild transformations to some of them. The resulting measure map can be converted + to JSON format via ``df.to_json(orient='records')``. + + Args: + df: Measures table for a single piece. + + Returns: + The measure map where each row corresponds to one entry. + + Raises: + ValueError: + If there are NA values in the quarterbeats column. If a quarterbeats_all_endings column is present, + which should never include NA values, it will be used for the "qstamp" column. + """ + # renaming columns + renaming_dict = { + "mc": "count", + "mn": "number", + "timesig": "time-signature", + } + mm_columns = list(renaming_dict.values()) + measure_map = df.rename(columns=renaming_dict)[mm_columns] + + # quarterbeats_all_endings only present in measures if score has voltas + if "quarterbeats_all_endings" in df: + qstamp_col = df["quarterbeats_all_endings"] + else: + qstamp_col = df["quarterbeats"] + if qstamp_col.isna().any(): + raise ValueError(f"There are NA values in the column {qstamp_col.name!r}.") + qstamp_col = qstamp_col.astype(float).rename("qstamp") + nominal_col = (df.timesig.map(Fraction) * 4.0).rename("nominal_length") + actual_col = (df.act_dur * 4.0).rename("actual_length") + start_repeat_col = df.repeats.str.contains("start").rename("start_repeat") + end_repeat_col = df.repeats.str.contains("end").rename("end_repeat") + next_col = df.next.map(list) + + measure_map = pd.concat( + [ + measure_map, + qstamp_col, + nominal_col, + actual_col, + start_repeat_col, + end_repeat_col, + next_col, + ], + axis=1, + ) + return measure_map + + def notes2pcvs( notes, pitch_class_format="tpc", diff --git a/src/ms3/utils/functions.py b/src/ms3/utils/functions.py index 4185bd88..e21f05e6 100644 --- a/src/ms3/utils/functions.py +++ b/src/ms3/utils/functions.py @@ -6564,57 +6564,3 @@ def get_value_profile_mask( if reindex_flag: return beginnings.reindex(series.index) return beginnings - - -def measures2measure_map(df: pd.DataFrame) -> pd.DataFrame: - """Turns the given measures table into a table corresponding to the MeasureMap specification. This includes - renaming columns and applying mild transformations to some of them. The resulting measure map can be converted - to JSON format via ``df.to_json(orient='records')``. - - Args: - df: Measures table for a single piece. - - Returns: - The measure map where each row corresponds to one entry. - - Raises: - ValueError: - If there are NA values in the quarterbeats column. If a quarterbeats_all_endings column is present, - which should never include NA values, it will be used for the "qstamp" column. - """ - # renaming columns - renaming_dict = { - "mc": "count", - "mn": "number", - "timesig": "time-signature", - } - mm_columns = list(renaming_dict.values()) - measure_map = df.rename(columns=renaming_dict)[mm_columns] - - # quarterbeats_all_endings only present in measures if score has voltas - if "quarterbeats_all_endings" in df: - qstamp_col = df["quarterbeats_all_endings"] - else: - qstamp_col = df["quarterbeats"] - if qstamp_col.isna().any(): - raise ValueError(f"There are NA values in the column {qstamp_col.name!r}.") - qstamp_col = qstamp_col.astype(float).rename("qstamp") - nominal_col = (df.timesig.map(Fraction) * 4.0).rename("nominal_length") - actual_col = (df.act_dur * 4.0).rename("actual_length") - start_repeat_col = df.repeats.str.contains("start").rename("start_repeat") - end_repeat_col = df.repeats.str.contains("end").rename("end_repeat") - next_col = df.next.map(list) - - measure_map = pd.concat( - [ - measure_map, - qstamp_col, - nominal_col, - actual_col, - start_repeat_col, - end_repeat_col, - next_col, - ], - axis=1, - ) - return measure_map From 5c7293283fd153cc8fe2edaed39bc116c3f75019 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Thu, 21 Sep 2023 09:40:39 +0200 Subject: [PATCH 38/44] has MeasureMap fields start/end_repeat default to False --- src/ms3/transformations.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ms3/transformations.py b/src/ms3/transformations.py index 01b1e316..9b74d87b 100644 --- a/src/ms3/transformations.py +++ b/src/ms3/transformations.py @@ -1262,8 +1262,10 @@ def measures2measure_map(df: pd.DataFrame) -> pd.DataFrame: qstamp_col = qstamp_col.astype(float).rename("qstamp") nominal_col = (df.timesig.map(Fraction) * 4.0).rename("nominal_length") actual_col = (df.act_dur * 4.0).rename("actual_length") - start_repeat_col = df.repeats.str.contains("start").rename("start_repeat") - end_repeat_col = df.repeats.str.contains("end").rename("end_repeat") + start_repeat_col = ( + df.repeats.str.contains("start").fillna(False).rename("start_repeat") + ) + end_repeat_col = df.repeats.str.contains("end").fillna(False).rename("end_repeat") next_col = df.next.map(list) measure_map = pd.concat( From ead673f3b70ca968d232c9abad2df82f554ea36c Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Thu, 21 Sep 2023 15:09:34 +0200 Subject: [PATCH 39/44] writes measuremap.json with indent=2 --- src/ms3/corpus.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ms3/corpus.py b/src/ms3/corpus.py index 5be46515..9177af0c 100644 --- a/src/ms3/corpus.py +++ b/src/ms3/corpus.py @@ -3151,7 +3151,9 @@ def store_extracted_facets( ) else: measure_map = measures2measure_map(df) - measure_map_json = measure_map.to_json(orient="records") + measure_map_json = measure_map.to_json( + orient="records", indent=2 + ) store_as_json_or_yaml( measure_map_json, file_path, logger=self.logger ) From cf1c16f4abe6f9f9a98d78557fa401d39b57e6e3 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Thu, 21 Sep 2023 15:27:14 +0200 Subject: [PATCH 40/44] corrects time-signature => time_signature --- src/ms3/transformations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ms3/transformations.py b/src/ms3/transformations.py index 9b74d87b..92a604a4 100644 --- a/src/ms3/transformations.py +++ b/src/ms3/transformations.py @@ -1247,7 +1247,7 @@ def measures2measure_map(df: pd.DataFrame) -> pd.DataFrame: renaming_dict = { "mc": "count", "mn": "number", - "timesig": "time-signature", + "timesig": "time_signature", } mm_columns = list(renaming_dict.values()) measure_map = df.rename(columns=renaming_dict)[mm_columns] From b0d50f72dd43188fe061796af64a976f98075fe8 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Thu, 21 Sep 2023 15:41:29 +0200 Subject: [PATCH 41/44] prevents panda.to_json's strange escaping of characters --- src/ms3/corpus.py | 4 +--- src/ms3/utils/frictionless_helpers.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ms3/corpus.py b/src/ms3/corpus.py index 9177af0c..4caf59ec 100644 --- a/src/ms3/corpus.py +++ b/src/ms3/corpus.py @@ -3151,9 +3151,7 @@ def store_extracted_facets( ) else: measure_map = measures2measure_map(df) - measure_map_json = measure_map.to_json( - orient="records", indent=2 - ) + measure_map_json = measure_map.to_dict(orient="records") store_as_json_or_yaml( measure_map_json, file_path, logger=self.logger ) diff --git a/src/ms3/utils/frictionless_helpers.py b/src/ms3/utils/frictionless_helpers.py index 2d2c8ce7..ff26445b 100644 --- a/src/ms3/utils/frictionless_helpers.py +++ b/src/ms3/utils/frictionless_helpers.py @@ -332,7 +332,7 @@ def get_schema( def store_as_json_or_yaml( - descriptor_dict: dict | str, + descriptor_dict: dict | list | str, descriptor_path: str, logger: Optional[logging.Logger | str] = None, ): From 2e9f53c6a9e96710fd0dc4464d377a382d782fd5 Mon Sep 17 00:00:00 2001 From: Louis Couturier Date: Fri, 22 Sep 2023 13:20:04 +0200 Subject: [PATCH 42/44] fix file formatting and keywords names --- src/ms3/measure_map.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ms3/measure_map.py b/src/ms3/measure_map.py index 4eb0798a..53541339 100644 --- a/src/ms3/measure_map.py +++ b/src/ms3/measure_map.py @@ -34,7 +34,7 @@ * `count`: int. Ordinal position of the 'printed' measure in the piece, so that the first has counter 1 and the last has the value of the total number of printed measures in the piece. This value is unique for each measure of the piece. -* `id`: str. Unique identifier of the measure in the piece. It defaults to a string +* `ID`: str. Unique identifier of the measure in the piece. It defaults to a string a the `count` value. * `time_signature`: str. Fraction corresponding to the time signature in the measure. By default, the time signature is equal to the last previously labelled time @@ -151,7 +151,7 @@ def generate_measure_map( # Time signature, time signature upbeat is_partial_measure = Fraction(measure.timesig) != Fraction(measure.act_dur) """If True, indicates if the printed measure's duration does not match the - time signature nominal duration; for example, in anacruses.""" + time signature nominal length; for example, in anacruses.""" has_new_timesig = measure.timesig != current_time_sig """If True, indicates a change of time signature. This is always the case for the first measure.""" @@ -159,19 +159,19 @@ def generate_measure_map( display_measure = True if full_info or is_partial_measure: - measure_dict["actual_duration"] = float(measure.duration_qb) + measure_dict["actual_length"] = float(measure.duration_qb) if full_info or has_new_timesig: measure_dict["time_signature"] = measure.timesig current_time_sig = measure.timesig if full_info: - measure_dict["nominal_duration"] = Fraction(measure.timesig) * 4.0 + measure_dict["nominal_length"] = Fraction(measure.timesig) * 4.0 # Measure count, identifier measure_dict["count"] = measure_mc if full_info: # TODO: allow personalised id # By default, also the measure counter - measure_dict["id"] = str(measure_mc) + measure_dict["ID"] = str(measure_mc) # Measure number, name # TODO: handle first/second endings which last more than 1 measure. @@ -259,7 +259,7 @@ def generate_measure_map( # Save output if output_file: - json_str = {"meter": measure_map} + json_str = measure_map # {"meter": measure_map} with open(output_file, "w", encoding="utf-8") as f: json.dump(json_str, f, indent=2) @@ -293,7 +293,7 @@ def main_generate_mm( INPUT_DIR = os.path.join( # from ~ms3/src/ms3 as working directory "..", "..", "..", "mozart_piano_sonatas", "measures" ) - TEST_PIECES = ["K284-1", "K284-3"] + TEST_PIECES = ["K279-1", "K283-1", "K284-3"] TEST_INPUT_PATHS = [ os.path.join(INPUT_DIR, f"{piece}.tsv") for piece in TEST_PIECES ] @@ -304,6 +304,6 @@ def main_generate_mm( output_dir=OUTPUT_DIR, verbose=True, stops=True, - compressed=True, - full_info=False, + compressed=False, + full_info=True, ) From 9ddaf22a016b6326c6b6f0162f5f3947dcb965e0 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Mon, 25 Sep 2023 16:45:19 +0200 Subject: [PATCH 43/44] changes suffix .measuremap.json => .mm.json --- src/ms3/cli.py | 2 +- src/ms3/corpus.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ms3/cli.py b/src/ms3/cli.py index f44b568a..20cbb188 100644 --- a/src/ms3/cli.py +++ b/src/ms3/cli.py @@ -694,7 +694,7 @@ def get_arg_parser(): metavar="folder", nargs="?", const="../measures", - help="Folder where to store measuremap.json files. They are a variant of the 'normal' --measures with renamed " + help="Folder where to store .mm.json files. They are a variant of the 'normal' --measures with renamed " "columns, satisfying the MeasureMap specification.", ) extract_args.add_argument( diff --git a/src/ms3/corpus.py b/src/ms3/corpus.py index 4caf59ec..be0d29d8 100644 --- a/src/ms3/corpus.py +++ b/src/ms3/corpus.py @@ -3142,9 +3142,7 @@ def store_extracted_facets( directory = compute_path_from_file( file, root_dir=root_dir, folder=measure_maps_folder ) - file_path = os.path.join( - directory, f"{piece}.measuremap.json" - ) + file_path = os.path.join(directory, f"{piece}.mm.json") if simulate: self.logger.info( f"Would have stored the MeasureMap from {file.rel_path} as {file_path}." From dbec9628e0e05a86dd351b729cb0b3f4fd812166 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Mon, 25 Sep 2023 21:10:42 +0200 Subject: [PATCH 44/44] makes MeasureMaps fully verbose and corrects the order of fields --- src/ms3/transformations.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/ms3/transformations.py b/src/ms3/transformations.py index 92a604a4..b76b7821 100644 --- a/src/ms3/transformations.py +++ b/src/ms3/transformations.py @@ -1244,13 +1244,23 @@ def measures2measure_map(df: pd.DataFrame) -> pd.DataFrame: which should never include NA values, it will be used for the "qstamp" column. """ # renaming columns - renaming_dict = { - "mc": "count", - "mn": "number", - "timesig": "time_signature", - } - mm_columns = list(renaming_dict.values()) - measure_map = df.rename(columns=renaming_dict)[mm_columns] + count_col = df.mc.rename("count") + id_col = count_col.astype(str).rename("ID") + number_col = df.mn.rename("number") + name_col = number_col.astype(str).rename("name") + if "volta" in df: + + def make_repeat_char(val: int) -> str: + """Turns a volta number into the corresponding repeat character.""" + if pd.isnull(val): + return "" + try: + return chr(int(val) + 96) # 1 -> 'a', 2 -> 'b', etc. + except Exception: + return "" + + name_col += df.volta.map(make_repeat_char) + timesig_col = df["timesig"].rename("time_signature") # quarterbeats_all_endings only present in measures if score has voltas if "quarterbeats_all_endings" in df: @@ -1270,8 +1280,12 @@ def measures2measure_map(df: pd.DataFrame) -> pd.DataFrame: measure_map = pd.concat( [ - measure_map, + id_col, + count_col, qstamp_col, + number_col, + name_col, + timesig_col, nominal_col, actual_col, start_repeat_col,