diff --git a/example/name_to_properties/example_name_to_properties.py b/example/name_to_properties/example_name_to_properties.py index a55f707..2c1426e 100644 --- a/example/name_to_properties/example_name_to_properties.py +++ b/example/name_to_properties/example_name_to_properties.py @@ -8,11 +8,11 @@ import pubchempy as pcp from gcms_data_analysis.fragmenter import Fragmenter -from gcms_data_analysis import name_to_properties +from gcms_data_analysis.gcms import name_to_properties folder_path = plib.Path( - r"C:\Users\mp933\OneDrive - Cornell University\Python\gcms_data_analysis\tests\data_name_to_properties" + r"/Users/matteo/Projects/gcms_data_analysis/example/name_to_properties/data_name_to_properties" ) # %% classifications_codes_fractions = pd.read_excel( diff --git a/src/gcms_data_analysis/fragmenter.py b/src/gcms_data_analysis/fragmenter.py index c89e20c..a97541b 100644 --- a/src/gcms_data_analysis/fragmenter.py +++ b/src/gcms_data_analysis/fragmenter.py @@ -2,9 +2,9 @@ from rdkit import Chem from rdkit.Chem import DataStructs from rdkit.Chem import rdmolops -from rdkit.Chem.AllChem import ( +from rdkit.Chem.AllChem import ( # pylint: disable=no-name-in-module GetMorganFingerprintAsBitVect, -) # pylint: disable=no-name-in-module +) class Fragmenter: diff --git a/src/gcms_data_analysis/gcms.py b/src/gcms_data_analysis/gcms.py index 5640ee8..a00dfd2 100644 --- a/src/gcms_data_analysis/gcms.py +++ b/src/gcms_data_analysis/gcms.py @@ -266,7 +266,7 @@ def load_all_files(self): for filename in self.files_info.index: file = self.load_single_file(filename) self.files[filename] = file - print("Info: load_all_files: files loaded") + print(f"Info: load_all_files: {len(self.files)} files loaded") return self.files def load_single_file(self, filename) -> pd.DataFrame: @@ -464,7 +464,10 @@ def create_tanimoto_and_molecular_weight_similarity_dfs( self.load_compounds_properties() if self.dict_names_to_iupacs is None: self.create_dict_names_to_iupacs() - if "iupac_name" not in list(self.files.values())[0].columns: + if ( + "iupac_name" not in list(self.files.values())[0].columns + or "iupac_name" not in list(self.calibrations.values())[0].columns + ): self.add_iupac_to_files_and_calibrations() prop_index_iupac = self.compounds_properties.set_index("iupac_name") prop_index_iupac = prop_index_iupac[ @@ -542,12 +545,23 @@ def apply_calibration_to_files(self): in the loaded files, adjusting concentrations based on calibration data, and updates the 'files' attribute with calibrated data.""" print("Info: apply_calibration_to_files: loop started") - if "iupac_name" not in list(self.files.values())[0].columns: + if not self.files: + self.load_all_files() + if not self.calibrations: + self.load_calibrations() + if self.compounds_properties is None: + self.load_compounds_properties() + if self.dict_names_to_iupacs is None: + self.create_dict_names_to_iupacs() + if ( + "iupac_name" not in list(self.files.values())[0].columns + or "iupac_name" not in list(self.calibrations.values())[0].columns + ): self.add_iupac_to_files_and_calibrations() if self.use_semi_calibration and not self.semi_calibration_dict: self.create_semi_calibration_dict() - for filename in self.files.keys(): + for filename in self.files: self.files[filename] = self.apply_calib_to_single_file(filename) return self.files @@ -630,7 +644,8 @@ def add_stats_to_files_info(self) -> pd.DataFrame: DataFrame, such as maximum height, area, and concentrations, updating the 'files_info' with these statistics.""" print("Info: add_stats_to_files_info: started") - + if not self.files: + self.load_all_files() numeric_columns = [ col for col in self.acceptable_params @@ -658,8 +673,8 @@ def create_samples_info(self): """Creates a summary 'samples_info' DataFrame from 'files_info', aggregating data for each sample, and updates the 'samples_info' attribute with this summarized data.""" - if self.files_info is None: - self.load_files_info() + if not self.files: + self.load_all_files() numeric_columns = [ col for col in self.acceptable_params @@ -801,6 +816,12 @@ def create_files_param_report(self, param="conc_vial_mg_L"): self.load_all_files() if param not in self.acceptable_params: raise ValueError(f"{param = } is not an acceptable param") + self.load_calibrations() + if self.calibrations: + self.apply_calibration_to_files() + for filename in self.files_info.index: + if param not in self.files[filename].columns: + raise ValueError(f"{param = } not found in {filename = }") # Create a dictionary of Series, each Series named after the file and containing the 'param' values series_dict = { filename: self.files[filename][param].rename(filename) @@ -829,6 +850,8 @@ def create_files_param_aggrrep(self, param="conc_vial_mg_L"): raise ValueError(f"{param = } is not an acceptable param") if param not in self.files_reports: self.create_files_param_report(param) + if self.compounds_properties is None: + self.load_compounds_properties() # create a df with iupac name index and fg_mf columns (underiv and deriv) comps_df = self.compounds_properties # .set_index("iupac_name") # comps_df = comps_df[~comps_df.index.duplicated(keep="first")] @@ -872,6 +895,9 @@ def create_samples_param_report(self, param: str = "conc_vial_mg_L"): print(f"Info: create_samples_param_report: {param = }") if param not in self.acceptable_params: raise ValueError(f"{param = } is not an acceptable param") + self.load_calibrations() + if self.calibrations: + self.apply_calibration_to_files() if param not in self.files_reports: self.create_files_param_report(param) file_to_sample_rename = dict( diff --git a/tests/data_minimal_case/compounds_properties.xlsx b/tests/data_minimal_case/compounds_properties.xlsx index 7136ef0..5e84c02 100644 Binary files a/tests/data_minimal_case/compounds_properties.xlsx and b/tests/data_minimal_case/compounds_properties.xlsx differ diff --git a/tests/data_minimal_case/files_info.xlsx b/tests/data_minimal_case/files_info.xlsx index e20503a..96c8319 100644 Binary files a/tests/data_minimal_case/files_info.xlsx and b/tests/data_minimal_case/files_info.xlsx differ diff --git a/tests/test_project_class.py b/tests/test_project_class.py index 06faf74..ab597ae 100644 --- a/tests/test_project_class.py +++ b/tests/test_project_class.py @@ -10,13 +10,15 @@ folder_path: plib.Path = plib.Path(__file__).parent folder_path = r"/Users/matteo/Projects/gcms_data_analysis/tests/data_minimal_case" -# %% + proj = Project( folder_path=folder_path, auto_save_to_excel=False, - compounds_to_rename_in_files={"almost oleic acid": "oleic acid"}, + compounds_to_rename_in_files={ + "almost oleic acid": "oleic acid", + "dichlorobenzene": "p-dichlorobenzene", + }, ) - # %% files_info_created = proj.create_files_info(update_saved_files_info=False) print(files_info_created.T) @@ -79,38 +81,55 @@ samples, samples_std = proj.create_samples_from_files() # %% reph = proj.create_files_param_report(param="height") -repc = proj.create_files_param_report(param="conc_vial_mg_L") print(reph) + +repc = proj.create_files_param_report(param="conc_vial_mg_L") print(repc) # %% repsh, repsh_d = proj.create_samples_param_report(param="height") -repsc, repsc_d = proj.create_samples_param_report(param="conc_vial_mg_L") print(repsh) +repsc, repsc_d = proj.create_samples_param_report(param="conc_vial_mg_L") print(repsc) # %% aggh = proj.create_files_param_aggrrep(param="height") -aggc = proj.create_files_param_aggrrep(param="conc_vial_mg_L") print(aggh) +# %% +aggc = proj.create_files_param_aggrrep(param="conc_vial_mg_L") + print(aggc) # %% aggsh, aggsh_d = proj.create_samples_param_aggrrep(param="height") -aggsc, aggsc_d = proj.create_samples_param_aggrrep(param="conc_vial_mg_L") print(aggsh) print(aggsh_d) +# %% +aggsc, aggsc_d = proj.create_samples_param_aggrrep(param="conc_vial_mg_L") + print(aggsc) print(aggsc_d) # %% proj.save_files_samples_reports() # %% -from __future__ import annotations -from typing import Literal -from myfigure.myfigure import MyFigure, colors, hatches +proj.plot_report() -def plot_ave_std( - project: Project, - files_or_samples: Literal["files", "samples"] = "samples", - parameter: Literal[ +# %% + + +@pytest.fixture +def project(): + test_project = Project( + folder_path=folder_path, + auto_save_to_excel=False, + compounds_to_rename_in_files={"almost oleic acid": "oleic acid"}, + ) + return test_project + + +# Test default parameters +def test_default_parameters(project): + assert proj.column_to_sort_values_in_samples == "retention_time" + assert proj.delta_mol_weight_threshold == 100 + assert proj.acceptable_params == [ "height", "area", "area_if_undiluted", @@ -118,415 +137,50 @@ def plot_ave_std( "conc_vial_if_undiluted_mg_L", "fraction_of_sample_fr", "fraction_of_feedstock_fr", - ] = "conc_vial_mg_L", - aggregate: bool = False, - show_total_in_twinx: bool = False, - min_y_thresh: float | None = None, - only_samples_to_plot: list[str] | None = None, - rename_samples: list[str] | None = None, - reorder_samples: list[str] | None = None, - item_to_color_to_hatch: pd.DataFrame | None = None, - yt_sum_label: str = "total\n(right axis)", - **kwargs, -) -> MyFigure: - """ """ - if show_total_in_twinx: - plot_twinx: bool = True - else: - plot_twinx: bool = None - default_kwargs = { - "filename": "plot" + parameter, - "out_path": proj.out_path, - "height": 4, - "width": 4, - "grid": proj.plot_grid, - "text_font": proj.plot_font, - "y_lab": project.parameter_to_axis_label[parameter], - "yt_lab": project.parameter_to_axis_label[parameter], - "twinx": plot_twinx, - "masked_unsignificant_data": True, - # "legend": False, - } - # Update kwargs with the default key-value pairs if the key is not present in kwargs - kwargs = {**default_kwargs, **kwargs} - # create folder where Plots are stored - out_path = plib.Path(project.out_path, "plots", files_or_samples) - out_path.mkdir(parents=True, exist_ok=True) - if not aggregate: # then use compounds reports - if files_or_samples == "files": - df_ave = proj.files_reports[parameter].T - df_std = pd.DataFrame() - elif files_or_samples == "samples": - df_ave = proj.samples_reports[parameter].T - df_std = proj.samples_reports_std[parameter].T - else: # use aggregated reports - if files_or_samples == "files": - df_ave = proj.files_aggrreps[parameter].T - df_std = pd.DataFrame() - elif files_or_samples == "samples": - df_ave = proj.samples_aggrreps[parameter].T - df_std = proj.samples_aggrreps_std[parameter].T + ] + assert proj.compounds_to_rename_in_files == {"almost oleic acid": "oleic acid"} - if only_samples_to_plot is not None: - df_ave = df_ave.loc[only_samples_to_plot, :].copy() - if files_or_samples == "samples": - df_std = df_std.loc[only_samples_to_plot, :].copy() - if rename_samples is not None: - df_ave.index = rename_samples - if files_or_samples == "samples": - df_std.index = rename_samples +# Test the `load_files_info` method +def test_load_files_info(project): + files_info = proj.load_files_info() + assert isinstance(files_info, pd.DataFrame) + assert len(files_info) > 0 - if reorder_samples is not None: - filtered_reorder_samples = [ - idx for idx in reorder_samples if idx in df_ave.index - ] - df_ave = df_ave.reindex(filtered_reorder_samples) - if files_or_samples == "samples": - df_std = df_std.reindex(filtered_reorder_samples) - if min_y_thresh is not None: - df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() - if files_or_samples == "samples": - df_std = df_std.loc[:, df_ave.columns].copy() +# Test the `load_all_files` method +def test_load_all_files(project): + files = proj.load_all_files() + assert isinstance(files, dict) + assert len(files) > 0 - if item_to_color_to_hatch is not None: # specific color and hatches to each fg - plot_colors = [ - item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns - ] - plot_hatches = [ - item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns - ] - else: # no specific colors and hatches specified - plot_colors = colors - plot_hatches = hatches - myfig = MyFigure( - rows=1, - cols=1, - **kwargs, - ) - if df_std.isna().all().all() or df_std.empty: # means that no std is provided - df_ave.plot( - ax=myfig.axs[0], - kind="bar", - width=0.9, - edgecolor="k", - legend=False, - capsize=3, - color=colors, - ) - else: # no legend is represented but non-significant values are shaded - mask = (df_ave.abs() > df_std.abs()) | df_std.isna() - df_ave[mask].plot( - ax=myfig.axs[0], - kind="bar", - width=0.9, - edgecolor="k", - legend=False, - yerr=df_std[mask], - capsize=3, - color=colors, - label="_nolegend_", - ) +# Test the `load_class_code_frac` method +def test_load_class_code_frac(project): + class_code_frac = proj.load_class_code_frac() + assert isinstance(class_code_frac, pd.DataFrame) + assert len(class_code_frac) > 0 - df_ave[~mask].plot( - ax=myfig.axs[0], - kind="bar", - width=0.9, - legend=False, - edgecolor="grey", - color=colors, - alpha=0.5, - label="_nolegend_", - ) - if show_total_in_twinx: - myfig.axts[0].scatter( - df_ave.index, - df_ave.sum(axis=1).values, - color="k", - linestyle="None", - edgecolor="k", - facecolor="grey", - s=100, - label=yt_sum_label, - alpha=0.5, - ) - if not df_std.empty: - myfig.axts[0].errorbar( - df_ave.index, - df_ave.sum(axis=1).values, - df_std.sum(axis=1).values, - capsize=3, - linestyle="None", - color="grey", - ecolor="k", - label="_nolegend_", - ) - myfig.save_figure() - return myfig +# Test the `load_calibrations` method +def test_load_calibrations(project): + calibrations = proj.load_calibrations() + assert isinstance(calibrations, dict) + assert len(calibrations) > 0 -def plot_df_ave_std( - proj: Project, - df_ave: pd.DataFrame, - df_std: pd.DataFrame = pd.DataFrame(), - filename: str = "plot", - show_total_in_twinx: bool = False, - annotate_outliers: bool = True, - min_y_thresh: float | None = None, - only_samples_to_plot: list[str] | None = None, - rename_samples: list[str] | None = None, - reorder_samples: list[str] | None = None, - item_to_color_to_hatch: pd.DataFrame | None = None, - yt_sum_label: str = "total\n(right axis)", - **kwargs, -) -> MyFigure: +# Test the `create_list_of_all_compounds` method +def test_create_list_of_all_compounds(project): + compounds = proj.create_list_of_all_compounds() + assert isinstance(compounds, list) + assert len(compounds) > 0 - # create folder where Plots are stored - out_path = plib.Path(Project.out_path, "df_plots") - out_path.mkdir(parents=True, exist_ok=True) - if only_samples_to_plot is not None: - df_ave = df_ave.loc[only_samples_to_plot, :].copy() - if not df_std.empty: - df_std = df_std.loc[only_samples_to_plot, :].copy() - if rename_samples is not None: - df_ave.index = rename_samples - if not df_std.empty: - df_std.index = rename_samples - - if reorder_samples is not None: - filtered_reorder_samples = [ - idx for idx in reorder_samples if idx in df_ave.index - ] - df_ave = df_ave.reindex(filtered_reorder_samples) - if not df_std.empty: - df_std = df_std.reindex(filtered_reorder_samples) - if reorder_samples is not None: - filtered_reorder_samples = [ - idx for idx in reorder_samples if idx in df_ave.index - ] - df_ave = df_ave.reindex(filtered_reorder_samples) - if not df_std.empty: - df_std = df_std.reindex(filtered_reorder_samples) - - if min_y_thresh is not None: - df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() - if not df_std.empty: - df_std = df_std.loc[:, df_ave.columns].copy() - - if item_to_color_to_hatch is not None: # specific color and hatches to each fg - colors = [item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns] - hatches = [item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns] - else: # no specific colors and hatches specified - colors = sns.color_palette(color_palette, df_ave.shape[1]) - hatches = htchs - - if show_total_in_twinx: - plot_twinx: bool = True - else: - plot_twinx: bool = False - - if show_total_in_twinx: - legend_x_anchor += 0.14 - yt_lab = y_lab - - myfig = MyFigure( - rows=1, - cols=1, - twinx=plot_twinx, - text_font=Project.plot_font, - y_lab=y_lab, - yt_lab=yt_lab, - y_lim=y_lim, - legend=False, - grid=Project.plot_grid, - **kwargs, - ) - if df_std.isna().all().all() or df_std.empty: # means that no std is provided - df_ave.plot( - ax=myfig.axs[0], - kind="bar", - rot=x_label_rotation, - width=0.9, - edgecolor="k", - legend=False, - capsize=3, - color=colors, - ) - bars = myfig.axs[0].patches # needed to add patches to the bars - n_different_hatches = int(len(bars) / df_ave.shape[0]) - else: # no legend is represented but non-significant values are shaded - mask = (df_ave.abs() > df_std.abs()) | df_std.isna() - - df_ave[mask].plot( - ax=myfig.axs[0], - kind="bar", - rot=x_label_rotation, - width=0.9, - edgecolor="k", - legend=False, - yerr=df_std[mask], - capsize=3, - color=colors, - label="_nolegend", - ) - df_ave[~mask].plot( - ax=myfig.axs[0], - kind="bar", - rot=x_label_rotation, - width=0.9, - legend=False, - edgecolor="grey", - color=colors, - alpha=0.5, - label="_nolegend", - ) - bars = myfig.axs[0].patches # needed to add patches to the bars - n_different_hatches = int(len(bars) / df_ave.shape[0] / 2) - if show_total_in_twinx: - myfig.axts[0].scatter( - df_ave.index, - df_ave.sum(axis=1).values, - color="k", - linestyle="None", - edgecolor="k", - facecolor="grey", - s=100, - label=yt_sum_label, - alpha=0.5, - ) - if not df_std.empty: - myfig.axts[0].errorbar( - df_ave.index, - df_ave.sum(axis=1).values, - df_std.sum(axis=1).values, - capsize=3, - linestyle="None", - color="grey", - ecolor="k", - ) - bar_hatches = [] - # get a list with the hatches - for h in hatches[:n_different_hatches] + hatches[:n_different_hatches]: - for n in range(df_ave.shape[0]): # htcs repeated for samples - bar_hatches.append(h) # append based on samples number - for bar, hatch in zip(bars, bar_hatches): # assign hatches to each bar - bar.set_hatch(hatch) - myfig.axs[0].set(xlabel=None) - if x_label_rotation != 0: - myfig.axs[0].set_xticklabels( - df_ave.index, rotation=x_label_rotation, ha="right", rotation_mode="anchor" - ) - if legend_location is not None: - hnd_ax, lab_ax = myfig.axs[0].get_legend_handles_labels() - if not df_std.empty: - hnd_ax = hnd_ax[: len(hnd_ax) // 2] - lab_ax = lab_ax[: len(lab_ax) // 2] - if legend_labelspacing > 0.5: # large legend spacing for molecules - myfig.axs[0].plot(np.nan, np.nan, "-", color="None", label=" ") - hhhh, aaaa = myfig.axs[0].get_legend_handles_labels() - hnd_ax.append(hhhh[0]) - lab_ax.append(aaaa[0]) - if show_total_in_twinx: - hnd_axt, lab_axt = myfig.axts[0].get_legend_handles_labels() - else: - hnd_axt, lab_axt = [], [] - if legend_location == "outside": # legend goes outside of plot area - myfig.axs[0].legend( - hnd_ax + hnd_axt, - lab_ax + lab_axt, - loc="upper left", - ncol=legend_columns, - bbox_to_anchor=(legend_x_anchor, legend_y_anchor), - labelspacing=legend_labelspacing, - ) - else: # legend is inside of plot area - myfig.axs[0].legend( - hnd_ax + hnd_axt, - lab_ax + lab_axt, - loc=legend_location, - ncol=legend_columns, - labelspacing=legend_labelspacing, - ) - # annotate ave+-std at the top of outliers bar (exceeding y_lim) - if annotate_outliers and (y_lim is not None): # and (not df_std.empty): - _annotate_outliers_in_plot(myfig.axs[0], df_ave, df_std, y_lim) - myfig.save_figure(filename, out_path) - return myfig - - -# %% - - -@pytest.fixture -def project(): - test_project = Project( - folder_path=folder_path, - auto_save_to_excel=False, - compounds_to_rename_in_files={"almost oleic acid": "oleic acid"}, - ) - return test_project - - -# Test default parameters -def test_default_parameters(project): - assert proj.column_to_sort_values_in_samples == "retention_time" - assert proj.delta_mol_weight_threshold == 100 - assert proj.acceptable_params == [ - "height", - "area", - "area_if_undiluted", - "conc_vial_mg_L", - "conc_vial_if_undiluted_mg_L", - "fraction_of_sample_fr", - "fraction_of_feedstock_fr", - ] - assert proj.compounds_to_rename_in_files == {"almost oleic acid": "oleic acid"} - - -# Test the `load_files_info` method -def test_load_files_info(project): - files_info = proj.load_files_info() - assert isinstance(files_info, pd.DataFrame) - assert len(files_info) > 0 - - -# Test the `load_all_files` method -def test_load_all_files(project): - files = proj.load_all_files() - assert isinstance(files, dict) - assert len(files) > 0 - - -# Test the `load_class_code_frac` method -def test_load_class_code_frac(project): - class_code_frac = proj.load_class_code_frac() - assert isinstance(class_code_frac, pd.DataFrame) - assert len(class_code_frac) > 0 - - -# Test the `load_calibrations` method -def test_load_calibrations(project): - calibrations = proj.load_calibrations() - assert isinstance(calibrations, dict) - assert len(calibrations) > 0 - - -# Test the `create_list_of_all_compounds` method -def test_create_list_of_all_compounds(project): - compounds = proj.create_list_of_all_compounds() - assert isinstance(compounds, list) - assert len(compounds) > 0 - - -# Test the `create_compounds_properties` method -def test_create_compounds_properties(project): - compounds_properties = proj.create_compounds_properties() - assert isinstance(compounds_properties, pd.DataFrame) - assert len(compounds_properties) > 0 +# Test the `create_compounds_properties` method +def test_create_compounds_properties(project): + compounds_properties = proj.create_compounds_properties() + assert isinstance(compounds_properties, pd.DataFrame) + assert len(compounds_properties) > 0 assert_frame_equal( @@ -742,3 +396,358 @@ def test_save_files_samples_reports(project): # %% +# %% +from __future__ import annotations +from typing import Literal +from myfigure.myfigure import MyFigure, colors, hatches + + +def plot_ave_std( + project: Project, + files_or_samples: Literal["files", "samples"] = "samples", + parameter: Literal[ + "height", + "area", + "area_if_undiluted", + "conc_vial_mg_L", + "conc_vial_if_undiluted_mg_L", + "fraction_of_sample_fr", + "fraction_of_feedstock_fr", + ] = "conc_vial_mg_L", + aggregate: bool = False, + show_total_in_twinx: bool = False, + min_y_thresh: float | None = None, + only_samples_to_plot: list[str] | None = None, + rename_samples: list[str] | None = None, + reorder_samples: list[str] | None = None, + item_to_color_to_hatch: pd.DataFrame | None = None, + yt_sum_label: str = "total\n(right axis)", + **kwargs, +) -> MyFigure: + """ """ + if show_total_in_twinx: + plot_twinx: bool = True + else: + plot_twinx: bool = None + default_kwargs = { + "filename": "plot" + parameter, + "out_path": proj.out_path, + "height": 4, + "width": 4, + "grid": proj.plot_grid, + "text_font": proj.plot_font, + "y_lab": project.parameter_to_axis_label[parameter], + "yt_lab": project.parameter_to_axis_label[parameter], + "twinx": plot_twinx, + "masked_unsignificant_data": True, + # "legend": False, + } + # Update kwargs with the default key-value pairs if the key is not present in kwargs + kwargs = {**default_kwargs, **kwargs} + # create folder where Plots are stored + out_path = plib.Path(project.out_path, "plots", files_or_samples) + out_path.mkdir(parents=True, exist_ok=True) + if not aggregate: # then use compounds reports + if files_or_samples == "files": + df_ave = proj.files_reports[parameter].T + df_std = pd.DataFrame() + elif files_or_samples == "samples": + df_ave = proj.samples_reports[parameter].T + df_std = proj.samples_reports_std[parameter].T + else: # use aggregated reports + if files_or_samples == "files": + df_ave = proj.files_aggrreps[parameter].T + df_std = pd.DataFrame() + elif files_or_samples == "samples": + df_ave = proj.samples_aggrreps[parameter].T + df_std = proj.samples_aggrreps_std[parameter].T + + if only_samples_to_plot is not None: + df_ave = df_ave.loc[only_samples_to_plot, :].copy() + if files_or_samples == "samples": + df_std = df_std.loc[only_samples_to_plot, :].copy() + + if rename_samples is not None: + df_ave.index = rename_samples + if files_or_samples == "samples": + df_std.index = rename_samples + + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if files_or_samples == "samples": + df_std = df_std.reindex(filtered_reorder_samples) + + if min_y_thresh is not None: + df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() + if files_or_samples == "samples": + df_std = df_std.loc[:, df_ave.columns].copy() + + if item_to_color_to_hatch is not None: # specific color and hatches to each fg + plot_colors = [ + item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns + ] + plot_hatches = [ + item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns + ] + else: # no specific colors and hatches specified + plot_colors = colors + plot_hatches = hatches + + myfig = MyFigure( + rows=1, + cols=1, + **kwargs, + ) + if df_std.isna().all().all() or df_std.empty: # means that no std is provided + df_ave.plot( + ax=myfig.axs[0], + kind="bar", + width=0.9, + edgecolor="k", + legend=False, + capsize=3, + color=colors, + ) + else: # no legend is represented but non-significant values are shaded + mask = (df_ave.abs() > df_std.abs()) | df_std.isna() + df_ave[mask].plot( + ax=myfig.axs[0], + kind="bar", + width=0.9, + edgecolor="k", + legend=False, + yerr=df_std[mask], + capsize=3, + color=colors, + label="_nolegend_", + ) + + df_ave[~mask].plot( + ax=myfig.axs[0], + kind="bar", + width=0.9, + legend=False, + edgecolor="grey", + color=colors, + alpha=0.5, + label="_nolegend_", + ) + if show_total_in_twinx: + myfig.axts[0].scatter( + df_ave.index, + df_ave.sum(axis=1).values, + color="k", + linestyle="None", + edgecolor="k", + facecolor="grey", + s=100, + label=yt_sum_label, + alpha=0.5, + ) + if not df_std.empty: + myfig.axts[0].errorbar( + df_ave.index, + df_ave.sum(axis=1).values, + df_std.sum(axis=1).values, + capsize=3, + linestyle="None", + color="grey", + ecolor="k", + label="_nolegend_", + ) + + myfig.save_figure() + return myfig + + +def plot_df_ave_std( + proj: Project, + df_ave: pd.DataFrame, + df_std: pd.DataFrame = pd.DataFrame(), + filename: str = "plot", + show_total_in_twinx: bool = False, + annotate_outliers: bool = True, + min_y_thresh: float | None = None, + only_samples_to_plot: list[str] | None = None, + rename_samples: list[str] | None = None, + reorder_samples: list[str] | None = None, + item_to_color_to_hatch: pd.DataFrame | None = None, + yt_sum_label: str = "total\n(right axis)", + **kwargs, +) -> MyFigure: + + # create folder where Plots are stored + out_path = plib.Path(Project.out_path, "df_plots") + out_path.mkdir(parents=True, exist_ok=True) + if only_samples_to_plot is not None: + df_ave = df_ave.loc[only_samples_to_plot, :].copy() + if not df_std.empty: + df_std = df_std.loc[only_samples_to_plot, :].copy() + + if rename_samples is not None: + df_ave.index = rename_samples + if not df_std.empty: + df_std.index = rename_samples + + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if not df_std.empty: + df_std = df_std.reindex(filtered_reorder_samples) + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if not df_std.empty: + df_std = df_std.reindex(filtered_reorder_samples) + + if min_y_thresh is not None: + df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() + if not df_std.empty: + df_std = df_std.loc[:, df_ave.columns].copy() + + if item_to_color_to_hatch is not None: # specific color and hatches to each fg + colors = [item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns] + hatches = [item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns] + else: # no specific colors and hatches specified + colors = sns.color_palette(color_palette, df_ave.shape[1]) + hatches = htchs + + if show_total_in_twinx: + plot_twinx: bool = True + else: + plot_twinx: bool = False + + if show_total_in_twinx: + legend_x_anchor += 0.14 + yt_lab = y_lab + + myfig = MyFigure( + rows=1, + cols=1, + twinx=plot_twinx, + text_font=Project.plot_font, + y_lab=y_lab, + yt_lab=yt_lab, + y_lim=y_lim, + legend=False, + grid=Project.plot_grid, + **kwargs, + ) + if df_std.isna().all().all() or df_std.empty: # means that no std is provided + df_ave.plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + edgecolor="k", + legend=False, + capsize=3, + color=colors, + ) + bars = myfig.axs[0].patches # needed to add patches to the bars + n_different_hatches = int(len(bars) / df_ave.shape[0]) + else: # no legend is represented but non-significant values are shaded + mask = (df_ave.abs() > df_std.abs()) | df_std.isna() + + df_ave[mask].plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + edgecolor="k", + legend=False, + yerr=df_std[mask], + capsize=3, + color=colors, + label="_nolegend", + ) + df_ave[~mask].plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + legend=False, + edgecolor="grey", + color=colors, + alpha=0.5, + label="_nolegend", + ) + bars = myfig.axs[0].patches # needed to add patches to the bars + n_different_hatches = int(len(bars) / df_ave.shape[0] / 2) + if show_total_in_twinx: + myfig.axts[0].scatter( + df_ave.index, + df_ave.sum(axis=1).values, + color="k", + linestyle="None", + edgecolor="k", + facecolor="grey", + s=100, + label=yt_sum_label, + alpha=0.5, + ) + if not df_std.empty: + myfig.axts[0].errorbar( + df_ave.index, + df_ave.sum(axis=1).values, + df_std.sum(axis=1).values, + capsize=3, + linestyle="None", + color="grey", + ecolor="k", + ) + bar_hatches = [] + # get a list with the hatches + for h in hatches[:n_different_hatches] + hatches[:n_different_hatches]: + for n in range(df_ave.shape[0]): # htcs repeated for samples + bar_hatches.append(h) # append based on samples number + for bar, hatch in zip(bars, bar_hatches): # assign hatches to each bar + bar.set_hatch(hatch) + myfig.axs[0].set(xlabel=None) + if x_label_rotation != 0: + myfig.axs[0].set_xticklabels( + df_ave.index, rotation=x_label_rotation, ha="right", rotation_mode="anchor" + ) + if legend_location is not None: + hnd_ax, lab_ax = myfig.axs[0].get_legend_handles_labels() + if not df_std.empty: + hnd_ax = hnd_ax[: len(hnd_ax) // 2] + lab_ax = lab_ax[: len(lab_ax) // 2] + if legend_labelspacing > 0.5: # large legend spacing for molecules + myfig.axs[0].plot(np.nan, np.nan, "-", color="None", label=" ") + hhhh, aaaa = myfig.axs[0].get_legend_handles_labels() + hnd_ax.append(hhhh[0]) + lab_ax.append(aaaa[0]) + if show_total_in_twinx: + hnd_axt, lab_axt = myfig.axts[0].get_legend_handles_labels() + else: + hnd_axt, lab_axt = [], [] + if legend_location == "outside": # legend goes outside of plot area + myfig.axs[0].legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc="upper left", + ncol=legend_columns, + bbox_to_anchor=(legend_x_anchor, legend_y_anchor), + labelspacing=legend_labelspacing, + ) + else: # legend is inside of plot area + myfig.axs[0].legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc=legend_location, + ncol=legend_columns, + labelspacing=legend_labelspacing, + ) + # annotate ave+-std at the top of outliers bar (exceeding y_lim) + if annotate_outliers and (y_lim is not None): # and (not df_std.empty): + _annotate_outliers_in_plot(myfig.axs[0], df_ave, df_std, y_lim) + myfig.save_figure(filename, out_path) + return myfig