From 12db02ef6620b314c4e0c13d194dbb389852c360 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Thu, 28 May 2020 17:01:31 -0400 Subject: [PATCH 1/2] add normalize --- 1.generate-profiles/2.normalize.py | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 1.generate-profiles/2.normalize.py diff --git a/1.generate-profiles/2.normalize.py b/1.generate-profiles/2.normalize.py new file mode 100644 index 0000000..eba7649 --- /dev/null +++ b/1.generate-profiles/2.normalize.py @@ -0,0 +1,60 @@ +import os +import sys +import pathlib +import argparse +import warnings +import pandas as pd + +from pycytominer import normalize + +sys.path.append("../scripts") +from config_utils import process_config_file + +parser = argparse.ArgumentParser() +parser.add_argument( + "--config_file", + help="configuration yaml file for the profiling pipeline", + default="profiling_config.yaml", +) +args = parser.parse_args() +config_file = args.config_file + +config = process_config_file(config_file) + +# Extract config arguments +core_args = config["core"] +batch = core_args["batch"] +aggregate_args = config["aggregate"] +normalize_args = config["normalize"] + +ignore_files = core_args["ignore_files"] +float_format = core_args["float_format"] +compression = core_args["compression"] + +normalize_singlecell_from_single_file = core_args["output_one_single_cell_file_only"] +normalize_levels = normalize_args["levels"] +normalize_by_samples = normalize_args["by_samples"] +normalize_these_features = normalize_args["features"] +normalize_method = normalize_args["method"] +normalize_input_files = aggregate_args["aggregate_output_files"] +normalize_output_files = normalize_args["normalize_output_files"] + +for data_level in normalize_levels: + if data_level == "single_cell": + if not normalize_singlecell_from_single_file: + continue + + file_to_normalize = normalize_input_files[data_level] + output_file = normalize_output_files[data_level] + + print(f"Now normalizing {data_level}...with operation: {normalize_method}") + + normalize_df = normalize( + profiles=file_to_normalize, + features=normalize_these_features, + samples=normalize_by_samples, + method=normalize_method, + output_file=output_file, + compression=compression, + float_format=float_format, + ) From 1bfb8fe18b36f5d452494a4ce2bbd7d859064713 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Thu, 28 May 2020 17:03:51 -0400 Subject: [PATCH 2/2] add normalization section to config --- 1.generate-profiles/profiling_config.yaml | 12 +++++++++++ scripts/config_utils.py | 25 +++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/1.generate-profiles/profiling_config.yaml b/1.generate-profiles/profiling_config.yaml index 6b931c9..45dae03 100644 --- a/1.generate-profiles/profiling_config.yaml +++ b/1.generate-profiles/profiling_config.yaml @@ -10,6 +10,7 @@ core: site_dir: ../0.preprocess-sites/data/ output_single_cell_dir: single_cell/ output_profile_dir: profiles/ + output_one_single_cell_file_only: true categorize_cell_quality: simple compression: gzip float_format: "%.5g" @@ -64,3 +65,14 @@ aggregate: guide: - Metadata_Foci_Barcode_MatchedTo_GeneCode - Metadata_Foci_Barcode_MatchedTo_Barcode +--- +normalize: + perform: true + output_basedir: data/profiles + method: standardize + levels: + - gene + - guide + - single_cell + by_samples: all + features: infer diff --git a/scripts/config_utils.py b/scripts/config_utils.py index 2964f85..0d68ba0 100644 --- a/scripts/config_utils.py +++ b/scripts/config_utils.py @@ -83,6 +83,31 @@ def generate_profiles_config(config): config["aggregate"]["output_basedir"], batch ) + # Build aggregated output files + config["aggregate"]["aggregate_output_files"] = {} + for aggregate_level, aggregate_columns in config["aggregate"]["levels"].items(): + config["aggregate"]["aggregate_output_files"][aggregate_level] = pathlib.Path( + config["aggregate"]["aggregate_output_dir"], + f"{batch}_{aggregate_level}.csv.gz", + ) + + config["aggregate"]["aggregate_output_files"]["single_cell"] = config[ + "single_cell" + ]["single_file_only_output_file"] + + # Build paths to normalize yaml document + config["normalize"]["normalize_output_dir"] = pathlib.Path( + config["normalize"]["output_basedir"], batch + ) + + # Build normalized output files + config["normalize"]["normalize_output_files"] = {} + for normalize_level in config["normalize"]["levels"]: + config["normalize"]["normalize_output_files"][normalize_level] = pathlib.Path( + config["normalize"]["normalize_output_dir"], + f"{batch}_{normalize_level}_normalized.csv.gz", + ) + return config