Skip to content

Commit

Permalink
Merge pull request #16 from gwaygenomics/add-normalize
Browse files Browse the repository at this point in the history
Adding normalize step to recipe
  • Loading branch information
gwaybio authored May 29, 2020
2 parents 67dc1a8 + 1bfb8fe commit 1608f20
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 0 deletions.
60 changes: 60 additions & 0 deletions 1.generate-profiles/2.normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import os
import sys
import pathlib
import argparse
import warnings
import pandas as pd

from pycytominer import normalize

sys.path.append("../scripts")
from config_utils import process_config_file

parser = argparse.ArgumentParser()
parser.add_argument(
"--config_file",
help="configuration yaml file for the profiling pipeline",
default="profiling_config.yaml",
)
args = parser.parse_args()
config_file = args.config_file

config = process_config_file(config_file)

# Extract config arguments
core_args = config["core"]
batch = core_args["batch"]
aggregate_args = config["aggregate"]
normalize_args = config["normalize"]

ignore_files = core_args["ignore_files"]
float_format = core_args["float_format"]
compression = core_args["compression"]

normalize_singlecell_from_single_file = core_args["output_one_single_cell_file_only"]
normalize_levels = normalize_args["levels"]
normalize_by_samples = normalize_args["by_samples"]
normalize_these_features = normalize_args["features"]
normalize_method = normalize_args["method"]
normalize_input_files = aggregate_args["aggregate_output_files"]
normalize_output_files = normalize_args["normalize_output_files"]

for data_level in normalize_levels:
if data_level == "single_cell":
if not normalize_singlecell_from_single_file:
continue

file_to_normalize = normalize_input_files[data_level]
output_file = normalize_output_files[data_level]

print(f"Now normalizing {data_level}...with operation: {normalize_method}")

normalize_df = normalize(
profiles=file_to_normalize,
features=normalize_these_features,
samples=normalize_by_samples,
method=normalize_method,
output_file=output_file,
compression=compression,
float_format=float_format,
)
12 changes: 12 additions & 0 deletions 1.generate-profiles/profiling_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ core:
site_dir: ../0.preprocess-sites/data/
output_single_cell_dir: single_cell/
output_profile_dir: profiles/
output_one_single_cell_file_only: true
categorize_cell_quality: simple
compression: gzip
float_format: "%.5g"
Expand Down Expand Up @@ -64,3 +65,14 @@ aggregate:
guide:
- Metadata_Foci_Barcode_MatchedTo_GeneCode
- Metadata_Foci_Barcode_MatchedTo_Barcode
---
normalize:
perform: true
output_basedir: data/profiles
method: standardize
levels:
- gene
- guide
- single_cell
by_samples: all
features: infer
25 changes: 25 additions & 0 deletions scripts/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,31 @@ def generate_profiles_config(config):
config["aggregate"]["output_basedir"], batch
)

# Build aggregated output files
config["aggregate"]["aggregate_output_files"] = {}
for aggregate_level, aggregate_columns in config["aggregate"]["levels"].items():
config["aggregate"]["aggregate_output_files"][aggregate_level] = pathlib.Path(
config["aggregate"]["aggregate_output_dir"],
f"{batch}_{aggregate_level}.csv.gz",
)

config["aggregate"]["aggregate_output_files"]["single_cell"] = config[
"single_cell"
]["single_file_only_output_file"]

# Build paths to normalize yaml document
config["normalize"]["normalize_output_dir"] = pathlib.Path(
config["normalize"]["output_basedir"], batch
)

# Build normalized output files
config["normalize"]["normalize_output_files"] = {}
for normalize_level in config["normalize"]["levels"]:
config["normalize"]["normalize_output_files"][normalize_level] = pathlib.Path(
config["normalize"]["normalize_output_dir"],
f"{batch}_{normalize_level}_normalized.csv.gz",
)

return config


Expand Down

0 comments on commit 1608f20

Please sign in to comment.