Skip to content

Commit

Permalink
Merge pull request #17 from gwaygenomics/add-feature-select
Browse files Browse the repository at this point in the history
Add feature select step to recipe
  • Loading branch information
gwaybio authored May 29, 2020
2 parents 1608f20 + a0a43dc commit 8c5e99f
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 0 deletions.
68 changes: 68 additions & 0 deletions 1.generate-profiles/3.feature-select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import sys
import pathlib
import argparse
import warnings
import pandas as pd

from pycytominer import feature_select

sys.path.append("../scripts")
from config_utils import process_config_file

parser = argparse.ArgumentParser()
parser.add_argument(
"--config_file",
help="configuration yaml file for the profiling pipeline",
default="profiling_config.yaml",
)
args = parser.parse_args()
config_file = args.config_file

config = process_config_file(config_file)

# Extract config arguments
core_args = config["core"]
batch = core_args["batch"]
float_format = core_args["float_format"]
compression = core_args["compression"]

normalize_args = config["normalize"]
feature_select_args = config["feature_select"]

singlecell_from_single_file = core_args["output_one_single_cell_file_only"]
feature_select_operations = feature_select_args["operations"]
feature_select_levels = feature_select_args["levels"]
feature_select_drop_samples = feature_select_args["drop_samples"]
feature_select_features = feature_select_args["features"]
feature_select_nacutoff = feature_select_args["na_cutoff"]
feature_select_corr_threshold = feature_select_args["corr_threshold"]
feature_select_input_files = normalize_args["normalize_output_files"]
feature_select_output_files = feature_select_args["feature_select_output_files"]

for data_level in feature_select_levels:
if data_level == "single_cell":
if not singlecell_from_single_file:
warnings.warn(
"Feature select operation is not enabled for site-specific single cell files. Skipping."
)
continue

input_file = feature_select_input_files[data_level]
output_file = feature_select_output_files[data_level]

print(
f"Now performing feature selection for {data_level}...with operations: {feature_select_operations}"
)

feature_select(
profiles=input_file,
features=feature_select_features,
samples=feature_select_drop_samples,
operation=feature_select_operations,
na_cutoff=feature_select_nacutoff,
corr_threshold=feature_select_corr_threshold,
output_file=output_file,
compression=compression,
float_format=float_format,
)
17 changes: 17 additions & 0 deletions 1.generate-profiles/profiling_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,20 @@ normalize:
- single_cell
by_samples: all
features: infer
---
feature_select:
perform: true
output_basedir: data/profiles
operations:
- variance_threshold
- drop_na_columns
- blacklist
- drop_outliers
levels:
- gene
- guide
- single_cell
drop_samples: none
features: infer
na_cutoff: 0
corr_threshold: 0.9
13 changes: 13 additions & 0 deletions scripts/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,19 @@ def generate_profiles_config(config):
f"{batch}_{normalize_level}_normalized.csv.gz",
)

# Build paths to normalize yaml document
config["feature_select"]["feature_select_output_dir"] = pathlib.Path(
config["feature_select"]["output_basedir"], batch
)

# Build feature select output files
config["feature_select"]["feature_select_output_files"] = {}
for feature_select_level in config["feature_select"]["levels"]:
config["feature_select"]["feature_select_output_files"][feature_select_level] = pathlib.Path(
config["feature_select"]["feature_select_output_dir"],
f"{batch}_{feature_select_level}_normalized_feature_select.csv.gz",
)

return config


Expand Down

0 comments on commit 8c5e99f

Please sign in to comment.