Skip to content

Commit

Permalink
Merge pull request #998 from biorack/untargeted_dev
Browse files Browse the repository at this point in the history
Untargeted dev
  • Loading branch information
bkieft-usa authored Dec 10, 2024
2 parents 192f8e5 + 12e0df1 commit 8739411
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 30 deletions.
11 changes: 10 additions & 1 deletion metatlas/untargeted/run_untargeted_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def main():
##### Step 1/7: Syncing LIMS and NERSC to identify new projects with raw data that are not yet in the untargeted task list
new_projects = mzm.update_new_untargeted_tasks(validate_names=args.validate_names, mzmine_batch_params=args.mzmine_batch_params, \
output_dir=args.output_dir, raw_data_dir=args.raw_data_dir, raw_data_subdir=args.raw_data_subdir, \
background_designator=args.background_designator,skip_sync=step_bools[0])
skip_blank_filter=args.skip_blank_filter, background_designator=args.background_designator, \
fps_files_only=args.fps_files_only, skip_sync=step_bools[0])

##### Step 1.5/7: Mirror raw data to GNPS2
if args.hard_raw_data_mirror:
Expand Down Expand Up @@ -83,6 +84,8 @@ def add_arguments(parser):
## Step 1 only
parser.add_argument('--validate_names', action='store_true', help='Validate filenames and project names')
parser.add_argument('--mzmine_batch_params', type=str, default=None, help='Add custom mzmine batch parameters xml')
parser.add_argument('--skip_blank_filter', action='store_true', help='Do not filter out files with "Blank" in the name from the untargeted task list')
parser.add_argument('--fps_files_only', action='store_true', help='Only FPS files will be input, so do not check for polarity in file name and use custom mzmine batch parameters')
## Step 1.5 only
parser.add_argument('--hard_raw_data_mirror', action='store_true', help='Run the raw data mirror to GNPS2 before proceeding with pipeline')
## Step 2 only
Expand Down Expand Up @@ -118,6 +121,12 @@ def add_arguments(parser):

def check_args(args):
##### Check if the input arguments are valid
if args.mzmine_batch_params is not None and not os.path.exists(args.mzmine_batch_params):
logging.error('Custom mzmine batch parameters file does not exist. Please check flag and path.')
sys.exit(1)
if args.fps_files_only and args.mzmine_batch_params is None:
logging.error('FPS files only flag requires custom mzmine batch parameters. Please check flags.')
sys.exit(1)
if args.direct_input:
args.direct_input = args.direct_input.split(',')
if args.background_designator:
Expand Down
109 changes: 80 additions & 29 deletions metatlas/untargeted/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1828,12 +1828,80 @@ def write_mzmine_sbatch_and_runner(
with open(runner_filename,'w') as fid:
fid.write('sbatch %s'%sbatch_filename)

def metadata_file_filter(data, polarity, skip_blank_filter=False, fps_files_only=False, nonstandard_filename=False):
if nonstandard_filename and skip_blank_filter and fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
if nonstandard_filename and skip_blank_filter and not fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
"_"+polarity+"_" in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
if nonstandard_filename and not skip_blank_filter and fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'Blank' not in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
if nonstandard_filename and not skip_blank_filter and not fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'Blank' not in x and
"_"+polarity+"_" in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
if skip_blank_filter and not nonstandard_filename and fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
len(x.split('_')) > 9 and
len(x.split('_')) > 12 and
'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
if skip_blank_filter and not nonstandard_filename and not fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
len(x.split('_')) > 9 and
polarity in x.split('_')[9] and
len(x.split('_')) > 12 and
'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
if not skip_blank_filter and not nonstandard_filename and fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'Blank' not in x and
len(x.split('_')) > 9 and
len(x.split('_')) > 12 and
'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
if not skip_blank_filter and not nonstandard_filename and not fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'Blank' not in x and
len(x.split('_')) > 9 and
polarity in x.split('_')[9] and
len(x.split('_')) > 12 and
'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()

def write_metadata_per_new_project(
df: pd.DataFrame,
background_designator: List[str],
validate_names: bool,
raw_data_dir: str,
raw_data_subdir: Optional[str] = None
raw_data_subdir: Optional[str] = None,
skip_blank_filter: Optional[bool] = False,
fps_files_only: Optional[bool] = False
) -> List:
"""
Takes a LIMS table (usually raw data from lcmsruns_plus) and creates
Expand Down Expand Up @@ -1883,19 +1951,9 @@ def write_metadata_per_new_project(

# Determine which polarities need metadata
try:
positive_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
lambda x: len(x.split('_')) > 9 and 'POS' in x.split('_')[9] and
'Blank' not in x and
len(x.split('_')) > 12 and 'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
positive_file_subset = metadata_file_filter(df_filtered, polarity="POS", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=False)
except IndexError:
positive_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
lambda x: '_POS_' in x and
'Blank' not in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
positive_file_subset = metadata_file_filter(df_filtered, polarity="POS", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=True)
if positive_file_subset:
positive_file_list = [os.path.join(full_mzml_path, file) for file in positive_file_subset]
positive_file_count = len(positive_file_list)
Expand All @@ -1915,19 +1973,9 @@ def write_metadata_per_new_project(
positive_file_count = 0

try:
negative_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
lambda x: len(x.split('_')) > 9 and 'NEG' in x.split('_')[9] and
'Blank' not in x and
len(x.split('_')) > 12 and 'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
negative_file_subset = metadata_file_filter(df_filtered, polarity="NEG", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=False)
except IndexError:
negative_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
lambda x: '_NEG_' in x and
'Blank' not in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
negative_file_subset = metadata_file_filter(df_filtered, polarity="NEG", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=True)
if negative_file_subset:
negative_file_list = [os.path.join(full_mzml_path, file) for file in negative_file_subset]
negative_file_count = len(negative_file_list)
Expand Down Expand Up @@ -1994,11 +2042,13 @@ def write_metadata_per_new_project(
def update_new_untargeted_tasks(
background_designator: List[str],
validate_names: bool,
mzmine_batch_params: str,
skip_sync: bool,
output_dir: str,
raw_data_dir: str,
raw_data_subdir: Optional[str] = None
mzmine_batch_params: Optional[str] = None,
raw_data_subdir: Optional[str] = None,
skip_blank_filter: Optional[bool] = False,
fps_files_only: Optional[bool] = False
) -> None:
"""
This script is called by run_mzmine.py before the untargeted pipeline kicks off
Expand Down Expand Up @@ -2065,8 +2115,8 @@ def update_new_untargeted_tasks(
# Check for polarities by looking for positive and negative mzml files
df_new = df[df['parent_dir'].isin(new_folders)]
logging.info(tab_print("Checking for polarities in new projects and validating mzml file names...", 1))
new_project_info_list = write_metadata_per_new_project(df=df_new,background_designator=background_designator,raw_data_dir=raw_data_dir, \
raw_data_subdir=raw_data_subdir, validate_names=validate_names)
new_project_info_list = write_metadata_per_new_project(df=df_new,background_designator=background_designator,raw_data_dir=raw_data_dir, fps_files_only=fps_files_only, \
skip_blank_filter=skip_blank_filter, raw_data_subdir=raw_data_subdir, validate_names=validate_names)
new_project_info_list_subset = [d for d in new_project_info_list if d.get('polarities') is not None]

# Create metadata for new projects with relevant polarities
Expand All @@ -2086,6 +2136,7 @@ def update_new_untargeted_tasks(
logging.info(tab_print("Inferred machine name: %s"%(validate_machine_name), 2))
if mzmine_batch_params is None:
if validate_machine_name is None: # Assume more lenient parameters if machine name cannot be validated
logging.warning(tab_print("Warning! Could not validate machine name. Using lenient (IQX) MZmine parameters...", 2))
mzmine_running_parameters = mzine_batch_params_file_iqx
mzmine_parameter = 5
elif any(substring in validate_machine_name.lower() for substring in ("iqx", "idx")):
Expand Down

0 comments on commit 8739411

Please sign in to comment.