biorack · bkieft-usa · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/metatlas/untargeted/run_untargeted_pipeline.py b/metatlas/untargeted/run_untargeted_pipeline.py
@@ -29,7 +29,8 @@ def main():
     ##### Step 1/7: Syncing LIMS and NERSC to identify new projects with raw data that are not yet in the untargeted task list
     new_projects = mzm.update_new_untargeted_tasks(validate_names=args.validate_names, mzmine_batch_params=args.mzmine_batch_params, \
                                                    output_dir=args.output_dir, raw_data_dir=args.raw_data_dir, raw_data_subdir=args.raw_data_subdir, \
-                                                   background_designator=args.background_designator,skip_sync=step_bools[0])
+                                                   skip_blank_filter=args.skip_blank_filter, background_designator=args.background_designator, \
+                                                   fps_files_only=args.fps_files_only, skip_sync=step_bools[0])
 
     ##### Step 1.5/7: Mirror raw data to GNPS2
     if args.hard_raw_data_mirror:
@@ -83,6 +84,8 @@ def add_arguments(parser):
     ## Step 1 only
     parser.add_argument('--validate_names', action='store_true', help='Validate filenames and project names')
     parser.add_argument('--mzmine_batch_params', type=str, default=None, help='Add custom mzmine batch parameters xml')
+    parser.add_argument('--skip_blank_filter', action='store_true', help='Do not filter out files with "Blank" in the name from the untargeted task list')
+    parser.add_argument('--fps_files_only', action='store_true', help='Only FPS files will be input, so do not check for polarity in file name and use custom mzmine batch parameters')
     ## Step 1.5 only
     parser.add_argument('--hard_raw_data_mirror', action='store_true', help='Run the raw data mirror to GNPS2 before proceeding with pipeline')
     ## Step 2 only
@@ -118,6 +121,12 @@ def add_arguments(parser):
 
 def check_args(args):
     ##### Check if the input arguments are valid
+    if args.mzmine_batch_params is not None and not os.path.exists(args.mzmine_batch_params):
+        logging.error('Custom mzmine batch parameters file does not exist. Please check flag and path.')
+        sys.exit(1)
+    if args.fps_files_only and args.mzmine_batch_params is None:
+        logging.error('FPS files only flag requires custom mzmine batch parameters. Please check flags.')
+        sys.exit(1)
     if args.direct_input:
         args.direct_input = args.direct_input.split(',')
     if args.background_designator:

diff --git a/metatlas/untargeted/tools.py b/metatlas/untargeted/tools.py
@@ -1828,12 +1828,80 @@ def write_mzmine_sbatch_and_runner(
     with open(runner_filename,'w') as fid:
         fid.write('sbatch %s'%sbatch_filename)
 
+def metadata_file_filter(data, polarity, skip_blank_filter=False, fps_files_only=False, nonstandard_filename=False):
+    if nonstandard_filename and skip_blank_filter and fps_files_only:
+        return data['basename'][data['basename'].apply(
+                        lambda x:
+                        'QC' not in x and
+                        'InjBl' not in x and
+                        'ISTD' not in x)].to_list()
+    if nonstandard_filename and skip_blank_filter and not fps_files_only:
+        return data['basename'][data['basename'].apply(
+                        lambda x:
+                        "_"+polarity+"_" in x and
+                        'QC' not in x and
+                        'InjBl' not in x and
+                        'ISTD' not in x)].to_list()
+    if nonstandard_filename and not skip_blank_filter and fps_files_only:
+        return data['basename'][data['basename'].apply(
+                        lambda x:
+                        'Blank' not in x and
+                        'QC' not in x and
+                        'InjBl' not in x and
+                        'ISTD' not in x)].to_list()
+    if nonstandard_filename and not skip_blank_filter and not fps_files_only:
+        return data['basename'][data['basename'].apply(
+                        lambda x:
+                        'Blank' not in x and
+                        "_"+polarity+"_" in x and
+                        'QC' not in x and
+                        'InjBl' not in x and
+                        'ISTD' not in x)].to_list()
+    if skip_blank_filter and not nonstandard_filename and fps_files_only:
+        return data['basename'][data['basename'].apply(
+                        lambda x:
+                        len(x.split('_')) > 9 and
+                        len(x.split('_')) > 12 and
+                        'QC' not in x.split('_')[12] and
+                        'InjBl' not in x.split('_')[12] and
+                        'ISTD' not in x.split('_')[12])].to_list()
+    if skip_blank_filter and not nonstandard_filename and not fps_files_only:
+        return data['basename'][data['basename'].apply(
+                        lambda x:
+                        len(x.split('_')) > 9 and
+                        polarity in x.split('_')[9] and
+                        len(x.split('_')) > 12 and
+                        'QC' not in x.split('_')[12] and
+                        'InjBl' not in x.split('_')[12] and
+                        'ISTD' not in x.split('_')[12])].to_list()
+    if not skip_blank_filter and not nonstandard_filename and fps_files_only:
+        return data['basename'][data['basename'].apply(
+                        lambda x:
+                        'Blank' not in x and
+                        len(x.split('_')) > 9 and
+                        len(x.split('_')) > 12 and
+                        'QC' not in x.split('_')[12] and
+                        'InjBl' not in x.split('_')[12] and
+                        'ISTD' not in x.split('_')[12])].to_list()
+    if not skip_blank_filter and not nonstandard_filename and not fps_files_only:
+        return data['basename'][data['basename'].apply(
+                        lambda x:
+                        'Blank' not in x and
+                        len(x.split('_')) > 9 and
+                        polarity in x.split('_')[9] and
+                        len(x.split('_')) > 12 and
+                        'QC' not in x.split('_')[12] and
+                        'InjBl' not in x.split('_')[12] and
+                        'ISTD' not in x.split('_')[12])].to_list()
+
 def write_metadata_per_new_project(
     df: pd.DataFrame,
     background_designator: List[str],
     validate_names: bool,
     raw_data_dir: str,
-    raw_data_subdir: Optional[str] = None
+    raw_data_subdir: Optional[str] = None,
+    skip_blank_filter: Optional[bool] = False,
+    fps_files_only: Optional[bool] = False
 ) -> List:
     """
     Takes a LIMS table (usually raw data from lcmsruns_plus) and creates
@@ -1883,19 +1951,9 @@ def write_metadata_per_new_project(
 
         # Determine which polarities need metadata
         try:
-            positive_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
-                lambda x: len(x.split('_')) > 9 and 'POS' in x.split('_')[9] and
-                'Blank' not in x and
-                len(x.split('_')) > 12 and 'QC' not in x.split('_')[12] and
-                'InjBl' not in x.split('_')[12] and
-                'ISTD' not in x.split('_')[12])].to_list()
+            positive_file_subset = metadata_file_filter(df_filtered, polarity="POS", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=False)
         except IndexError:
-            positive_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
-                lambda x: '_POS_' in x and
-                'Blank' not in x and
-                'QC' not in x and
-                'InjBl' not in x and
-                'ISTD' not in x)].to_list()
+            positive_file_subset = metadata_file_filter(df_filtered, polarity="POS", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=True)
         if positive_file_subset:
             positive_file_list = [os.path.join(full_mzml_path, file) for file in positive_file_subset]
             positive_file_count = len(positive_file_list)
@@ -1915,19 +1973,9 @@ def write_metadata_per_new_project(
             positive_file_count = 0
 
         try:
-            negative_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
-                lambda x: len(x.split('_')) > 9 and 'NEG' in x.split('_')[9] and
-                'Blank' not in x and
-                len(x.split('_')) > 12 and 'QC' not in x.split('_')[12] and
-                'InjBl' not in x.split('_')[12] and
-                'ISTD' not in x.split('_')[12])].to_list()
+            negative_file_subset = metadata_file_filter(df_filtered, polarity="NEG", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=False)
         except IndexError:
-            negative_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
-                lambda x: '_NEG_' in x and
-                'Blank' not in x and
-                'QC' not in x and
-                'InjBl' not in x and
-                'ISTD' not in x)].to_list()
+            negative_file_subset = metadata_file_filter(df_filtered, polarity="NEG", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=True)
         if negative_file_subset:
             negative_file_list = [os.path.join(full_mzml_path, file) for file in negative_file_subset]
             negative_file_count = len(negative_file_list)
@@ -1994,11 +2042,13 @@ def write_metadata_per_new_project(
 def update_new_untargeted_tasks(
     background_designator: List[str],
     validate_names: bool,
-    mzmine_batch_params: str,
     skip_sync: bool,
     output_dir: str,
     raw_data_dir: str,
-    raw_data_subdir: Optional[str] = None
+    mzmine_batch_params: Optional[str] = None,
+    raw_data_subdir: Optional[str] = None,
+    skip_blank_filter: Optional[bool] = False,
+    fps_files_only: Optional[bool] = False
 ) -> None:
     """
     This script is called by run_mzmine.py before the untargeted pipeline kicks off
@@ -2065,8 +2115,8 @@ def update_new_untargeted_tasks(
     # Check for polarities by looking for positive and negative mzml files
     df_new = df[df['parent_dir'].isin(new_folders)]
     logging.info(tab_print("Checking for polarities in new projects and validating mzml file names...", 1))
-    new_project_info_list = write_metadata_per_new_project(df=df_new,background_designator=background_designator,raw_data_dir=raw_data_dir, \
-                                                           raw_data_subdir=raw_data_subdir, validate_names=validate_names)
+    new_project_info_list = write_metadata_per_new_project(df=df_new,background_designator=background_designator,raw_data_dir=raw_data_dir, fps_files_only=fps_files_only, \
+                                                           skip_blank_filter=skip_blank_filter, raw_data_subdir=raw_data_subdir, validate_names=validate_names)
     new_project_info_list_subset = [d for d in new_project_info_list if d.get('polarities') is not None]
 
     # Create metadata for new projects with relevant polarities
@@ -2086,6 +2136,7 @@ def update_new_untargeted_tasks(
             logging.info(tab_print("Inferred machine name: %s"%(validate_machine_name), 2))
             if mzmine_batch_params is None:
                 if validate_machine_name is None:  # Assume more lenient parameters if machine name cannot be validated
+                    logging.warning(tab_print("Warning! Could not validate machine name. Using lenient (IQX) MZmine parameters...", 2))
                     mzmine_running_parameters = mzine_batch_params_file_iqx
                     mzmine_parameter = 5
                 elif any(substring in validate_machine_name.lower() for substring in ("iqx", "idx")):