Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Untargeted dev #998

Merged
merged 3 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion metatlas/untargeted/run_untargeted_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def main():
##### Step 1/7: Syncing LIMS and NERSC to identify new projects with raw data that are not yet in the untargeted task list
new_projects = mzm.update_new_untargeted_tasks(validate_names=args.validate_names, mzmine_batch_params=args.mzmine_batch_params, \
output_dir=args.output_dir, raw_data_dir=args.raw_data_dir, raw_data_subdir=args.raw_data_subdir, \
background_designator=args.background_designator,skip_sync=step_bools[0])
skip_blank_filter=args.skip_blank_filter, background_designator=args.background_designator, \
fps_files_only=args.fps_files_only, skip_sync=step_bools[0])

##### Step 1.5/7: Mirror raw data to GNPS2
if args.hard_raw_data_mirror:
Expand Down Expand Up @@ -83,6 +84,8 @@ def add_arguments(parser):
## Step 1 only
parser.add_argument('--validate_names', action='store_true', help='Validate filenames and project names')
parser.add_argument('--mzmine_batch_params', type=str, default=None, help='Add custom mzmine batch parameters xml')
parser.add_argument('--skip_blank_filter', action='store_true', help='Do not filter out files with "Blank" in the name from the untargeted task list')
parser.add_argument('--fps_files_only', action='store_true', help='Only FPS files will be input, so do not check for polarity in file name and use custom mzmine batch parameters')
## Step 1.5 only
parser.add_argument('--hard_raw_data_mirror', action='store_true', help='Run the raw data mirror to GNPS2 before proceeding with pipeline')
## Step 2 only
Expand Down Expand Up @@ -118,6 +121,12 @@ def add_arguments(parser):

def check_args(args):
##### Check if the input arguments are valid
if args.mzmine_batch_params is not None and not os.path.exists(args.mzmine_batch_params):
logging.error('Custom mzmine batch parameters file does not exist. Please check flag and path.')
sys.exit(1)
if args.fps_files_only and args.mzmine_batch_params is None:
logging.error('FPS files only flag requires custom mzmine batch parameters. Please check flags.')
sys.exit(1)
if args.direct_input:
args.direct_input = args.direct_input.split(',')
if args.background_designator:
Expand Down
109 changes: 80 additions & 29 deletions metatlas/untargeted/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1828,12 +1828,80 @@ def write_mzmine_sbatch_and_runner(
with open(runner_filename,'w') as fid:
fid.write('sbatch %s'%sbatch_filename)

def metadata_file_filter(data, polarity, skip_blank_filter=False, fps_files_only=False, nonstandard_filename=False):
if nonstandard_filename and skip_blank_filter and fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
if nonstandard_filename and skip_blank_filter and not fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
"_"+polarity+"_" in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
if nonstandard_filename and not skip_blank_filter and fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'Blank' not in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
if nonstandard_filename and not skip_blank_filter and not fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'Blank' not in x and
"_"+polarity+"_" in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
if skip_blank_filter and not nonstandard_filename and fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
len(x.split('_')) > 9 and
len(x.split('_')) > 12 and
'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
if skip_blank_filter and not nonstandard_filename and not fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
len(x.split('_')) > 9 and
polarity in x.split('_')[9] and
len(x.split('_')) > 12 and
'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
if not skip_blank_filter and not nonstandard_filename and fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'Blank' not in x and
len(x.split('_')) > 9 and
len(x.split('_')) > 12 and
'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
if not skip_blank_filter and not nonstandard_filename and not fps_files_only:
return data['basename'][data['basename'].apply(
lambda x:
'Blank' not in x and
len(x.split('_')) > 9 and
polarity in x.split('_')[9] and
len(x.split('_')) > 12 and
'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()

def write_metadata_per_new_project(
df: pd.DataFrame,
background_designator: List[str],
validate_names: bool,
raw_data_dir: str,
raw_data_subdir: Optional[str] = None
raw_data_subdir: Optional[str] = None,
skip_blank_filter: Optional[bool] = False,
fps_files_only: Optional[bool] = False
) -> List:
"""
Takes a LIMS table (usually raw data from lcmsruns_plus) and creates
Expand Down Expand Up @@ -1883,19 +1951,9 @@ def write_metadata_per_new_project(

# Determine which polarities need metadata
try:
positive_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
lambda x: len(x.split('_')) > 9 and 'POS' in x.split('_')[9] and
'Blank' not in x and
len(x.split('_')) > 12 and 'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
positive_file_subset = metadata_file_filter(df_filtered, polarity="POS", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=False)
except IndexError:
positive_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
lambda x: '_POS_' in x and
'Blank' not in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
positive_file_subset = metadata_file_filter(df_filtered, polarity="POS", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=True)
if positive_file_subset:
positive_file_list = [os.path.join(full_mzml_path, file) for file in positive_file_subset]
positive_file_count = len(positive_file_list)
Expand All @@ -1915,19 +1973,9 @@ def write_metadata_per_new_project(
positive_file_count = 0

try:
negative_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
lambda x: len(x.split('_')) > 9 and 'NEG' in x.split('_')[9] and
'Blank' not in x and
len(x.split('_')) > 12 and 'QC' not in x.split('_')[12] and
'InjBl' not in x.split('_')[12] and
'ISTD' not in x.split('_')[12])].to_list()
negative_file_subset = metadata_file_filter(df_filtered, polarity="NEG", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=False)
except IndexError:
negative_file_subset = df_filtered['basename'][df_filtered['basename'].apply(
lambda x: '_NEG_' in x and
'Blank' not in x and
'QC' not in x and
'InjBl' not in x and
'ISTD' not in x)].to_list()
negative_file_subset = metadata_file_filter(df_filtered, polarity="NEG", skip_blank_filter=skip_blank_filter, fps_files_only=fps_files_only, nonstandard_filename=True)
if negative_file_subset:
negative_file_list = [os.path.join(full_mzml_path, file) for file in negative_file_subset]
negative_file_count = len(negative_file_list)
Expand Down Expand Up @@ -1994,11 +2042,13 @@ def write_metadata_per_new_project(
def update_new_untargeted_tasks(
background_designator: List[str],
validate_names: bool,
mzmine_batch_params: str,
skip_sync: bool,
output_dir: str,
raw_data_dir: str,
raw_data_subdir: Optional[str] = None
mzmine_batch_params: Optional[str] = None,
raw_data_subdir: Optional[str] = None,
skip_blank_filter: Optional[bool] = False,
fps_files_only: Optional[bool] = False
) -> None:
"""
This script is called by run_mzmine.py before the untargeted pipeline kicks off
Expand Down Expand Up @@ -2065,8 +2115,8 @@ def update_new_untargeted_tasks(
# Check for polarities by looking for positive and negative mzml files
df_new = df[df['parent_dir'].isin(new_folders)]
logging.info(tab_print("Checking for polarities in new projects and validating mzml file names...", 1))
new_project_info_list = write_metadata_per_new_project(df=df_new,background_designator=background_designator,raw_data_dir=raw_data_dir, \
raw_data_subdir=raw_data_subdir, validate_names=validate_names)
new_project_info_list = write_metadata_per_new_project(df=df_new,background_designator=background_designator,raw_data_dir=raw_data_dir, fps_files_only=fps_files_only, \
skip_blank_filter=skip_blank_filter, raw_data_subdir=raw_data_subdir, validate_names=validate_names)
new_project_info_list_subset = [d for d in new_project_info_list if d.get('polarities') is not None]

# Create metadata for new projects with relevant polarities
Expand All @@ -2086,6 +2136,7 @@ def update_new_untargeted_tasks(
logging.info(tab_print("Inferred machine name: %s"%(validate_machine_name), 2))
if mzmine_batch_params is None:
if validate_machine_name is None: # Assume more lenient parameters if machine name cannot be validated
logging.warning(tab_print("Warning! Could not validate machine name. Using lenient (IQX) MZmine parameters...", 2))
mzmine_running_parameters = mzine_batch_params_file_iqx
mzmine_parameter = 5
elif any(substring in validate_machine_name.lower() for substring in ("iqx", "idx")):
Expand Down