Merge pull request #29 from ajmaurais/pdc_client_update

PDC client update
mriffle · Feb 26, 2025 · ef243f7 · ef243f7
2 parents 66a41ed + 5554bff
commit ef243f7
Show file tree

Hide file tree

Showing 10 changed files with 54 additions and 24 deletions.
diff --git a/container_images.config b/container_images.config
@@ -4,7 +4,7 @@ params {
         diann:                 'quay.io/protio/diann:1.8.1',
         bibliospec:            'quay.io/protio/bibliospec-linux:3.0',
         panorama_client:       'quay.io/protio/panorama-client:1.1.0',
-        pdc_client:            'quay.io/mauraisa/pdc_client:0.15',
+        pdc_client:            'quay.io/mauraisa/pdc_client:2.0.1',
         encyclopedia:          'quay.io/protio/encyclopedia:2.12.30-2',
         encyclopedia3_mriffle: 'quay.io/protio/encyclopedia:3.0.0-MRIFFLE',
         qc_pipeline:           'quay.io/mauraisa/dia_qc_report:2.3.1',

diff --git a/docs/source/workflow_parameters.rst b/docs/source/workflow_parameters.rst
@@ -103,7 +103,7 @@ The ``params`` Section
      - If starting with raw files, this is the value used by ``msconvert`` for the ``do_simasspectra`` parameter. Default: ``true``.
    * -
      - ``msconvert.mz_shift_ppm``
-     - If starting with raw files, ``msconvert`` will shift all mz values by ``n`` ppm when converting to ``mzML``. If ``null`` the mz values are not shifed. Default: ``null``.
+     - If starting with raw files, ``msconvert`` will shift all mz values by ``n`` ppm when converting to ``mzML``. If ``null`` the mz values are not shifted. Default: ``null``.
    * -
      - ``encyclopedia.chromatogram.params``
      - If you are generating a chromatogram library for quantification, this is the command line options passed to EncyclopeDIA during the chromatogram generation step. Default: ``'-enableAdvancedOptions -v2scoring'`` If you do not wish to pass any options to EncyclopeDIA, this must be set to ``''``.
@@ -142,6 +142,9 @@ The ``params`` Section
    * -
      - ``skyline.protein_parsimony``
      - If ``true``, protein parsimony is performed in Skyline. If ``false`` the protein assignments given by the search engine are used as protein groups. Default is ``false``.
+   * -
+     - ``skyline.fasta``
+     - The fasta file to use as a background proteome in Skyline. If ``null`` the same fasta file (``params.fasta``) used for the DIA search is used. Default is ``null``.
    * -
      - ``skyline.group_by_gene``
      - If ``true``, when protein parsimony is performed in Skyline protein groups are formed by gene instead of by protein. Default is ``false``.

diff --git a/main.nf b/main.nf
@@ -54,7 +54,7 @@ workflow {
     all_elib_ch = null       // hold all elibs generated by encyclopedia
     all_diann_file_ch = null // all files generated by diann to upload
 
-    // version file channles
+    // version file channels
     encyclopedia_version = null
     diann_version = null
     proteowizard_version = null
@@ -65,6 +65,7 @@ workflow {
     // check for old param variable names
     params.skyline.document_name = check_old_param_name('skyline_document_name',
                                                         'skyline.document_name')
+    skyline_document_name = params.skyline.document_name
     params.skyline.skip = check_old_param_name('skip_skyline',
                                                'skyline.skip')
     params.skyline.template_file = check_old_param_name('skyline_template_file',
@@ -100,6 +101,7 @@ workflow {
         get_pdc_files()
         wide_mzml_ch = get_pdc_files.out.wide_mzml_ch
         pdc_study_name = get_pdc_files.out.study_name
+        skyline_document_name = skyline_document_name == 'final' ? pdc_study_name : skyline_document_name
     } else{
         get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id)
         wide_mzml_ch = get_wide_mzmls.out.mzml_ch
@@ -156,6 +158,7 @@ workflow {
         replicate_metadata = get_input_files.out.replicate_metadata
     }
     fasta = get_input_files.out.fasta
+    skyline_fasta = get_input_files.out.skyline_fasta
     skyline_template_zipfile = get_input_files.out.skyline_template_zipfile
     skyr_file_ch = get_input_files.out.skyr_files
 
@@ -366,10 +369,11 @@ workflow {
         if(skyline_template_zipfile != null) {
             skyline_import(
                 skyline_template_zipfile,
-                fasta,
+                skyline_fasta,
                 final_elib,
                 wide_mzml_ch,
-                replicate_metadata
+                replicate_metadata,
+                skyline_document_name
             )
             proteowizard_version = skyline_import.out.proteowizard_version
         }
@@ -433,13 +437,15 @@ workflow {
                                                 dia_qc_version).splitText()
 
     input_files = fasta.map{ it -> ['Fasta file', it.name] }.concat(
+        fasta.map{ it -> ['Skyline fasta file', it.name] },
         spectral_library.map{ it -> ['Spectra library', it.baseName] },
         all_mzml_ch.map{ it -> ['Spectra file', it.baseName] })
 
     save_run_details(input_files.collect(), version_files.collect())
     run_details_file = save_run_details.out.run_details
 
-    combine_file_hashes(fasta, spectral_library,
+    fasta_files = fasta.concat(skyline_fasta).unique()
+    combine_file_hashes(fasta_files, spectral_library,
                         search_file_stats,
                         final_skyline_file,
                         final_skyline_hash,
@@ -486,6 +492,7 @@ def is_panorama_authentication_required() {
 
     return params.panorama.upload ||
            (params.fasta && panorama_auth_required_for_url(params.fasta)) ||
+           (params.skyline.fasta && panorama_auth_required_for_url(params.skyline.fasta)) ||
            (params.spectral_library && panorama_auth_required_for_url(params.spectral_library)) ||
            (params.replicate_metadata && panorama_auth_required_for_url(params.replicate_metadata)) ||
            (params.skyline.template_file && panorama_auth_required_for_url(params.skyline.template_file)) ||

diff --git a/modules/pdc.nf b/modules/pdc.nf
@@ -15,8 +15,8 @@ process GET_STUDY_METADATA {
         val pdc_study_id
 
     output:
-        path('study_metadata.tsv'), emit: metadata
-        path('study_metadata_annotations.csv'), emit: skyline_annotations
+        path('*_flat.json'), emit: metadata
+        path('*_skyline_annotations.csv'), emit: skyline_annotations
         env(study_id), emit: study_id
         env(study_name), emit: study_name
         path('pdc_client_version.txt'), emit: version
@@ -28,7 +28,7 @@ process GET_STUDY_METADATA {
     '''
     study_id=$(PDC_client studyID !{pdc_client_args} !{pdc_study_id} | tee study_id.txt)
     study_name=$(PDC_client studyName --normalize !{pdc_client_args} ${study_id} | tee study_name.txt)
-    PDC_client metadata !{pdc_client_args} -f tsv !{n_files_arg} --skylineAnnotations ${study_id}
+    PDC_client metadata !{pdc_client_args} --flatten -f json !{n_files_arg} --skylineAnnotations ${study_id}
 
     echo "pdc_client_git_repo='$GIT_REPO - $GIT_BRANCH [$GIT_SHORT_HASH]'" > pdc_client_version.txt
     '''
@@ -52,20 +52,23 @@ process METADATA_TO_SKY_ANNOTATIONS {
 
 process GET_FILE {
     storeDir "${params.panorama_cache_directory}"
-    label 'process_low_constant'
+    cpus 1
+    memory 8.GB
+    time 2.h
+    maxForks 10
+    errorStrategy { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' }
+    maxRetries 3
     container params.images.pdc_client
-    errorStrategy 'retry'
-    maxRetries 1
 
     input:
-        tuple val(url), val(file_name), val(md5)
+        tuple val(url), val(file_name), val(md5), val(file_size)
 
     output:
         path(file_name), emit: downloaded_file
 
     shell:
     '''
-    PDC_client file -o '!{file_name}' -m '!{md5}' '!{url}'
+    PDC_client file -o '!{file_name}' --size '!{file_size}' --md5sum '!{md5}' --url '!{url}'
     '''
 
     stub:

diff --git a/modules/skyline.nf b/modules/skyline.nf
@@ -134,9 +134,10 @@ process SKYLINE_MERGE_RESULTS {
         path skyd_files
         val mzml_files
         path fasta
+        val skyline_document_name
 
     output:
-        path("${params.skyline.document_name}.sky.zip"), emit: final_skyline_zipfile
+        path("*.sky.zip"), emit: final_skyline_zipfile
         path("skyline-merge.stdout"), emit: stdout
         path("skyline-merge.stderr"), emit: stderr
         path('output_file_hashes.txt'), emit: output_file_hashes
@@ -158,13 +159,13 @@ process SKYLINE_MERGE_RESULTS {
         --in="${skyline_zipfile.baseName}" --memstamp \
         ${import_files_params} \
         ${params.skyline.protein_parsimony ? protein_parsimony_args : ''} \
-        --out="${params.skyline.document_name}.sky" \
+        --out="${skyline_document_name}.sky" \
         --save \
-        --share-zip="${params.skyline.document_name}.sky.zip" \
+        --share-zip="${skyline_document_name}.sky.zip" \
         --share-type="complete" \
         > >(tee 'skyline-merge.stdout') 2> >(tee 'skyline-merge.stderr' >&2)
 
-    md5sum ${params.skyline.document_name}.sky.zip | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\1\\t\\2/' > output_file_hashes.txt
+    md5sum ${skyline_document_name}.sky.zip | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\1\\t\\2/' > output_file_hashes.txt
     """
 
     stub:

diff --git a/nextflow.config b/nextflow.config
@@ -87,8 +87,10 @@ params {
 
     // Minimize Skyline document?
     skyline.minimize = false
+
     skyline.group_by_gene = false
     skyline.protein_parsimony = false
+    skyline.fasta = null
 
     // Whether or not to use hardlinks with Skyline
     skyline.use_hardlinks = false
@@ -215,7 +217,7 @@ manifest {
     homePage        = 'https://github.com/mriffle/nf-skyline-dia-ms'
     description     = 'DIA workflows for TEI-REX project'
     mainScript      = 'main.nf'
-    nextflowVersion = '!>=21.10.3'
+    nextflowVersion = '!>=23.04.2'
 }
 
 // Capture exit codes from upstream processes when piping

diff --git a/workflows/combine_file_hashes.nf b/workflows/combine_file_hashes.nf
@@ -15,7 +15,7 @@ def get_search_file_dir() {
 
 workflow combine_file_hashes {
     take:
-        fasta
+        fasta_files
         spectral_library
 
         search_file_stats
@@ -55,7 +55,7 @@ workflow combine_file_hashes {
         ).map{ it -> tuple(it[0], it[1], it[3], it[2])}
 
         // Combine files we need to calculate the hash of into a single channel
-        file_stat_files = fasta.concat(spectral_library).map{
+        file_stat_files = fasta_files.concat(spectral_library).map{
             it -> tuple(it.name, it, params.result_dir, it.size())
         }.concat(
             skyline_reports.map{ tuple(it.name, it, params.output_directories.skyline.reports, it.size()) },

diff --git a/workflows/get_input_files.nf b/workflows/get_input_files.nf
@@ -32,6 +32,7 @@ workflow get_input_files {
 
    emit:
        fasta
+       skyline_fasta
        spectral_library
        skyline_template_zipfile
        skyr_files
@@ -51,6 +52,17 @@ workflow get_input_files {
             fasta = Channel.empty()
         }
 
+        if(params.skyline.fasta){
+            if(panorama_auth_required_for_url(params.fasta)) {
+                PANORAMA_GET_FASTA(params.skyline.fasta, aws_secret_id)
+                skyline_fasta = PANORAMA_GET_FASTA.out.panorama_file
+            } else {
+                skyline_fasta = Channel.value(file(params.skyline.fasta, checkIfExists: true))
+            }
+        } else {
+            skyline_fasta = fasta
+        }
+
         if(params.spectral_library) {
             if(panorama_auth_required_for_url(params.spectral_library)) {
                 PANORAMA_GET_SPECTRAL_LIBRARY(params.spectral_library, aws_secret_id)

diff --git a/workflows/get_pdc_files.nf b/workflows/get_pdc_files.nf
@@ -38,8 +38,8 @@ workflow get_pdc_files {
         study_name = get_pdc_study_metadata.out.study_name
 
         metadata \
-            | splitCsv(header:true, sep:'\t') \
-            | map{row -> tuple(row.url, row.file_name, row.md5sum)} \
+            | splitJson() \
+            | map{row -> tuple(row['url'], row['file_name'], row['md5sum'], row['file_size'])} \
             | GET_FILE
 
         MSCONVERT(GET_FILE.out.downloaded_file)

diff --git a/workflows/skyline_import.nf b/workflows/skyline_import.nf
@@ -14,6 +14,7 @@ workflow skyline_import {
         elib
         wide_mzml_file_ch
         replicate_metadata
+        skyline_document_name
 
     emit:
         skyline_results
@@ -36,7 +37,8 @@ workflow skyline_import {
             skyline_zipfile,
             SKYLINE_IMPORT_MZML.out.skyd_file.collect(),
             wide_mzml_file_ch.collect(),
-            fasta
+            fasta,
+            skyline_document_name
         )
 
         if(params.replicate_metadata != null || params.pdc.study_id != null) {