scipion-em · jamesmkrieger · Jun 18, 2024 · Jun 18, 2024
diff --git a/prody2/constants.py b/prody2/constants.py
@@ -59,3 +59,7 @@ def getProDyEnvName(version):
 PWALIGN = 1 # biopython pwalign local pairwise sequence alignment after trivial mapping
 CEALIGN = 2 # combinatorial extension (CE) as in PyMOL
 DEFAULT = 3 # try pwalign then CE
+
+UNITE_CHAINS_LABEL = "Unite chains in mmCIF segments"
+UNITE_CHAINS_HELP = ('Elect whether to unite chains in mmCIF segments for each structure like ChimeraX. '
+                     'Default is **False**, which means the smaller unit IDs are used for chains like PyMOL.')
diff --git a/prody2/objects/data.py b/prody2/objects/data.py
@@ -502,3 +502,68 @@ def loadAndWriteEnsemble(cls):
                               objLabel=cls.ens.getLabels()[j],
                               weight=weights[j])
             cls.npz.append(frame)
+
+
+hitEntryTypes = {
+    'num': String,
+    'bit-score': Float,
+    'score': Float,
+    'evalue': Float,
+    'query-from': Integer,
+    'query-to': Integer,
+    'hit-from': Integer,
+    'hit-to': Integer,
+    'query-frame': Integer,
+    'hit-frame': Integer,
+    'identity': Integer,
+    'positive': Integer,
+    'gaps': Integer,
+    'align-len': Integer,
+    'qseq': String,
+    'hseq': String,
+    'midline': String,
+    'query-len': Integer,
+    'percent_identity': Float,
+    'percent_coverage': Float,
+    'pdb_id': String,
+    'chain_id': String,
+    'title': String
+}
+
+class BlastHit(EMObject):
+    """Represents each hit from blastPDB"""
+
+    def __init__(self, key=None, hit=None, **kwargs):
+        """
+        Params:
+        :param hit: a hit from the BlastRecord
+        """
+        EMObject.__init__(self, **kwargs)
+
+        self._key = String(key)
+        for key, value in hit.items():
+            persistType = hitEntryTypes[key]
+            setattr(self, key.replace('-', '_'), 
+                    persistType(value))
+
+class SetOfBlastHits(EMSet):
+    """ Represents a set of Blast hits"""
+    ITEM_TYPE = BlastHit
+
+def createSetOfBlastResults(blastRec, protocol, percentId=0.0, 
+                            percentOverlap=0.0, chain=False):
+    """Create Scipion SetOfBlastResults from ProDy BlastRecord *blastRec*
+
+    The hits can be filtered by providing *percent_identity* and *percent_overlap* 
+    to getHits. The argument *chain* specifies whether to return individual chains 
+    as separate hits (default False).
+    """
+    hitsSet = SetOfBlastHits().create(protocol._getExtraPath())
+    for key, value in blastRec.getHits(percent_identity=percentId, 
+                                       percent_overlap=percentOverlap, 
+                                       chain=chain).items():
+        hit = BlastHit(key, value)
+        hitsSet.append(hit)
+
+    return hitsSet
+
diff --git a/prody2/protocols/__init__.py b/prody2/protocols/__init__.py
@@ -32,3 +32,5 @@
 from .protocol_pdbfixer import ProDyPDBFixer
 from .protocol_clustenm import ProDyClustENM
 from .protocol_bioexcel import ProDyBioExcelCV19
+
+from .protocol_blastpdb import ProDyBlastPDB
diff --git a/prody2/protocols/protocol_atoms.py b/prody2/protocols/protocol_atoms.py
@@ -51,9 +51,7 @@
 def notFoundException(inputFn):
     return Exception("Atomic structure not found at *%s*" % inputFn)
 
-UNITE_CHAINS_LABEL = "Unite chains in mmCIF segments"
-UNITE_CHAINS_HELP = ('Elect whether to unite chains in mmCIF segments for each structure like ChimeraX. '
-                     'Default is **False**, which means the smaller unit IDs are used for chains like PyMOL.')
+from prody2.constants import UNITE_CHAINS_LABEL, UNITE_CHAINS_HELP
 
 IMPORT_FROM_ID_CONDITION = 'inputPdbData == IMPORT_FROM_ID'
 SUMMARY_NO_OUTPUT = 'Output structure not ready yet'

diff --git a/prody2/protocols/protocol_blastpdb.py b/prody2/protocols/protocol_blastpdb.py
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+# **************************************************************************
+# *
+# * Authors:     James Krieger (jmkrieger@cnb.csic.es)
+# *
+# * Centro Nacional de Biotecnologia, CSIC
+# *
+# * This program is free software; you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation; either version 2 of the License, or
+# * (at your option) any later version.
+# *
+# * This program is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with this program; if not, write to the Free Software
+# * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+# * 02111-1307  USA
+# *
+# *  All comments concerning this program package may be sent to the
+# *  e-mail address 'scipion@cnb.csic.es'
+# *
+# **************************************************************************
+
+
+"""
+This module will provide the ProDy interface for running an NCBI BLAST
+search against the PDB
+"""
+from pwem.protocols import EMProtocol
+from pyworkflow.protocol import params
+
+import prody
+
+from prody2.objects import SetOfBlastHits, createSetOfBlastResults
+from prody2.constants import UNITE_CHAINS_LABEL, UNITE_CHAINS_HELP
+from prody2 import fixVerbositySecondary, restoreVerbositySecondary
+
+
+class ProDyBlastPDB(EMProtocol):
+    """
+    This module will provide the ProDy interface for running an 
+    NCBI BLAST search against the PDB
+    """
+    _label = 'BlastPDB'
+    _possibleOutputs = {'outputResults': SetOfBlastHits}
+
+    IMPORT_FROM_STRUCT = 0
+    IMPORT_FROM_SEQ = 1
+    USE_TEXT = 2
+
+    # -------------------------- DEFINE param functions ----------------------
+    def _defineParams(self, form):
+        """ Define the input parameters that will be used.
+        Params:
+            form: this is the form to be populated with sections and params.
+        """
+        form.addSection(label='ProDy BlastPDB parsing')
+
+        form.addParam('inputSeqData', params.EnumParam, choices=['AtomStruct', 'Sequence', 'text'],
+                      label="Import sequence from",
+                      default=self.USE_TEXT,
+                      display=params.EnumParam.DISPLAY_HLIST,
+                      help='Extract sequence from AtomStruct, Sequence or text')
+
+        form.addParam('inputStructure', params.PointerParam, label="Input structure",
+                      condition='inputSeqData == IMPORT_FROM_STRUCT',
+                      pointerClass='AtomStruct',
+                      help='Provide any atomic structure with a sequence')
+
+        form.addParam('uniteChains', params.BooleanParam, default=False,
+                      condition='inputSeqData == IMPORT_FROM_STRUCT',
+                      label=UNITE_CHAINS_LABEL,
+                      help=UNITE_CHAINS_HELP)
+
+        form.addParam('inputSequence', params.PointerParam, label="Input sequence",
+                      condition='inputSeqData == IMPORT_FROM_SEQ',
+                      pointerClass='Sequence',
+                      help='Provide any Sequence object')
+
+        form.addParam('inputSeqText', params.TextParam, width=50,
+                       condition='inputSeqData == USE_TEXT', default="",
+                       label='Input sequence',
+                       help='Defined order of chains from custom matching.')       
+
+        form.addParam('seqid', params.FloatParam, default=0.,
+                      label="Sequence identity percentage cutoff",
+                      help='Hits with lower percent sequence identity will not be included.\n'
+                           'This can be a number between 0 and 100')
+
+        form.addParam('overlap', params.FloatParam, default=0.,
+                      label="Overlap percentage cutoff",
+                      help='Hits with lower percent sequence coverage will not be included.\n'
+                           'This can be a number between 0 and 100')
+
+        form.addParam('separateChains', params.BooleanParam, default=False,
+                      label="Whether to separate chains",
+                      help="If true, a separate entry will be provided for each chain")
+
+    # --------------------------- STEPS functions ------------------------------
+    def _insertAllSteps(self):
+        # Insert processing steps
+        self._insertFunctionStep('computeStep')
+        self._insertFunctionStep('createOutputStep')
+
+    def computeStep(self):
+        """Run blastPDB and save the results to an xml file"""
+
+        if self.inputSeqData.get() == self.IMPORT_FROM_STRUCT:
+
+            fixVerbositySecondary(self)
+            ag = prody.parsePDB(self.inputStructure.get().getFileName(),
+                                unite_chains=self.uniteChains.get())
+            restoreVerbositySecondary(self)
+
+            sequence = ag.ca.getSequence()
+
+        elif self.inputSeqData.get() == self.IMPORT_FROM_SEQ:
+            sequence = self.inputSequence.get().getSequence()
+
+        else:
+            sequence = self.inputSeqText.get()
+
+        self.blastRec = prody.blastPDB(sequence=sequence, 
+                                       filename=self._getExtraPath('results.xml'), 
+                                       timeout=1e10) # ensure completion
+
+    def createOutputStep(self):
+        outputHitsSet = createSetOfBlastResults(self.blastRec, self,
+                                                self.seqid.get(),
+                                                self.overlap.get(),
+                                                self.separateChains.get())
+        self._defineOutputs(outputResults=outputHitsSet)
+
+    def _summary(self):
+        if not hasattr(self, 'outputResults'):
+            summ = ['Output results are not ready yet']
+        else:
+            summ = ['blastPDB retrieved *{0}* hits with the input criteria'.format(
+                len(self.outputResults))]
+        return summ
diff --git a/prody2/viewers/__init__.py b/prody2/viewers/__init__.py
@@ -14,9 +14,9 @@
 from pwem.viewers import DataViewer, showj
 from pwem.viewers.viewers_data import RegistryViewerConfig
 from prody2.objects import (SetOfTrajFrames, SetOfAtoms,
-                            SetOfClassesTraj)
+                            SetOfClassesTraj, SetOfBlastHits)
 DataViewer._targets.extend([SetOfTrajFrames, SetOfAtoms,
-                            SetOfClassesTraj])
+                            SetOfClassesTraj, SetOfBlastHits])
 
 RegistryViewerConfig.registerConfig(SetOfTrajFrames,
                                     {showj.ORDER: 'id enabled label _filename ',