Skip to content

Commit

Permalink
Merge pull request #5 from hgb-bin-proteomics/develop
Browse files Browse the repository at this point in the history
streamlit gui
  • Loading branch information
michabirklbauer authored Dec 7, 2023
2 parents 8182826 + 641a022 commit 51613f1
Show file tree
Hide file tree
Showing 6 changed files with 382 additions and 46 deletions.
21 changes: 21 additions & 0 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Docker Image CI

on:
workflow_dispatch:

jobs:

build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Build the Docker image
run: |
echo ${{ secrets.DOCKER_PASSWORD }} | docker login --username michabirklbauer --password-stdin
docker build . --file Dockerfile --tag michabirklbauer/spectrallibraryexporter:latest
docker push michabirklbauer/spectrallibraryexporter:latest
GIT_SHA="$(git rev-parse --short HEAD)"
docker tag michabirklbauer/spectrallibraryexporter:latest michabirklbauer/spectrallibraryexporter:$GIT_SHA
docker push michabirklbauer/spectrallibraryexporter:$GIT_SHA
2 changes: 1 addition & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']

steps:
- uses: actions/checkout@v3
Expand Down
19 changes: 19 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Dockerfile with MS Annika Spectral Library exporter
# author: Micha Birklbauer
# version: 1.0.0

FROM python:3.12.0

LABEL maintainer="micha.birklbauer@gmail.com"

RUN mkdir app
COPY requirements.txt app
COPY config.py app
COPY create_spectral_library.py app
COPY gui/streamlit_util.py app
COPY gui/streamlit_app.py app
WORKDIR app
RUN pip install -r requirements.txt
RUN pip install streamlit

CMD ["streamlit", "run", "streamlit_app.py", "--server.maxUploadSize", "5000"]
154 changes: 109 additions & 45 deletions create_spectral_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
# micha.birklbauer@gmail.com

# version tracking
__version = "1.1.0"
__date = "2023-11-28"
__version = "1.1.1"
__date = "2023-12-06"

# REQUIREMENTS
# pip install pandas
Expand All @@ -28,10 +28,14 @@
from typing import List
from typing import Tuple
from typing import Set
from typing import Union
from typing import BinaryIO
import warnings

# reading spectra
def read_spectra(filename: str) -> Dict[int, Dict]:
# def read_spectra(filename: str | BinaryIO) -> Dict[int, Dict]:
# for backward compatibility >>
def read_spectra(filename: Union[str, BinaryIO]) -> Dict[int, Dict]:
"""
Returns a dictionary that maps scan numbers to spectra:
Dict[int -> Dict["precursor" -> float
Expand Down Expand Up @@ -76,8 +80,28 @@ def read_multiple_spectra(filenames: List[str]) -> Dict[str, Dict[int, Dict]]:

return result_dict

# read multiple spectra files - streamlit version
def read_multiple_spectra_streamlit(st_files) -> Dict[str, Dict[int, Dict]]:
"""
Returns a dictionary that maps filenames to scan numbers to spectra:
Dict[str -> Dict[int -> Dict["precursor" -> float
"charge" -> int
"max_intensity" -> float
"peaks" -> Dict[m/z -> intensity]]
"""

result_dict = dict()

for st_file in st_files:
current_spectra_file = ".".join(st_file.name.split(".")[:-1]).strip()
result_dict[current_spectra_file] = read_spectra(st_file)

return result_dict

# generate a position to modification mass mapping
def generate_modifications_dict(peptide: str, modification_str: str) -> Dict[int, List[float]]:
def generate_modifications_dict(peptide: str,
modification_str: str,
possible_modifications: Dict[str, List[float]] = MODIFICATIONS) -> Dict[int, List[float]]:
"""
Returns a mapping of peptide positions (0 based) to possible modification masses.
modification_str is the modification string as returned by MS Annika e.g. K5(DSSO);M7(Oxidation)
Expand All @@ -101,16 +125,19 @@ def generate_modifications_dict(peptide: str, modification_str: str) -> Dict[int
pos = len(peptide)
else:
pos = int(aa_and_pos[1:]) - 1
if mod in MODIFICATIONS:
modifications_dict[pos] = MODIFICATIONS[mod]
if mod in possible_modifications:
modifications_dict[pos] = possible_modifications[mod]
else:
warnings.warn("Modification '" + mod + "' not found!")

return modifications_dict

# generate all theoretical fragments
# adapted from https://pyteomics.readthedocs.io/en/latest/examples/example_msms.html
def generate_theoretical_fragments(peptide: str, modifications: Dict[int, List[float]], ion_types: Tuple[str] = ("b", "y"), max_charge: int = 1) -> Dict[float, str]:
def generate_theoretical_fragments(peptide: str,
modifications: Dict[int, List[float]],
ion_types: Tuple[str] = ("b", "y"),
max_charge: int = 1) -> Dict[float, str]:
"""
Generates a set of theoretical fragment ion masses of the specified peptide with the modifications.
"""
Expand Down Expand Up @@ -164,13 +191,20 @@ def generate_theoretical_fragments(peptide: str, modifications: Dict[int, List[f
return fragments

# get all fragments and their annotations
def get_fragments(row: pd.Series, alpha: bool, spectra: Dict[str, Dict[int, Dict]]) -> List[Dict]:
def get_fragments(row: pd.Series,
alpha: bool,
spectra: Dict[str, Dict[int, Dict]],
crosslinker: str = CROSSLINKER,
possible_modifications: Dict[str, List[float]] = MODIFICATIONS,
ion_types: Tuple[str] = ION_TYPES,
max_charge: int = MAX_CHARGE,
match_tolerance: float = MATCH_TOLERANCE) -> List[Dict]:
"""
Generates all fragments with the necessary spectral library annotations for a given CSM peptide.
"""

# function to check if the fragment contains the crosslinker
def check_if_xl_in_frag(row, alpha, ion_type, fragment):
def check_if_xl_in_frag(row, alpha, ion_type, fragment, crosslinker):

if alpha:
peptide = row["Sequence A"]
Expand All @@ -185,7 +219,7 @@ def check_if_xl_in_frag(row, alpha, ion_type, fragment):
aa_and_pos = mod_in_list.strip().split("(")[0]
mod = mod_in_list.strip().split("(")[1].rstrip(")")

if mod == CROSSLINKER:
if mod == crosslinker:
if aa_and_pos == "Nterm":
pos = -1
elif aa_and_pos == "Cterm":
Expand Down Expand Up @@ -218,15 +252,15 @@ def check_if_xl_in_frag(row, alpha, ion_type, fragment):
current_spectra_file = ".".join(row["Spectrum File"].split(".")[:-1]).strip()
spectrum = spectra[current_spectra_file][scan_nr]

modifications_processed = generate_modifications_dict(sequence, modifications)
theoretical_fragments = generate_theoretical_fragments(sequence, modifications_processed, ion_types = ION_TYPES, max_charge = MAX_CHARGE)
modifications_processed = generate_modifications_dict(sequence, modifications, possible_modifications)
theoretical_fragments = generate_theoretical_fragments(sequence, modifications_processed, ion_types, max_charge)

matched_fragments = dict()

# match fragments
for peak_mz in spectrum["peaks"].keys():
for fragment in theoretical_fragments.keys():
if round(peak_mz, 4) < round(fragment + MATCH_TOLERANCE, 4) and round(peak_mz, 4) > round(fragment - MATCH_TOLERANCE, 4):
if round(peak_mz, 4) < round(fragment + match_tolerance, 4) and round(peak_mz, 4) > round(fragment - match_tolerance, 4):
matched_fragments[peak_mz] = theoretical_fragments[fragment]
break

Expand All @@ -239,7 +273,7 @@ def check_if_xl_in_frag(row, alpha, ion_type, fragment):
fragment_mz = match
fragment_rel_intensity = float(spectrum["peaks"][match] / spectrum["max_intensity"])
fragment_loss_type = ""
fragment_contains_xl = check_if_xl_in_frag(row, alpha, fragment_type, matched_fragments[match].split(":")[1].strip())
fragment_contains_xl = check_if_xl_in_frag(row, alpha, fragment_type, matched_fragments[match].split(":")[1].strip(), crosslinker)
fragment_lossy = False
fragments.append({"FragmentCharge": fragment_charge,
"FragmentType": fragment_type,
Expand Down Expand Up @@ -325,20 +359,21 @@ def get_PrecursorMz(row: pd.Series) -> float:
return float(row["m/z [Da]"])

# get the ModifiedPeptide value
def get_ModifiedPeptide(row: pd.Series) -> str:
def get_ModifiedPeptide(row: pd.Series,
crosslinker: str = CROSSLINKER) -> str:
"""
Returns SequenceA with modification annotations (without crosslinker) _ SequenceB with modification annotations (without crosslinker).
"""

# helper function to parse MS Annika modification string
def parse_mod_str(mod_str):
def parse_mod_str(mod_str, crosslinker):
modifications_dict = dict()
modifications = mod_str.split(";")
for modification in modifications:
aa_and_pos = modification.strip().split("(")[0]
mod = modification.strip().split("(")[1].rstrip(")")

if mod == CROSSLINKER:
if mod == crosslinker:
continue

if aa_and_pos == "Nterm":
Expand All @@ -361,8 +396,8 @@ def str_insert(string, index, character):
return string[:index] + character + string[:index]
# end function

mods_A = parse_mod_str(str(row["Modifications A"]))
mods_B = parse_mod_str(str(row["Modifications B"]))
mods_A = parse_mod_str(str(row["Modifications A"]), crosslinker)
mods_B = parse_mod_str(str(row["Modifications B"]), crosslinker)

# generate annotation for sequence A
shift = 0
Expand Down Expand Up @@ -406,12 +441,12 @@ def get_scanID(row: pd.Series) -> int:
return int(row["First Scan"])

# get the run value
def get_run() -> str:
def get_run(run_name: str = RUN_NAME) -> str:
"""
Returns the run name specified in config.py
"""

return RUN_NAME
return run_name

# get the searchID value
def get_searchID(row: pd.Series) -> str:
Expand All @@ -432,20 +467,23 @@ def get_crosslinkedResidues(row: pd.Series) -> str:
return str(positions["A"]) + "_" + str(positions["B"])

# get the LabeledSequence value
def get_LabeledSequence(row: pd.Series) -> str:
def get_LabeledSequence(row: pd.Series,
crosslinker: str = CROSSLINKER) -> str:
"""
Returns SequenceA with modification annotations (without crosslinker) _ SequenceB with modification annotations (without crosslinker).
"""

return get_ModifiedPeptide(row)
return get_ModifiedPeptide(row, crosslinker)

# get the iRT value
def get_iRT(row: pd.Series) -> float:
def get_iRT(row: pd.Series,
iRT_t: float = iRT_PARAMS["iRT_t"],
iRT_m: float = iRT_PARAMS["iRT_m"]) -> float:
"""
Returns the calculated iRT using the values specified in config.py.
"""

return (float(row["RT [min]"]) - iRT_PARAMS["iRT_t"]) / iRT_PARAMS["iRT_m"]
return (float(row["RT [min]"]) - iRT_t) / iRT_m

# get the RT value
def get_RT(row: pd.Series) -> float:
Expand All @@ -470,37 +508,62 @@ def get_IonMobility() -> float:
return 0.0

# get the values for all fragments of a CSM
def get_fragment_values(csm: pd.Series, spectra: Dict) -> Dict[str, List]:
def get_fragment_values(csm: pd.Series,
spectra: Dict,
crosslinker: str = CROSSLINKER,
possible_modifications: Dict[str, List[float]] = MODIFICATIONS,
ion_types: Tuple[str] = ION_TYPES,
max_charge: int = MAX_CHARGE,
match_tolerance: float = MATCH_TOLERANCE) -> Dict[str, List]:
"""
Returns the annotated fragments of both cross-linked peptides.
"""

fragments_A = get_fragments(csm, True, spectra)
fragments_B = get_fragments(csm, False, spectra)
fragments_A = get_fragments(csm, True, spectra, crosslinker, possible_modifications, ion_types, max_charge, match_tolerance)
fragments_B = get_fragments(csm, False, spectra, crosslinker, possible_modifications, ion_types, max_charge, match_tolerance)

return {"Fragments_A": fragments_A, "Fragments_B": fragments_B}

##### MAIN FUNCTION #####

# generates the spectral library
def main() -> pd.DataFrame:

print("INFO: Creating spectral library with input files:\nSpectra: " + "\n".join(SPECTRA_FILE) + "\nCSMs: " + CSMS_FILE)
# def main(spectra_file: List[str] | List[BinaryIO] = SPECTRA_FILE,
# csms_file: str | BinaryIO = CSMS_FILE,
# for backward compatibility >>
def main(spectra_file: Union[List[str], List[BinaryIO]] = SPECTRA_FILE,
csms_file: Union[str, BinaryIO] = CSMS_FILE,
run_name: str = RUN_NAME,
crosslinker: str = CROSSLINKER,
modifications: Dict[str, List[float]] = MODIFICATIONS,
ion_types: Tuple[str] = ION_TYPES,
max_charge: int = MAX_CHARGE,
match_tolerance: float = MATCH_TOLERANCE,
iRT_m: float = iRT_PARAMS["iRT_m"],
iRT_t: float = iRT_PARAMS["iRT_t"],
is_streamlit: bool = False,
save_output: bool = True) -> pd.DataFrame:

if is_streamlit:
print("INFO: Creating spectral library with input files:\nSpectra: " +
"\n".join([spectrum_file.name for spectrum_file in spectra_file]) +
"\nCSMs: " + str(csms_file.name))
else:
print("INFO: Creating spectral library with input files:\nSpectra: " + "\n".join(spectra_file) + "\nCSMs: " + csms_file)
print("INFO: Using the following modifications:")
print(MODIFICATIONS)
print(modifications)
print("INFO: Using the following ion types:")
print(ION_TYPES)
print(ion_types)
print("INFO: Using the following charge states:")
print([i for i in range(1, MAX_CHARGE + 1)])
print("INFO: Using a match tolerance of: " + str(MATCH_TOLERANCE) + " Da")
print([i for i in range(1, max_charge + 1)])
print("INFO: Using a match tolerance of: " + str(match_tolerance) + " Da")
print("INFO: Starting annotation process...")

print("INFO: Reading spectra...")
spectra = read_multiple_spectra(SPECTRA_FILE)
spectra = read_multiple_spectra(spectra_file) if not is_streamlit else read_multiple_spectra_streamlit(spectra_file)
print("INFO: Done reading spectra!")

print("INFO: Reading CSMs...")
csms = pd.read_excel(CSMS_FILE)
csms = pd.read_excel(csms_file)
print("INFO: Done reading CSMs! Starting spectral library creation...")

# columns
Expand Down Expand Up @@ -540,19 +603,19 @@ def main() -> pd.DataFrame:
FragmentGroupId = get_FragmentGroupId(row)
PrecursorCharge = get_PrecursorCharge(row)
PrecursorMz = get_PrecursorMz(row)
ModifiedPeptide = get_ModifiedPeptide(row)
ModifiedPeptide = get_ModifiedPeptide(row, crosslinker)
IsotopeLabel = get_IsotopeLabel()
cfile = get_filename(row)
scanID = get_scanID(row)
run = get_run()
run = get_run(run_name)
searchID = get_searchID(row)
crosslinkedResidues = get_crosslinkedResidues(row)
LabeledSequence = get_LabeledSequence(row)
iRT = get_iRT(row)
iRT = get_iRT(row, iRT_t, iRT_m)
RT = get_RT(row)
CCS = get_CCS()
IonMobility = get_IonMobility()
fragments = get_fragment_values(row, spectra)
fragments = get_fragment_values(row, spectra, crosslinker, modifications, ion_types, max_charge, match_tolerance)

for k in fragments.keys():
pep = fragments[k]
Expand Down Expand Up @@ -619,11 +682,12 @@ def main() -> pd.DataFrame:

spectral_library = pd.DataFrame(df_dict)

# save spectral library
spectral_library.to_csv(".".join(CSMS_FILE.split(".")[:-1]) + "_spectralLibrary.csv", index = True)
if save_output:
# save spectral library
spectral_library.to_csv(".".join(csms_file.split(".")[:-1]) + "_spectralLibrary.csv", index = True)

print("SUCCESS: Spectral library created with filename:")
print(".".join(CSMS_FILE.split(".")[:-1]) + "_spectralLibrary.csv")
print("SUCCESS: Spectral library created with filename:")
print(".".join(csms_file.split(".")[:-1]) + "_spectralLibrary.csv")

return spectral_library

Expand Down
Loading

0 comments on commit 51613f1

Please sign in to comment.