generated from dataprofessor/qsar
-
Notifications
You must be signed in to change notification settings - Fork 2
/
streamlit_app.py
142 lines (112 loc) · 5.41 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st
import os
import pickle
import pandas as pd
from PIL import Image
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from padelpy import padeldescriptor
# Page configuration
st.set_page_config(
page_title='PARP1pred',
page_icon='💊',
initial_sidebar_state='expanded')
# Session state
if 'smiles_input' not in st.session_state:
st.session_state.smiles_input = ''
# Utilities
if os.path.isfile('molecule.smi'):
os.remove('molecule.smi')
# The App
st.title('💊 PARP1pred app')
st.info('PARP1pred allow users to predict whether a query molecule is active/inactive towards the PARP1 target protein.')
tab1,tab2,tab3,tab4,tab5,tab6,tab7 = st.tabs(['Main', 'About', 'What is PARP1?', 'Dataset', 'Model performance', 'Python libraries', 'Citing us'])
with tab1:
if st.session_state.smiles_input == '':
with st.form('my_form'):
st.subheader('Predict PARP1 inhibitory activity')
smiles_txt = st.text_input('Enter SMILES notation', st.session_state.smiles_input)
st.session_state.smiles_input = smiles_txt
with st.expander('Example SMILES'):
st.code('O=C(c1cc(Cc2n[nH]c(=O)c3ccccc23)ccc1F)N1CCN(C(=O)C2CC2)CC1')
submit_button = st.form_submit_button('Submit')
if submit_button:
st.subheader('⚛️ Input molecule:')
with st.expander('Show SMILES', expanded=True):
#st.write('**SMILES**')
st.text(st.session_state.smiles_input)
with st.expander('Show chemical structures', expanded=True):
#st.write('**Chemical structure**')
smi = Chem.MolFromSmiles(st.session_state.smiles_input)
Chem.Draw.MolToFile(smi, 'molecule.png', width=900)
mol_image = Image.open('molecule.png')
st.image(mol_image)
# Input SMILES saved to file
f = open('molecule.smi', 'w')
f.write(f'{st.session_state.smiles_input}\tmol_001')
f.close()
# Compute PADEL descriptors
if st.session_state.smiles_input != '':
st.subheader('🔢 Descriptors')
if os.path.isfile('molecule.smi'):
padeldescriptor(mol_dir='molecule.smi',
d_file='descriptors.csv',
descriptortypes='data/PubchemFingerprinter.xml',
detectaromaticity=True,
standardizenitro=True,
standardizetautomers=True,
threads=2,
removesalt=True,
log=True,
fingerprints=True)
descriptors = pd.read_csv('descriptors.csv')
descriptors.drop('Name', axis=1, inplace=True)
with st.expander('Show full set of descriptors as calculated for query molecule'):
#st.write('**Full set of descriptors (calculated for query molecule)**')
st.write(descriptors)
st.write(descriptors.shape)
# Load descriptor subset used in trained model
if st.session_state.smiles_input != '':
model = pickle.load(open('data/oversampling_PubChem_RandomForestClassifier.pkl', 'rb'))
pubchem_subset = model.feature_names_in_
query_desc_1 = descriptors.columns.difference(pubchem_subset)
query_desc_2 = descriptors.drop(query_desc_1, axis=1)
with st.expander('Show subset of descriptors as used in trained model'):
#st.write('**Subset of descriptors (used in trained model)**')
st.write(query_desc_2)
st.write(query_desc_2.shape)
# Read in saved classification model
if st.session_state.smiles_input != '':
st.subheader('🤖 Predictions')
pred = int(model.predict(query_desc_2))
if pred == 0:
st.error('Inactive')
if pred == 1:
st.success('Active')
with tab2:
coverimage = Image.open('PARP1pred.jpg')
st.image(coverimage)
with tab3:
st.header('What is PARP1?')
st.write('Poly (ADP-ribose) polymerase-1 (PARP-1) is an enzyme that catalyzes the ADP-ribosylation of a specific protein and plays a vital role in DNA repair. It has become an attractive target as inhibition of PARP-1 causes a toxic accumulation of DNA double strand breaks in cancer cells, particularly those with BRCA1/2 deficiency, which are found in breast, ovarian, prostate, and pancreatic cancers.')
with tab4:
st.header('Dataset')
st.write('''
In our work, we retrieved a human PARP-1 biological dataset from the ChEMBL database. The data was curated and resulted in a non-redundant set of 2,018 PARP-1 inhibitors, which can be divided into:
- 1,720 active compounds
- 298 inactive compounds
''')
with tab5:
st.header('Model performance')
st.write('We selected PubChem as a molecular fingerprint and used a random forest with an oversampling approach to construct the best model. The Matthews correlation coefficients for training, cross-validation, and test sets were 1.00, 0.96, and 0.74, respectively.')
with tab6:
st.header('Python libraries')
st.markdown('''
This app is based on the following Python libraries:
- `streamlit`
- `pandas`
- `rdkit`
- `padelpy`
''')
with tab7:
st.markdown('T. Lerksuthirat, S. Chitphuk, W. Stitchantrakul, D. Dejsuphong, A.A. Malik, C. Nantasenamat, PARP1PRED: A web server for screening the bioactivity of inhibitors against DNA repair enzyme PARP-1, ***EXCLI Journal*** (2023) DOI: https://doi.org/10.17179/excli2022-5602.')