-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreamlit.py
96 lines (75 loc) · 3.09 KB
/
streamlit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import streamlit as st
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from utils.tokenizer import load_tokenizer
from utils.model import DecoderOnlyTransformer
from utils.generate import generate_molecule_streaming
from utils.properties import calculate_properties, draw_molecule
st.set_page_config(page_title="MolDecod App", page_icon="🧪")
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load tokenizer
@st.cache_resource
def load_sp_model():
sp_model_path = 'models/moldecod_tokenizer.model'
return load_tokenizer(sp_model_path)
sp = load_sp_model()
# Define the model architecture and load MolDecod
@st.cache_resource
def load_model():
vocab_size = sp.get_piece_size()
d_model = 256
nhead = 4
num_encoder_layers = 4
dropout = 0.25
model = DecoderOnlyTransformer(vocab_size, d_model, nhead, num_layers=num_encoder_layers, dropout=dropout)
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
checkpoint_path = 'models/moldecod_transformer.pth'
if torch.cuda.is_available():
checkpoint = torch.load(checkpoint_path)
else:
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
return model, optimizer
model, optimizer = load_model()
# Streamlit app
st.title('MolDecod - Molecule Generation')
st.markdown('Generate molecules with a decoder-only transformer model')
st.markdown('###')
# User inputs
start_tokens = st.text_input('Start tokens (e.g., C, CCN)', value='C')
col1, col2 = st.columns(2)
with col1:
max_length = st.slider('Max length', min_value=10, max_value=1000, value=150)
with col2:
temperature = st.slider('Temperature', min_value=0.01, max_value=1.0, value=0.5, step=0.1)
if st.button('Generate'):
start_seq = sp.encode(start_tokens)
start_seq = torch.tensor(start_seq, device=device)
st.markdown('###')
# Create a placeholder
st.markdown('#### Generated molecule')
output_placeholder = st.empty()
with st.spinner('Generating...'):
generated_molecule = generate_molecule_streaming(model, start_seq, sp, device, max_length=max_length, temperature=temperature, streamlit_context=True, output_placeholder=output_placeholder)
st.markdown('')
# Draw molecule
img = draw_molecule(generated_molecule)
if img:
st.image(img, caption='Generated Molecule Structure')
# Calculate properties
properties = calculate_properties(generated_molecule)
if properties:
st.markdown('#### Molecular Properties')
property_data = [[prop, value] for prop, value in properties.items()]
property_df = pd.DataFrame(property_data, columns=['Property', 'Value'])
st.table(property_df)
else:
st.write("Invalid molecule")
# Add information about the other page
st.sidebar.info("Go to 'Visualize Tokenization' to see how SMILES strings are tokenized.")