-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathZincProteome.lyx
5755 lines (3859 loc) · 88.9 KB
/
ZincProteome.lyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#LyX 2.3 created this file. For more info see http://www.lyx.org/
\lyxformat 544
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass classicthesis
\use_default_options true
\maintain_unincluded_children false
\language american
\language_package default
\inputencoding utf8
\fontencoding global
\font_roman "default" "default"
\font_sans "default" "default"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\use_microtype false
\use_dash_ligatures true
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 0
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine biblatex
\cite_engine_type authoryear
\biblio_style plainnat
\biblatex_bibstyle numeric-comp
\biblatex_citestyle numeric-comp
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 0
\use_minted 0
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 2
\tocdepth 2
\paragraph_separation indent
\paragraph_indentation default
\is_math_indent 0
\math_numbering_side default
\quotes_style english
\dynamic_quotes 0
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header
\begin_body
\begin_layout Chapter
Investigation of intermolecular metal-binding sites
\end_layout
\begin_layout Standard
Zn(II) ions were found to be essential for the growth of the toxic mold
–
\emph on
Aspergillus niger
\emph default
in 1869, nearly one hundred years later, in 1961, it was postulated that
zinc is essential also for humans.
\begin_inset Note Note
status collapsed
\begin_layout Plain Layout
(1) Raulin J.
Etudes cliniques sur la vegetation.
Ann.
Sci.
Nat.
Bot.
Biol.
Veg.
Ser.
1869, 11, 293– 299.
(2) Prasad, A.
S.
et al.
Syndrome of iron deficiency anemia, hepatosplenomegaly, hypogonadism, dwarfism
and geophagia.
Am.
J.
Med.
1961, 31, 532–546.
\end_layout
\end_inset
Almost 20 years later the first structures of proteins containing Zn(II)
started to appear.
It will not be an exaggeration if I say that the number of scientific questions
is proportional to the amount of new data—with the advent of the first
Zn(II)-containing macromolecular structures, questions about the factors
that govern the formation of the zinc-binding sites started to appear.
With more Zn(II)-containing structures deposited in the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
some tendencies became clear (see
\begin_inset Flex CT - auto cross-reference
status open
\begin_layout Plain Layout
\begin_inset CommandInset ref
LatexCommand labelonly
reference "sec:Zinc-Binding-Sites"
plural "false"
caps "false"
noprefix "false"
\end_inset
\end_layout
\end_inset
), however, the fact that the Zn(II) ions (and other metal ions) can be
bound at the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{interface}
\end_layout
\end_inset
of two or more macromolecules was rather ignored – one of the first reviews
of structural knowledge of metal-binding sites in proteins appeared in
\begin_inset CommandInset citation
LatexCommand citebyear
key "Tainer1992"
literal "true"
\end_inset
,
\begin_inset CommandInset citation
LatexCommand citep
key "Tainer1992"
literal "true"
\end_inset
whereas the first review regarding the binding of metals at
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glspl{ppi}
\end_layout
\end_inset
appeared in
\begin_inset CommandInset citation
LatexCommand citebyear
key "Song2014"
literal "true"
\end_inset
.
\begin_inset CommandInset citation
LatexCommand citep
key "Song2014"
literal "true"
\end_inset
This Chapter will focus on the methodology of how one can investigate the
metal-binding sites at
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glsdescplural{ppi}
\end_layout
\end_inset
.
\begin_inset Note Note
status open
\begin_layout Plain Layout
podać jakieś charakterystyki miejsc cynkowych, powiedzieć dlaczego szukanie
miejsc białkowych na poddstawie tych charakterystyk jest głupie
\end_layout
\end_inset
\end_layout
\begin_layout Section
\emph on
In silico
\emph default
investigation of intermolecular metal-binding sites
\begin_inset Note Note
status open
\begin_layout Plain Layout
gdzieś tu powinno pojawić się
\begin_inset space \space{}
\end_inset
informacje o tym, że możliwe, że coś się zmieniło wraz z typem annotacji
zrobionej przez włochów, ale nwm.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
The problems of
\shape italic
in silico
\shape default
investigation of intermolecular metal-binding sites
\end_layout
\begin_layout Standard
To date, there are no computational algorithms or other methods that are
able to identify or classify intermolecular metal-binding based on protein
sequence or structure.
The problem of metal-binding sites prediction was undertaken in the past
several times.
Apart from the availability of such tools, which is often unacceptable
\begin_inset CommandInset citation
LatexCommand citep
key "Ye2022"
literal "true"
\end_inset
, the quality of the proposed classifiers is often poor.
\begin_inset Marginal
status open
\begin_layout Plain Layout
Classifier is a tool that assigns an element to a certain class.
An algorithm that tells whether a protein is a
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{mprotein}
\end_layout
\end_inset
based on the protein attributes is a classifier.
\end_layout
\end_inset
Usually, the authors of such algorithms present the quality and efficiency
of their classifier in a reliable way, e.g., by showing various kinds of
statistics like receiver operating characteristic curve in the case of
binary classifiers, the presented tools at first glance seem to work very
well, however, to date, the use of such tools is limited.
For sure one cannot accuse the authors of such articles of scientific misconduc
t or ill-faith – usually, the problem lies deeper.
The are two main problems with the classifiers: the first one is the problem
with the initial data set, which eventually is a source for creating the
training and the test data sets, and the second problem is the choice of
explanatory variables.
\end_layout
\begin_layout Standard
The availability of well-annotated (due to manual curation) protein sequence
data is quite good.
The primary source of such information is scientific articles.
A publication that treats whether a protein binds metal or not has a number
of methods that show and describe this binding.
Such publication does not need to include information about the structure
of the protein, this information is not necessary to prove the interaction
of the metal with the protein.
Typically, this type of data can be aggregated in
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{UniProt}
\end_layout
\end_inset
, where metal-interacting residues are often assigned based on similarity
and sequence.
The problem with this type of data is the sparsity of annotations – not
every known
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{mprotein}
\end_layout
\end_inset
is annotated in
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{UniProt}
\end_layout
\end_inset
to interact with metal.
This leads to the attempts to create one's own data set, which in turn
involves the use of the other type of input – the structural data deposited
in the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
, however, the metal-binding sites in the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
are not annotated to be a true physiological or adventitious metal-binding
sites.
This, in turn, leads to the use of simple algorithms that sort
\begin_inset Quotes eld
\end_inset
true
\begin_inset Quotes erd
\end_inset
from
\begin_inset Quotes eld
\end_inset
false
\begin_inset Quotes erd
\end_inset
metal-binding sites in the data set based on simple rules, i.e., the distance
between the metal and
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glspl{Lg}
\end_layout
\end_inset
, the number of
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glspl{Lg}
\end_layout
\end_inset
, type of
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glspl{Lg}
\end_layout
\end_inset
, etc., however, this approach is far from being accurate.
\end_layout
\begin_layout Standard
The problem with the selection of explanatory variables is most acute with
classifiers that are based on the sequence.
Whereas structural-based classifiers may involve the abundance of meaningful
variables for the classifier (e.g., hydrophobicity, a charge of the amino
acids, geometry, position in space, etc.), the effort of prediction of a
metal-binding site based solely on the sequence might be similar to the
prediction of the sales on stock exchange based using only stock shares
name.
Nevertheless, in some specific cases, where there is an existing pattern
in the sequence it is possible to build sensitive and specific classifiers,
e.g., the prediction of zinc fingers based on sequence is possible due to
the existence of well-defined input data and the existence of patterns
in the sequences
\begin_inset CommandInset citation
LatexCommand citep
key "Sathyaseelan2023"
literal "true"
\end_inset
.
\end_layout
\begin_layout Standard
The above problems with the prediction of metal-binding sites are the same
for intermolecular metal-binding sites, as well, moreover, the fact of
two or more interacting macromolecules increases the complexity of the
problem.
So how to search for metals bound at
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glspl{ppi}
\end_layout
\end_inset
if there are no viable
\begin_inset Note Note
status open
\begin_layout Plain Layout
nwm czy tu dobrze viiable
\end_layout
\end_inset
\emph on
in silico
\emph default
tools? The question is addressed below in
\begin_inset Flex CT - auto cross-reference
status open
\begin_layout Plain Layout
\begin_inset CommandInset ref
LatexCommand labelonly
reference "subsec:Obtaining-knowledge-about-InterMBS"
plural "false"
caps "false"
noprefix "false"
\end_inset
\end_layout
\end_inset
.
\end_layout
\begin_layout Subsection
Obtaining knowledge about intermolecular metal-binding sites
\begin_inset CommandInset label
LatexCommand label
name "subsec:Obtaining-knowledge-about-InterMBS"
\end_inset
\end_layout
\begin_layout Standard
As in the case of information about the interaction of the metal with the
protein, here also publications may also be the primary source of information
about metal-binding on
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{ppi}
\end_layout
\end_inset
.
However, the lack of recognition of inter-protein metal ion binding has
partly contributed to the fact that even if a publication is accompanied
by the deposition of a structure in the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
, the fact of inter-protein metal ion binding is not necessarily commented
on in the publication in any way.
An additional problem with extracting knowledge directly from publications
is the fact that the search for this type of information can be time-consuming.
There may be problems with the availability of the article (not every article
is published as an open access article), and the fact that there is no
tool that aggregates the articles based on the information on whether the
described protein within the article has an inter-protein metal-binding
site, makes this approach unreasonable.
Nevertheless, it is not out of the question that this type of information
aggregation will change (not necessarily regarding the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{mppi}
\end_layout
\end_inset
) with the increasing number of open access articles published each year
and the development of technologies related to natural language processing.
\end_layout
\begin_layout Standard
To date, the best source of information on proteins that bind metal ions
in an intermolecular manner are structural databases like
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
.
Since
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
contains all deposited structures (whether they contain metal or not) searching
for information on metal-binding sites in
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
is not necessarily an easy task.
For this reason, various secondary databases have been created that rely
on the information contained in the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset Flex CT - Description Label
status open
\begin_layout Plain Layout
\begin_inset Flex CT - Spaced Low Small Caps
status open
\begin_layout Plain Layout
MESPEUS
\end_layout
\end_inset
\end_layout
\end_inset
was one of the first databases to include information on zinc sites, MESPEUS
allowed filtering of data based on metal,
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{Lg}
\end_layout
\end_inset
type, and resolution, unfortunately, MESPEUS is now no longer available
\begin_inset CommandInset citation
LatexCommand citep
key "Hsin2008,Harding2014"
literal "true"
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset Flex CT - Description Label
status open
\begin_layout Plain Layout
\begin_inset Flex CT - Spaced Low Small Caps
status open
\begin_layout Plain Layout
ZifBASE
\end_layout
\end_inset
\end_layout
\end_inset
is an even more specific database – it is dedicated only to zinc fingers,
plus it contains far more information than just structural information
– it contains sequence information, potential DNA interaction sequences,
and more.
However, as of today, despite the existence of the site, ZifBASE appears
to be severely broken or non-functional at all, search and other functions
do not work, additionally, it is not known how often the database is updated
or whether longer updated
\begin_inset CommandInset citation
LatexCommand citep
key "Jayakanthan2009"
literal "true"
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset Flex CT - Description Label
status open
\begin_layout Plain Layout
\begin_inset Flex CT - Spaced Low Small Caps
status open
\begin_layout Plain Layout
MetalPDB
\end_layout
\end_inset
\end_layout
\end_inset
is the best-known derivative database based on
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{rcsb}
\end_layout
\end_inset
regarding metal ions.
MetalPDB has more advanced features compared to a simple database like
MESPEUS, it has, for example, the ability to search for binding sites by
geometric likeness.
The big advantage of MetalPDB is the use of the
\begin_inset Quotes eld
\end_inset
Minimal Functional Site
\begin_inset Quotes erd
\end_inset
concept which allows the implementation of the above function.
The disadvantage of MetalPDB is that MetalPDB is based on
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glspl{asunit}
\end_layout
\end_inset
, not
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glspl{bassembly}
\end_layout
\end_inset
.
This approach to building a database is incorrect because some of the binding
sites (and especially the intermolecular binding) can only be seen after
the symmetry operations are applied and the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{bassembly}
\end_layout
\end_inset
is created.
Like the rest of these tools, MetalPDB does not allow searching for inter-prote
in binding sites of metal ions.
MetalPDB despite the fact that it is possible to automate this type of
tool seems to be rarely updated, as of February 13, 2023, MetalPDB gives
an update date as of April 5, 2022, which does not seem to be true.
No metal-containing structures, but published close to the supposed MetalPDB
update date are present in the database – for example, the structures
\begin_inset CommandInset href
LatexCommand href
name "7V8G"
target "https://www.rcsb.org/structure/7V8G"
literal "false"
\end_inset
(published March 30, 2022) or
\begin_inset CommandInset href
LatexCommand href
name "7TIH"
target "https://www.rcsb.org/structure/7TIH"
literal "false"
\end_inset
(published March 30, 2022) despite the fact that it is strewn with metals
is also absent.
The fact that derivative databases do not have all the records that are
present in the primary database is understandable – for various reasons,
over the years
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{pdb}
\end_layout
\end_inset
files have all sorts of errors, inaccuracies, or format incompatibilities
that cause problems in processing, nevertheless, the number of missing
files in MetalPDB is puzzling
\begin_inset CommandInset citation
LatexCommand citep
key "Andreini2013,Putignano2018"
literal "true"
\end_inset
.
\end_layout
\begin_layout Standard
\begin_inset Flex CT - Description Label
status open
\begin_layout Plain Layout
\begin_inset Flex CT - Spaced Low Small Caps
status open
\begin_layout Plain Layout
ZincBind
\end_layout
\end_inset
\end_layout
\end_inset
is one of the newest databases, which includes information on
\begin_inset Quotes eld
\end_inset
physiological
\begin_inset Quotes erd
\end_inset
binding sites for Zn(II) ions, however, it does not have information on
intermolecular zinc-binding.
Physiological sites in ZincBind are understood to be those with at least
two binding residues and at least three binding atoms, additionally, ZincBind
overcomes the biggest issues of MetalPDB: ZincBind is automatically updated,
presented structures are already in the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
glspl{bassembly}
\end_layout
\end_inset
, and most of all the
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{code}
\end_layout
\end_inset
of ZincBind is freely available under the MIT License
\begin_inset CommandInset citation
LatexCommand citep
key "Ireland2019"
literal "true"
\end_inset
.
\begin_inset Marginal
status open
\begin_layout Plain Layout
The
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
gls{code}
\end_layout
\end_inset
availability makes the science more transparent.
\end_layout
\end_inset
\end_layout