forked from ProteomicsML/ProteomicsML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreferences.bib
1193 lines (1156 loc) · 52.4 KB
/
references.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{Adhikari2020-vu,
title = {A high-stringency blueprint of the human proteome},
author = {
Adhikari, Subash and Nice, Edouard C and Deutsch, Eric W and Lane, Lydie
and Omenn, Gilbert S and Pennington, Stephen R and Paik, Young-Ki and
Overall, Christopher M and Corrales, Fernando J and Cristea, Ileana M and
Van Eyk, Jennifer E and Uhl{\'e}n, Mathias and Lindskog, Cecilia and Chan,
Daniel W and Bairoch, Amos and Waddington, James C and Justice, Joshua L
and LaBaer, Joshua and Rodriguez, Henry and He, Fuchu and Kostrzewa, Markus
and Ping, Peipei and Gundry, Rebekah L and Stewart, Peter and Srivastava,
Sanjeeva and Srivastava, Sudhir and Nogueira, Fabio C S and Domont,
Gilberto B and Vandenbrouck, Yves and Lam, Maggie P Y and Wennersten, Sara
and Vizcaino, Juan Antonio and Wilkins, Marc and Schwenk, Jochen M and
Lundberg, Emma and Bandeira, Nuno and Marko-Varga, Gyorgy and Weintraub,
Susan T and Pineau, Charles and Kusebauch, Ulrike and Moritz, Robert L and
Ahn, Seong Beom and Palmblad, Magnus and Snyder, Michael P and Aebersold,
Ruedi and Baker, Mark S
},
year = 2020,
month = oct,
journal = {Nat. Commun.},
volume = 11,
number = 1,
pages = 5301,
doi = {10.1038/s41467-020-19045-9},
abstract = {
The Human Proteome Organization (HUPO) launched the Human Proteome Project
(HPP) in 2010, creating an international framework for global
collaboration, data sharing, quality assurance and enhancing accurate
annotation of the genome-encoded proteome. During the subsequent decade,
the HPP established collaborations, developed guidelines and metrics, and
undertook reanalysis of previously deposited community data, continuously
increasing the coverage of the human proteome. On the occasion of the HPP's
tenth anniversary, we here report a 90.4\% complete high-stringency human
proteome blueprint. This knowledge is essential for discerning molecular
processes in health and disease, as we demonstrate by highlighting
potential roles the human proteome plays in our understanding, diagnosis
and treatment of cancers, cardiovascular and infectious diseases.
},
language = {en}
}
@article{Bittremieux2019,
title = {
spectrum_utils: A Python Package for Mass Spectrometry Data Processing and
Visualization
},
author = {Wout Bittremieux},
year = 2019,
month = dec,
journal = {Analytical Chemistry},
publisher = {American Chemical Society ({ACS})},
volume = 92,
number = 1,
pages = {659--661},
doi = {10.1021/acs.analchem.9b04884},
url = {https://doi.org/10.1021/acs.analchem.9b04884}
}
@article{bouwmeester-gabriels2020,
title = {
The Age of Data-Driven Proteomics: How Machine Learning Enables Novel
Workflows
},
author = {
Bouwmeester, Robbin and Gabriels, Ralf and Van Den Bossche, Tim and
Martens, Lennart and Degroeve, Sven
},
year = 2020,
journal = {PROTEOMICS},
volume = 20,
number = {21-22},
pages = 1900351,
doi = {https://doi.org/10.1002/pmic.201900351},
url = {
https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/abs/10.1002/pmic.201900351
},
keywords = {data driven modeling, deep learning, machine learning},
eprint = {
https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/pdf/10.1002/pmic.201900351
},
abstract = {
Abstract A lot of energy in the field of proteomics is dedicated to the
application of challenging experimental workflows, which include
metaproteomics, proteogenomics, data independent acquisition (DIA),
non-specific proteolysis, immunopeptidomics, and open modification
searches. These workflows are all challenging because of ambiguity in the
identification stage; they either expand the search space and thus increase
the ambiguity of identifications, or, in the case of DIA, they generate
data that is inherently more ambiguous. In this context, machine
learning-based predictive models are now generating considerable excitement
in the field of proteomics because these predictive models hold great
potential to drastically reduce the ambiguity in the identification process
of the above-mentioned workflows. Indeed, the field has already produced
classical machine learning and deep learning models to predict almost every
aspect of a liquid chromatography-mass spectrometry (LC-MS) experiment. Yet
despite all the excitement, thorough integration of predictive models in
these challenging LC-MS workflows is still limited, and further
improvements to the modeling and validation procedures can still be made.
Therefore, highly promising recent machine learning developments in
proteomics are pointed out in this viewpoint, alongside some of the
remaining challenges.
}
}
@article{Bouwmeester2021-cf,
title = {
{DeepLC} can predict retention times for peptides that carry as-yet unseen
modifications
},
author = {
Bouwmeester, Robbin and Gabriels, Ralf and Hulstaert, Niels and Martens,
Lennart and Degroeve, Sven
},
year = 2021,
month = nov,
journal = {Nat. Methods},
volume = 18,
number = 11,
pages = {1363--1369},
doi = {10.1038/s41592-021-01301-5},
abstract = {
The inclusion of peptide retention time prediction promises to remove
peptide identification ambiguity in complex liquid chromatography-mass
spectrometry identification workflows. However, due to the way peptides are
encoded in current prediction models, accurate retention times cannot be
predicted for modified peptides. This is especially problematic for
fledgling open searches, which will benefit from accurate retention time
prediction for modified peptides to reduce identification ambiguity. We
present DeepLC, a deep learning peptide retention time predictor using
peptide encoding based on atomic composition that allows the retention time
of (previously unseen) modified peptides to be predicted accurately. We
show that DeepLC performs similarly to current state-of-the-art approaches
for unmodified peptides and, more importantly, accurately predicts
retention times for modifications not seen during training. Moreover, we
show that DeepLC's ability to predict retention times for any modification
enables potentially incorrect identifications to be flagged in an open
search of a wide variety of proteome data.
},
language = {en}
}
@article{Broeckling2021-ks,
title = {
Application of Predicted Collisional Cross Section to Metabolome Databases
to Probabilistically Describe the Current and Future Ion Mobility Mass
Spectrometry
},
author = {
Broeckling, Corey D and Yao, Linxing and Isaac, Giorgis and Gioioso, Marisa
and Ianchis, Valentin and Vissers, Johannes P C
},
year = 2021,
month = mar,
journal = {J. Am. Soc. Mass Spectrom.},
volume = 32,
number = 3,
pages = {661--669},
doi = {10.1021/jasms.0c00375},
abstract = {
Metabolomics is a powerful phenotyping platform with potential for
high-throughput analyses. The primary technology for metabolite profiling
is mass spectrometry. In recent years, the coupling of mass spectrometry
with ion mobility spectrometry (IMS) has offered the promise of faster
analysis time and greater resolving power. Our understanding of the
potential impact of IMS on the field of metabolomics is limited by
availability of comprehensive experimental data. In this analysis, we use a
probabilistic approach to enumerate the strengths and limitations, the
present and future, of this technology. This is accomplished through use of
``model'' metabolomes, predicted physicochemical properties, and
probabilistic descriptions of resolving power. This analysis advances our
understanding of the importance of orthogonality in resolving (separation)
dimensions, describes the impact of the metabolome composition on
resolution demands, and offers a system resolution landscape that may serve
to guide practitioners in the coming years.
},
language = {en}
}
@article{C_Silva2019-yy,
title = {
Accurate peptide fragmentation predictions allow data driven approaches to
replace and improve upon proteomics search engine scoring functions
},
author = {
C Silva, Ana S and Bouwmeester, Robbin and Martens, Lennart and Degroeve,
Sven
},
year = 2019,
month = dec,
journal = {Bioinformatics},
volume = 35,
number = 24,
pages = {5243--5248},
doi = {10.1093/bioinformatics/btz383},
abstract = {
MOTIVATION: The use of post-processing tools to maximize the information
gained from a proteomics search engine is widely accepted and used by the
community, with the most notable example being Percolator-a semi-supervised
machine learning model which learns a new scoring function for a given
dataset. The usage of such tools is however bound to the search engine's
scoring scheme, which doesn't always make full use of the intensity
information present in a spectrum. We aim to show how this tool can be
applied in such a way that maximizes the use of spectrum intensity
information by leveraging another machine learning-based tool, MS2PIP.
MS2PIP predicts fragment ion peak intensities. RESULTS: We show how
comparing predicted intensities to annotated experimental spectra by
calculating direct similarity metrics provides enough information for a
tool such as Percolator to accurately separate two classes of
peptide-to-spectrum matches. This approach allows using more information
out of the data (compared with simpler intensity based metrics, like peak
counting or explained intensities summing) while maintaining control of
statistics such as the false discovery rate. AVAILABILITY AND
IMPLEMENTATION: All of the code is available online at
https://github.com/compomics/ms2rescore. SUPPLEMENTARY INFORMATION:
Supplementary data are available at Bioinformatics online.
},
language = {en}
}
@inproceedings{Chen2016,
title = {{XGBoost}},
author = {Tianqi Chen and Carlos Guestrin},
year = 2016,
month = aug,
booktitle = {
Proceedings of the 22nd {ACM} {SIGKDD} International Conference on
Knowledge Discovery and Data Mining
},
publisher = {{ACM}},
doi = {10.1145/2939672.2939785},
url = {https://doi.org/10.1145/2939672.2939785}
}
@article{deng2012,
title = {
The MNIST Database of Handwritten Digit Images for Machine Learning
Research [Best of the Web]
},
author = {Deng, Li},
year = 2012,
journal = {IEEE Signal Processing Magazine},
volume = 29,
number = 6,
pages = {141--142},
doi = {10.1109/MSP.2012.2211477},
keywords = {Machine learning}
}
@article{Deutsch2020-og,
title = {
The {ProteomeXchange} consortium in 2020: enabling 'big data' approaches in
proteomics
},
author = {
Deutsch, Eric W and Bandeira, Nuno and Sharma, Vagisha and Perez-Riverol,
Yasset and Carver, Jeremy J and Kundu, Deepti J and Garc{\'\i}a-Seisdedos,
David and Jarnuczak, Andrew F and Hewapathirana, Suresh and Pullman,
Benjamin S and Wertz, Julie and Sun, Zhi and Kawano, Shin and Okuda,
Shujiro and Watanabe, Yu and Hermjakob, Henning and MacLean, Brendan and
MacCoss, Michael J and Zhu, Yunping and Ishihama, Yasushi and
Vizca{\'\i}no, Juan A
},
year = 2020,
month = jan,
journal = {Nucleic Acids Res.},
volume = 48,
number = {D1},
pages = {D1145--D1152},
doi = {10.1093/nar/gkz984},
abstract = {
The ProteomeXchange (PX) consortium of proteomics resources
(http://www.proteomexchange.org) has standardized data submission and
dissemination of mass spectrometry proteomics data worldwide since 2012. In
this paper, we describe the main developments since the previous update
manuscript was published in Nucleic Acids Research in 2017. Since then, in
addition to the four PX existing members at the time (PRIDE, PeptideAtlas
including the PASSEL resource, MassIVE and jPOST), two new resources have
joined PX: iProX (China) and Panorama Public (USA). We first describe the
updated submission guidelines, now expanded to include six members. Next,
with current data submission statistics, we demonstrate that the proteomics
field is now actively embracing public open data policies. At the end of
June 2019, more than 14 100 datasets had been submitted to PX resources
since 2012, and from those, more than 9 500 in just the last three years.
In parallel, an unprecedented increase of data re-use activities in the
field, including 'big data' approaches, is enabling novel research and new
data resources. At last, we also outline some of our future plans for the
coming years.
},
language = {en}
}
@article{Dincer2022-re,
title = {
Reducing Peptide Sequence Bias in Quantitative Mass Spectrometry Data with
Machine Learning
},
author = {
Ayse B. Dincer and Yang Lu and Devin K. Schweppe and Sewoong Oh and William
Stafford Noble
},
year = 2022,
month = jun,
journal = {Journal of Proteome Research},
publisher = {American Chemical Society ({ACS})},
volume = 21,
number = 7,
pages = {1771--1782},
doi = {10.1021/acs.jproteome.2c00211},
url = {https://doi.org/10.1021/acs.jproteome.2c00211}
}
@article{Dodds2019-oi,
title = {
Ion Mobility Spectrometry: Fundamental Concepts, Instrumentation,
Applications, and the Road Ahead
},
author = {Dodds, James N and Baker, Erin S},
year = 2019,
month = nov,
journal = {J. Am. Soc. Mass Spectrom.},
volume = 30,
number = 11,
pages = {2185--2195},
doi = {10.1007/s13361-019-02288-2},
abstract = {
Ion mobility spectrometry (IMS) is a rapid separation technique that has
experienced exponential growth as a field of study. Interfacing IMS with
mass spectrometry (IMS-MS) provides additional analytical power as
complementary separations from each technique enable multidimensional
characterization of detected analytes. IMS separations occur on a
millisecond timescale, and therefore can be readily nested into traditional
GC and LC/MS workflows. However, the continual development of novel IMS
methods has generated some level of confusion regarding the advantages and
disadvantages of each. In this critical insight, we aim to clarify some
common misconceptions for new users in the community pertaining to the
fundamental concepts of the various IMS instrumental platforms (i.e.,
DTIMS, TWIMS, TIMS, FAIMS, and DMA), while addressing the strengths and
shortcomings associated with each. Common IMS-MS applications are also
discussed in this review, such as separating isomeric species, performing
signal filtering for MS, and incorporating collision cross-section (CCS)
values into both targeted and untargeted omics-based workflows as
additional ion descriptors for chemical annotation. Although many
challenges must be addressed by the IMS community before mobility
information is collected in a routine fashion, the future is bright with
possibilities.
},
keywords = {IMS; Ion mobility spectrometry; Mass spectrometry; Untargeted metabolomics},
language = {en}
}
@article{Fondrie2021-nb,
title = {{ppx}: Programmatic Access to Proteomics Data Repositories},
author = {Fondrie, William E and Bittremieux, Wout and Noble, William S},
year = 2021,
month = sep,
journal = {J. Proteome Res.},
volume = 20,
number = 9,
pages = {4621--4624},
doi = {10.1021/acs.jproteome.1c00454},
abstract = {
The volume of proteomics and mass spectrometry data available in public
repositories continues to grow at a rapid pace as more researchers embrace
open science practices. Open access to the data behind scientific
discoveries has become critical to validate published findings and develop
new computational tools. Here, we present ppx, a Python package that
provides easy, programmatic access to the data stored in ProteomeXchange
repositories, such as PRIDE and MassIVE. The ppx package can be used as
either a command line tool or a Python package to retrieve the files and
metadata associated with a project when provided its identifier. To
demonstrate how ppx enhances reproducible research, we used ppx within a
Snakemake workflow to reanalyze a published data set with the open
modification search tool ANN-SoLo and compared our reanalysis to the
original results. We show that ppx readily integrates into workflows, and
our reanalysis produced results consistent with the original analysis. We
envision that ppx will be a valuable tool for creating reproducible
analyses, providing tool developers easy access to data for development,
testing, and benchmarking, and enabling the use of mass spectrometry data
in data-intensive analyses. The ppx package is freely available and open
source under the MIT license at https://github.com/wfondrie/ppx.
},
keywords = {
FAIR; Python; bioinformatics; data access; data dissemination; data
sharing; mass spectrometry; proteomics; repository; reproducibility
},
language = {en}
}
% For tutorials and other
@article{Friedman2002,
title = {Stochastic gradient boosting},
author = {Jerome H. Friedman},
year = 2002,
month = feb,
journal = {Computational Statistics and Data Analysis},
publisher = {Elsevier {BV}},
volume = 38,
number = 4,
pages = {367--378},
doi = {10.1016/s0167-9473(01)00065-2},
url = {https://doi.org/10.1016/s0167-9473(01)00065-2}
}
@article{Gabriels2019,
title = {
Updated MS²PIP web server delivers fast and accurate MS² peak intensity
prediction for multiple fragmentation methods, instruments and labeling
techniques
},
author = {Gabriels, Ralf and Martens, Lennart and Degroeve, Sven},
year = 2019,
journal = {NUCLEIC ACIDS RESEARCH},
volume = 47,
number = {W1},
pages = {W295--W299},
doi = {10.1093/nar/gkz299},
issn = {0305-1048},
abstract = {
MS²PIP is a data-driven tool that accurately predicts peak intensities for
a given peptide's fragmentation mass spectrum. Since the release of the
MS²PIP web server in 2015, we have brought significant updates to both
the tool and the web server. In addition to the original models for CID and
HCD fragmentation, we have added specialized models for the TripleTOF 5600+
mass spectrometer, for TMT-labeled peptides, for iTRAQ-labeled peptides,
and for iTRAQ-labeled phosphopeptides. Because the fragmentation pattern is
heavily altered in each of these cases, these additional models greatly
improve the prediction accuracy for their corresponding data types. We have
also substantially reduced the computational resources required to run
MS²PIP, and have completely rebuilt the web server, which now allows
predictions of up to 100 000 peptide sequences in a single request. The
MS²PIP web server is freely available at https://iomics.ugent.be/ms2pip/.
},
language = {eng}
}
@article{Gabriels2019-gx,
title = {
Updated MS²PIP web server delivers fast and accurate MS² peak intensity
prediction for multiple fragmentation methods, instruments and labeling
techniques
},
author = {Gabriels, Ralf and Martens, Lennart and Degroeve, Sven},
year = 2019,
month = jul,
journal = {Nucleic Acids Res.},
volume = 47,
number = {W1},
pages = {W295--W299},
doi = {10.1093/nar/gkz299},
abstract = {
MS²PIP is a data-driven tool that accurately predicts peak intensities for
a given peptide's fragmentation mass spectrum. Since the release of the
MS²PIP web server in 2015, we have brought significant updates to both the
tool and the web server. In addition to the original models for CID and HCD
fragmentation, we have added specialized models for the TripleTOF 5600+
mass spectrometer, for TMT-labeled peptides, for iTRAQ-labeled peptides,
and for iTRAQ-labeled phosphopeptides. Because the fragmentation pattern is
heavily altered in each of these cases, these additional models greatly
improve the prediction accuracy for their corresponding data types. We have
also substantially reduced the computational resources required to run
MS²PIP, and have completely rebuilt the web server, which now allows
predictions of up to 100 000 peptide sequences in a single request. The
MS²PIP web server is freely available at https://iomics.ugent.be/ms2pip/.
},
language = {en}
}
@article{Gessulat2019-rt,
title = {
Prosit: proteome-wide prediction of peptide tandem mass spectra by deep
learning
},
author = {
Gessulat, Siegfried and Schmidt, Tobias and Zolg, Daniel Paul and Samaras,
Patroklos and Schnatbaum, Karsten and Zerweck, Johannes and Knaute, Tobias
and Rechenberger, Julia and Delanghe, Bernard and Huhmer, Andreas and
Reimer, Ulf and Ehrlich, Hans-Christian and Aiche, Stephan and Kuster,
Bernhard and Wilhelm, Mathias
},
year = 2019,
month = jun,
journal = {Nat. Methods},
volume = 16,
number = 6,
pages = {509--518},
doi = {10.1038/s41592-019-0426-7},
abstract = {
In mass-spectrometry-based proteomics, the identification and
quantification of peptides and proteins heavily rely on sequence database
searching or spectral library matching. The lack of accurate predictive
models for fragment ion intensities impairs the realization of the full
potential of these approaches. Here, we extended the ProteomeTools
synthetic peptide library to 550,000 tryptic peptides and 21 million
high-quality tandem mass spectra. We trained a deep neural network, termed
Prosit, resulting in chromatographic retention time and fragment ion
intensity predictions that exceed the quality of the experimental data.
Integrating Prosit into database search pipelines led to more
identifications at >10$\times$ lower false discovery rates. We show the
general applicability of Prosit by predicting spectra for proteases other
than trypsin, generating spectral libraries for data-independent
acquisition and improving the analysis of metaproteomes. Prosit is
integrated into ProteomicsDB, allowing search result re-scoring and custom
spectral library generation for any organism on the basis of peptide
sequence alone.
},
language = {en}
}
@article{Hebert2014-tc,
title = {The one hour yeast proteome},
author = {
Hebert, Alexander S and Richards, Alicia L and Bailey, Derek J and Ulbrich,
Arne and Coughlin, Emma E and Westphall, Michael S and Coon, Joshua J
},
year = 2014,
month = jan,
journal = {Mol. Cell. Proteomics},
volume = 13,
number = 1,
pages = {339--347},
doi = {10.1074/mcp.M113.034769},
abstract = {
We describe the comprehensive analysis of the yeast proteome in just over
one hour of optimized analysis. We achieve this expedited proteome
characterization with improved sample preparation, chromatographic
separations, and by using a new Orbitrap hybrid mass spectrometer equipped
with a mass filter, a collision cell, a high-field Orbitrap analyzer, and,
finally, a dual cell linear ion trap analyzer (Q-OT-qIT, Orbitrap Fusion).
This system offers high MS(2) acquisition speed of 20 Hz and detects up to
19 peptide sequences within a single second of operation. Over a 1.3 h
chromatographic method, the Q-OT-qIT hybrid collected an average of 13,447
MS(1) and 80,460 MS(2) scans (per run) to produce 43,400 (x) peptide
spectral matches and 34,255 (x) peptides with unique amino acid sequences
(1\% false discovery rate (FDR)). On average, each one hour analysis
achieved detection of 3,977 proteins (1\% FDR). We conclude that further
improvements in mass spectrometer scan rate could render comprehensive
analysis of the human proteome within a few hours.
},
language = {en}
}
@misc{Kaggle,
author = {Kaggle.com},
journal = {Kaggle},
url = {https://www.kaggle.com/datasets?search=proteomics}
}
@article{Lam2008,
title = {
Building consensus spectral libraries for peptide identification in
proteomics
},
author = {
Henry Lam and Eric W Deutsch and James S Eddes and Jimmy K Eng and Stephen
E Stein and Ruedi Aebersold
},
year = 2008,
month = sep,
journal = {Nature Methods},
publisher = {Springer Science and Business Media {LLC}},
volume = 5,
number = 10,
pages = {873--875},
doi = {10.1038/nmeth.1254},
url = {https://doi.org/10.1038/nmeth.1254}
}
@article{Larriba-Andaluz2020-kc,
title = {
Fundamentals of ion mobility in the free molecular regime. Interlacing the
past, present and future of ion mobility calculations
},
author = {Larriba-Andaluz, Carlos and Prell, James S},
year = 2020,
month = oct,
journal = {Int. Rev. Phys. Chem.},
publisher = {Taylor \& Francis},
volume = 39,
number = 4,
pages = {569--623},
doi = {10.1080/0144235X.2020.1826708},
abstract = {
While existing ion mobility calculators are capable of feats as impressive
as calculating collision cross sections (CCS) within a few per cent and
within a very reasonable time, the simplifications assumed in their
estimations precludes them from being more precise, potentially
overreaching with respect to the interpretation of existing calculations.
With ion mobility instrumentation progressively reaching resolutions of
several hundreds to thousands (accuracy in the range of ?0.1\%), a more
accurate theoretical description of gas-phase ion mobility becomes
necessary to correctly interpret experimental state-of-the-art separations.
This manuscript entails an effort to consolidate the most relevant
theoretical work pertaining to ion mobility within the ?free molecular?
regime, describing in detail the rationale for approximations up to the
two-temperature theory, using both a momentum transfer approach as well as
the solution to the moments of the Boltzmann equation for the ion. With
knowledge of the existing deficiencies in the numerical methods, the
manuscript provides a series of necessary additions in order to better
simulate some of the separations observed experimentally due to
second-order effects, namely, high field effects, dipole alignment, angular
velocities and moments of inertia, potential interactions and inelastic
collisions among others.
}
}
@article{Levitsky2018,
title = {Pyteomics 4.0: Five Years of Development of a Python Proteomics Framework},
author = {
Lev I. Levitsky and Joshua A. Klein and Mark V. Ivanov and Mikhail V.
Gorshkov
},
year = 2018,
month = dec,
journal = {Journal of Proteome Research},
publisher = {American Chemical Society ({ACS})},
volume = 18,
number = 2,
pages = {709--714},
doi = {10.1021/acs.jproteome.8b00717},
url = {https://doi.org/10.1021/acs.jproteome.8b00717}
}
@article{Meier2021-ig,
title = {
Deep learning the collisional cross sections of the peptide universe from a
million experimental values
},
author = {
Meier, Florian and K{\"o}hler, Niklas D and Brunner, Andreas-David and
Wanka, Jean-Marc H and Voytik, Eugenia and Strauss, Maximilian T and Theis,
Fabian J and Mann, Matthias
},
year = 2021,
month = feb,
journal = {Nat. Commun.},
volume = 12,
number = 1,
pages = 1185,
doi = {10.1038/s41467-021-21352-8},
abstract = {
The size and shape of peptide ions in the gas phase are an under-explored
dimension for mass spectrometry-based proteomics. To investigate the nature
and utility of the peptide collisional cross section (CCS) space, we
measure more than a million data points from whole-proteome digests of five
organisms with trapped ion mobility spectrometry (TIMS) and parallel
accumulation-serial fragmentation (PASEF). The scale and precision (CV
0.99). Hydrophobicity, proportion of prolines and position of histidines
are main determinants of the cross sections in addition to
sequence-specific interactions. CCS values can now be predicted for any
peptide and organism, forming a basis for advanced proteomics workflows
that make full use of the additional information.
},
language = {en}
}
@article{Meyer2021-jm,
title = {Deep learning neural network tools for proteomics},
author = {Meyer, Jesse G},
year = 2021,
month = jun,
journal = {Cell Rep Methods},
volume = 1,
number = 2,
pages = 100003,
doi = {10.1016/j.crmeth.2021.100003},
abstract = {
Mass-spectrometry-based proteomics enables quantitative analysis of
thousands of human proteins. However, experimental and computational
challenges restrict progress in the field. This review summarizes the
recent flurry of machine-learning strategies using artificial deep neural
networks (or ``deep learning'') that have started to break barriers and
accelerate progress in the field of shotgun proteomics. Deep learning now
accurately predicts physicochemical properties of peptides from their
sequence, including tandem mass spectra and retention time. Furthermore,
deep learning methods exist for nearly every aspect of the modern
proteomics workflow, enabling improved feature selection, peptide
identification, and protein inference.
},
keywords = {
MS/MS; bioinformatics; deep learning; mass spectrometry; neural networks;
peptides; proteomics; retention time
},
language = {en}
}
@article{Michelmann2015-nu,
title = {Fundamentals of trapped ion mobility spectrometry},
author = {
Michelmann, Karsten and Silveira, Joshua A and Ridgeway, Mark E and Park,
Melvin A
},
year = 2015,
month = jan,
journal = {J. Am. Soc. Mass Spectrom.},
volume = 26,
number = 1,
pages = {14--24},
doi = {10.1007/s13361-014-0999-4},
abstract = {
Trapped ion mobility spectrometry (TIMS) is a relatively new gas-phase
separation method that has been coupled to quadrupole orthogonal
acceleration time-of-flight mass spectrometry. The TIMS analyzer is a
segmented rf ion guide wherein ions are mobility-analyzed using an electric
field that holds ions stationary against a moving gas, unlike conventional
drift tube ion mobility spectrometry where the gas is stationary. Ions are
initially trapped, and subsequently eluted from the TIMS analyzer over time
according to their mobility (K). Though TIMS has achieved a high level of
performance (R > 250) in a small device (<5 cm) using modest operating
potentials (<300 V), a proper theory has yet to be produced. Here, we
develop a quantitative theory for TIMS via mathematical derivation and
simulations. A one-dimensional analytical model, used to predict the
transit time and theoretical resolving power, is described. Theoretical
trends are in agreement with experimental measurements performed as a
function of K, pressure, and the axial electric field scan rate. The linear
dependence of the transit time with 1/K provides a fundamental basis for
determination of reduced mobility or collision cross section values by
calibration. The quantitative description of TIMS provides an operational
understanding of the analyzer, outlines the current performance
capabilities, and provides insight into future avenues for improvement.
},
language = {en}
}
@article{neely2023,
title = {Toward an Integrated Machine Learning Model of a Proteomics Experiment},
author = {
Neely, Benjamin A. and Dorfer, Viktoria and Martens, Lennart and Bludau,
Isabell and Bouwmeester, Robbin and Degroeve, Sven and Deutsch, Eric W. and
Gessulat, Siegfried and Käll, Lukas and Palczynski, Pawel and Payne, Samuel
H. and Rehfeldt, Tobias Greisager and Schmidt, Tobias and Schwämmle, Veit
and Uszkoreit, Julian and Vizcaíno, Juan Antonio and Wilhelm, Mathias and
Palmblad, Magnus
},
year = 2023,
journal = {Journal of Proteome Research},
volume = 22,
number = 3,
pages = {681--696},
doi = {10.1021/acs.jproteome.2c00711},
note = {PMID: 36744821}
}
@article{Nielsen1999-ej,
title = {
Machine learning approaches for the prediction of signal peptides and other
protein sorting signals
},
author = {Nielsen, H and Brunak, S and von Heijne, G},
year = 1999,
month = jan,
journal = {Protein Eng.},
volume = 12,
number = 1,
pages = {3--9},
doi = {10.1093/protein/12.1.3},
abstract = {
Prediction of protein sorting signals from the sequence of amino acids has
great importance in the field of proteomics today. Recently, the growth of
protein databases, combined with machine learning approaches, such as
neural networks and hidden Markov models, have made it possible to achieve
a level of reliability where practical use in, for example automatic
database annotation is feasible. In this review, we concentrate on the
present status and future perspectives of SignalP, our neural network-based
method for prediction of the most well-known sorting signal: the secretory
signal peptide. We discuss the problems associated with the use of SignalP
on genomic sequences, showing that signal peptide prediction will improve
further if integrated with predictions of start codons and transmembrane
helices. As a step towards this goal, a hidden Markov model version of
SignalP has been developed, making it possible to discriminate between
cleaved signal peptides and uncleaved signal anchors. Furthermore, we show
how SignalP can be used to characterize putative signal peptides from an
archaeon, Methanococcus jannaschii. Finally, we briefly review a few
methods for predicting other protein sorting signals and discuss the future
of protein sorting prediction in general.
},
language = {en}
}
@article{Omenn2021-qc,
title = {
Progress Identifying and Analyzing the Human Proteome: 2021 Metrics from
the {HUPO} Human Proteome Project
},
author = {
Omenn, Gilbert S and Lane, Lydie and Overall, Christopher M and Paik,
Young-Ki and Cristea, Ileana M and Corrales, Fernando J and Lindskog,
Cecilia and Weintraub, Susan and Roehrl, Michael H A and Liu, Siqi and
Bandeira, Nuno and Srivastava, Sudhir and Chen, Yu-Ju and Aebersold, Ruedi
and Moritz, Robert L and Deutsch, Eric W
},
year = 2021,
month = dec,
journal = {J. Proteome Res.},
volume = 20,
number = 12,
pages = {5227--5240},
doi = {10.1021/acs.jproteome.1c00590},
abstract = {
The 2021 Metrics of the HUPO Human Proteome Project (HPP) show that protein
expression has now been credibly detected (neXtProt PE1 level) for 18 357
(92.8\%) of the 19 778 predicted proteins coded in the human genome, a gain
of 483 since 2020 from reports throughout the world reanalyzed by the HPP.
Conversely, the number of neXtProt PE2, PE3, and PE4 missing proteins has
been reduced by 478 to 1421. This represents remarkable progress on the
proteome parts list. The utilization of proteomics in a broad array of
biological and clinical studies likewise continues to expand with many
important findings and effective integration with other omics platforms. We
present highlights from the Immunopeptidomics, Glycoproteomics, Infectious
Disease, Cardiovascular, Musculo-Skeletal, Liver, and Cancers B/D-HPP teams
and from the Knowledgebase, Mass Spectrometry, Antibody Profiling, and
Pathology resource pillars, as well as ethical considerations important to
the clinical utilization of proteomics and protein biomarkers.
},
keywords = {
Biology and Disease-HPP (B/D-HPP); Chromosome-centric HPP (C-HPP); Human
Protein Atlas; Human Proteome Project (HPP); Mass Spectrometry Interactive
Virtual Environment (MassIVE); PeptideAtlas; missing proteins (MP);
neXtProt protein existence (PE) metrics; non-MS PE1 proteins;
uncharacterized protein existence 1 (uPE1)
},
language = {en}
}
@article{Perez-Riverol2022-ak,
title = {
The {PRIDE} database resources in 2022: a hub for mass spectrometry-based
proteomics evidences
},
author = {
Perez-Riverol, Yasset and Bai, Jingwen and Bandla, Chakradhar and
Garc{\'\i}a-Seisdedos, David and Hewapathirana, Suresh and Kamatchinathan,
Selvakumar and Kundu, Deepti J and Prakash, Ananth and Frericks-Zipper,
Anika and Eisenacher, Martin and Walzer, Mathias and Wang, Shengbo and
Brazma, Alvis and Vizca{\'\i}no, Juan Antonio
},
year = 2022,
month = jan,
journal = {Nucleic Acids Res.},
volume = 50,
number = {D1},
pages = {D543--D552},
doi = {10.1093/nar/gkab1038},
abstract = {
The PRoteomics IDEntifications (PRIDE) database
(https://www.ebi.ac.uk/pride/) is the world's largest data repository of
mass spectrometry-based proteomics data. PRIDE is one of the founding
members of the global ProteomeXchange (PX) consortium and an ELIXIR core
data resource. In this manuscript, we summarize the developments in PRIDE
resources and related tools since the previous update manuscript was
published in Nucleic Acids Research in 2019. The number of submitted
datasets to PRIDE Archive (the archival component of PRIDE) has reached on
average around 500 datasets per month during 2021. In addition to
continuous improvements in PRIDE Archive data pipelines and infrastructure,
the PRIDE Spectra Archive has been developed to provide direct access to
the submitted mass spectra using Universal Spectrum Identifiers. As a key
point, the file format MAGE-TAB for proteomics has been developed to enable
the improvement of sample metadata annotation. Additionally, the resource
PRIDE Peptidome provides access to aggregated peptide/protein evidences
across PRIDE Archive. Furthermore, we will describe how PRIDE has increased
its efforts to reuse and disseminate high-quality proteomics data into
other added-value resources such as UniProt, Ensembl and Expression Atlas.
},
language = {en}
}
@article{ProteomicsML2022,
title = {
{ProteomicsML}: An Online Platform for Community-Curated Datasets and
Tutorials for Machine Learning in Proteomics
},
author = {
Tobias Greisager Rehfeldt and Ralf Gabriels and Robbin Bouwmeester and
Siegfried Gessulat and Benjamin Neely and Magnus Palmblad and Yasset
Perez-Riverol and Tobias Schmidt and Juan Antonio Vizca{\'{\i}}no and Eric
W. Deutsch
},
year = 2022,
month = oct,
publisher = {American Chemical Society ({ACS})},
doi = {10.26434/chemrxiv-2022-2s6kx},
url = {https://doi.org/10.26434/chemrxiv-2022-2s6kx}
}
@article{Rehfeldt2021-iw,
title = {
{MS2AI}: Automated repurposing of public peptide {LC-MS} data for machine
learning applications
},
author = {
Rehfeldt, Tobias Greisager and Krawczyk, Konrad and B{\o}gebjerg, Mathias
and Schw{\"a}mmle, Veit and R{\"o}ttger, Richard
},
year = 2021,
month = oct,
journal = {Bioinformatics},
doi = {10.1021/acs.analchem.9b01262},
abstract = {
MOTIVATION: Liquid-chromatography mass-spectrometry (LC-MS) is the
established standard for analyzing the proteome in biological samples by
identification and quantification of thousands of proteins. Machine
learning (ML) promises to considerably improve the analysis of the
resulting data, however, there is yet to be any tool that mediates the path
from raw data to modern ML applications. More specifically, ML applications
are currently hampered by three major limitations: (1) absence of balanced
training data with large sample size; (2) unclear definition of
sufficiently information-rich data representations for e.g., peptide
identification; (3) lack of benchmarking of ML methods on specific LC-MS
problems. RESULTS: We created the MS2AI pipeline that automates the process
of gathering vast quantities of mass spectrometry (MS) data for large scale
ML applications. The software retrieves raw data from either in-house
sources or from the proteomics identifications database, PRIDE.
Subsequently, the raw data is stored in a standardized format amenable for
ML, encompassing MS1/MS2 spectra and peptide identifications. This tool
bridges the gap between MS and AI, and to this effect we also present an ML
application in the form of a convolutional neural network for the
identification of oxidized peptides. AVAILABILITY: An open-source
implementation of the software can be found at
https://gitlab.com/roettgerlab/ms2ai. SUPPLEMENTARY INFORMATION:
Supplementary data are available at Bioinformatics online.
},
language = {en}
}
@article{Shvartsburg2008-ir,
title = {Fundamentals of traveling wave ion mobility spectrometry},
author = {Shvartsburg, Alexandre A and Smith, Richard D},
year = 2008,
month = dec,
journal = {Anal. Chem.},
volume = 80,
number = 24,
pages = {9689--9699},
doi = {10.1021/ac8016295},
abstract = {
Traveling wave ion mobility spectrometry (TW IMS) is a new IMS method
implemented in the Synapt IMS/mass spectrometry system (Waters). Despite
its wide adoption, the foundations of TW IMS were only qualitatively
understood and factors governing the ion transit time (the separation
parameter) and resolution remained murky. Here we develop the theory of TW
IMS using derivations and ion dynamics simulations. The key parameter is
the ratio (c) of ion drift velocity at the steepest wave slope to wave
speed. At low c, the ion transit velocity is proportional to the squares of
mobility (K) and electric field intensity (E), as opposed to linear scaling
in drift tube (DT) IMS and differential mobility analyzers. At higher c,
the scaling deviates from quadratic in a way controlled by the waveform
profile, becoming more gradual with the ideal triangular profile but first
steeper and then more gradual for realistic profiles with variable E. At
highest c, the transit velocity asymptotically approaches the wave speed.
Unlike with DT IMS, the resolving power of TW IMS depends on mobility,
scaling as K(1/2) in the low-c limit and less at higher c. A nonlinear
dependence of the transit time on mobility means that the true resolving
power of TW IMS differs from that indicated by the spectrum. A near-optimum
resolution is achievable over an approximately 300-400\% range of
mobilities. The major predicted trends are in agreement with TW IMS
measurements for peptide ions as a function of mobility, wave amplitude,
and gas pressure. The issues of proper TW IMS calibration and ion
distortion by field heating are also discussed. The new quantitative
understanding of TW IMS separations allows rational optimization of
instrument design and operation and improved spectral calibration.
},
language = {en}
}
@article{tyanova2016-ma,
title = {
The MaxQuant computational platform for mass spectrometry-based shotgun
proteomics
},
author = {Tyanova, Stefka and Temu, Tikira and Cox, Juergen},
year = 2016,
journal = {Nature protocols},
publisher = {Nature Publishing Group},
volume = 11,
number = 12,
pages = {2301--2319},
doi = {10.1038/nprot.2016.136}
}
@article{Van_Puyvelde2022-nv,
title = {
A comprehensive {LFQ} benchmark dataset on modern day acquisition
strategies in proteomics
},
author = {
Van Puyvelde, Bart and Daled, Simon and Willems, Sander and Gabriels, Ralf
and Gonzalez de Peredo, Anne and Chaoui, Karima and Mouton-Barbosa,
Emmanuelle and Bouyssi{\'e}, David and Boonen, Kurt and Hughes, Christopher
J and Gethings, Lee A and Perez-Riverol, Yasset and Bloomfield, Nic and
Tate, Stephen and Schiltz, Odile and Martens, Lennart and Deforce, Dieter
and Dhaenens, Maarten
},
year = 2022,
month = mar,
journal = {Sci Data},
volume = 9,
number = 1,
pages = 126,
doi = {10.1038/s41597-022-01216-6},
abstract = {