Introduction

This Vignette describes how the prozor::greedy function can be used to infer proteins form peptide identifications using the Occam’s razor principle. The method tries to find a minimal set of porteins which can explain all the peptides identified.

library(prozor)

rm(list = ls())

file = system.file("extdata/IDResults.txt.gz" , package = "prozor")
specMeta <- readr::read_tsv(file)
head(specMeta)

## # A tibble: 6 x 12
##   RefSpectraId numPeaks peptideSeq     precursorCharge precursorMZ retentionTime
##          <dbl>    <dbl> <chr>                    <dbl>       <dbl>         <dbl>
## 1         9908       57 GPDVLTATVSGK                 2        573.          73.6
## 2        36028       70 VPQVSTPTLVEVSR               3        505.          93.3
## 3        74786      177 EVQLVETGGGLIQ~               3        637.         121. 
## 4        53362      184 AQPVQVAEGSEPD~               3        758.         129. 
## 5        48668      157 KYLYEIAR                     2        528.          69.8
## 6        90153      158 AQLVPLPPSTYVE~               3        860.         137. 
## # ... with 6 more variables: copies <dbl>, peptideModSeq <chr>, score <dbl>,
## #   lengthPepSeq <dbl>, fileName <dbl>, SpecIDinFile <dbl>

Annotate peptide sequences with protein sequences from two fasta files on with reviewed entries only (sp) and the other with reviewed and Trembl entries (sp/tr).

length(unique(specMeta$peptideSeq))

## [1] 1520

upeptide <- unique(specMeta$peptideSeq)

resAll <-
    prozor::readPeptideFasta(
        system.file("p1000_db1_example/Annotation_allSeq.fasta.gz" , package = "prozor"))

resCan <-
    prozor::readPeptideFasta(
        system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor"))

annotAll <- prozor::annotatePeptides(upeptide, resAll)

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

head(subset(annotAll,select =  -proteinSequence))

##             peptideSeq                      proteinID Offset matched
## 1 AACAQLNDFLQEYGTQGCQV         sp|P0C0L4-2|CO4A_HUMAN   1679    TRUE
## 2 AACAQLNDFLQEYGTQGCQV tr|A0A140TA49|A0A140TA49_HUMAN   1679    TRUE
## 3 AACAQLNDFLQEYGTQGCQV tr|A0A140TA29|A0A140TA29_HUMAN   1679    TRUE
## 4 AACAQLNDFLQEYGTQGCQV         tr|F5GXS0|F5GXS0_HUMAN   1679    TRUE
## 5 AACAQLNDFLQEYGTQGCQV tr|A0A140TA44|A0A140TA44_HUMAN   1679    TRUE
## 6 AACAQLNDFLQEYGTQGCQV tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN   1725    TRUE
##   lengthPeptide
## 1            20
## 2            20
## 3            20
## 4            20
## 5            20
## 6            20

annotCan <- prozor::annotatePeptides(upeptide, resCan)
barplot(c(All = length(resAll),Canonical = length(resCan)))

Number of proteins in the All and Canonical database.

barplot(c(All = nrow(annotAll), Canonical = nrow(annotCan)), ylab = "# peptide protein matches.")

Number of unique peptide protein pairs for the All and Canonical database.

We can see that using the larger fasta database reduces the proportion of proteotypic peptides.

PCProteotypic_all <-
    sum(table(annotAll$peptideSeq) == 1) / length(table(annotAll$peptideSeq)) * 100
PCProteotypic_canonical <-
    sum(table(annotCan$peptideSeq) == 1) / length(table(annotCan$peptideSeq)) * 100

barplot(
    c(All = PCProteotypic_all, Canonical =  PCProteotypic_canonical),
    las = 2,
    ylab = "% proteotypic"
)

Protein Inference

We can now identify a minmal set of proteins explaining all the peptides observed for both databases

library(Matrix)
precursors <-
    unique(subset(specMeta, select = c(
        peptideModSeq, precursorCharge, peptideSeq
    )))

Protein Inference for database with Trembl identifiers

library(Matrix)
annotatedPrecursors <- merge(precursors ,
                             subset(annotAll, select = c(peptideSeq, proteinID)),
                             by.x = "peptideSeq",
                             by.y = "peptideSeq")


xx <-
    prepareMatrix(annotatedPrecursors,
                  proteinID = "proteinID",
                  peptideID = "peptideSeq")

image(xx)

Peptide protein machtes for the All database. Rows - peptides, Columns - proteins, black - peptide protein match.

xxAll <- greedy(xx)

Protein Inference for Reviewed/Canonical database

annotatedPrecursors <-
    merge(precursors ,
          subset(annotCan, select = c(peptideSeq, proteinID)),
          by.x = "peptideSeq",
          by.y = "peptideSeq")


xx <-
    prepareMatrix(annotatedPrecursors ,
                  proteinID = "proteinID",
                  peptideID = "peptideSeq")
image(xx)

Peptide protein machtes for the Canonical database. Rows - peptides, Columns - proteins, black - peptide protein match.

xxCAN <- greedy(xx)

Conclusion

We see that the number of proteins needed to explain all the peptides is practically identical for both databases. Also in practice using a database with more entries does not lead to more identified proteins. On the contrary, it might even reduce the number of porteins identified.

barplot(c(All_before = length(unique(annotAll$proteinID)), All_after = length(unique(unlist(
    xxAll
))) , Canonical_before =  length(unique(annotCan$proteinID)), Canonical_after = length(unique(unlist(
    xxCAN
)))))

Number of proteins before and after protein inference.

Session Info

sessionInfo()

## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=C                          
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] Matrix_1.3-4 prozor_0.3.1
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.7            highr_0.9             bslib_0.3.1          
##  [4] compiler_4.1.1        pillar_1.6.4          jquerylib_0.1.4      
##  [7] tools_4.1.1           bit_4.0.4             digest_0.6.28        
## [10] docopt_0.7.1          jsonlite_1.7.2        evaluate_0.14        
## [13] lifecycle_1.0.1       tibble_3.1.4          lattice_0.20-44      
## [16] AhoCorasickTrie_0.1.2 pkgconfig_2.0.3       rlang_0.4.11         
## [19] DBI_1.1.1             cli_3.1.0             parallel_4.1.1       
## [22] yaml_2.2.1            xfun_0.26             fastmap_1.1.0        
## [25] dplyr_1.0.7           stringr_1.4.0         knitr_1.36           
## [28] generics_0.1.1        sass_0.4.0            vctrs_0.3.8          
## [31] hms_1.1.1             tidyselect_1.1.1      bit64_4.0.5          
## [34] ade4_1.7-18           grid_4.1.1            glue_1.4.2           
## [37] R6_2.5.1              fansi_0.5.0           vroom_1.5.6          
## [40] rmarkdown_2.11        tzdb_0.1.2            purrr_0.3.4          
## [43] readr_2.0.1           seqinr_4.2-8          magrittr_2.0.1       
## [46] htmltools_0.5.2       ellipsis_0.3.2        MASS_7.3-54          
## [49] assertthat_0.2.1      utf8_1.2.2            stringi_1.7.4        
## [52] crayon_1.4.2

Peptide Annotation and Protein Inference

Witold Wolski

March 16, 2017