diff --git a/episodes/06-biological-sequences.Rmd b/episodes/06-biological-sequences.Rmd index d8b3429c..e2625d82 100644 --- a/episodes/06-biological-sequences.Rmd +++ b/episodes/06-biological-sequences.Rmd @@ -390,6 +390,132 @@ sequence that was produced by the `translate()` method. :::::::::::::::::::::::::::::::::::::::::::::::::: +## The BSgenome package + +### Overview + +In the Bioconductor project, the `r BiocStyle::Biocpkg("BSgenome")` package +provides software infrastructure for efficient representation of full genome +and their single-nucleotide polymorphisms. + +The `r BiocStyle::Biocpkg("BSgenome")` package itself does not contain any +genome sequence itself, but provides functionality to access genome sequences +available in other Bioconductor packages, as we demonstrate in the next section. + +### First steps + +To get started, we load the package. + +```{r, message=FALSE} +library(BSgenome) +``` + +With the package loaded and attached to the session, we have access to the +package functions. + +In particular, the function `BSgenome::available.genomes()` can be used to +display the names of Bioconductor packages that contain genome sequences. + +```{r} +available.genomes() +``` + +### Installing BSgenome packages + +To use one of the available genomes, the corresponding package must be installed +first. +For instance, the example below demonstrates how the data package +`r BiocStyle::Biocpkg("BSgenome.Hsapiens.UCSC.hg38.masked")` can be installed +using the function `BiocManager::install()` that we have seen before. + +```{r, eval=FALSE} +BiocManager::install("BSgenome.Hsapiens.UCSC.hg38.masked") +``` + +### Using BSgenome packages + +Once installed, BSgenome packages can be loaded like any other R package, +using the `library()` function. + +```{r, message=FALSE} +library(BSgenome.Hsapiens.UCSC.hg38.masked) +``` + +Each BSgenome package contains an object that is named identically to the +package and contains the genome sequence. + +Having loaded the package +`r BiocStyle::Biocpkg("BSgenome.Hsapiens.UCSC.hg38.masked")` above, we can +display the BSgenome object as follows. + +```{r} +BSgenome.Hsapiens.UCSC.hg38.masked +``` + +Given the length and the complexity of the object name, it is common practice +to assign a copy of BSgenome objects to a new object simply called `genome`. + +```{r} +genome <- BSgenome.Hsapiens.UCSC.hg38.masked +``` + +### Using BSgenome objects + +When printing BSgenome objects in the console (see above), some helpful tips +are displayed under the object itself, hinting at functions commonly used to +access information in the object. + +For instance, the function `seqnames()` can be used get the list of sequence +names (i.e., chromosomes and contigs) present in the object. + +```{r} +seqnames(genome) +``` + +Similarly, the function `seqinfo()` can be used to get the full sequence +information stored in the object. + +```{r} +seqinfo(genome) +``` + +Finally, the nature of BSgenome objects being akin to a list of sequences, +the operators `$` and `[[]]` can both be used to extract individual sequences +from the BSgenome object. + +```{r} +genome$chr1 +``` + +For instance, we can extract the sequence of the Y chromosome and assign it +to a new object `chrY`. + +```{r} +chrY <- genome[["chrY"]] +``` + +### Using genome sequences + +From this point, genome sequences can be treated very much like biological +strings (e.g. `DNAString`) described earlier, in the +`r BiocStyle::Biocpkg("Biostrings")` package. + +For instance, the function `countPattern()` can be used to count the number of +occurences of a given pattern in a given genome sequence. + +```{r} +countPattern(pattern = "CANNTG", subject = chrY, fixed = FALSE) +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Note + +In the example above, the argument `fixed = FALSE` is used to indicate that the +pattern contain [IUPAC ambiguity codes][external-iupac]. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + [glossary-s4-class]: reference.html#s4-class [crossref-s4]: 05-s4.html [external-iupac]: https://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation diff --git a/renv/profiles/lesson-requirements/renv.lock b/renv/profiles/lesson-requirements/renv.lock index 3de4d78b..d4fd3c51 100644 --- a/renv/profiles/lesson-requirements/renv.lock +++ b/renv/profiles/lesson-requirements/renv.lock @@ -47,6 +47,51 @@ "Repository": "CRAN", "Hash": "a8235afbcd6316e6e91433ea47661013" }, + "BSgenome": { + "Package": "BSgenome", + "Version": "1.68.0", + "Source": "Bioconductor", + "Requirements": [ + "BiocGenerics", + "Biostrings", + "GenomeInfoDb", + "GenomicRanges", + "IRanges", + "R", + "Rsamtools", + "S4Vectors", + "XVector", + "matrixStats", + "methods", + "rtracklayer", + "stats", + "utils" + ], + "Hash": "9d7c3c9b904c28bc971ed29d3f3b445e" + }, + "BSgenome.Hsapiens.UCSC.hg38": { + "Package": "BSgenome.Hsapiens.UCSC.hg38", + "Version": "1.4.5", + "Source": "Bioconductor", + "Requirements": [ + "BSgenome", + "GenomeInfoDb", + "R" + ], + "Hash": "0ba28cc20a4f8629fbb30d0bf1a133ac" + }, + "BSgenome.Hsapiens.UCSC.hg38.masked": { + "Package": "BSgenome.Hsapiens.UCSC.hg38.masked", + "Version": "1.4.5", + "Source": "Bioconductor", + "Requirements": [ + "BSgenome", + "BSgenome.Hsapiens.UCSC.hg38", + "GenomeInfoDb", + "R" + ], + "Hash": "f165a75d7b8ad3c36692070e50d43a09" + }, "Biobase": { "Package": "Biobase", "Version": "2.60.0", @@ -1117,7 +1162,7 @@ }, "xfun": { "Package": "xfun", - "Version": "0.43", + "Version": "0.44", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1125,7 +1170,7 @@ "stats", "tools" ], - "Hash": "ab6371d8653ce5f2f9290f4ec7b42a8e" + "Hash": "317a0538d32f4a009658bcedb7923f4b" }, "xml2": { "Package": "xml2",