suppressPackageStartupMessages({
library(pasillaBamSubset)
})
# Path to a bam file with paired-end reads
(un3 <- untreated3_chr4())
#> [1] "/Users/runner/work/_temp/Library/pasillaBamSubset/extdata/untreated3_chr4.bam"How to read paired-end reads from a BAM file
2025-06-19
Source:vignettes/how-to-read-paired-end-reads-from-bam-file.qmd
This HowTo has been adapted from the list of HowTos provided in the vignette for the GenomicRanges Bioconductor package.
Bioconductor packages used in this document
How to read paired-end reads from a BAM file
For illustration, we use data from the pasillaBamSubset data package.
Paired-end reads can be loaded with the GenomicAlignments::readGAlignmentPairs or GenomicAlignments::readGAlignmentsList functions from the GenomicAlignments package. These functions use the same mate pairing algorithm but output different objects.
Let’s start with GenomicAlignments::readGAlignmentPairs:
suppressPackageStartupMessages({
library(GenomicAlignments)
})
gapairs <- readGAlignmentPairs(un3)
class(gapairs)
#> [1] "GAlignmentPairs"
#> attr(,"package")
#> [1] "GenomicAlignments"
gapairs
#> GAlignmentPairs object with 75409 pairs, strandMode=1, and 0 metadata columns:
#> seqnames strand : ranges -- ranges
#> <Rle> <Rle> : <IRanges> -- <IRanges>
#> [1] chr4 + : 169-205 -- 326-362
#> [2] chr4 + : 943-979 -- 1086-1122
#> [3] chr4 + : 944-980 -- 1119-1155
#> [4] chr4 + : 946-982 -- 986-1022
#> [5] chr4 + : 966-1002 -- 1108-1144
#> ... ... ... ... ... ... ...
#> [75405] chr4 - : 1348209-1348245 -- 1348063-1348099
#> [75406] chr4 - : 1348211-1348247 -- 1348077-1348113
#> [75407] chr4 + : 1348217-1348253 -- 1348215-1348251
#> [75408] chr4 + : 1349196-1349232 -- 1349326-1349362
#> [75409] chr4 + : 1349708-1349744 -- 1349838-1349874
#> -------
#> seqinfo: 8 sequences from an unspecified genomeThe GAlignmentPairs class holds only pairs; reads with no mate or with ambiguous pairing are discarded. Each list element holds exactly 2 records (a mated pair). Records can be accessed as the first and last segments in a template or as left and right alignments. See ?GAlignmentPairs in the GenomicAlignments package for more information.
For readGAlignmentsList, mate pairing is performed when asMates is set to TRUE on the BamFile object, otherwise records are treated as single-end.
galist <- readGAlignmentsList(BamFile(un3, asMates = TRUE))
galist
#> GAlignmentsList object of length 96636:
#> [[1]]
#> GAlignments object with 2 alignments and 0 metadata columns:
#> seqnames strand cigar qwidth start end width
#> <Rle> <Rle> <character> <integer> <integer> <integer> <integer>
#> [1] chr4 + 37M 37 169 205 37
#> [2] chr4 - 37M 37 326 362 37
#> njunc
#> <integer>
#> [1] 0
#> [2] 0
#> -------
#> seqinfo: 8 sequences from an unspecified genome
#>
#> [[2]]
#> GAlignments object with 2 alignments and 0 metadata columns:
#> seqnames strand cigar qwidth start end width
#> <Rle> <Rle> <character> <integer> <integer> <integer> <integer>
#> [1] chr4 + 37M 37 946 982 37
#> [2] chr4 - 37M 37 986 1022 37
#> njunc
#> <integer>
#> [1] 0
#> [2] 0
#> -------
#> seqinfo: 8 sequences from an unspecified genome
#>
#> [[3]]
#> GAlignments object with 2 alignments and 0 metadata columns:
#> seqnames strand cigar qwidth start end width
#> <Rle> <Rle> <character> <integer> <integer> <integer> <integer>
#> [1] chr4 + 37M 37 943 979 37
#> [2] chr4 - 37M 37 1086 1122 37
#> njunc
#> <integer>
#> [1] 0
#> [2] 0
#> -------
#> seqinfo: 8 sequences from an unspecified genome
#>
#> ...
#> <96633 more elements>GAlignmentsList is a more general ‘list-like’ structure that holds mate pairs as well as nonmates (i.e., singletons, records with unmapped mates etc). A mates_status metadata column (accessed with mcols) indicates which records were paired.
Non-mated reads are returned as groups by QNAME and contain any number of records. Here the non-mate groups range in size from 1 to 9.
non_mates <- galist[unlist(mcols(galist)$mate_status) == "unmated"]
table(elementNROWS(non_mates))
#>
#> 1 2 3 4 5 6 7 8 9
#> 18191 2888 69 60 7 8 2 1 1Session info
Click to display session info
sessionInfo()
#> R version 4.5.1 (2025-06-13)
#> Platform: x86_64-apple-darwin20
#> Running under: macOS Ventura 13.7.6
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> time zone: UTC
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] GenomicAlignments_1.45.0 Rsamtools_2.25.0
#> [3] Biostrings_2.77.1 XVector_0.49.0
#> [5] SummarizedExperiment_1.39.0 Biobase_2.69.0
#> [7] MatrixGenerics_1.21.0 matrixStats_1.5.0
#> [9] GenomicRanges_1.61.0 GenomeInfoDb_1.45.3
#> [11] IRanges_2.43.0 S4Vectors_0.47.0
#> [13] BiocGenerics_0.55.0 generics_0.1.4
#> [15] pasillaBamSubset_0.47.0 BiocStyle_2.37.0
#>
#> loaded via a namespace (and not attached):
#> [1] Matrix_1.7-3 jsonlite_2.0.0 compiler_4.5.1
#> [4] BiocManager_1.30.25 crayon_1.5.3 bitops_1.0-9
#> [7] parallel_4.5.1 BiocParallel_1.43.2 yaml_2.3.10
#> [10] fastmap_1.2.0 lattice_0.22-7 R6_2.6.1
#> [13] S4Arrays_1.9.0 knitr_1.50 DelayedArray_0.35.1
#> [16] rlang_1.1.6 xfun_0.52 SparseArray_1.9.0
#> [19] cli_3.6.5 digest_0.6.37 grid_4.5.1
#> [22] evaluate_1.0.3 codetools_0.2-20 abind_1.4-8
#> [25] rmarkdown_2.29 httr_1.4.7 tools_4.5.1
#> [28] htmltools_0.5.8.1 UCSC.utils_1.5.0