These are the solutions to the exercises found in Common Genomic Data Wrangling Tasks.
Data
We need to read in three files from HelloRangesData
- exons.bed (RefSeq hg19 annotated exons)
- cpg.bed (RefSeq hg19 annotated CpG islands)
- hesc.chromHmm.bed (predicted functional elements from chromHMM program)
suppressPackageStartupMessages(library(plyranges))
# our genome build
build <- system.file("extdata", "hg19.genome", package="HelloRangesData") %>%
read.delim(header = FALSE, col.names = c("seqnames", "width")) %>%
as_granges(start = 1) %>%
set_genome_info(genome = "hg19",
seqlengths = width(.))
exons <- system.file("extdata", "exons.bed", package="HelloRangesData") %>%
read_bed(genome_info = build)
cpg <- system.file("extdata", "cpg.bed", package="HelloRangesData") %>%
read_bed(genome_info = build)
hesc <- system.file("extdata", "hesc.chromHmm.bed", package="HelloRangesData") %>%
read_bed(genome_info = build)
Solutions
Filtering and mutating
- Modify the above example so only CpG islands that are completely within exons are included in the result.
cpg %>%
mutate(is_contained = . %within% exons) %>%
filter(is_contained)
## GRanges object with 2235 ranges and 2 metadata columns:
## seqnames ranges strand | name is_contained
## <Rle> <IRanges> <Rle> | <character> <logical>
## [1] chr1 135125-135563 * | CpG:_30 TRUE
## [2] chr1 327791-328229 * | CpG:_29 TRUE
## [3] chr1 788864-789211 * | CpG:_28 TRUE
## [4] chr1 990275-990694 * | CpG:_29 TRUE
## [5] chr1 1132828-1133218 * | CpG:_40 TRUE
## ... ... ... ... . ... ...
## [2231] chrY 171535-171828 * | CpG:_23 TRUE
## [2232] chrY 1496664-1497002 * | CpG:_32 TRUE
## [2233] chrY 2356771-2358703 * | CpG:_133 TRUE
## [2234] chrY 15815489-15815779 * | CpG:_26 TRUE
## [2235] chrY 16941823-16942188 * | CpG:_32 TRUE
## -------
## seqinfo: 93 sequences from hg19 genome
# or just
# cpg %>% filter(. %within% exons)
- Create a new column in exons called tx_id (the transcript id is everything before _exon in name).
# we can use the sub function in base R
exons <- exons %>%
mutate(tx_id = sub("_exon.*", "", name))
Modifying genomic regions
- Create a new GRanges object from CpG islands that stretches the intervals on either side by their width while leaving the centre of the interval fixed.
cpg_stretch <- cpg %>%
anchor_centre() %>%
mutate(width = 2*width)
# alternative is to use `stretch()`
- Create a new GRanges object from exons that has only non-exonic regions.
no_exon <- exons %>% complement_ranges()
- Create a new GRanges object that represent 2bp canonical splice sites on either side of exon.
# you can do this directly with mutate and anchors
left <- exons %>%
anchor_start() %>%
mutate(start = start - 2L, width = 2L)
right <- exons %>%
anchor_end() %>%
mutate(end = end + 2L,
width = 2L)
sites <- bind_ranges(list(left = left, right = right), .id = "side")
# or with flank_* family of functions
identical(exons %>% flank_left(2), left)
## [1] TRUE
identical(exons %>% flank_right(2), right)
## [1] TRUE
Summarising GRanges objects
- How many ranges are there for each predicted state?
- How many base pairs are represented in the genome for each predicted state? Which state has the maximum number of bases?
state_summary <- hesc %>%
group_by(name) %>%
summarise(
n_ranges = n(),
n_bases = sum(width)
)
state_summary %>%
as.data.frame() %>%
filter(n_bases == max(n_bases))
## name n_ranges n_bases
## 1 13_Heterochrom/lo 91228 1992618522
Overlaps
- Create a new GRanges object, that has exons that are completely within Enhancers elements of hesc. How many exons are there?
enhancers <- hesc %>%
filter(grepl("Enhancer", name))
exon_within <- join_overlap_inner_within(exons, enhancers)
exon_within
## GRanges object with 13746 ranges and 4 metadata columns:
## seqnames ranges strand |
## <Rle> <IRanges> <Rle> |
## [1] chr1 948847-948956 + |
## [2] chr1 1051440-1051736 - |
## [3] chr1 1109286-1109306 + |
## [4] chr1 1109804-1109869 + |
## [5] chr1 1219358-1219470 + |
## ... ... ... ... .
## [13742] chrX 153927569-153927788 - |
## [13743] chrX 153927569-153927788 - |
## [13744] chrX 153927569-153927788 - |
## [13745] chrX 153927569-153927788 - |
## [13746] chrX 154114409-154114577 - |
## name.x score tx_id
## <character> <numeric> <character>
## [1] NM_005101_exon_0_0_chr1_948847_f 0 NM_005101
## [2] NM_017891_exon_9_0_chr1_1051440_r 0 NM_017891
## [3] NM_001130045_exon_0_0_chr1_1109286_f 0 NM_001130045
## [4] NM_001130045_exon_2_0_chr1_1109804_f 0 NM_001130045
## [5] NM_001130413_exon_4_0_chr1_1219358_f 0 NM_001130413
## ... ... ... ...
## [13742] NM_001081573_exon_4_0_chrX_153927569_r 0 NM_001081573
## [13743] NM_001282283_exon_3_0_chrX_153927569_r 0 NM_001282283
## [13744] NM_080612_exon_4_0_chrX_153927569_r 0 NM_080612
## [13745] NR_104114_exon_4_0_chrX_153927569_r 0 NR_104114
## [13746] NM_019863_exon_4_0_chrX_154114409_r 0 NM_019863
## name.y
## <character>
## [1] 4_Strong_Enhancer
## [2] 6_Weak_Enhancer
## [3] 6_Weak_Enhancer
## [4] 6_Weak_Enhancer
## [5] 7_Weak_Enhancer
## ... ...
## [13742] 6_Weak_Enhancer
## [13743] 6_Weak_Enhancer
## [13744] 6_Weak_Enhancer
## [13745] 6_Weak_Enhancer
## [13746] 6_Weak_Enhancer
## -------
## seqinfo: 93 sequences from hg19 genome
- Use
join_overlap_intersect()
to filter exons if at least 50 per cent of their bases are overlapped by enhancer elements.
exons %>%
mutate(total_length = width) %>%
join_overlap_intersect(enhancers) %>%
filter(width / total_length > 0.5)
## GRanges object with 21151 ranges and 5 metadata columns:
## seqnames ranges strand |
## <Rle> <IRanges> <Rle> |
## [1] chr1 35738-35937 - |
## [2] chr1 35738-35937 - |
## [3] chr1 910938-911537 - |
## [4] chr1 911938-912004 - |
## [5] chr1 948847-948956 + |
## ... ... ... ... .
## [21147] chrX 153927569-153927788 - |
## [21148] chrX 154114409-154114577 - |
## [21149] chrX 154300607-154300618 + |
## [21150] chrX 154300607-154300618 + |
## [21151] chrX 154300607-154300618 + |
## name.x score tx_id
## <character> <numeric> <character>
## [1] NR_026818_exon_2_0_chr1_35721_r 0 NR_026818
## [2] NR_026820_exon_2_0_chr1_35721_r 0 NR_026820
## [3] NR_027693_exon_0_0_chr1_910579_r 0 NR_027693
## [4] NR_027693_exon_1_0_chr1_911879_r 0 NR_027693
## [5] NM_005101_exon_0_0_chr1_948847_f 0 NM_005101
## ... ... ... ...
## [21147] NR_104114_exon_4_0_chrX_153927569_r 0 NR_104114
## [21148] NM_019863_exon_4_0_chrX_154114409_r 0 NM_019863
## [21149] NM_001018055_exon_1_0_chrX_154300602_f 0 NM_001018055
## [21150] NM_001242640_exon_1_0_chrX_154300602_f 0 NM_001242640
## [21151] NM_024332_exon_1_0_chrX_154300602_f 0 NM_024332
## total_length name.y
## <integer> <character>
## [1] 361 7_Weak_Enhancer
## [2] 361 7_Weak_Enhancer
## [3] 1071 6_Weak_Enhancer
## [4] 126 6_Weak_Enhancer
## [5] 110 4_Strong_Enhancer
## ... ... ...
## [21147] 220 6_Weak_Enhancer
## [21148] 169 6_Weak_Enhancer
## [21149] 17 6_Weak_Enhancer
## [21150] 17 6_Weak_Enhancer
## [21151] 17 6_Weak_Enhancer
## -------
## seqinfo: 93 sequences from hg19 genome
- Count the number of each enhancer element type that are exonic.
There are several ways of doing this, but see if you can come up with a
solution using
join_overlap_left()
+disjoin_ranges()
.
enhancers %>%
join_overlap_left(exons) %>%
group_by(name.x) %>%
disjoin_ranges(olap_any = all(!is.na(tx_id))) %>%
group_by(name.x) %>%
summarise(prop = sum(olap_any) / n())
## DataFrame with 4 rows and 2 columns
## name.x prop
## <character> <numeric>
## 1 4_Strong_Enhancer 0.129369300911854
## 2 5_Strong_Enhancer 0.0830471433031533
## 3 6_Weak_Enhancer 0.0921656261823685
## 4 7_Weak_Enhancer 0.0614762445084519