These are the solutions to the exercises found in Common Genomic Data Wrangling Tasks.

Data

We need to read in three files from HelloRangesData

exons.bed (RefSeq hg19 annotated exons)
cpg.bed (RefSeq hg19 annotated CpG islands)
hesc.chromHmm.bed (predicted functional elements from chromHMM program)

suppressPackageStartupMessages(library(plyranges))
# our genome build
build <- system.file("extdata", "hg19.genome", package="HelloRangesData") %>% 
  read.delim(header = FALSE, col.names = c("seqnames", "width")) %>% 
  as_granges(start = 1) %>% 
  set_genome_info(genome = "hg19", 
                  seqlengths = width(.))

exons <- system.file("extdata", "exons.bed", package="HelloRangesData") %>% 
  read_bed(genome_info = build)

cpg <- system.file("extdata", "cpg.bed", package="HelloRangesData") %>% 
  read_bed(genome_info = build)

hesc <- system.file("extdata", "hesc.chromHmm.bed", package="HelloRangesData") %>% 
  read_bed(genome_info = build)

Solutions

Filtering and mutating

Modify the above example so only CpG islands that are completely within exons are included in the result.

cpg %>% 
  mutate(is_contained = . %within% exons) %>% 
  filter(is_contained)

## GRanges object with 2235 ranges and 2 metadata columns:
##          seqnames            ranges strand |        name is_contained
##             <Rle>         <IRanges>  <Rle> | <character>    <logical>
##      [1]     chr1     135125-135563      * |     CpG:_30         TRUE
##      [2]     chr1     327791-328229      * |     CpG:_29         TRUE
##      [3]     chr1     788864-789211      * |     CpG:_28         TRUE
##      [4]     chr1     990275-990694      * |     CpG:_29         TRUE
##      [5]     chr1   1132828-1133218      * |     CpG:_40         TRUE
##      ...      ...               ...    ... .         ...          ...
##   [2231]     chrY     171535-171828      * |     CpG:_23         TRUE
##   [2232]     chrY   1496664-1497002      * |     CpG:_32         TRUE
##   [2233]     chrY   2356771-2358703      * |    CpG:_133         TRUE
##   [2234]     chrY 15815489-15815779      * |     CpG:_26         TRUE
##   [2235]     chrY 16941823-16942188      * |     CpG:_32         TRUE
##   -------
##   seqinfo: 93 sequences from hg19 genome

# or just 
# cpg %>% filter(. %within% exons)

Create a new column in exons called tx_id (the transcript id is everything before _exon in name).

# we can use the sub function in base R
exons <- exons %>% 
  mutate(tx_id = sub("_exon.*", "", name))

Modifying genomic regions

Create a new GRanges object from CpG islands that stretches the intervals on either side by their width while leaving the centre of the interval fixed.

cpg_stretch <- cpg %>% 
  anchor_centre() %>% 
  mutate(width = 2*width)

# alternative is to use `stretch()`

Create a new GRanges object from exons that has only non-exonic regions.

no_exon <- exons %>% complement_ranges()

Create a new GRanges object that represent 2bp canonical splice sites on either side of exon.

# you can do this directly with mutate and anchors
left <- exons %>% 
  anchor_start() %>% 
  mutate(start = start - 2L, width = 2L)

right <- exons %>% 
  anchor_end() %>% 
  mutate(end = end + 2L,
         width = 2L)

sites <- bind_ranges(list(left = left, right = right), .id = "side")

# or with flank_* family of functions
identical(exons %>% flank_left(2), left)

## [1] TRUE

identical(exons %>% flank_right(2), right)

## [1] TRUE

Summarising GRanges objects

How many ranges are there for each predicted state?
How many base pairs are represented in the genome for each predicted state? Which state has the maximum number of bases?

state_summary <- hesc %>% 
  group_by(name) %>% 
  summarise(
    n_ranges = n(), 
    n_bases = sum(width)
  )

state_summary %>% 
  as.data.frame() %>% 
  filter(n_bases == max(n_bases))

##                name n_ranges    n_bases
## 1 13_Heterochrom/lo    91228 1992618522

Overlaps

Create a new GRanges object, that has exons that are completely within Enhancers elements of hesc. How many exons are there?

enhancers <- hesc %>% 
  filter(grepl("Enhancer", name))

exon_within <- join_overlap_inner_within(exons, enhancers)

exon_within

## GRanges object with 13746 ranges and 4 metadata columns:
##           seqnames              ranges strand |
##              <Rle>           <IRanges>  <Rle> |
##       [1]     chr1       948847-948956      + |
##       [2]     chr1     1051440-1051736      - |
##       [3]     chr1     1109286-1109306      + |
##       [4]     chr1     1109804-1109869      + |
##       [5]     chr1     1219358-1219470      + |
##       ...      ...                 ...    ... .
##   [13742]     chrX 153927569-153927788      - |
##   [13743]     chrX 153927569-153927788      - |
##   [13744]     chrX 153927569-153927788      - |
##   [13745]     chrX 153927569-153927788      - |
##   [13746]     chrX 154114409-154114577      - |
##                                           name.x     score        tx_id
##                                      <character> <numeric>  <character>
##       [1]       NM_005101_exon_0_0_chr1_948847_f         0    NM_005101
##       [2]      NM_017891_exon_9_0_chr1_1051440_r         0    NM_017891
##       [3]   NM_001130045_exon_0_0_chr1_1109286_f         0 NM_001130045
##       [4]   NM_001130045_exon_2_0_chr1_1109804_f         0 NM_001130045
##       [5]   NM_001130413_exon_4_0_chr1_1219358_f         0 NM_001130413
##       ...                                    ...       ...          ...
##   [13742] NM_001081573_exon_4_0_chrX_153927569_r         0 NM_001081573
##   [13743] NM_001282283_exon_3_0_chrX_153927569_r         0 NM_001282283
##   [13744]    NM_080612_exon_4_0_chrX_153927569_r         0    NM_080612
##   [13745]    NR_104114_exon_4_0_chrX_153927569_r         0    NR_104114
##   [13746]    NM_019863_exon_4_0_chrX_154114409_r         0    NM_019863
##                      name.y
##                 <character>
##       [1] 4_Strong_Enhancer
##       [2]   6_Weak_Enhancer
##       [3]   6_Weak_Enhancer
##       [4]   6_Weak_Enhancer
##       [5]   7_Weak_Enhancer
##       ...               ...
##   [13742]   6_Weak_Enhancer
##   [13743]   6_Weak_Enhancer
##   [13744]   6_Weak_Enhancer
##   [13745]   6_Weak_Enhancer
##   [13746]   6_Weak_Enhancer
##   -------
##   seqinfo: 93 sequences from hg19 genome

Use join_overlap_intersect() to filter exons if at least 50 per cent of their bases are overlapped by enhancer elements.

exons %>% 
  mutate(total_length = width) %>% 
  join_overlap_intersect(enhancers) %>% 
  filter(width / total_length > 0.5)

## GRanges object with 21151 ranges and 5 metadata columns:
##           seqnames              ranges strand |
##              <Rle>           <IRanges>  <Rle> |
##       [1]     chr1         35738-35937      - |
##       [2]     chr1         35738-35937      - |
##       [3]     chr1       910938-911537      - |
##       [4]     chr1       911938-912004      - |
##       [5]     chr1       948847-948956      + |
##       ...      ...                 ...    ... .
##   [21147]     chrX 153927569-153927788      - |
##   [21148]     chrX 154114409-154114577      - |
##   [21149]     chrX 154300607-154300618      + |
##   [21150]     chrX 154300607-154300618      + |
##   [21151]     chrX 154300607-154300618      + |
##                                           name.x     score        tx_id
##                                      <character> <numeric>  <character>
##       [1]        NR_026818_exon_2_0_chr1_35721_r         0    NR_026818
##       [2]        NR_026820_exon_2_0_chr1_35721_r         0    NR_026820
##       [3]       NR_027693_exon_0_0_chr1_910579_r         0    NR_027693
##       [4]       NR_027693_exon_1_0_chr1_911879_r         0    NR_027693
##       [5]       NM_005101_exon_0_0_chr1_948847_f         0    NM_005101
##       ...                                    ...       ...          ...
##   [21147]    NR_104114_exon_4_0_chrX_153927569_r         0    NR_104114
##   [21148]    NM_019863_exon_4_0_chrX_154114409_r         0    NM_019863
##   [21149] NM_001018055_exon_1_0_chrX_154300602_f         0 NM_001018055
##   [21150] NM_001242640_exon_1_0_chrX_154300602_f         0 NM_001242640
##   [21151]    NM_024332_exon_1_0_chrX_154300602_f         0    NM_024332
##           total_length            name.y
##              <integer>       <character>
##       [1]          361   7_Weak_Enhancer
##       [2]          361   7_Weak_Enhancer
##       [3]         1071   6_Weak_Enhancer
##       [4]          126   6_Weak_Enhancer
##       [5]          110 4_Strong_Enhancer
##       ...          ...               ...
##   [21147]          220   6_Weak_Enhancer
##   [21148]          169   6_Weak_Enhancer
##   [21149]           17   6_Weak_Enhancer
##   [21150]           17   6_Weak_Enhancer
##   [21151]           17   6_Weak_Enhancer
##   -------
##   seqinfo: 93 sequences from hg19 genome

Count the number of each enhancer element type that are exonic. There are several ways of doing this, but see if you can come up with a solution using join_overlap_left() + disjoin_ranges().

enhancers %>% 
  join_overlap_left(exons) %>% 
  group_by(name.x) %>% 
  disjoin_ranges(olap_any = all(!is.na(tx_id))) %>% 
  group_by(name.x) %>% 
  summarise(prop = sum(olap_any) / n())

## DataFrame with 4 rows and 2 columns
##              name.x               prop
##         <character>          <numeric>
## 1 4_Strong_Enhancer  0.129369300911854
## 2 5_Strong_Enhancer 0.0830471433031533
## 3   6_Weak_Enhancer 0.0921656261823685
## 4   7_Weak_Enhancer 0.0614762445084519