1. Data

H3K27ac ChIP-seq data of human neuroepithelial stem cells (hNESCs) have been used to detect differential peak calling using THOR (Allhoff et al. (2016)).

2. Objectives

THOR will be used to detect differential peaks between K7-WT and IM5-MUT and between T46-GC and T46-MUT.

3. Analysis

3.1 Install THOR

Get Regulatory Genome toolbox (RGT) version 0.9.9

pip install --user cython numpy scipy
pip install --user RGT

# Download hg38 dataset
python setupGenomicData.py --hg38

The chromosome file provided by THOR have chromosome names as chr1, chr2, chr3, etc. while H3K27ac ChIP-seq data have chromosome names as 1, 2, 3, etc. So change the chromosome name of the hg38 fasta and the chromosome size files.

cat genome_hg38.fa | sed 's/>chr/>/g' > genome_hg38_NOCHR.fa
cat chrom.sizes.hg38 | sed 's/>chr/>/g' > chrom.sizes.NO_CHR.hg38
3.1.1 Run THOR

1st comparison is K7-WT vs. IM5-MUT. THOR uses a configuration file for running. Here is the configuration file:

cat /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/config_files/THOR.SN.K7WTvsIM5MUT.H3K27ac.config
#rep1
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/K7-WT-H3K27Ac.GRChr38.p1.q30.bam
#rep2
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/IM5-MUT-H3K27Ac.GRChr38.p1.q30.bam
#genome
/Users/deborah.gerard/rgtdata/hg38/genome_hg38_NOCHR.fa
#chrom_sizes
/Users/deborah.gerard/rgtdata/hg38/chrom.sizes.NO_CHR.hg38
#inputs1
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/K7-WT-I.GRChr38.p1.q30.bam
#inputs2
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/IM5-MUT-I.GRChr38.p1.q30.bam

Run THOR.

rgt-THOR THOR.SN.K7WTvsIM5MUT.H3K27ac.config --output-dir ./Results -n 29112017-K7WTvsIM5MUT.H3K27ac-TMM-005 --pvalue 0.05 --save-input

2nd comparison is T46-GC vs. T46-MUT. Here is the configuration file:

cat /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/config_files/THOR.SN.T46GCvsT46MUT.H3K27ac.config
#rep1
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/T46_GC_H3K27ac.bam
#rep2
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/T4_6-MUT-H3K27Ac.GRChr38.p1.q30.bam
#genome
/Users/deborah.gerard/rgtdata/hg38/genome_hg38_NOCHR.fa
#chrom_sizes
/Users/deborah.gerard/rgtdata/hg38/chrom.sizes.NO_CHR.hg38
#inputs1
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/T46_GC_INPUT.bam
#inputs2
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/T4_6-MUT-I.GRChr38.p1.q30.bam

Run THOR.

rgt-THOR THOR.SN.T46GCvsT46MUT.H3K27ac.config --output-dir ./Results -n 01122017-T46GCvsT46MUT.H3K27ac-TMM-005 --pvalue 0.05 --save-input

3.2 Extract top 1000 peaks

Load libraries

library("tidyverse")

Extract top 1000 differential peaks for K7-WT vs. IM5-MUT.

IM5_MUTvsK7_WT_H3K27ac_DiffPeaks = read_delim(
  paste0(getwd(), "/Fig6_THOR/Data/BED/29112017-K7WTvsIM5MUT.H3K27ac-TMM-005-diffpeaks.bed"),
  delim = "\t", progress = TRUE, col_names = FALSE,
  col_types = cols(X1 = col_character()))

# Extract count per samples
IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks %>%
  separate(X11, into = c("count of K7_WT", "count of IM5_MUT", "-log10(pvalue)"), sep = ";") %>% 
  mutate(`-log10(pvalue)` = as.numeric(`-log10(pvalue)`),
         `count of K7_WT` = as.numeric(`count of K7_WT`),
         `count of IM5_MUT` = as.numeric(`count of IM5_MUT`))

# Unlog pvalue and calculate ratio of counts
IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean %>%
  mutate(pval = 10^-(`-log10(pvalue)`)) %>%
  mutate(signalFoldChange = log2((`count of IM5_MUT` + 0.001)/(`count of K7_WT` + 0.001))) %>%  # add a pseudocount to avoid to divide by 0
  rename(chr = X1, start = X2, end = X3, PeakName = X4, unknown = X5, gain_lose_Peaks = X6, colorCode = X9) %>%
  select(chr, start, end, PeakName, gain_lose_Peaks, `count of K7_WT`, `count of IM5_MUT`, signalFoldChange,
         pval, `-log10(pvalue)`) %>%
  arrange(desc(signalFoldChange))

# Visualise top 1000 peaks
IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>%
  mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>%
  select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% 
  ggplot(., aes(x = DiffReadCounts)) +
  geom_density() +
  theme_classic() +
  xlim(-2500, 2500) +
  geom_vline(xintercept = -1000) +
  geom_vline(xintercept = 1000) ## Take peaks above 1000 and below 1000 as top

# Extract coordinates for the top1000 peaks
IM5_MUTvsK7_WT_H3K27ac_topMinus1000Peaks = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>%
  mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>%
  select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
  filter(DiffReadCounts <= -1000)

IM5_MUTvsK7_WT_H3K27ac_topPlus1000Peaks = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>%
  mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>%
  select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
  filter(DiffReadCounts >= 1000)

IM5_MUTvsK7_WT_H3K27ac_top1000Peaks = bind_rows(IM5_MUTvsK7_WT_H3K27ac_topPlus1000Peaks,
                                                IM5_MUTvsK7_WT_H3K27ac_topMinus1000Peaks)

write_delim(IM5_MUTvsK7_WT_H3K27ac_top1000Peaks %>% 
              select(chr, start, end, PeakName),
            paste0(getwd(), 
                   "/Fig6_THOR/Data/BED/30072018-THOR-IM5_MUTvsK7_WT-H3K27ac-Top1000Peaks.bed"), delim = "\t", 
            col_names = FALSE)

Extract top 1000 differential peaks for T46-GC vs. T46-MUT.

T46_GCvsT46_MUT_H3K27ac_DiffPeaks = read_delim(
  paste0(getwd(), "/Fig6_THOR/Data/BED/01122017-T46GCvsT46MUT.H3K27ac-TMM-005-diffpeaks.bed"),
         delim = "\t", progress = TRUE, col_names = FALSE,
         col_types = cols(X1 = col_character()))

# Extract count per samples
T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean = T46_GCvsT46_MUT_H3K27ac_DiffPeaks %>%
  separate(X11, into = c("count of T4.6_GC", "count of T4.6_MUT", "-log10(pvalue)"), sep = ";") %>% 
  mutate(`-log10(pvalue)` = as.numeric(`-log10(pvalue)`),
         `count of T4.6_MUT` = as.numeric(`count of T4.6_MUT`),
         `count of T4.6_GC` = as.numeric(`count of T4.6_GC`))

# Unlog pvalue and calculate ratio of counts
T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean %>%
  mutate(pval = 10^-(`-log10(pvalue)`)) %>%
  mutate(signalFoldChange = log2((`count of T4.6_GC` + 0.001)/(`count of T4.6_MUT` + 0.001))) %>%  # add a pseudocount to avoid to divide by 0
  rename(chr = X1, start = X2, end = X3, PeakName = X4, unknown = X5, gain_lose_Peaks = X6, colorCode = X9) %>%
  select(chr, start, end, PeakName, gain_lose_Peaks, `count of T4.6_GC`, `count of T4.6_MUT`, signalFoldChange,
         pval, `-log10(pvalue)`) %>%
  arrange(desc(signalFoldChange))

# Visualise top 1000 peaks
T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>% 
  mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>%
  select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% 
  ggplot(., aes(x = DiffReadCounts)) +
  geom_density() +
  theme_classic() +
  xlim(-2500, 2500) +
  geom_vline(xintercept = -1000) +
  geom_vline(xintercept = 1000) ## Take peaks above 1000 and below 1000 as top)

# Extract coordinates for the top1000 peaks
T46_GCvsT46_MUT_H3K27ac_topMinus1000Peaks = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>%
  mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>%
  select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% 
  filter(DiffReadCounts <= -1000)

T46_GCvsT46_MUT_H3K27ac_topPlus1000Peaks = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>%
  mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>%
  select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
  filter(DiffReadCounts >= 1000)

T46_GCvsT46_MUT_H3K27ac_top1000Peaks = bind_rows(T46_GCvsT46_MUT_H3K27ac_topPlus1000Peaks,
                                                T46_GCvsT46_MUT_H3K27ac_topMinus1000Peaks)

write_delim(T46_GCvsT46_MUT_H3K27ac_top1000Peaks %>% 
              select(chr, start, end, PeakName),
            paste0(getwd(), 
                   "/Fig6_THOR/Data/BED27092018-THOR-T46_GCvsT46_MUT_H3K27ac-Top1000Peaks.bed"), delim = "\t", 
            col_names = FALSE)

3.3 PLAC-seq track

Redo a figure of H3K27Ac tracks with PLAQ-seq data from Nott et al. (2019).

3.3.1 Transform the BigBedFile into a BED file
# Download bigBedToBed from UCSC
cd ~/bin/
wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64/bigBedToBed
chmod +x bigBedToBed

mkdir BED
cd /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/
OUT="./BED/"

# Convert bigbed into bed
bigBedToBed ./Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb \
${OUT}/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bed

PLACseq data is in hg19. Need a lift-over to hg38 to match hNESCs data

3.3.2 Perform a lift-over from hg19 to hg38
# Download liftOver BEDPE from git
git clone https://github.com/dphansti/liftOverBedpe.git ~/bin/liftOver_bedpe

# Download the chain hg19 -> hg38
cd /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz

# Make an appropriate BEDPE file
cat ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bed | \
awk 'BEGIN{OFS="\t"} {print $9,$10,$11,$14,$15,$16,"inter_"NR,$5,".","."}' > \
./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bedpe

# LiftOver
python ~/bin/liftOver_bedpe/liftOverBedpe.py \
--lift ~/bin/liftOver \
--chain ./hg19ToHg38.over.chain.gz \
--i ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bedpe \
--o ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.HG38.bedpe \
--h F

4. References

Allhoff, Manuel, Kristin Seré, Juliana F Pires, Martin Zenke, and Ivan G Costa. 2016. “Differential Peak Calling of ChIP-Seq Signals with Replicates with THOR.” Nucleic Acids Research 44 (20): e153. https://doi.org/10.1093/nar/gkw680.

Nott, Alexi, Inge R. Holtman, Nicole G. Coufal, Johannes C. M. Schlachetzki, Miao Yu, Rong Hu, Claudia Z. Han, et al. 2019. “Brain Cell Type-Specific Enhancer-Promoter Interactome Maps and Disease-Risk Association.” Science (New York, N.Y.) 366 (6469): 1134–9. https://doi.org/10.1126/science.aay0793.