H3K27ac ChIP-seq data of human neuroepithelial stem cells (hNESCs) have been used to detect differential peak calling using THOR (Allhoff et al. (2016)).
THOR will be used to detect differential peaks between K7-WT and IM5-MUT and between T46-GC and T46-MUT.
Get Regulatory Genome toolbox (RGT) version 0.9.9
pip install --user cython numpy scipy
pip install --user RGT
# Download hg38 dataset
python setupGenomicData.py --hg38
The chromosome file provided by THOR have chromosome names as chr1, chr2, chr3, etc. while H3K27ac ChIP-seq data have chromosome names as 1, 2, 3, etc. So change the chromosome name of the hg38 fasta and the chromosome size files.
cat genome_hg38.fa | sed 's/>chr/>/g' > genome_hg38_NOCHR.fa
cat chrom.sizes.hg38 | sed 's/>chr/>/g' > chrom.sizes.NO_CHR.hg38
1st comparison is K7-WT vs. IM5-MUT. THOR uses a configuration file for running. Here is the configuration file:
cat /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/config_files/THOR.SN.K7WTvsIM5MUT.H3K27ac.config
#rep1
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/K7-WT-H3K27Ac.GRChr38.p1.q30.bam
#rep2
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/IM5-MUT-H3K27Ac.GRChr38.p1.q30.bam
#genome
/Users/deborah.gerard/rgtdata/hg38/genome_hg38_NOCHR.fa
#chrom_sizes
/Users/deborah.gerard/rgtdata/hg38/chrom.sizes.NO_CHR.hg38
#inputs1
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/K7-WT-I.GRChr38.p1.q30.bam
#inputs2
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/IM5-MUT-I.GRChr38.p1.q30.bam
Run THOR.
rgt-THOR THOR.SN.K7WTvsIM5MUT.H3K27ac.config --output-dir ./Results -n 29112017-K7WTvsIM5MUT.H3K27ac-TMM-005 --pvalue 0.05 --save-input
2nd comparison is T46-GC vs. T46-MUT. Here is the configuration file:
cat /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/config_files/THOR.SN.T46GCvsT46MUT.H3K27ac.config
#rep1
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/T46_GC_H3K27ac.bam
#rep2
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/T4_6-MUT-H3K27Ac.GRChr38.p1.q30.bam
#genome
/Users/deborah.gerard/rgtdata/hg38/genome_hg38_NOCHR.fa
#chrom_sizes
/Users/deborah.gerard/rgtdata/hg38/chrom.sizes.NO_CHR.hg38
#inputs1
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/T46_GC_INPUT.bam
#inputs2
/Users/deborah.gerard/THORAnalysis/THOR_SarahNickels_data/T4_6-MUT-I.GRChr38.p1.q30.bam
Run THOR.
rgt-THOR THOR.SN.T46GCvsT46MUT.H3K27ac.config --output-dir ./Results -n 01122017-T46GCvsT46MUT.H3K27ac-TMM-005 --pvalue 0.05 --save-input
Load libraries
library("tidyverse")
Extract top 1000 differential peaks for K7-WT vs. IM5-MUT.
IM5_MUTvsK7_WT_H3K27ac_DiffPeaks = read_delim(
paste0(getwd(), "/Fig6_THOR/Data/BED/29112017-K7WTvsIM5MUT.H3K27ac-TMM-005-diffpeaks.bed"),
delim = "\t", progress = TRUE, col_names = FALSE,
col_types = cols(X1 = col_character()))
# Extract count per samples
IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks %>%
separate(X11, into = c("count of K7_WT", "count of IM5_MUT", "-log10(pvalue)"), sep = ";") %>%
mutate(`-log10(pvalue)` = as.numeric(`-log10(pvalue)`),
`count of K7_WT` = as.numeric(`count of K7_WT`),
`count of IM5_MUT` = as.numeric(`count of IM5_MUT`))
# Unlog pvalue and calculate ratio of counts
IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean %>%
mutate(pval = 10^-(`-log10(pvalue)`)) %>%
mutate(signalFoldChange = log2((`count of IM5_MUT` + 0.001)/(`count of K7_WT` + 0.001))) %>% # add a pseudocount to avoid to divide by 0
rename(chr = X1, start = X2, end = X3, PeakName = X4, unknown = X5, gain_lose_Peaks = X6, colorCode = X9) %>%
select(chr, start, end, PeakName, gain_lose_Peaks, `count of K7_WT`, `count of IM5_MUT`, signalFoldChange,
pval, `-log10(pvalue)`) %>%
arrange(desc(signalFoldChange))
# Visualise top 1000 peaks
IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>%
mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>%
select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
ggplot(., aes(x = DiffReadCounts)) +
geom_density() +
theme_classic() +
xlim(-2500, 2500) +
geom_vline(xintercept = -1000) +
geom_vline(xintercept = 1000) ## Take peaks above 1000 and below 1000 as top
# Extract coordinates for the top1000 peaks
IM5_MUTvsK7_WT_H3K27ac_topMinus1000Peaks = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>%
mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>%
select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
filter(DiffReadCounts <= -1000)
IM5_MUTvsK7_WT_H3K27ac_topPlus1000Peaks = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>%
mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>%
select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
filter(DiffReadCounts >= 1000)
IM5_MUTvsK7_WT_H3K27ac_top1000Peaks = bind_rows(IM5_MUTvsK7_WT_H3K27ac_topPlus1000Peaks,
IM5_MUTvsK7_WT_H3K27ac_topMinus1000Peaks)
write_delim(IM5_MUTvsK7_WT_H3K27ac_top1000Peaks %>%
select(chr, start, end, PeakName),
paste0(getwd(),
"/Fig6_THOR/Data/BED/30072018-THOR-IM5_MUTvsK7_WT-H3K27ac-Top1000Peaks.bed"), delim = "\t",
col_names = FALSE)
Extract top 1000 differential peaks for T46-GC vs. T46-MUT.
T46_GCvsT46_MUT_H3K27ac_DiffPeaks = read_delim(
paste0(getwd(), "/Fig6_THOR/Data/BED/01122017-T46GCvsT46MUT.H3K27ac-TMM-005-diffpeaks.bed"),
delim = "\t", progress = TRUE, col_names = FALSE,
col_types = cols(X1 = col_character()))
# Extract count per samples
T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean = T46_GCvsT46_MUT_H3K27ac_DiffPeaks %>%
separate(X11, into = c("count of T4.6_GC", "count of T4.6_MUT", "-log10(pvalue)"), sep = ";") %>%
mutate(`-log10(pvalue)` = as.numeric(`-log10(pvalue)`),
`count of T4.6_MUT` = as.numeric(`count of T4.6_MUT`),
`count of T4.6_GC` = as.numeric(`count of T4.6_GC`))
# Unlog pvalue and calculate ratio of counts
T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean %>%
mutate(pval = 10^-(`-log10(pvalue)`)) %>%
mutate(signalFoldChange = log2((`count of T4.6_GC` + 0.001)/(`count of T4.6_MUT` + 0.001))) %>% # add a pseudocount to avoid to divide by 0
rename(chr = X1, start = X2, end = X3, PeakName = X4, unknown = X5, gain_lose_Peaks = X6, colorCode = X9) %>%
select(chr, start, end, PeakName, gain_lose_Peaks, `count of T4.6_GC`, `count of T4.6_MUT`, signalFoldChange,
pval, `-log10(pvalue)`) %>%
arrange(desc(signalFoldChange))
# Visualise top 1000 peaks
T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>%
mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>%
select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
ggplot(., aes(x = DiffReadCounts)) +
geom_density() +
theme_classic() +
xlim(-2500, 2500) +
geom_vline(xintercept = -1000) +
geom_vline(xintercept = 1000) ## Take peaks above 1000 and below 1000 as top)
# Extract coordinates for the top1000 peaks
T46_GCvsT46_MUT_H3K27ac_topMinus1000Peaks = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>%
mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>%
select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
filter(DiffReadCounts <= -1000)
T46_GCvsT46_MUT_H3K27ac_topPlus1000Peaks = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>%
mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>%
select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>%
filter(DiffReadCounts >= 1000)
T46_GCvsT46_MUT_H3K27ac_top1000Peaks = bind_rows(T46_GCvsT46_MUT_H3K27ac_topPlus1000Peaks,
T46_GCvsT46_MUT_H3K27ac_topMinus1000Peaks)
write_delim(T46_GCvsT46_MUT_H3K27ac_top1000Peaks %>%
select(chr, start, end, PeakName),
paste0(getwd(),
"/Fig6_THOR/Data/BED27092018-THOR-T46_GCvsT46_MUT_H3K27ac-Top1000Peaks.bed"), delim = "\t",
col_names = FALSE)
Redo a figure of H3K27Ac tracks with PLAQ-seq data from Nott et al. (2019).
# Download bigBedToBed from UCSC
cd ~/bin/
wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64/bigBedToBed
chmod +x bigBedToBed
mkdir BED
cd /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/
OUT="./BED/"
# Convert bigbed into bed
bigBedToBed ./Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb \
${OUT}/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bed
PLACseq data is in hg19. Need a lift-over to hg38 to match hNESCs data
# Download liftOver BEDPE from git
git clone https://github.com/dphansti/liftOverBedpe.git ~/bin/liftOver_bedpe
# Download the chain hg19 -> hg38
cd /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz
# Make an appropriate BEDPE file
cat ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bed | \
awk 'BEGIN{OFS="\t"} {print $9,$10,$11,$14,$15,$16,"inter_"NR,$5,".","."}' > \
./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bedpe
# LiftOver
python ~/bin/liftOver_bedpe/liftOverBedpe.py \
--lift ~/bin/liftOver \
--chain ./hg19ToHg38.over.chain.gz \
--i ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bedpe \
--o ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.HG38.bedpe \
--h F
Allhoff, Manuel, Kristin Seré, Juliana F Pires, Martin Zenke, and Ivan G Costa. 2016. “Differential Peak Calling of ChIP-Seq Signals with Replicates with THOR.” Nucleic Acids Research 44 (20): e153. https://doi.org/10.1093/nar/gkw680.
Nott, Alexi, Inge R. Holtman, Nicole G. Coufal, Johannes C. M. Schlachetzki, Miao Yu, Rong Hu, Claudia Z. Han, et al. 2019. “Brain Cell Type-Specific Enhancer-Promoter Interactome Maps and Disease-Risk Association.” Science (New York, N.Y.) 366 (6469): 1134–9. https://doi.org/10.1126/science.aay0793.