--- title: "Walter_et_al_THOR" author: Deborah GĂ©rard output: html_document: keep_md: true df_print: paged toc: true toc_depth: 6 pdf_document: default editor_options: chunk_output_type: console bibliography: citations.bib --- ```{r setup, include = FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, results = "as.is", fig.height = 3, cache = TRUE, tidy = TRUE, fig.pos = "H") ``` ### *1. Data* H3K27ac ChIP-seq data of human neuroepithelial stem cells (hNESCs) have been used to detect differential peak calling using THOR (@allhoff_differential_2016). ### *2. Objectives* THOR will be used to detect differential peaks between K7-WT and IM5-MUT and between T46-GC and T46-MUT. ### *3. Analysis* #### *3.1 Install THOR* Get Regulatory Genome toolbox (RGT) version 0.9.9 ```{bash, eval = FALSE} pip install --user cython numpy scipy pip install --user RGT # Download hg38 dataset python setupGenomicData.py --hg38 ``` The chromosome file provided by THOR have chromosome names as chr1, chr2, chr3, etc. while H3K27ac ChIP-seq data have chromosome names as 1, 2, 3, etc. So change the chromosome name of the hg38 fasta and the chromosome size files. ```{bash, eval = FALSE} cat genome_hg38.fa | sed 's/>chr/>/g' > genome_hg38_NOCHR.fa cat chrom.sizes.hg38 | sed 's/>chr/>/g' > chrom.sizes.NO_CHR.hg38 ``` ##### *3.1.1 Run THOR* 1st comparison is K7-WT vs. IM5-MUT. THOR uses a configuration file for running. Here is the configuration file: ```{bash, eval = TRUE, comment = NA} cat /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/config_files/THOR.SN.K7WTvsIM5MUT.H3K27ac.config ``` Run THOR. ```{bash, eval = FALSE} rgt-THOR THOR.SN.K7WTvsIM5MUT.H3K27ac.config --output-dir ./Results -n 29112017-K7WTvsIM5MUT.H3K27ac-TMM-005 --pvalue 0.05 --save-input ``` 2nd comparison is T46-GC vs. T46-MUT. Here is the configuration file: ```{bash, eval = TRUE, comment = NA} cat /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/config_files/THOR.SN.T46GCvsT46MUT.H3K27ac.config ``` Run THOR. ```{bash, eval = FALSE} rgt-THOR THOR.SN.T46GCvsT46MUT.H3K27ac.config --output-dir ./Results -n 01122017-T46GCvsT46MUT.H3K27ac-TMM-005 --pvalue 0.05 --save-input ``` #### *3.2 Extract top 1000 peaks* Load libraries ```{r lib, eval = FALSE} library("tidyverse") ``` Extract top 1000 differential peaks for K7-WT vs. IM5-MUT. ```{r top_100_K7WTvsIM5MUT, eval = FALSE} IM5_MUTvsK7_WT_H3K27ac_DiffPeaks = read_delim( paste0(getwd(), "/Fig6_THOR/Data/BED/29112017-K7WTvsIM5MUT.H3K27ac-TMM-005-diffpeaks.bed"), delim = "\t", progress = TRUE, col_names = FALSE, col_types = cols(X1 = col_character())) # Extract count per samples IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks %>% separate(X11, into = c("count of K7_WT", "count of IM5_MUT", "-log10(pvalue)"), sep = ";") %>% mutate(`-log10(pvalue)` = as.numeric(`-log10(pvalue)`), `count of K7_WT` = as.numeric(`count of K7_WT`), `count of IM5_MUT` = as.numeric(`count of IM5_MUT`)) # Unlog pvalue and calculate ratio of counts IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean %>% mutate(pval = 10^-(`-log10(pvalue)`)) %>% mutate(signalFoldChange = log2((`count of IM5_MUT` + 0.001)/(`count of K7_WT` + 0.001))) %>% # add a pseudocount to avoid to divide by 0 rename(chr = X1, start = X2, end = X3, PeakName = X4, unknown = X5, gain_lose_Peaks = X6, colorCode = X9) %>% select(chr, start, end, PeakName, gain_lose_Peaks, `count of K7_WT`, `count of IM5_MUT`, signalFoldChange, pval, `-log10(pvalue)`) %>% arrange(desc(signalFoldChange)) # Visualise top 1000 peaks IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>% mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>% select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% ggplot(., aes(x = DiffReadCounts)) + geom_density() + theme_classic() + xlim(-2500, 2500) + geom_vline(xintercept = -1000) + geom_vline(xintercept = 1000) ## Take peaks above 1000 and below 1000 as top # Extract coordinates for the top1000 peaks IM5_MUTvsK7_WT_H3K27ac_topMinus1000Peaks = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>% mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>% select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% filter(DiffReadCounts <= -1000) IM5_MUTvsK7_WT_H3K27ac_topPlus1000Peaks = IM5_MUTvsK7_WT_H3K27ac_DiffPeaks_clean_fin %>% mutate(DiffReadCounts = `count of IM5_MUT` - `count of K7_WT`) %>% select(chr:`count of IM5_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% filter(DiffReadCounts >= 1000) IM5_MUTvsK7_WT_H3K27ac_top1000Peaks = bind_rows(IM5_MUTvsK7_WT_H3K27ac_topPlus1000Peaks, IM5_MUTvsK7_WT_H3K27ac_topMinus1000Peaks) write_delim(IM5_MUTvsK7_WT_H3K27ac_top1000Peaks %>% select(chr, start, end, PeakName), paste0(getwd(), "/Fig6_THOR/Data/BED/30072018-THOR-IM5_MUTvsK7_WT-H3K27ac-Top1000Peaks.bed"), delim = "\t", col_names = FALSE) ``` Extract top 1000 differential peaks for T46-GC vs. T46-MUT. ```{r top_100_T46GCvsT46MUT, eval = FALSE} T46_GCvsT46_MUT_H3K27ac_DiffPeaks = read_delim( paste0(getwd(), "/Fig6_THOR/Data/BED/01122017-T46GCvsT46MUT.H3K27ac-TMM-005-diffpeaks.bed"), delim = "\t", progress = TRUE, col_names = FALSE, col_types = cols(X1 = col_character())) # Extract count per samples T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean = T46_GCvsT46_MUT_H3K27ac_DiffPeaks %>% separate(X11, into = c("count of T4.6_GC", "count of T4.6_MUT", "-log10(pvalue)"), sep = ";") %>% mutate(`-log10(pvalue)` = as.numeric(`-log10(pvalue)`), `count of T4.6_MUT` = as.numeric(`count of T4.6_MUT`), `count of T4.6_GC` = as.numeric(`count of T4.6_GC`)) # Unlog pvalue and calculate ratio of counts T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean %>% mutate(pval = 10^-(`-log10(pvalue)`)) %>% mutate(signalFoldChange = log2((`count of T4.6_GC` + 0.001)/(`count of T4.6_MUT` + 0.001))) %>% # add a pseudocount to avoid to divide by 0 rename(chr = X1, start = X2, end = X3, PeakName = X4, unknown = X5, gain_lose_Peaks = X6, colorCode = X9) %>% select(chr, start, end, PeakName, gain_lose_Peaks, `count of T4.6_GC`, `count of T4.6_MUT`, signalFoldChange, pval, `-log10(pvalue)`) %>% arrange(desc(signalFoldChange)) # Visualise top 1000 peaks T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>% mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>% select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% ggplot(., aes(x = DiffReadCounts)) + geom_density() + theme_classic() + xlim(-2500, 2500) + geom_vline(xintercept = -1000) + geom_vline(xintercept = 1000) ## Take peaks above 1000 and below 1000 as top) # Extract coordinates for the top1000 peaks T46_GCvsT46_MUT_H3K27ac_topMinus1000Peaks = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>% mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>% select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% filter(DiffReadCounts <= -1000) T46_GCvsT46_MUT_H3K27ac_topPlus1000Peaks = T46_GCvsT46_MUT_H3K27ac_DiffPeaks_clean_fin %>% mutate(DiffReadCounts = `count of T4.6_GC` - `count of T4.6_MUT`) %>% select(chr:`count of T4.6_GC`, `count of T4.6_MUT`, DiffReadCounts, signalFoldChange:`-log10(pvalue)`) %>% filter(DiffReadCounts >= 1000) T46_GCvsT46_MUT_H3K27ac_top1000Peaks = bind_rows(T46_GCvsT46_MUT_H3K27ac_topPlus1000Peaks, T46_GCvsT46_MUT_H3K27ac_topMinus1000Peaks) write_delim(T46_GCvsT46_MUT_H3K27ac_top1000Peaks %>% select(chr, start, end, PeakName), paste0(getwd(), "/Fig6_THOR/Data/BED27092018-THOR-T46_GCvsT46_MUT_H3K27ac-Top1000Peaks.bed"), delim = "\t", col_names = FALSE) ``` #### *3.3 PLAC-seq track* Redo a figure of H3K27Ac tracks with PLAQ-seq data from @nott_brain_2019. ##### *3.3.1 Transform the BigBedFile into a BED file* ```{bash, eval = FALSE} # Download bigBedToBed from UCSC cd ~/bin/ wget http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64/bigBedToBed chmod +x bigBedToBed mkdir BED cd /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/ OUT="./BED/" # Convert bigbed into bed bigBedToBed ./Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb \ ${OUT}/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bed ``` PLACseq data is in hg19. Need a lift-over to hg38 to match hNESCs data ##### *3.3.2 Perform a lift-over from hg19 to hg38* ```{bash, eval = FALSE} # Download liftOver BEDPE from git git clone https://github.com/dphansti/liftOverBedpe.git ~/bin/liftOver_bedpe # Download the chain hg19 -> hg38 cd /Volumes/deborah.gerard/Documents/Jonas/Walter_et_al_Github/Fig6_THOR/Data/ wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz # Make an appropriate BEDPE file cat ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bed | \ awk 'BEGIN{OFS="\t"} {print $9,$10,$11,$14,$15,$16,"inter_"NR,$5,".","."}' > \ ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bedpe # LiftOver python ~/bin/liftOver_bedpe/liftOverBedpe.py \ --lift ~/bin/liftOver \ --chain ./hg19ToHg38.over.chain.gz \ --i ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.bedpe \ --o ./BED/Nott_et_al_NeuN.5k_interactions_ucsc_genome_browser.inter.bb.HG38.bedpe \ --h F ``` ### *4. References*