################################################################################
##Genome-wide association study of copy number variations in Parkinson's disease
################################################################################

library(dplyr)
library(ggplot2)

##################################################
##4. Burden analysis
##################################################
# input filtered CNV data
cnv_filtered_all <- read.table("cnv_data_filtered.txt", header = T)
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]

##Run Regression analysis for each category to generate a final dataframe with all the statistics

#1# ALL CNVs
glm_data_courage <- as.data.frame(c())
glm_data_courage[1,1] <- "All CNVs"
##Number of CNV carriers / non carriers in PD 
glm_data_courage[1,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
##Number of CNV carriers / non carriers in CTL 
glm_data_courage[1,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
##beta
glm_data_courage[1,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
##Odds ratio
glm_data_courage[1,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
##CI95
glm_data_courage[1,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
##P-value
glm_data_courage[1,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]
colnames(glm_data_courage) <- c("CNV_group","Cases","Controls","beta","OR","ci2.5","ci97.5","glm_p.value")

cnv_filtered_all$DEL_status <- ifelse(cnv_filtered_all$Copy_Number < 2,1,0)
cnv_filtered_all$DEL_status[is.na(cnv_filtered_all$DEL_status)] <- 0
cnv_filtered_all$DEL_status <- as.factor(cnv_filtered_all$DEL_status)
cnv_filtered_all$DUP_status <- ifelse(cnv_filtered_all$Copy_Number > 2,1,0)
cnv_filtered_all$DUP_status[is.na(cnv_filtered_all$DUP_status)] <- 0
cnv_filtered_all$DUP_status <- as.factor(cnv_filtered_all$DUP_status)

##combine DEL status in all samples
temp_DEL <- cnv_filtered_all %>% group_by(Sample_Name,DEL_status) %>% summarise(count = n()) %>% tidyr::spread(key = DEL_status, value = count, fill = 0)
temp_DEL <- data.frame(temp_DEL)
temp_DEL <- filter(temp_DEL, X1 != 0)
##update for DEL status
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]
cnv_filtered$DEL_status <- ifelse(cnv_filtered$Sample_Name %in% temp_DEL$Sample_Name,1,0)
cnv_filtered$DEL_status[is.na(cnv_filtered$DEL_status)] <- 0
cnv_filtered$DEL_status <- as.factor(cnv_filtered$DEL_status)
rm(temp_DEL)

#2# courage DEL CNVs
glm_data_courage[2,1] <- "All Deletions"
glm_data_courage[2,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & DEL_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage[2,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & DEL_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage[2,4] <- (glm(data = cnv_filtered, formula = aff ~ DEL_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage[2,5] <- exp((glm(data = cnv_filtered, formula = aff ~ DEL_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])
glm_data_courage[2,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ DEL_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]
glm_data_courage[2,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ DEL_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

##combine DUP status in all samples
temp_DUP <- cnv_filtered_all %>% group_by(Sample_Name,DUP_status) %>% summarise(count = n()) %>% tidyr::spread(key = DUP_status, value = count, fill = 0)
temp_DUP <- data.frame(temp_DUP)
temp_DUP <- filter(temp_DUP, X1 != 0)
##update for DEL status
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]
cnv_filtered$DUP_status <- ifelse(cnv_filtered$Sample_Name %in% temp_DUP$Sample_Name,1,0)
cnv_filtered$DUP_status[is.na(cnv_filtered$DUP_status)] <- 0
cnv_filtered$DUP_status <- as.factor(cnv_filtered$DUP_status)
rm(temp_DUP)

#3# courage DUP CNVs
glm_data_courage[3,1] <- "All Duplications"
glm_data_courage[3,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & DUP_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage[3,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & DUP_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage[3,4] <- (glm(data = cnv_filtered, formula = aff ~ DUP_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage[3,5] <- exp((glm(data = cnv_filtered, formula = aff ~ DUP_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])
glm_data_courage[3,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ DUP_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]
glm_data_courage[3,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ DUP_status*sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#4# CNV in any genes: exonic regions
PD_genes <- c("LRRK2;","SNCA;","PRKN;","PINK1;","PARK7;","VPS35;")
cnv_filtered_all$CNV_status <- ifelse(cnv_filtered_all$Func.refGene == "intergenic" | is.na(cnv_filtered_all$Func.refGene) | cnv_filtered_all$Gene.refGene %in% PD_genes, 0, 1)
cnv_filtered_all$CNV_status <- as.factor(cnv_filtered_all$CNV_status)
##combine coding genes status in all samples
temp_genes <- cnv_filtered_all %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_genes <- data.frame(temp_genes)
temp_genes <- filter(temp_genes, X1 != 0)
##update for coding genes status
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_genes$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_genes)

glm_data_courage[4,1] <- "Protein-coding genes"
glm_data_courage[4,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage[4,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage[4,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage[4,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage[4,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage[4,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#5# CNV in PD genes
cnv_filtered_PDgenes <- cnv_filtered_all[unlist(lapply(PD_genes, function(x) grep(x, cnv_filtered_all$Gene.refGene, fixed = TRUE))),]
cnv_filtered_all$CNV_status <- ifelse( cnv_filtered_all$Sample_Name %in% cnv_filtered_PDgenes$Sample_Name, 1, 0)
cnv_filtered_all$CNV_status <- as.factor(cnv_filtered_all$CNV_status)

##combine coding genes status in all samples
temp_PDgenes <- cnv_filtered_all %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_PDgenes <- data.frame(temp_PDgenes)
temp_PDgenes <- filter(temp_PDgenes, X1 != 0)

##update for coding genes status
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_PDgenes$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_PDgenes)

glm_data_courage[5,1] <- "PD genes (exonic+intronic)"
glm_data_courage[5,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage[5,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage[5,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage[5,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage[5,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage[5,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#6# exonic CNV in PD genes
cnv_filtered_PDgenes_exon <- filter(cnv_filtered_PDgenes, cnv_filtered_PDgenes$Func.refGene == "exonic")
cnv_filtered_all$CNV_status <- ifelse( cnv_filtered_all$Sample_Name %in% cnv_filtered_PDgenes_exon$Sample_Name, 1, 0)
cnv_filtered_all$CNV_status <- as.factor(cnv_filtered_all$CNV_status)

##combine PD genes status in all samples
temp_PDgenes <- cnv_filtered_all %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_PDgenes <- data.frame(temp_PDgenes)
temp_PDgenes <- filter(temp_PDgenes, X1 != 0)

##update for coding genes status
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_PDgenes$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_PDgenes, cnv_filtered_PDgenes_exon)

glm_data_courage[6,1] <- "PD genes (exonic)"
glm_data_courage[6,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage[6,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage[6,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage[6,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage[6,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage[6,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#7# CNV in PRKN genes
cnv_filtered_PRKN <- filter(cnv_filtered_PDgenes, cnv_filtered_PDgenes$`PD genes` == "PRKN")
cnv_filtered_all$CNV_status <- ifelse( cnv_filtered_all$Sample_Name %in% cnv_filtered_PRKN$Sample_Name, 1, 0)
cnv_filtered_all$CNV_status <- as.factor(cnv_filtered_all$CNV_status)

##combine PD genes status in all samples
temp_PRKN <- cnv_filtered_all %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_PRKN <- data.frame(temp_PRKN)
temp_PRKN <- filter(temp_PRKN, X1 != 0)

##update for coding genes status
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_PRKN$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_PRKN, cnv_filtered_PRKN)

glm_data_courage[7,1] <- "PRKN"
glm_data_courage[7,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage[7,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage[7,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage[7,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage[7,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage[7,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#8# CNV longer than 1 Mb
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]
cnv_filtered_all$CNV_status <- ifelse(cnv_filtered_all$Length_bp <= 1000000,0,1)
cnv_filtered_all$CNV_status[is.na(cnv_filtered_all$CNV_status)] <- 0
temp_longCNV <- cnv_filtered_all %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_longCNV <- data.frame(temp_longCNV)
temp_longCNV <- filter(temp_longCNV, X1 != 0)
##update for coding genes status
cnv_filtered <- cnv_filtered_all[!duplicated(cnv_filtered_all$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_longCNV$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_longCNV)

glm_data_courage[8,1] <- "CNV > 1Mb"
glm_data_courage[8,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage[8,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage[8,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage[8,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage[8,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage[8,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

glm_data_courage$group <- "ALL"
glm_data_courage$CNV_group <- factor(glm_data_courage$CNV_group, levels = rev(glm_data_courage$CNV_group))
glm_data_courage$FDR_pvalue <- as.numeric(p.adjust(glm_data_courage$glm_p.value, "fdr"))
glm_data_courage$FDR_pvalue <- scales::scientific(glm_data_courage$FDR_pvalue, digits = 2)
glm_data_courage$FDR_pvalue <- as.numeric(glm_data_courage$FDR_pvalue)

##################################################
##stratify by AAO
##################################################
cnv_filtered_CTL <- filter(cnv_filtered_all, aff == 0)
cnv_filtered_PD_y50 <- filter(cnv_filtered_all, aff == 1 & AAO <= 50)
cnv_filtered_young_onset <- union(cnv_filtered_PD_y50, cnv_filtered_CTL)
rm(cnv_filtered_CTL, cnv_filtered_PD_y50)

cnv_filtered_young_onset$CNV_status <- ifelse(is.na(cnv_filtered_young_onset$Copy_Number),0,1)
cnv_filtered_young_onset$CNV_status <- as.factor(cnv_filtered_young_onset$CNV_status)
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]

#2.1# ALL CNVs
glm_data_courage_EOPD <- as.data.frame(c())
glm_data_courage_EOPD[1,1] <- "All CNVs"
glm_data_courage_EOPD[1,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage_EOPD[1,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage_EOPD[1,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage_EOPD[1,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage_EOPD[1,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage_EOPD[1,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]
colnames(glm_data_courage_EOPD) <- c("CNV_group","Cases","Controls","beta","OR","ci2.5","ci97.5","glm_p.value")

##combine DEL status in all samples
temp_DEL <- cnv_filtered_young_onset %>% group_by(Sample_Name,DEL_status) %>% summarise(count = n()) %>% tidyr::spread(key = DEL_status, value = count, fill = 0)
temp_DEL <- data.frame(temp_DEL)
temp_DEL <- filter(temp_DEL, X1 != 0)
##update for DEL status
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]
cnv_filtered$DEL_status <- ifelse(cnv_filtered$Sample_Name %in% temp_DEL$Sample_Name,1,0)
cnv_filtered$DEL_status[is.na(cnv_filtered$DEL_status)] <- 0
cnv_filtered$DEL_status <- as.factor(cnv_filtered$DEL_status)
rm(temp_DEL)

#2.2# courage DEL CNVs
glm_data_courage_EOPD[2,1] <- "All Deletions"
glm_data_courage_EOPD[2,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & DEL_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage_EOPD[2,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & DEL_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage_EOPD[2,4] <- (glm(data = cnv_filtered, formula = aff ~ DEL_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage_EOPD[2,5] <- exp((glm(data = cnv_filtered, formula = aff ~ DEL_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])
glm_data_courage_EOPD[2,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ DEL_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]
glm_data_courage_EOPD[2,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ DEL_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

##combine DUP status in all samples
temp_DUP <- cnv_filtered_young_onset %>% group_by(Sample_Name,DUP_status) %>% summarise(count = n()) %>% tidyr::spread(key = DUP_status, value = count, fill = 0)
temp_DUP <- data.frame(temp_DUP)
temp_DUP <- filter(temp_DUP, X1 != 0)
##update for DEL status
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]
cnv_filtered$DUP_status <- ifelse(cnv_filtered$Sample_Name %in% temp_DUP$Sample_Name,1,0)
cnv_filtered$DUP_status[is.na(cnv_filtered$DUP_status)] <- 0
cnv_filtered$DUP_status <- as.factor(cnv_filtered$DUP_status)
rm(temp_DUP)

#2.3# courage DUP CNVs
glm_data_courage_EOPD[3,1] <- "All Duplications"
glm_data_courage_EOPD[3,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & DUP_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage_EOPD[3,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & DUP_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage_EOPD[3,4] <- (glm(data = cnv_filtered, formula = aff ~ DUP_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage_EOPD[3,5] <- exp((glm(data = cnv_filtered, formula = aff ~ DUP_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])
glm_data_courage_EOPD[3,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ DUP_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]
glm_data_courage_EOPD[3,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ DUP_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#2.4# CNV in any genes: exonic regions
PD_genes <- c("LRRK2;","SNCA;","PRKN;","PINK1;","PARK7;","VPS35;")
cnv_filtered_young_onset$CNV_status <- ifelse(cnv_filtered_young_onset$Func.refGene == "intergenic" | is.na(cnv_filtered_young_onset$Func.refGene) | cnv_filtered_young_onset$Gene.refGene %in% PD_genes, 0, 1)
cnv_filtered_young_onset$CNV_status <- as.factor(cnv_filtered_young_onset$CNV_status)
##combine coding genes status in all samples
temp_genes <- cnv_filtered_young_onset %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_genes <- data.frame(temp_genes)
temp_genes <- filter(temp_genes, X1 != 0)
##update for coding genes status
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_genes$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_genes)

# cnv_filtered <- cnv_filtered[!duplicated(cnv_filtered$Sample_Name),]
glm_data_courage_EOPD[4,1] <- "Protein-coding genes"
glm_data_courage_EOPD[4,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage_EOPD[4,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage_EOPD[4,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage_EOPD[4,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage_EOPD[4,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage_EOPD[4,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#2.5# CNV in PD genes
cnv_filtered_PDgenes <- readxl::read_xlsx("~/Documents/Projects/COURAGE_PD/CNV/draft/CNV_in_PDgenes.xlsx", sheet = 2)
cnv_filtered_PDgenes_EOPD <- filter(cnv_filtered_PDgenes, cnv_filtered_PDgenes$Sample_Name %in% cnv_filtered_young_onset$Sample_Name)
cnv_filtered_young_onset$CNV_status <- ifelse( cnv_filtered_young_onset$Sample_Name %in% cnv_filtered_PDgenes$Sample_Name, 1, 0)
cnv_filtered_young_onset$CNV_status <- as.factor(cnv_filtered_young_onset$CNV_status)

##combine coding genes status in all samples
temp_PDgenes <- cnv_filtered_young_onset %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_PDgenes <- data.frame(temp_PDgenes)
temp_PDgenes <- filter(temp_PDgenes, X1 != 0)

##update for coding genes status
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_PDgenes$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_PDgenes)

glm_data_courage_EOPD[5,1] <- "PD genes (exonic+intronic)"
glm_data_courage_EOPD[5,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage_EOPD[5,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage_EOPD[5,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage_EOPD[5,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage_EOPD[5,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage_EOPD[5,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#2.6# exonic CNV in PD genes
cnv_filtered_PDgenes_exon <- filter(cnv_filtered_PDgenes, cnv_filtered_PDgenes$Func.refGene == "exonic")
cnv_filtered_young_onset$CNV_status <- ifelse( cnv_filtered_young_onset$Sample_Name %in% cnv_filtered_PDgenes_exon$Sample_Name, 1, 0)
cnv_filtered_young_onset$CNV_status <- as.factor(cnv_filtered_young_onset$CNV_status)

##combine PD genes status in all samples
temp_PDgenes <- cnv_filtered_young_onset %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_PDgenes <- data.frame(temp_PDgenes)
temp_PDgenes <- filter(temp_PDgenes, X1 != 0)

##update for coding genes status
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_PDgenes$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_PDgenes, cnv_filtered_PDgenes_exon)

glm_data_courage_EOPD[6,1] <- "PD genes (exonic)"
glm_data_courage_EOPD[6,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage_EOPD[6,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage_EOPD[6,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage_EOPD[6,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage_EOPD[6,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage_EOPD[6,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#2.7# CNV in PRKN genes
cnv_filtered_PRKN <- filter(cnv_filtered_PDgenes, cnv_filtered_PDgenes$`PD genes` == "PRKN")
cnv_filtered_young_onset$CNV_status <- ifelse( cnv_filtered_young_onset$Sample_Name %in% cnv_filtered_PRKN$Sample_Name, 1, 0)
cnv_filtered_young_onset$CNV_status <- as.factor(cnv_filtered_young_onset$CNV_status)

##combine PD genes status in all samples
temp_PRKN <- cnv_filtered_young_onset %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_PRKN <- data.frame(temp_PRKN)
temp_PRKN <- filter(temp_PRKN, X1 != 0)

##update for coding genes status
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_PRKN$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_PRKN, cnv_filtered_PRKN)

glm_data_courage_EOPD[7,1] <- "PRKN"
glm_data_courage_EOPD[7,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage_EOPD[7,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage_EOPD[7,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage_EOPD[7,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage_EOPD[7,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage_EOPD[7,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#2.8# CNV longer than 1 Mb
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]
cnv_filtered_young_onset$CNV_status <- ifelse(cnv_filtered_young_onset$Length_bp <= 1000000,0,1)
cnv_filtered_young_onset$CNV_status[is.na(cnv_filtered_young_onset$CNV_status)] <- 0
temp_longCNV <- cnv_filtered_young_onset %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_longCNV <- data.frame(temp_longCNV)
temp_longCNV <- filter(temp_longCNV, X1 != 0)
##update for coding genes status
cnv_filtered <- cnv_filtered_young_onset[!duplicated(cnv_filtered_young_onset$Sample_Name),]
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Sample_Name %in% temp_longCNV$Sample_Name,1,0)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
rm(temp_longCNV)

glm_data_courage_EOPD[8,1] <- "CNV > 1Mb"
glm_data_courage_EOPD[8,2] <- paste0(as.character(length(filter(cnv_filtered, aff == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 1)[,1])) )
glm_data_courage_EOPD[8,3] <- paste0(as.character(length(filter(cnv_filtered, aff == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered, aff == 0)[,1])) )
glm_data_courage_EOPD[8,4] <- (glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]
glm_data_courage_EOPD[8,5] <- exp((glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_courage_EOPD[8,6:7] <- exp(confint(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_courage_EOPD[8,8] <- coef(summary(glm(data = cnv_filtered, formula = aff ~ CNV_status+sex+age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

glm_data_courage_EOPD$group <- "EOPD"
glm_data_courage_EOPD$CNV_group <- factor(glm_data_courage_EOPD$CNV_group, levels = rev(glm_data_courage_EOPD$CNV_group))
glm_data_courage_EOPD$FDR_pvalue <- as.numeric(p.adjust(glm_data_courage_EOPD$glm_p.value, "fdr"))
glm_data_courage_EOPD$FDR_pvalue <- scales::scientific(glm_data_courage_EOPD$FDR_pvalue, digits = 2)
glm_data_courage_EOPD$FDR_pvalue <- as.numeric(glm_data_courage_EOPD$FDR_pvalue)

##Merge ALL and EOPD dataset
glm_data <- rbind(glm_data_courage,glm_data_courage_EOPD)
glm_data <- arrange(glm_data, factor(glm_data$CNV_group, levels = unique(glm_data$CNV_group)))
glm_data$sign <- ifelse(as.numeric(glm_data$FDR_pvalue) < 0.05, "*", "")
glm_data$sign2 <- ifelse(as.numeric(glm_data$FDR_pvalue) < 0.05, paste0(glm_data$FDR_pvalue), " ")

##FOREST PLOT
p_middle <- ggplot(data=glm_data,aes(x=OR,y=CNV_group,xmin=ci2.5, xmax=ci97.5, color = group)) + theme_classic() +
  # geom_point(aes(x = OR), shape = 15, size = 3) +
  geom_point(position = position_dodge(width = 0.5), size = 3) + 
  geom_errorbarh(aes(xmin = ci2.5, xmax = ci97.5), height = 0.2, position = position_dodge(width = 0.5)) +
  scale_x_continuous(trans = "log10", breaks = c(1:5)) +
  geom_vline(xintercept =1, linetype="dashed") +
  labs(x="Odds Ratio (95% Confidence Interval)") +
  coord_cartesian(ylim=c(1,8), xlim=c(0.5, 4.75)) +
  annotate("text", x = 0.8, y = 8, label = "") +
  scale_color_manual(values=c("#00BFC4","#F8766D")) + 
  theme(axis.text.y = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks.y= element_blank(),
        axis.text.x = element_text(face="bold", size = 14),
        axis.title.x =element_text(size=16,face="bold"),
        axis.title.y =element_blank(),
        legend.position = c(.95, .95),
        legend.justification = c("right", "top"),
        legend.text = element_text(size=16,face="bold"),
        legend.title=element_blank())+
  geom_text(aes(x = OR, label=sign) , hjust = 0, vjust = -0.5,size=8, show.legend = FALSE) +
  geom_text(aes(x = OR, label=sign2) , hjust = -0.35 , vjust = -1.5,size=5, show.legend = FALSE)
p_middle

##add titles in dataframe
glm_data_courage_EOPD_2 <- glm_data_courage_EOPD
glm_data_courage_EOPD_3 <- t(as.data.frame(colnames(glm_data_courage_EOPD)))
rownames(glm_data_courage_EOPD_3) <- NULL
colnames(glm_data_courage_EOPD_3) <- colnames(glm_data_courage_EOPD)
glm_data_courage_EOPD_2 <- rbind(glm_data_courage_EOPD_3, glm_data_courage_EOPD_2)
rm(glm_data_courage_EOPD_3)
glm_data_courage_EOPD_2[1,1] <- "CNV group"
glm_data_courage_EOPD_2[1,9] <- "FDR p-value"
glm_data_courage_EOPD_2$CNV_group <- factor(glm_data_courage_EOPD_2$CNV_group, levels = rev(glm_data_courage_EOPD_2$CNV_group))
glm_data_courage_EOPD_2$FDR_pvalue <- ifelse(glm_data_courage_EOPD_2$FDR_pvalue == 1,"1.00",glm_data_courage_EOPD_2$FDR_pvalue)

##add titles in dataframe
glm_data_left <- t(as.data.frame(colnames(glm_data)))
colnames(glm_data_left) <- colnames(glm_data)
glm_data_left <- rbind(glm_data_left, glm_data)
rownames(glm_data_left) <- NULL
glm_data_left[1,1] <- "CNV group"
glm_data_left$CNV_group <- factor(glm_data_left$CNV_group, levels = rev(unique(glm_data_left$CNV_group)))
glm_data_left$Cases_all <- ifelse(glm_data_left$group == "ALL", glm_data_left$Cases, "")
glm_data_left$Cases_all[1] <- "All cases"
glm_data_left$Cases_EOPD <- ifelse(glm_data_left$group == "EOPD", glm_data_left$Cases, "")
glm_data_left$Cases_EOPD[1] <- "EOPD cases"

p_left <- ggplot(data = glm_data_left, aes(y = CNV_group)) +
  geom_text(aes(x = 0, label = CNV_group), hjust = 0, fontface = "bold", size=6) +
  geom_text(aes(x = 1, label = Controls), hjust = 0, size=6,
            fontface = "plain")+
  geom_text(aes(x = 2, label = Cases_all), hjust = 0, size=6,
            fontface = "plain")+
  geom_text(aes(x = 3, label = Cases_EOPD), hjust = 0, size=6,
            fontface = "plain")+
  theme_void() +
  coord_cartesian(xlim = c(0, 4))

layout <- c(
  area(t = 0, l = 0, b = 30, r = 4), # left plot
  area(t = 4.5, l = 5, b = 30, r = 8) # middle plot 
)

p_left+p_middle+ plot_layout(design = layout)
ggsave("forest-plot.tiff", width=20, height=5)