################################################################################
##Genome-wide burden analysis of copy number variants confirms PRKN as the predominant 
##driver in Parkinson’s disease
################################################################################
library(dplyr)
library(ggplot2)

##################################################
## Burden test for validated CNVs in PD-related genes
##################################################

##upload formatted demographic files for samples with ctl >= 60 yo
demo_file <- read.table("ProtectMove_CNV_paper_data.xlsx", header = T)

##MLPA_status in demo_file is a factor of 2 status: 1 (MLPA/qpCR Validated CNV in PD genes) and 0 (No validated CNV)
##PRKN_status in demo_file is a factor of 2 status: 1 (MLPA/qpCR Validated CNV in PRKN) and 0 (No validated CNV)

demo_file %>% group_by(Case_Control) %>% count(MLPA_status)
demo_file %>% group_by(Case_Control) %>% count(PRKN_status)

##Burden test of CNVs in PD genes using glm
##create a dataframe for glm results
glm_data_PDgenes <- as.data.frame(c())
glm_data_PDgenes[1,1] <- "PD genes"
glm_data_PDgenes[1,2] <- paste0(as.character(nrow(filter(demo_file, Case_Control == 1 & MLPA_status == 1))),"/",as.character(nrow(filter(demo_file, Case_Control == 1))) )
glm_data_PDgenes[1,3] <- paste0(as.character(nrow(filter(demo_file, Case_Control == 0 & MLPA_status == 1))),"/",as.character(nrow(filter(demo_file, Case_Control == 0))) )
##OR
glm_data_PDgenes[1,4] <- exp((glm(data = demo_file, formula = Case_Control ~ MLPA_status+Age+Sex+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]) 
##CI95
glm_data_PDgenes[1,5:6] <- exp(confint(glm(data = demo_file, formula = Case_Control ~ MLPA_status+Age+Sex+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2] 
##glm P-value
glm_data_PDgenes[1,7] <- coef(summary(glm(data = demo_file, formula = Case_Control ~ MLPA_status+Age+Sex+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]
colnames(glm_data_PDgenes) <- c("CNV_group","Cases","Controls","OR","ci2.5","ci97.5","glm_p.value")

##Burden test of CNVs in PRKN using glm
glm_data_PDgenes[2,1] <- "PRKN"
glm_data_PDgenes[2,2] <- paste0(as.character(nrow(filter(demo_file, Case_Control == 1 & PRKN_status == 1))),"/",as.character(nrow(filter(demo_file, Case_Control == 1))) )
glm_data_PDgenes[2,3] <- paste0(as.character(nrow(filter(demo_file, Case_Control == 0 & PRKN_status == 1))),"/",as.character(nrow(filter(demo_file, Case_Control == 0))) )
glm_data_PDgenes[2,4] <- exp((glm(data = demo_file, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]) 
glm_data_PDgenes[2,5:6] <- exp(confint(glm(data = demo_file, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2] 
glm_data_PDgenes[2,7] <- coef(summary(glm(data = demo_file, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

##AAO statistics
demo_file[demo_file$Case_Control == 1,] %>% group_by(PRKN_status) %>% summarise(mean(AAO), sd(AAO) ,plotrix::std.error(AAO))
wilcox.test(AAO ~ PRKN_status, data=demo_file[demo_file$Case_Control == 1,])

##extract Early-onset PD (EOPD) and CTL
demo_file_EOPD <- filter(demo_file, demo_file$Case_Control == 1 & demo_file$AAO <= 50)
demo_file_HC <- filter(demo_file, demo_file$Case_Control == 0)
demo_file_EOPD <- union(demo_file_EOPD, demo_file_HC)
demo_file_EOPD %>% count(Case_Control)
demo_file_EOPD %>% group_by(Case_Control) %>% count(Sex)
demo_file_EOPD[!is.na(demo_file_EOPD$Age),] %>% group_by(Case_Control) %>% summarise(mean(Age),sd(Age),plotrix::std.error(Age))
demo_file_EOPD[!is.na(demo_file_EOPD$AAO),] %>% group_by(Case_Control) %>% summarise(mean(AAO),sd(AAO),plotrix::std.error(Age))

##Burden test of CNVs in PD genes and PRKN using glm for EOPD
demo_file_EOPD %>% count(Case_Control)
demo_file_EOPD %>% group_by(Case_Control) %>% count(MLPA_status)
glm_data_PDgenes[3,1] <- "PD genes in EOPD"
glm_data_PDgenes[3,2] <- paste0(as.character(nrow(filter(demo_file_EOPD, Case_Control == 1 & MLPA_status == 1))),"/",as.character(nrow(filter(demo_file_EOPD, Case_Control == 1))) )
glm_data_PDgenes[3,3] <- paste0(as.character(nrow(filter(demo_file_EOPD, Case_Control == 0 & MLPA_status == 1))),"/",as.character(nrow(filter(demo_file_EOPD, Case_Control == 0))) )
glm_data_PDgenes[3,4] <- exp((glm(data = demo_file_EOPD, formula = Case_Control ~ MLPA_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])
glm_data_PDgenes[3,5:6] <- exp(confint(glm(data = demo_file_EOPD, formula = Case_Control ~ MLPA_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]
glm_data_PDgenes[3,7] <- coef(summary(glm(data = demo_file_EOPD, formula = Case_Control ~ MLPA_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

demo_file_EOPD %>% group_by(Case_Control) %>% count(PRKN_status)
glm_data_PDgenes[4,1] <- "PRKN in EOPD"
glm_data_PDgenes[4,2] <- paste0(as.character(nrow(filter(demo_file_EOPD, Case_Control == 1 & PRKN_status == 1))),"/",as.character(nrow(filter(demo_file_EOPD, Case_Control == 1))) )
glm_data_PDgenes[4,3] <- paste0(as.character(nrow(filter(demo_file_EOPD, Case_Control == 0 & PRKN_status == 1))),"/",as.character(nrow(filter(demo_file_EOPD, Case_Control == 0))) )
glm_data_PDgenes[4,4] <- exp((glm(data = demo_file_EOPD, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]) 
glm_data_PDgenes[4,5:6] <- exp(confint(glm(data = demo_file_EOPD, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]
glm_data_PDgenes[4,7] <- coef(summary(glm(data = demo_file_EOPD, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]
rm(demo_file_EOPD, demo_file_HC)

##extrat Late-onset PD (LOPD) and CTL
demo_file_LOPD <- filter(demo_file, demo_file$Case_Control == 1 & demo_file$AAO > 50)
demo_file_HC <- filter(demo_file, demo_file$Case_Control == 0)
demo_file_LOPD <- union(demo_file_LOPD, demo_file_HC)
demo_file_LOPD %>% count(Case_Control)
demo_file_LOPD %>% group_by(Case_Control) %>% count(Sex)
demo_file_LOPD[!is.na(demo_file_LOPD$Age),] %>% group_by(Case_Control) %>% summarise(mean(Age),sd(Age),plotrix::std.error(Age))
demo_file_LOPD[!is.na(demo_file_LOPD$AAO),] %>% group_by(Case_Control) %>% summarise(mean(AAO),sd(AAO),plotrix::std.error(Age))

##Burden test of CNVs in PD genes and PRKN using glm for LOPD
demo_file_LOPD %>% count(Case_Control)
demo_file_LOPD %>% group_by(Case_Control) %>% count(MLPA_status)
glm_data_PDgenes[5,1] <- "PD genes in LOPD"
glm_data_PDgenes[5,2] <- paste0(as.character(nrow(filter(demo_file_LOPD, Case_Control == 1 & MLPA_status == 1))),"/",as.character(nrow(filter(demo_file_LOPD, Case_Control == 1))) )
glm_data_PDgenes[5,3] <- paste0(as.character(nrow(filter(demo_file_LOPD, Case_Control == 0 & MLPA_status == 1))),"/",as.character(nrow(filter(demo_file_LOPD, Case_Control == 0))) )
glm_data_PDgenes[5,4] <- exp((glm(data = demo_file_LOPD, formula = Case_Control ~ MLPA_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])
glm_data_PDgenes[5,5:6] <- exp(confint(glm(data = demo_file_LOPD, formula = Case_Control ~ MLPA_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]
glm_data_PDgenes[5,7] <- coef(summary(glm(data = demo_file_LOPD, formula = Case_Control ~ MLPA_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

demo_file_LOPD %>% group_by(Case_Control) %>% count(PRKN_status)
glm_data_PDgenes[6,1] <- "PRKN in LOPD"
glm_data_PDgenes[6,2] <- paste0(as.character(nrow(filter(demo_file_LOPD, Case_Control == 1 & PRKN_status == 1))),"/",as.character(nrow(filter(demo_file_LOPD, Case_Control == 1))) )
glm_data_PDgenes[6,3] <- paste0(as.character(nrow(filter(demo_file_LOPD, Case_Control == 0 & PRKN_status == 1))),"/",as.character(nrow(filter(demo_file_LOPD, Case_Control == 0))) )
glm_data_PDgenes[6,4] <- exp((glm(data = demo_file_LOPD, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2]) 
glm_data_PDgenes[6,5:6] <- exp(confint(glm(data = demo_file_LOPD, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]
glm_data_PDgenes[6,7] <- coef(summary(glm(data = demo_file_LOPD, formula = Case_Control ~ PRKN_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]
rm(demo_file_LOPD, demo_file_HC)

##################################################
## PLOT ORs of PDgenes CNVs 
##################################################
glm_data_PDgenes$FDR <- p.adjust(glm_data_PDgenes$glm_p.value, "fdr")
glm_data_PDgenes$FDR <- format(glm_data_PDgenes$FDR, format = "e", digits = 2)
# glm_data_PDgenes$sign <- "*"
glm_data_PDgenes <- arrange(glm_data_PDgenes, factor(glm_data_PDgenes$CNV_group, levels = unique(glm_data_PDgenes$CNV_group)))
glm_data_PDgenes$group <- c("PD","PD","EOPD","EOPD","LOPD","LOPD")
glm_data_PDgenes$group <- factor(glm_data_PDgenes$group, levels = rev(unique(glm_data_PDgenes$group)))

##FOREST PLOT
burden_plot <- ggplot(data=glm_data_PDgenes,aes(x=OR,y=CNV_group,xmin=ci2.5, xmax=ci97.5, color = group)) + theme_classic() +
  geom_point(position = position_dodge(width = 0.5), size = 3) + 
  geom_errorbarh(aes(xmin = ci2.5, xmax = ci97.5), height = 0.2, position = position_dodge(width = 0.5)) +
  scale_x_continuous(trans = "log10", breaks = c(1,3,5,8)) +
  geom_vline(xintercept =1, linetype="dashed") +
  labs(x="Odds Ratio (95% Confidence Interval)") +
  coord_cartesian(ylim=c(1,6), xlim=c(0.5, 7.5)) +
  annotate("text", x = 0.8, y = 8, label = "") +
  scale_color_manual(values=c("#00BFC4","#F8766D","darkgrey")) + 
  theme(axis.text.y = element_text(size=18,face="bold"),
        #axis.line.y = element_blank(),
        axis.ticks.y= element_blank(),
        axis.text.x = element_text(face="bold", size = 14),
        axis.title.x =element_text(size=16,face="bold"),
        axis.title.y =element_blank(),
        legend.position = c(.95, .95),
        legend.justification = c("right", "top"),
        legend.text = element_text(size=16,face="bold"),
        legend.title=element_blank())+
  geom_text(aes(label = Cases, y = CNV_group, x = 0.5))+
  geom_text(aes(label = Controls, y = CNV_group, x = 0.75))+
  geom_text(aes(x = OR, label=FDR) , hjust = 0, vjust = -0.5,size=6, show.legend = FALSE)
burden_plot

##################################################
## Burden tests for non-validated CNVs (5 different categories)
##################################################

##upload raw CNV files
cnv_filtered <- read.table("cnv_filtered.txt", header = T)

##For burden analysis include non_carriers in every conditions
##Reformat CNV status
cnv_filtered_unique <- cnv_filtered[!duplicated(cnv_filtered$Sample_Name),]
cnv_filtered_unique <- cnv_filtered_unique[!is.na(cnv_filtered_unique$cnv),]
cnv_filtered_unique <- merge(cnv_filtered_unique, demo_file, by = c("Sample_Name","ID","Study","Sex","Case_Control","Age","AAO","ethnicity",
                                                                    "PC1","PC2","PC3","PC4","PC5","MLPA_status","PRKN_status"), all = T)
cnv_filtered_unique$CNV_status <- ifelse(is.na(cnv_filtered_unique$Copy_Number), 0, 1)
cnv_filtered_unique[,c(4,5,26)] <- lapply(cnv_filtered_unique[,c(4,5,26)], as.factor)
cnv_filtered_unique <- cnv_filtered_unique[!duplicated(cnv_filtered_unique$Sample_Name),]
# cnv_filtered_unique$Age <- ifelse(is.na(cnv_filtered_unique$Age),mean(cnv_filtered_unique$Age),cnv_filtered_unique$Age)

#1# ALL CNVs (genome-wide burden)
glm_data_PD <- as.data.frame(c())
glm_data_PD[1,1] <- "All CNVs"
glm_data_PD[1,2] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 1)[,1])) )
glm_data_PD[1,3] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 0)[,1])) )
glm_data_PD[1,4] <- exp((glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_PD[1,5:6] <- exp(confint(glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_PD[1,7] <- coef(summary(glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]
colnames(glm_data_PD) <- c("CNV_group","Cases","Controls","OR","ci2.5","ci97.5","glm_p.value")

#2# PD DEL CNVs
cnv_filtered$DEL_status <- ifelse(cnv_filtered$Copy_Number < 2,1,0)
cnv_filtered$DEL_status[is.na(cnv_filtered$DEL_status)] <- 0
cnv_filtered$DEL_status <- as.factor(cnv_filtered$DEL_status)
##combine DEL status in all samples
temp_DEL <- cnv_filtered %>% group_by(Sample_Name,DEL_status) %>% summarise(count = n()) %>% tidyr::spread(key = DEL_status, value = count, fill = 0)
temp_DEL <- data.frame(temp_DEL)
temp_DEL <- filter(temp_DEL, X1 != 0)
##update for DEL status
cnv_filtered_unique$DEL_status <- ifelse(cnv_filtered_unique$Sample_Name %in% temp_DEL$Sample_Name,1,0)
cnv_filtered_unique$DEL_status[is.na(cnv_filtered_unique$DEL_status)] <- 0
cnv_filtered_unique$DEL_status <- as.factor(cnv_filtered_unique$DEL_status)
rm(temp_DEL)
##GLM
glm_data_PD[2,1] <- "All CNVs - Deletions"
glm_data_PD[2,2] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 1 & DEL_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 1)[,1])) )
glm_data_PD[2,3] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 0 & DEL_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 0)[,1])) )
glm_data_PD[2,4] <- exp((glm(data = cnv_filtered_unique, formula = Case_Control ~ DEL_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_PD[2,5:6] <- exp(confint(glm(data = cnv_filtered_unique, formula = Case_Control ~ DEL_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_PD[2,7] <- coef(summary(glm(data = cnv_filtered_unique, formula = Case_Control ~ DEL_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#3# PD DUP CNVs
cnv_filtered$DUP_status <- ifelse(cnv_filtered$Copy_Number > 2,1,0)
cnv_filtered$DUP_status[is.na(cnv_filtered$DUP_status)] <- 0
cnv_filtered$DUP_status <- as.factor(cnv_filtered$DUP_status)
##combine DUP status in all samples
temp_DUP <- cnv_filtered %>% group_by(Sample_Name,DUP_status) %>% summarise(count = n()) %>% tidyr::spread(key = DUP_status, value = count, fill = 0)
temp_DUP <- data.frame(temp_DUP)
temp_DUP <- filter(temp_DUP, X1 != 0)
##update for DUP status
cnv_filtered_unique$DUP_status <- ifelse(cnv_filtered_unique$Sample_Name %in% temp_DUP$Sample_Name,1,0)
cnv_filtered_unique$DUP_status[is.na(cnv_filtered_unique$DUP_status)] <- 0
cnv_filtered_unique$DUP_status <- as.factor(cnv_filtered_unique$DUP_status)
rm(temp_DUP)
##GLM
glm_data_PD[3,1] <- "All CNVs - Duplications"
glm_data_PD[3,2] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 1 & DUP_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 1)[,1])) )
glm_data_PD[3,3] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 0 & DUP_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 0)[,1])) )
glm_data_PD[3,4] <- exp((glm(data = cnv_filtered_unique, formula = Case_Control ~ DUP_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_PD[3,5:6] <- exp(confint(glm(data = cnv_filtered_unique, formula = Case_Control ~ DUP_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_PD[3,7] <- coef(summary(glm(data = cnv_filtered_unique, formula = Case_Control ~ DUP_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#4# CNV longer than 1Mb
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Length_bp <= 1000000,0,1)
cnv_filtered$CNV_status[is.na(cnv_filtered$CNV_status)] <- 0
temp_longCNV <- cnv_filtered %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_longCNV <- data.frame(temp_longCNV)
temp_longCNV <- filter(temp_longCNV, X1 != 0)
##update for coding genes status
cnv_filtered_unique$CNV_status <- ifelse(cnv_filtered_unique$Sample_Name %in% temp_longCNV$Sample_Name,1,0)
cnv_filtered_unique$CNV_status[is.na(cnv_filtered_unique$CNV_status)] <- 0
cnv_filtered_unique$CNV_status <- as.factor(cnv_filtered_unique$CNV_status)
rm(temp_longCNV)
##GLM
glm_data_PD[4,1] <- "CNV > 1 Mb"
glm_data_PD[4,2] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 1)[,1])) )
glm_data_PD[4,3] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 0)[,1])) )
glm_data_PD[4,4] <- exp((glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_PD[4,5:6] <- exp(confint(glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_PD[4,7] <- coef(summary(glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

#5# CNV in any genes, functions are : "intergenic","exonic","ncRNA_exonic","intronic","splicing","UTR5","upstream","downstream","ncRNA_intronic","UTR3","UTR5;UTR3"
PD_genes <- c("LRRK2;","SNCA;","PRKN;","PINK1;","PARK7;","VPS35;","GBA;","RAB32;","CHCHD2;")
cnv_filtered$CNV_status <- ifelse(cnv_filtered$Func.refGene == "intergenic" | is.na(cnv_filtered$Func.refGene) | cnv_filtered$Gene.refGene %in% PD_genes, 0, 1)
cnv_filtered$CNV_status <- as.factor(cnv_filtered$CNV_status)
##combine coding genes status in all samples
temp_genes <- cnv_filtered %>% group_by(Sample_Name,CNV_status) %>% summarise(count = n()) %>% tidyr::spread(key = CNV_status, value = count, fill = 0)
temp_genes <- data.frame(temp_genes)
temp_genes <- filter(temp_genes, X1 != 0)
##update for coding genes status
cnv_filtered_unique$CNV_status <- ifelse(cnv_filtered_unique$Sample_Name %in% temp_genes$Sample_Name,1,0)
cnv_filtered_unique$CNV_status[is.na(cnv_filtered_unique$CNV_status)] <- 0
cnv_filtered_unique$CNV_status <- as.factor(cnv_filtered_unique$CNV_status)
rm(temp_genes)
##GLM
glm_data_PD[5,1] <- "CNVs on any genes"
glm_data_PD[5,2] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 1 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 1)[,1])) )
glm_data_PD[5,3] <- paste0(as.character(length(filter(cnv_filtered_unique, Case_Control == 0 & CNV_status == 1)[,1])),"/",as.character(length(filter(cnv_filtered_unique, Case_Control == 0)[,1])) )
glm_data_PD[5,4] <- exp((glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit")))[[1]][2])   
glm_data_PD[5,5:6] <- exp(confint(glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,1:2]  
glm_data_PD[5,7] <- coef(summary(glm(data = cnv_filtered_unique, formula = Case_Control ~ CNV_status+Sex+Age+PC1+PC2+PC3+PC4+PC5, family = binomial(link = "logit"))))[2,4]

##################################################
## PLOT ORs of all categories
##################################################
fp_PD <- ggplot(data=glm_data_PD[1:5,], aes(x = CNV_group,y = OR, ymin = ci2.5, ymax = ci97.5 )) +
  geom_pointrange() + 
  geom_hline(yintercept =1, linetype=2)+
  labs(y="Odds Ratio (95% Confidence Interval)")+
  geom_errorbar(aes(ymin=ci2.5, ymax=ci97.5),width=0.1,cex=0.5)+
  scale_y_continuous(trans = "log10", breaks = c(1,1.5,2), limits = c(0.5, 3)) +
  theme_bw()+
  theme(axis.text.y = element_text(size=18,face="bold"),
        axis.text.x = element_text(face="bold", size = 18),
        axis.title.x =element_text(size=12,face="bold"),
        axis.title.y =element_blank(),
        axis.ticks.y = element_blank(),
        legend.title=element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  geom_text(aes(label = glm_data_PD$numbers_PD, x = CNV_group, y = 2.0))+
  geom_text(aes(label = glm_data_PD$numbers_CTL, x = CNV_group, y = 2.75))+
  coord_flip()
fp_PD




