################################################################################
##Genome-wide burden analysis of copy number variants confirms PRKN as the predominant 
##driver in Parkinson’s disease
################################################################################

library(dplyr)
library(ggplot2)

##################################################
##Filter CNVs
##################################################
# input CNV annotated data for ALL samples
cnv_data <- read.table("cnv_data.txt", header = T)

# Keep the CNVs with NumCNV < 200 and Length < 1e6
filter_step_1 <- dplyr::filter(cnv_data, density > 1e-04 & No_Probes > 19 & Length_bp > 19999)

##include numsnp  > 20 SNPs when length > 1000 kb
filter_step_2 <- filter(cnv_data, No_Probes > 19 & Length_bp > 1000000)

cnv_data_filtered <- dplyr::union(filter_step_1, filter_step_2)

# remove CNVs with quality scores between −0.5 and 0.5
cnv_data_filtered <- dplyr::filter(cnv_data_filtered, Quality_Score<= -0.5 | Quality_Score >= 0.5)
