################################################################################
##Genome-wide association study of copy number variations in Parkinson's disease
################################################################################

library(dplyr)

##################################################
##Filter CNVs step 1
##################################################
# input CNV annotated data for ALL samples or EOPD+CTL
cnv_data <- read.table("cnv_data.txt", header = T)

# Keep the CNVs with density (No_Probes/Length_bp) > 1e-04, length >= 20kb and numsnp >= 20
filter_step_1 <- dplyr::filter(cnv_data, density > 1e-04 & No_Probes > 19 & Length_bp > 19999)

##include numsnp  > 20 SNPs when length > 1000 kb
filter_step_2 <- filter(cnv_data, No_Probes > 19 & Length_bp > 1000000)

cnv_data_filtered <- dplyr::union(filter_step_1, filter_step_2)

##Remove CNV with low PEnnCNV confidence score (Max_Log_BF)
cnv_data_filtered <- dplyr::filter(cnv_data_filtered, cnv_data_filtered$Max_Log_BF >= 10)

##################################################
##Filter CNVs step 2
##################################################
##To select CNVs below 1% PLINK frequency (1% of 11926 individuals ~ 120 events)
freq_th <- length(unique(cnv_data_filtered$Sample_Name)) / 100
cnv_data_filtered <- dplyr::filter(cnv_data_filtered, cnv_data_filtered$plink_freq <= freq_th)

##Filter for DGV overlap 0.5
cnv_data_filtered <- dplyr::filter(cnv_data_filtered, DGV.GS <= 50)

##Filter for DECIPHER overlap 0.5
cnv_data_filtered <- dplyr::filter(cnv_data_filtered, DECIPHER <= 50)
