################################################################################
##Genome-wide burden analysis of copy number variants confirms PRKN as the predominant 
##driver in Parkinson’s disease
################################################################################
library(dplyr)
library(ggplot2)

##################################################
## Single-gene analysis
##################################################

##count genes in duplications in controls
DUP <- filter(cnv_filtered, DEL_status == 0)
DUP <- DUP %>% group_by(Gene.refGene, Case_Control) %>% count()
DUP_ctl <- filter(as.data.frame(DUP), Case_Control == 0) %>% dplyr::select(-2)
colnames(DUP_ctl) <- c('genes','count')
DUP_ctl <- filter(DUP_ctl, !is.na(DUP_ctl$genes))
DUP_ctl$genes <- gsub(";$", "", DUP_ctl$genes)

# Initialize an empty list to store gene counts
gene_counts <- list()
# Loop through each row to split genes and update counts
for (i in 1:nrow(DUP_ctl)) {
  split_genes <- unlist(strsplit(DUP_ctl$genes[i], ";"))
  for (gene in split_genes) {
    if (gene %in% names(gene_counts)) {
      gene_counts[[gene]] <- gene_counts[[gene]] + DUP_ctl$count[i]
    } else {
      gene_counts[[gene]] <- DUP_ctl$count[i]
    }
  }
}
# Convert the list to a dataframe
gene_counts_ctl <- data.frame(
  Gene = names(gene_counts),
  Count_CTL = unlist(gene_counts)
)

##################
##count genes in duplications in PD
DUP <- filter(cnv_filtered_extra, DEL_status == 0)
DUP_PD <- filter(as.data.frame(DUP), Case_Control == 1) %>% dplyr::select(-2)
colnames(DUP_PD) <- c('genes','count')
DUP_PD <- filter(DUP_PD, !is.na(DUP_PD$genes))
DUP_PD$genes <- gsub(";$", "", DUP_PD$genes)

# Initialize an empty list to store gene counts
gene_counts <- list()
# Loop through each row to split genes and update counts
for (i in 1:nrow(DUP_PD)) {
  split_genes <- unlist(strsplit(DUP_PD$genes[i], ";"))
  for (gene in split_genes) {
    if (gene %in% names(gene_counts)) {
      gene_counts[[gene]] <- gene_counts[[gene]] + DUP_PD$count[i]
    } else {
      gene_counts[[gene]] <- DUP_PD$count[i]
    }
  }
}
# Convert the list to a dataframe
gene_counts_PD <- data.frame(
  Gene = names(gene_counts),
  Count_PD = unlist(gene_counts)
)

## Merge gene counts in cases and controls
gene_counts <- merge(gene_counts_PD, gene_counts_ctl, by = "Gene", all = T)
gene_counts$Count_PD <- ifelse(is.na(gene_counts$Count_PD),0,gene_counts$Count_PD)
gene_counts$Count_PD <- ifelse(is.na(gene_counts$Count_PD),0,gene_counts$Count_PD)
gene_counts$Count_PD_nc <- 2364 - gene_counts$Count_PD
gene_counts$Count_CTL_nc <- 2909 - gene_counts$Count_CTL

gene_counts_final <- gene_counts
rownames(gene_counts_final) <- gene_counts_final$Gene
gene_counts_final <- gene_counts_final[,-1]
gene_counts_final <- dplyr::filter(gene_counts_final, Count_PD != 0)
gene_counts_final <- dplyr::filter(gene_counts_final, Count_CTL != 0)

################################

# Initialize a results dataframe to store odds ratio, confidence interval, p-values, and counts
gene_burden <- data.frame(gene = character(),
                          odds_ratio = numeric(),
                          conf_lower = numeric(),
                          conf_upper = numeric(),
                          p_value = numeric(),
                          count_control_carriers = integer(),
                          count_control_non_carriers = integer(),
                          count_case_carriers = integer(),
                          count_case_non_carriers = integer(),
                          stringsAsFactors = FALSE)

# Loop through each gene in the external list
genes <- gene_counts$Gene
cnv_filtered_dup <- filter(cnv_filtered, DUP_status == 1)
for (g in genes) {
  
  # Search for rows in the dataframe where the gene column matches (partially) the external gene
  matched_rows <- grep(g, cnv_filtered_dup$Gene.refGene, ignore.case = TRUE)
  
  # If there are any matches, proceed with the analysis
  if (length(matched_rows) > 0) {
    
    # Create gene_status column (1 for matched rows, 0 otherwise)
    cnv_filtered_dup$gene_status <- ifelse(cnv_filtered_dup$Gene.refGene %in% cnv_filtered_dup$Gene.refGene[matched_rows], 1, 0)
    
    # Run logistic regression
    model <- glm(Case_Control ~ gene_status, data = cnv_filtered_dup, family = binomial)
    
    # Extract odds ratio, confidence interval, and p-value for gene_status
    odds_ratio <- exp(coef(summary(model))["gene_status", "Estimate"])
    conf_int <- exp(confint(model)["gene_status", ])
    p_value <- coef(summary(model))["gene_status", "Pr(>|z|)"]
    
    # Count carriers and non-carriers in control group
    count_control_carriers <- nrow(filter(cnv_filtered_dup, Case_Control == 0 & gene_status == 1))
    count_control_non_carriers <- nrow(filter(cnv_filtered_dup, Case_Control == 0 & gene_status == 0))
    
    # Count carriers and non-carriers in case group
    count_case_carriers <- nrow(filter(cnv_filtered_dup, Case_Control == 1 & gene_status == 1))
    count_case_non_carriers <- nrow(filter(cnv_filtered_dup, Case_Control == 1 & gene_status == 0))
    
    # Store the results
    gene_burden <- rbind(gene_burden, data.frame(gene = g, 
                                                 odds_ratio = odds_ratio, 
                                                 conf_lower = conf_int[1], 
                                                 conf_upper = conf_int[2], 
                                                 p_value = p_value,
                                                 count_control_carriers = count_control_carriers,
                                                 count_control_non_carriers = count_control_non_carriers,
                                                 count_case_carriers = count_case_carriers,
                                                 count_case_non_carriers = count_case_non_carriers
    ))
  }
}

gene_burden_filtered <- filter(gene_burden, gene_burden$count_control_carriers != 0 & gene_burden$count_case_carriers != 0)
gene_burden_filtered <- gene_burden_filtered[order(gene_burden_filtered$odds_ratio, decreasing = TRUE),]

