################################################################################
##Genome-wide association study of copy number variations in Parkinson's disease
################################################################################

library(dplyr)
library(ggplot2)
library(survival)
library(survminer)

##########################
##5. Survival analysis
##########################

# input filtered CNV data
cnv_filtered_all <- read.table("cnv_data_filtered.txt", header = T)

data <- cnv_filtered_all[,c(1:5,8:12)]
data <- data[!duplicated(data),]
data$time <- ifelse(data$aff == 1, data$AAO, data$age) #age for controls and AAO for PD
data$status <- ifelse(data$Sample_Name %in% unique(cnv_filtered_PDgenes$Sample_Name), 
                      "individuals with CNV in PD genes", "individuals with other or no CNVs")
data$status <- factor(data$status, levels = c("individuals with CNV in PD genes", "individuals with other or no CNVs"))
data$status <- relevel(data$status, ref = "individuals with other or no CNVs")
data <- data[!is.na(data$time),]
data$aff <- as.numeric(data$aff)
data$aff <- ifelse(data$aff == 2, 1, 0)
data$sex <- ifelse(data$sex == 1, "M", "F")

## create survival object
surv_object <- Surv(time = data$time, event = data$aff)

## Fit the Kaplan-Meier Model
fit <- survfit(surv_object ~ status, data = data)

##Log-rank test
surv_diff <- survdiff(surv_object ~ status, data = data)
surv_diff

## Plot the Kaplan-Meier curves

ggsurvplot(fit, data = data)
ggsurvplot(
  fit, 
  data = data,
  pval = TRUE,                  # Show p-value
  conf.int = TRUE,              # Show confidence intervals
  risk.table = FALSE,            # Show risk table
  legend.labs = c("individuals with CNV in PD genes", "individuals with other or no CNVs"), # Labels for the groups
  #title = "Kaplan-Meier Curve",
  xlab = "Age",
  ylab = "Probability of not having PD symptoms"
)

# Fit the Cox proportional hazards model
cox_fit <- coxph(surv_object ~ status+ sex + PC1 + PC2 + PC3 + PC4 + PC5, data = data)
summary(cox_fit)

##########################
##Survival analysis in EOPD
##########################

data_EOPD <- filter(data, aff == 1)
data_EOPD <- data_EOPD[!duplicated(data_EOPD),]
data$EOPD <- as.factor(ifelse(data$AAO <= 50, 1, 0))
data_EOPD$status <- factor(data_EOPD$status, levels = c("individuals with CNV in PD genes", "individuals with other or no CNVs"))
data_EOPD$status <- relevel(data_EOPD$status, ref = "individuals with other or no CNVs")

## create survival object
surv_object_EOPD <- Surv(time = data_EOPD$time, event = data_EOPD$aff)
## Fit the Kaplan-Meier Model
fit_EOPD <- survfit(surv_object_EOPD ~ status, data = data_EOPD)
##Log-rank test
surv_diff_EOPD <- survdiff(surv_object_EOPD ~ status, data = data_EOPD)
surv_diff
## Plot the Kaplan-Meier curves
ggsurvplot(
  fit_EOPD, 
  data = data_EOPD,
  pval = TRUE,                  # Show p-value
  conf.int = TRUE,              # Show confidence intervals
  risk.table = FALSE,            # Show risk table
  legend.labs = c("individuals with CNV in PD genes", "individuals with other or no CNVs"), # Labels for the groups
  # title = "Kaplan-Meier Curve",
  xlab = "Age",
  ylab = "Probability of not having PD symptoms"
)

# Fit the Cox proportional hazards model
cox_fit_EOPD <- coxph(surv_object_EOPD ~ status + sex + PC1 + PC2 + PC3 + PC4 + PC5, data = data_EOPD)
summary(cox_fit_EOPD)
