########################################################################################################################################################
########################################################################################################################################################
## Functions used in paper "Random Cancers as Supported by Registry Data" by Janez Stare, Robin Henderson and Nina Ruzic Gorenjec                     ##
## Programmed by: Robin Henderson and Nina Ruzic Gorenjec                                                                                             ##
##                                                                                                                                                    ##
## The goal is to estimate an upper limit to the probability of random cancer                                                                         ##
## (ie its probability of occurring is not changed by the presence or absence of any factor) using registry data that include                         ##
## cumulative cause-specific hazards of a particular cancer to some age (say 80), together with their estimated standard deviations.                  ##
## The idea is to order the cumulative hazards and then search from left to right to find the largest possible left m-tail (ie the smallest m values) ##
## for which it is still feasible that its cumulative hazards have the same true cumulative hazard.                                                   ##
##                                                                                                                                                    ##
## Function procedure returns this aforementioned largest m.                                                                                          ##
########################################################################################################################################################
########################################################################################################################################################

###############################################################################################################################################
# procedure gives the largest m (ie Mhat) for which it is still feasible that cumulative hazards of the first m registries                    #
# (already ordered according to cumulative hazards) have the same true cumulative hazard.                                                     #
#                                                                                                                                             #
# First it tests if it is feasible that all registries have a common mean.                                                                    #
# If the null hypothesis of common mean is rejected, it searches for the largest feasible m.                                                  #
# It iterates m (in steps of mStep) and k (in steps of kStep).                                                                                #
# On each step it performs decisionRule to test if the left m-tail of the data is consistent with                                             #
# m-tail of a sample of Gaussian random variables with common mean and with variances that match the registry data.                           #
# When it finds the first k for which decisionRule gives a p-value >= alpha, it declares m feasible and moves on to the next m.               #
# If there is no k for which m is deemed to be feasible, then it stops and declares the last feasible m as Mhat.                              #
#                                                                                                                                             #
# dat is a data.frame object that includes variables Ahat (cumulative hazards) and Asd (estimated standard deviations of cumulative hazards). #
# dat has to be ordered according to Ahat.                                                                                                    #
# nSim is the number of simulations used at each step when performing decisionRule.                                                           #
#                                                                                                                                             #
# Output is a list of:                                                                                                                        #
# rejectNull    TRUE if the null hypothesis of common mean is rejected.                                                                       #
# pAll          p-value of a test with the null hypothesis of common mean.                                                                    #
# out           list of p-values from decisionRule for all the performed steps, where there is one element for each m                         #
#               with matrix of p-values according to all considered k                                                                         #
#               (the last m has all p-values <alpha, all other m have the last p-value >=alpha and others <alpha).                            #
# Mhat          largest feasible m                                                                                                            #
###############################################################################################################################################
procedure <- function(dat, alpha = 0.05, mStep = 5, kStep = 20, nSim = 1000){
  rejectNull <- TRUE
  out <- NULL
  Mhat <- NULL
  
  N <- length(dat$Ahat) #number of all registries
  mGrid <- mStep*(1:floor(N/mStep))
  mGrid <- mGrid[mGrid != N] #we will search through these m
  
  pAll <- decisionRule(dat, m = N, k = N, nSim = nSim, usePlot = FALSE) #test the null hypothesis of common mean
  if(pAll > alpha)  rejectNull <- FALSE
  
  kStepLarge <- max(kStep, 20) #if kStep is smaller than 20, we will first search for k in steps of 20 in order to make the procedure faster
  
  if(rejectNull){ #start searching for the largest feasible m only if the test of common mean is rejected
    out <- list()
    it <- 1
    m <- 0
    while(it < length(mGrid)){ #iterate m
      Mhat <- m #m on the previous step was feasible, this is current candidate for the largest feasible m
      m <- mGrid[it] #in this step we test if this m is feasible
      
      kGrid <- unique(c(seq(m, N, by = kStepLarge), N)) #we search through these k
      pValues <- matrix(NA, ncol = 2, nrow = length(kGrid)) #for storing p-values according to all k
      
      for(jt in 1:length(kGrid)){ #iterate k
        pValues[jt, 1] <- kGrid[jt]
        pValues[jt, 2] <- decisionRule(dat, m = m, k = kGrid[jt], nSim = nSim, usePlot = FALSE) #perform the decision rule for this m and k, ie:
        #test whether the observed left m-tail of the data is consistent with the left m-tail of a sample of k Gaussian random variables with common mean and with variances that match the registry data
        if (pValues[jt, 2] >= alpha) break #we stop iterating k once we find such k that test decisionRule does not reject
      }
      pValues <- as.data.frame(na.omit(pValues))
      
      if(max(pValues[jt, 2]) < alpha & kStep<20) { #if kStep<20, we need to search also through the rest of the k
        kGridRefinement <- unique(c(seq(m, N, by = kStep), N)) #new kGrid that is a refinement of the previous one
        kGridRefinement <- kGridRefinement[!(kGridRefinement %in% kGrid)] #eliminate all the k that were already in the previous grid
        
        pValuesRefinement <- matrix(NA, ncol = 2, nrow = length(kGridRefinement)) #for storing p-values according to all new k
        for(jt in 1:length(kGridRefinement)){
          pValuesRefinement[jt, 1] <- kGridRefinement[jt]
          pValuesRefinement[jt, 2] <- decisionRule(dat, m = m, k = kGridRefinement[jt], nSim = nSim, usePlot = FALSE) #as before
          if (pValuesRefinement[jt, 2] >= alpha) break #as before
        }
        pValuesRefinement <- as.data.frame(na.omit(pValuesRefinement))
        pValues <- rbind(pValues, pValuesRefinement) #join p-values from all the steps
      }
      
      colnames(pValues) <- c("k", "pValue")
      out[[it]] <- pValues #save p-values according to this m as new element of the list out
      names(out)[it] <- paste("m = ", m, sep = "")
      
      if(max(pValues[jt, 2]) < alpha) it <- length(mGrid) + 1000 #if decisionRule rejected for all possible k, then the current m is not feasible, so stop the whole procedure
      it <- it + 1 #otherwise, continue iterating m
    }
  }
  list(rejectNull = rejectNull, pAll = pAll, out = out, Mhat = Mhat)
}

#######################################################################################################################################################
# decisionRule tests if the left m-tail of the data is consistent with m-tail of a sample of k Gaussian random variables                              #
# with common mean and with variances that match the registry data (its reference distribution is simulated nSim times).                              #
#                                                                                                                                                     #
# dat is a data.frame object that includes variables Ahat (cumulative hazards) and Asd (estimated standard deviations of cumulative hazards).         #
# dat has to be ordered according to Ahat.                                                                                                            #
# If usePlot = TRUE, function will plot the left m-tails of the data (in red) together with reference distribution (as grey lines) on the left panel, #
#                    and bridge obtained from the data (in red) together with bridges from reference distribution (in grey) on the right panel.       #
# addLab can be used to specify additonal prefix to the titles of the graphs.                                                                         #
#                                                                                                                                                     #
# Output is p-value of the test.                                                                                                                      #
#######################################################################################################################################################
decisionRule <- function(dat, m, k = 423, nSim = 1000, usePlot = TRUE, addLab = NULL){
  Ahat <- dat$Ahat #take cumulative hazards
  Asd <- dat$Asd #take estimated standard deviations of cumulative hazards
  Atail <- Ahat[1:m] #left m-tail of the data
  
  Zmatrix <- referenceDistribution(m = m, k = k, referenceMean = mean(Atail), Asd = Asd, nSim = nSim) #reference distribution
  means <- apply(Zmatrix, 2, mean) #means of order statistics based on simulations from Zmatrix
  sds <- apply(Zmatrix, 2, sd) #standard deviations of the ordered statistics based on simulations from Zmatrix
  
  W <- t((t(rbind(Zmatrix, Atail)) - means)/sds) #standardise the Zmatrix and the left m-tail of the data
  
  bridges <- apply(W - rowMeans(W), 1, cumsum) #obtain bridges
  if (m != 1) bridges <- t(bridges)
  bridges <- cbind(0, bridges)
  
  if(usePlot == TRUE){
    plotTitle <- paste(addLab, "m = ", as.character(m), ", k = ", as.character(k), sep = "")
    par(mfrow = c(1, 2))
    
    #Left panel: left m-tails of the data (in red) together with reference distribution (as grey lines)
    yRange <- range(c(as.vector(Zmatrix), Atail))
    plot(1:m, Zmatrix[1, ], xlab = "Ordered registry", ylab = "A", col = grey(0.7), ylim = yRange, type = "l", main = plotTitle)
    for(it in 2:nSim){
      lines(1:m, Zmatrix[it, ], col = grey(0.7))
    }
    lines(1:m, Atail, col = 2)
    
    #Right panel: bridge obtained from the data (in red) together with bridges from reference distribution (in grey)
    yRange <- range(as.vector(bridges))
    plot(0:m, bridges[1, ], type = "l", col = grey(0.7), ylim = yRange, xlab = "Ordered registry", ylab = "Bridge", main = plotTitle)
    for(it in 2:nSim){
      lines(0:m, bridges[it, ], col = grey(0.7))
    }
    lines(0:m, bridges[nSim+1, ], col = 2)
  }
  
  testStatistic <- apply(abs(bridges), 1, max) #compute test statistics as maximum absolute bridge values
  pValue <- mean(testStatistic[1:nSim] >= testStatistic[nSim+1]) #empirical p-value
  pValue
}

############################################################################################################################
# referenceDistribution function simulates the reference distribution to the left m-tails of the data under the assumption #
# that they are the left m-tail of k independent Gaussian random variables with common mean.                               #
#                                                                                                                          #
# It simulates k Gaussian independent random variables with mean 0 and standard deviations Asd                             #
# (vector or number if all standard deviations are equal), orders them, takes the smallest m,                              #
# and then aligns their mean to referenceMean.                                                                             #
#                                                                                                                          #
# nSim is number of simulations.                                                                                           #
#                                                                                                                          #
# Output is a nSim times m matrix with ordered m-tails samples in rows.                                                    #
############################################################################################################################
referenceDistribution <- function(m, k = 423, referenceMean = 0.05, Asd = 0.005, nSim = 1000){
  Zmatrix <- matrix(NA, nSim, m)
  for(it in 1:nSim){
    z <- rnorm(k, 0, Asd) #simulate k Gaussian independent random variables with mean 0 and standard deviations Asd
    Zmatrix[it, ] <- sort(z)[1:m] #order them and take the smallest m
  }
  Zmatrix <- Zmatrix + referenceMean - mean(as.vector(Zmatrix)) #align their mean (which is negative because of the ordering) to referenceMean
  Zmatrix
}

##########################################################################################################################
# simData function simulates cumulative hazards for k registries from two groups:                                        #
# - Mtrue registries all have true cumulative hazards equal to A1,                                                       #
# - the rest k-Mtrue registries have true cumulative hazards evenly distributed between A2 and A3 where A3 >= A2 >= A1.  #
#                                                                                                                        #
# Setting A2>A1 induces a step change in mean, while setting A1=A2<A3 leads to a gradual increase in mean from A1 to A3. #
# Cumulative hazards are assumed normally distributed, its standard deviations are specified in Asd.                     #
#                                                                                                                        #
# Asd can be a specified as a number (then all standard deviations will be equal) or a vector .                          #
# If usePlot = TRUE, function will plot the ordered simulated cumulative hazards, where the two groups will be labeled   #
#                    (red crosses for the first group and blue circles for the second one).                              #
#                                                                                                                        #
# Output are the simulated cumulative hazards, Asd and group labels.                                                     #
##########################################################################################################################
simData <- function(A1 = 0.05, A2 = 0.06, A3 = 0.06, Asd = 0.005, Mtrue = 100, k = 423, usePlot = TRUE){
  if (length(Asd) == 1) Asd <- rep(Asd, k) #if Asd is a number, make a vector of all equal standard deviations
  
  x <- 1:k
  Amean <- (x - Mtrue)*(A3 - A2)/(k - Mtrue) + A2 #means are evenly distributed between A2 and A3 for registries from Mtrue to k
  Amean[x <= Mtrue] <- A1 #means are equal to A1 for the first Mtrue registries
  
  Ahat <- Amean + rnorm(k)*Asd #simulate cumulative hazards
  group <- rep(1, length = k) #remember which registries had the same mean A1
  group[x > Mtrue] <- 2 
  
  ii <- order(Ahat) #order the data according to cumulative hazards
  Ahat <- Ahat[ii]
  Asd <- Asd[ii]
  group <- group[ii]
  
  if(usePlot == TRUE){ #plot ordered simulated cumulative hazards
    par(mfrow = c(1, 1))
    pchSymbol <- ifelse(group==1, 4, 1) #crosses come from the same mean A1, others are labeled with circles
    plot(1:k, Ahat, pch = pchSymbol, cex = 0.5, xlab = "Registry", ylab = "A", main = "Ordered", col = 2*group) #crosses are red, circles are blue
  }
  
  list(Ahat = Ahat, Asd = Asd, group = group)
}