######################################################################################################################################################## ######################################################################################################################################################## ## Functions used in paper "Random Cancers as Supported by Registry Data" by Janez Stare, Robin Henderson and Nina Ruzic Gorenjec ## ## Programmed by: Robin Henderson and Nina Ruzic Gorenjec ## ## ## ## The goal is to estimate an upper limit to the probability of random cancer ## ## (ie its probability of occurring is not changed by the presence or absence of any factor) using registry data that include ## ## cumulative cause-specific hazards of a particular cancer to some age (say 80), together with their estimated standard deviations. ## ## The idea is to order the cumulative hazards and then search from left to right to find the largest possible left m-tail (ie the smallest m values) ## ## for which it is still feasible that its cumulative hazards have the same true cumulative hazard. ## ## ## ## Function procedure returns this aforementioned largest m. ## ######################################################################################################################################################## ######################################################################################################################################################## ############################################################################################################################################### # procedure gives the largest m (ie Mhat) for which it is still feasible that cumulative hazards of the first m registries # # (already ordered according to cumulative hazards) have the same true cumulative hazard. # # # # First it tests if it is feasible that all registries have a common mean. # # If the null hypothesis of common mean is rejected, it searches for the largest feasible m. # # It iterates m (in steps of mStep) and k (in steps of kStep). # # On each step it performs decisionRule to test if the left m-tail of the data is consistent with # # m-tail of a sample of Gaussian random variables with common mean and with variances that match the registry data. # # When it finds the first k for which decisionRule gives a p-value >= alpha, it declares m feasible and moves on to the next m. # # If there is no k for which m is deemed to be feasible, then it stops and declares the last feasible m as Mhat. # # # # dat is a data.frame object that includes variables Ahat (cumulative hazards) and Asd (estimated standard deviations of cumulative hazards). # # dat has to be ordered according to Ahat. # # nSim is the number of simulations used at each step when performing decisionRule. # # # # Output is a list of: # # rejectNull TRUE if the null hypothesis of common mean is rejected. # # pAll p-value of a test with the null hypothesis of common mean. # # out list of p-values from decisionRule for all the performed steps, where there is one element for each m # # with matrix of p-values according to all considered k # # (the last m has all p-values =alpha and others alpha) rejectNull <- FALSE kStepLarge <- max(kStep, 20) #if kStep is smaller than 20, we will first search for k in steps of 20 in order to make the procedure faster if(rejectNull){ #start searching for the largest feasible m only if the test of common mean is rejected out <- list() it <- 1 m <- 0 while(it < length(mGrid)){ #iterate m Mhat <- m #m on the previous step was feasible, this is current candidate for the largest feasible m m <- mGrid[it] #in this step we test if this m is feasible kGrid <- unique(c(seq(m, N, by = kStepLarge), N)) #we search through these k pValues <- matrix(NA, ncol = 2, nrow = length(kGrid)) #for storing p-values according to all k for(jt in 1:length(kGrid)){ #iterate k pValues[jt, 1] <- kGrid[jt] pValues[jt, 2] <- decisionRule(dat, m = m, k = kGrid[jt], nSim = nSim, usePlot = FALSE) #perform the decision rule for this m and k, ie: #test whether the observed left m-tail of the data is consistent with the left m-tail of a sample of k Gaussian random variables with common mean and with variances that match the registry data if (pValues[jt, 2] >= alpha) break #we stop iterating k once we find such k that test decisionRule does not reject } pValues <- as.data.frame(na.omit(pValues)) if(max(pValues[jt, 2]) < alpha & kStep<20) { #if kStep<20, we need to search also through the rest of the k kGridRefinement <- unique(c(seq(m, N, by = kStep), N)) #new kGrid that is a refinement of the previous one kGridRefinement <- kGridRefinement[!(kGridRefinement %in% kGrid)] #eliminate all the k that were already in the previous grid pValuesRefinement <- matrix(NA, ncol = 2, nrow = length(kGridRefinement)) #for storing p-values according to all new k for(jt in 1:length(kGridRefinement)){ pValuesRefinement[jt, 1] <- kGridRefinement[jt] pValuesRefinement[jt, 2] <- decisionRule(dat, m = m, k = kGridRefinement[jt], nSim = nSim, usePlot = FALSE) #as before if (pValuesRefinement[jt, 2] >= alpha) break #as before } pValuesRefinement <- as.data.frame(na.omit(pValuesRefinement)) pValues <- rbind(pValues, pValuesRefinement) #join p-values from all the steps } colnames(pValues) <- c("k", "pValue") out[[it]] <- pValues #save p-values according to this m as new element of the list out names(out)[it] <- paste("m = ", m, sep = "") if(max(pValues[jt, 2]) < alpha) it <- length(mGrid) + 1000 #if decisionRule rejected for all possible k, then the current m is not feasible, so stop the whole procedure it <- it + 1 #otherwise, continue iterating m } } list(rejectNull = rejectNull, pAll = pAll, out = out, Mhat = Mhat) } ####################################################################################################################################################### # decisionRule tests if the left m-tail of the data is consistent with m-tail of a sample of k Gaussian random variables # # with common mean and with variances that match the registry data (its reference distribution is simulated nSim times). # # # # dat is a data.frame object that includes variables Ahat (cumulative hazards) and Asd (estimated standard deviations of cumulative hazards). # # dat has to be ordered according to Ahat. # # If usePlot = TRUE, function will plot the left m-tails of the data (in red) together with reference distribution (as grey lines) on the left panel, # # and bridge obtained from the data (in red) together with bridges from reference distribution (in grey) on the right panel. # # addLab can be used to specify additonal prefix to the titles of the graphs. # # # # Output is p-value of the test. # ####################################################################################################################################################### decisionRule <- function(dat, m, k = 423, nSim = 1000, usePlot = TRUE, addLab = NULL){ Ahat <- dat$Ahat #take cumulative hazards Asd <- dat$Asd #take estimated standard deviations of cumulative hazards Atail <- Ahat[1:m] #left m-tail of the data Zmatrix <- referenceDistribution(m = m, k = k, referenceMean = mean(Atail), Asd = Asd, nSim = nSim) #reference distribution means <- apply(Zmatrix, 2, mean) #means of order statistics based on simulations from Zmatrix sds <- apply(Zmatrix, 2, sd) #standard deviations of the ordered statistics based on simulations from Zmatrix W <- t((t(rbind(Zmatrix, Atail)) - means)/sds) #standardise the Zmatrix and the left m-tail of the data bridges <- apply(W - rowMeans(W), 1, cumsum) #obtain bridges if (m != 1) bridges <- t(bridges) bridges <- cbind(0, bridges) if(usePlot == TRUE){ plotTitle <- paste(addLab, "m = ", as.character(m), ", k = ", as.character(k), sep = "") par(mfrow = c(1, 2)) #Left panel: left m-tails of the data (in red) together with reference distribution (as grey lines) yRange <- range(c(as.vector(Zmatrix), Atail)) plot(1:m, Zmatrix[1, ], xlab = "Ordered registry", ylab = "A", col = grey(0.7), ylim = yRange, type = "l", main = plotTitle) for(it in 2:nSim){ lines(1:m, Zmatrix[it, ], col = grey(0.7)) } lines(1:m, Atail, col = 2) #Right panel: bridge obtained from the data (in red) together with bridges from reference distribution (in grey) yRange <- range(as.vector(bridges)) plot(0:m, bridges[1, ], type = "l", col = grey(0.7), ylim = yRange, xlab = "Ordered registry", ylab = "Bridge", main = plotTitle) for(it in 2:nSim){ lines(0:m, bridges[it, ], col = grey(0.7)) } lines(0:m, bridges[nSim+1, ], col = 2) } testStatistic <- apply(abs(bridges), 1, max) #compute test statistics as maximum absolute bridge values pValue <- mean(testStatistic[1:nSim] >= testStatistic[nSim+1]) #empirical p-value pValue } ############################################################################################################################ # referenceDistribution function simulates the reference distribution to the left m-tails of the data under the assumption # # that they are the left m-tail of k independent Gaussian random variables with common mean. # # # # It simulates k Gaussian independent random variables with mean 0 and standard deviations Asd # # (vector or number if all standard deviations are equal), orders them, takes the smallest m, # # and then aligns their mean to referenceMean. # # # # nSim is number of simulations. # # # # Output is a nSim times m matrix with ordered m-tails samples in rows. # ############################################################################################################################ referenceDistribution <- function(m, k = 423, referenceMean = 0.05, Asd = 0.005, nSim = 1000){ Zmatrix <- matrix(NA, nSim, m) for(it in 1:nSim){ z <- rnorm(k, 0, Asd) #simulate k Gaussian independent random variables with mean 0 and standard deviations Asd Zmatrix[it, ] <- sort(z)[1:m] #order them and take the smallest m } Zmatrix <- Zmatrix + referenceMean - mean(as.vector(Zmatrix)) #align their mean (which is negative because of the ordering) to referenceMean Zmatrix } ########################################################################################################################## # simData function simulates cumulative hazards for k registries from two groups: # # - Mtrue registries all have true cumulative hazards equal to A1, # # - the rest k-Mtrue registries have true cumulative hazards evenly distributed between A2 and A3 where A3 >= A2 >= A1. # # # # Setting A2>A1 induces a step change in mean, while setting A1=A2 Mtrue] <- 2 ii <- order(Ahat) #order the data according to cumulative hazards Ahat <- Ahat[ii] Asd <- Asd[ii] group <- group[ii] if(usePlot == TRUE){ #plot ordered simulated cumulative hazards par(mfrow = c(1, 1)) pchSymbol <- ifelse(group==1, 4, 1) #crosses come from the same mean A1, others are labeled with circles plot(1:k, Ahat, pch = pchSymbol, cex = 0.5, xlab = "Registry", ylab = "A", main = "Ordered", col = 2*group) #crosses are red, circles are blue } list(Ahat = Ahat, Asd = Asd, group = group) }