################################################### # # PSEUCHECK # #Functions for checking goodness of fit of various models used in survival analysis: # Cox model, Fine&Gray model, linear regression model of restricted mean (calculated with pseudo-observations) # and models fitted using GEE (single time point survival, competing risks # # usage: # # pseucheck(fit,...) # # arguments: # fit ... model fit. the corresponding function is chosen according to the class of this fit # resid ... the default value (FALSE) plots a scatterplot, resid=TRUE gives a residual plot. (not an option for the linear regression model) # nt the time-points used (relevant for models not fitted using pseudo-observations). If none are specified, 4 quantiles of the event times distribution are chosen. # scaled the default value (TRUE) plots scaled residuals (not raw). Only relevant if resid=TRUE # correction corrects for other covariates in the model (defaults to TRUE in the multivariate case) # addline adds the beta Z line (defaults to TRUE if correction=TRUE) # varplot a vector indicating the covariates to be plotted (c(1,3) indicates the first and the third covariate in the model fit formula) # bins the default value (TRUE) adds the distribution of the covariate on the graph # col the color of the scatterplot curves # lwd the width of the scatterplot curves # title the titles of the plots (defaults to the covariate names) # # additional arguments: # # Fine&Gray model: cov1 the matrix of covariates (see help of the crr function) # time event time # event event indicator (a vector of 0, 1 or 2) # failcode the event of interest (1 or 2) # Note that the censoring must be denoted with 0 and only two causes (1 and 2) are currently implemented # # Cox model and linear regression model: no additional arguments. Note that both models must be fitted using the x=TRUE option # # GEE based models: cov1 if the names of the covariates in the formula are not the same as the names in the dataframe, the covariate matrix should be passed as cov1 # NOTE: the data set must include variables named pseudo (pseudo-observations) and tpseudo (time points) # # NOTE: missing values are not allowed in any of the function # ###################################################################### # # PSEUGEE.MAKE # # An auxiliary function that prepares data in the form needed to fit a GEE model with pseudo-observations # # usage: # pseugee.make(data,time,event,state=1,nt=5) # #incoming data: time, event (0=censoring,1&2 events), data containes variables. One line per case #outgoing data: one line per chosen time-point, new variables: id (identifying cases), pseudo (pseudo-observation), tpseudo (time point) # #nt: either a number of time points - indicates the number of time-points that should be selected at the respective quantiles of the distribution of event times (any cause) # or a vector of time points - the function finds the closest time-points in the data set #use state=0 for Cox model/survival # ###################################################################### # Examples: # if(0){ #don't run library(survival) library(pseudo) library(KMsurv) library(cmprsk) library(geepack) data(bmt) #Cox model: fit <- coxph(Surv(t1,d1)~group+z1,data=bmt,x=T) pseucheck(fit) #more options pseucheck(fit,varplot=2,title="Age",lwd=1) #residuals pseucheck(fit,resid=T) #restricted mean survival (tau=530) bmt$rmean <- pseudomean(bmt$t1, bmt$d1,530)$psumean fit <- lm(rmean~group+z1,data=bmt,x=T,y=T) pseucheck(fit) #survival at a single time point bmts5 <- bmt bmts5$pseudo <- 1-pseudosurv(bmt$t1,bmt$d1,530)[,3] bmts5$tpseudo <- 530 #just one time point fit <- geese(pseudo~group+z1,data=bmts5,family=gaussian,mean.link="cloglog",id=1:nrow(bmts5),jack=TRUE,scale.fix=TRUE,corstr="independence") pseucheck(fit,bmts5) #calculate the pseudo-observations cutoffs <- c(50,105,170,280,530) bmt$icr <- bmt$d1 + bmt$d3 #competing risks for cause 2, with cloglog link function bmtlong <- pseugee.make(bmt,bmt$t2,bmt$icr,nt=cutoffs,state=2) fit <- geese(pseudo ~ as.factor(tpseudo) + group+z1, data =bmtlong, id=id, jack = TRUE, scale.fix=TRUE,family=gaussian,mean.link = "cloglog", corstr="independence") pseucheck(fit,bmtlong) #same thing with the Fine&Gray model: xmat <- data.frame(group=bmt$group,z1=bmt$z1) fit <- crr(bmt$t2,bmt$icr,cov1=xmat,failcode=2,cencode=0) pseucheck(fit,bmt$t2,bmt$icr,cov1=xmat,failcode=2) } ############################################################################# pseucheck <- function(fit, ...) UseMethod("pseucheck") pseucheck.crr <- function(fit,time,event,resid=FALSE,cov1,col,bins=TRUE,scaled=TRUE,correction,addline,lwd=1,title,varplot,nt,failcode=2){ #events must be 1 or 2, censoring 0 #nt is not the number of qantiles but rather a vector of timepoints. default is 4 time points set at the quantiles of event times distribution if(missing(nt)) nt <- quantile(time[event!=0],c(1:4)/5) #4 quantiles of event times betas <- fit$coef #link function link <- "cloglog" #time points, covariates, positions timesall <- sort(unique(fit$uftime)) #all time-points used in the fitted model nvar <- length(betas) #number of covariates pvar <- 1:nvar #position of the covariates wh <- time.points(nt,timesall) times <- timesall[wh] nc <- length(times) #number of timepoints #prediction ci <- t(apply(cov1,1,function(x)predict.crr(fit,x)[,2])) #the predicted cumulative incidence functions #pseudo-observations if(resid) pval <- pseudoci(time,event,times) #the data must have variables time and event else pval <- pseudoci(time,event,timesall) #the data must have variables time and event kir <- paste("r",failcode,sep="") cips <- pval[, grep(kir,names(pval))] #the matrix of relevant pseudo-values (for the cause required) #smooth in time for scatterplots if(resid){ ci <- ci[,wh] } else{ ci <- t(apply(ci,1,smoot.t,timesall))[,wh] #smooth the predicted curves cips<- t(apply(cips,1,smoot.t,timesall))[,wh] #smooth the unsorted matrix with respect to time and pick the nt timepoints (PREVERI TO - KATERE IZBERE) } kt <- 0 #counter of covariates if(missing(varplot))varplot <- 1:nvar if(missing(title)) title <- names(cov1)[varplot] pseucheck.plot(resid,geefit=FALSE,varplot,nc,cov1,cips,ci,scaled,pvar,title,times,correction,betas,col,addline,link,nvar,lwd,bins) } pseucheck.coxph <- function(fit,resid=FALSE,col,bins=TRUE,scaled=TRUE,correction,addline,lwd=1,title,varplot,nt){ #nt is a vector of timepoints. if(missing(nt)) nt <- quantile(fit$y[fit$y[,2]!=0,1],c(1:4)/5) #4 quantiles of event times if(is.null(fit$x))stop("The Cox model must be fitted using the x=TRUE option") betas <- fit$coef #link function link <- "cloglogs" #time points, covariates, positions timesall <- sort(unique(fit$y[fit$y[,2]!=0])) #all time-points used in the fitted model nvar <- length(betas) #number of covariates pvar <- 1:nvar #position of the covariates wh <- time.points(nt,timesall) times <- timesall[wh] nc <- length(times) #number of timepoints n <- length(fit$linear.predictors) #prediction bx <- fit$linear.predictors survfit <- getFromNamespace("survfit", "survival") basefit <- summary(survfit(fit),times=timesall)$surv #baseline hazard for the full model ci <- matrix(rep(basefit, each = n)^exp(bx), byrow = FALSE, nrow = n) #the predicted survival functions #pseudo-observations if(resid) cips <- pseudosurv(fit$y[,1],fit$y[,2],times)[,-(1:2)] else cips <- pseudosurv(fit$y[,1],fit$y[,2],timesall)[,-(1:2)] #smooth in time for scatterplots if(resid){ ci <- ci[,wh] } else{ ci <- t(apply(ci,1,smoot.t,timesall))[,wh] #smooth the predicted curves cips<- t(apply(cips,1,smoot.t,timesall))[,wh] #smooth the unsorted matrix with respect to time and pick the nt timepoints (PREVERI TO - KATERE IZBERE) } kt <- 0 #counter of covariates if(missing(varplot))varplot <- 1:nvar if(missing(title)) title <- names(fit$coef)[varplot] pseucheck.plot(resid,geefit=FALSE,varplot,nc,fit$x,cips,ci,scaled,pvar,title,times,correction,betas,col,addline,link,nvar,lwd,bins) } pseucheck.geese <- function(fit,data,resid=FALSE,cov1,col,bins=TRUE,scaled=TRUE,correction,addline,lwd=1,title,varplot){ #checking a model fitted using pseudo-observations and the geese function #fit - a geese function fit #data - the dataframe that was used for the geese function fit (contains all the variables used) - does not work the names in the formula are changed (factor...) #resid - FALSE=scatterplot, TRUE=residuals plot #title - the title for each graph (if not given - the names of the variables are taken) #col - the colors of the lines for scatterplot #bins - adds an illustration of the covariate distribution #scaled - if FALSE, the residuals are raw #cov1 - the function automatically finds the names of variables used in the data set. If the variables are changed in the formula, the # matrix of covariates should be passed using cov1. betas <- fit$beta #link function link <- fit$call$mean.link if(link=="cloglog") invf <- function(x)1-exp(-exp(x)) else if(link=="logit") invf <- function(x)exp(x)/(1+exp(x)) else stop(paste("link function",link ,"not implemented")) #time points, covariates, positions times <- sort(unique(data$tpseudo)) #the time-points used in the fit inx <- sort(c(grep("tpseudo",fit$xnames) ,grep("Intercept",fit$xnames)) ) pvar <- (1:length(fit$xnames))[-inx] #position of the covariates in the model nc <- length(inx) #number of timepoints #prediction dummy <- matrix(rep(1,nrow(data)),ncol=1) if(nc>1){ for(it in 2:length(times)){ dummy <- cbind(dummy,data$tpseudo==times[it]) } } #a dummy matrix for the timepoints if(missing(cov1)) cov1 <- as.matrix(data[,fit$xnames[pvar]]) bx <- dummy %*% betas[inx] + cov1%*%betas[pvar] ci <- matrix(invf(bx),byrow=TRUE,ncol=nc) #matrix of predicted values of the cumulative incidence function #pseudo observations cips <- matrix(data$pseudo,byrow=TRUE,ncol=nc) #matrix of pseudo-observations nvar <- ncol(cov1) #number of covariates if(missing(varplot))varplot <- 1:nvar if(missing(title)) title <- fit$xnames[pvar[varplot]] pseucheck.plot(resid,geefit=TRUE,varplot,nc,cov1,cips,ci,scaled,pvar,title,times,correction,betas,col,addline,link,nvar,lwd,bins) } pseucheck.lm <- function(fit,correction,title,varplot){ #function for plotting restricted mean gof #if(missing(varname))stop("Choose the covariate of interest with parameter varname") if(is.null(fit$x))stop("The model must be fitted using 'x=TRUE' option") inx <- grep("Intercept",names(fit$coef) ) xi <- data.frame(fit$x)[,-inx] pvar <- 1:ncol(xi) if(missing(correction)){ if(length(pvar>1))correction <- TRUE else correction <- FALSE } if(missing(varplot))varplot <- pvar if(missing(title))title <- (names(fit$coef)[-inx])[varplot] pvar <- pvar[varplot] kt <- 0 for(it in pvar){ kt <- kt+1 x <- xi[,it] if(correction){ y.pred <- fit$fitted y <- fit$y - y.pred if(correction) y <- y + fit$coef[it+length(inx)]*x } else y <- fit$y sm <- loess(y~x,span=.85) plot(x,y,ylab="Pseudo-residual",xlab="z") title(title[kt]) lines(x[order(x)],(fit$coef[it+length(inx)]*x)[order(x)],lwd=2,lty=2) lines(x[order(x)],sm$fitted[order(x)],lwd=2) } } pfungee <- function(fit,d,varmat=fit$vbeta){ b1 <- fit$beta[d] v1 <- varmat[d,d] hi <- try(b1%*%solve(v1)%*%b1) if(!inherits(hi,"try-error"))p <- 1-pchisq(hi,length(d)) else p <- NA list(chi2=hi,df=length(d),p=p) } #functions for pseudo observations fitting and diagnostics pseugee.make <- function(data,time,event,state=1,nt=5){ #function that converts competing risks data into the required form #incoming data: time, event (0=censoring,1&2 events), variables. One line per case #outgoing data: one line per chosen time-point, new variables: id (identifying cases), pseudo (pseudo-observation), tpseudo (time point) #nt: either a number of time points - indicates the number of time-points that should be selected at the respective quantiles of the distribution of event times (any cause) # or a vector of time points - the function finds the closest time-points in the data set #use state=0 for Cox model/survival if(any(names(data)=="pseudo"))warning("a variable named pseudo already exists in the data set") if(any(names(data)=="tpseudo"))warning("a variable named tpseudo already exists in the data set") timeall <- time[event!=0] if(length(nt)==1)times <- quantile(timeall,c(1:nt)/(nt+1)) #quantiles of event times (either event) else{ wh <- time.points(nt,timeall) times <- timeall[wh] } whichstate <- paste("r",state,sep="") # the desired state if(state!=0){ pval <- pseudoci(time,event,times) #pval containts the pseudo-observations pval1 <- pval[, grep(whichstate,names(pval))] #get the pseudo-observations for the required state } else pval1 <- pseudosurv(time,event,times)[,-c(1,2)] data$id <- 1:nrow(data) #add an id variable datalong <- NULL for(jt in 1:ncol(data))datalong <- cbind(datalong,rep(data[,jt],length(times))) #all the existing variables are recycled (each case must be repeated as many times as there are chosen time-points datalong <- cbind(datalong,as.vector(as.matrix(pval1)),rep(times,each=nrow(pval1))) #pseudo-observations and time points are added datalong <- as.data.frame(datalong) names(datalong) <- c(names(data),"pseudo","tpseudo") #names are added datalong <- datalong[order(datalong$id),] #ordered by id (puts same id together as required by the geese function } ################################################################################################################ # Auxiliary functions # ################################################################################################################ smoot.z <- function(column,xi){ sm <- loess(column~xi,span=.85) sm$fitted } smoot.t <- function(row,time){ sm <- loess(row~time)#,span=.25) sm$fitted } time.points <- function(nt,times){ unlist(lapply(nt,function(x)which.min(abs(times-x))[1])) } bins <- function(nrows,xis,ylim){ kbin <- min(51,floor(nrows/2)) x <- xis lim<-range(xis) bins <- seq(lim[1], lim[2], length = kbin) x <- x[x >= lim[1] & x <= lim[2]] f <- table(if (version$major < 5) cut(x, bins) else oldCut(x,bins)) j <- f > 0 bins <- (bins[-kbin])[j] f <- f[j] f <- f/max(f)*diff(ylim)/10 segments(bins, ylim[1], bins, f+ylim[1]) } pseucheck.plot <- function(resid,geefit,varplot,nc,cov1,cips,ci,scaled,pvar,title,times,correction,betas,col,addline,link,nvar,lwd,bins){ if(missing(correction)){ if(nvar>1)correction <- TRUE else correction <- FALSE } if(missing(addline)){ if(correction)addline <- TRUE else addline <- FALSE } if(link=="cloglog"){ invf <- function(x)1-exp(-exp(x)) vf <- function(x)log(-log(1-x)) } else if(link=="cloglogs"){ invf <- function(x)exp(-exp(x)) vf <- function(x)log(-log(x)) } else if(link=="logit"){ invf <- function(x)exp(x)/(1+exp(x)) vf <- function(x)log(x/(1-x)) } else stop(paste("link function",link ,"not implemented")) kt <- 0 pvar <- pvar[varplot] #plot only the required covariates for(it in pvar){ kt <- kt +1 #kt counts the number of plotted covariates if(geefit)xi <- matrix(cov1[,varplot[kt]],ncol=nc,byrow=TRUE)[,1,drop=TRUE] #the chosen covariate #TU JE RAZLIKA else xi <- cov1[,it,drop=TRUE] if(resid){ #the residual plot if(!(nc==1&length(unique(xi))<=2)){ #supress plotting for binary variables if there is only one time point res <- (cips - ci) #raw residual if(scaled) res <- res/sqrt(ci*(1-ci)) #scaled residual res <- res[order(xi),,drop=FALSE] #ordered with respect to covariate for(jt in 1:nc){ #time points xlab <- ylab <- "" #labels are only on the left most graph and in the bottom panel if(jt==1)ylab <- "Pseudo-residual" if(it==pvar[length(pvar)]|nc==1)xlab <- "z" #ylim=range(apply(res[,jt,drop=FALSE],1,mean),na.rm=TRUE) #the same limits for all ylim <- range(res,na.rm=TRUE) plot(sort(xi),res[,jt],col="Gray70",ylim=ylim,xlab=xlab,ylab=ylab,cex=.3,axes=FALSE) axis(2,padj=.7) axis(1,padj=-.7) box() xm <- sort(xi[!is.na(res[,jt])]) #exclude the missing values ym <- res[!is.na(res[,jt]),jt] lfit <- loess(ym~xm)$fitted lines(xm,lfit,col=1,lwd=2) #plot the smoothed curve title(paste(title[kt],", t=",round(times[jt],1))) #title of the graph }#for each time-point }#if not supressed }#if(resid) else{ if(!(nc==1&length(unique(xi))<=2)){ #supress plotting for binary variables if there is only one time point ord <- order(xi) #order everything with respect to the covariate xis <- xi[ord] ci.t <- ci[ord,,drop=FALSE] #keep the old ci&cips as it was cips.t <- cips[ord,,drop=FALSE] s.ci <- apply(ci.t,2,smoot.z,xis) #smoothed with respect to the covariate s.cips <- apply(cips.t,2,smoot.z,xis) out <- vf(s.cips) if(correction){ #correction for more covariates out <- out - vf(s.ci) out <- out + matrix(betas[it]*xis,nrow=nrow(out),ncol=ncol(out)) #beta Z is added } if(nc>1)ylab <- paste(link,"(1-C(t|z))") else ylab <- paste(link,"S(t|z)") if(link=="cloglogs")paste("cloglog S(t|z)") if(missing(col))col <- 1:nc else if (length(col)==1) col <- rep(col,nc) if(addline)ylim <- range(cbind(out,betas[it]*xis),na.rm=TRUE) else ylim <- range(out,na.rm=TRUE) plot(xis,out[,1],ylim=ylim,xlab="z",ylab=ylab,type="n") # prepare the graph if(addline)lines(xis,betas[it]*xis,lty=2,lwd=2) # line added title(title[kt]) for(it in 1:ncol(out)){ lines(xis,out[,it],col=col[it],lwd=lwd) # plot curves } if(bins) bins(nrow(ci),xis,ylim) # add covariate distribution }#if plot not supressed }#else (scatterplot) }#for each covariate }