pseu <- function(formula,data,type=c("surv","cox","add"),resid=FALSE,test=FALSE,nt=3,pl=TRUE,ylim,col,bins=TRUE,linpred=FALSE){ ################################################################################################################ # A function for graphical and formal testing of the Cox and additive model assumptions # # # # help files available at http://www.mf.uni-lj.si/ibmi-english/biostat-center/index.html -> software # ################################################################################################################ options(warn=-1) require(timereg) require(survival) require(Design) require(geepack) if(pl==F & (test==F |type=="surv"))stop("You have chosen neither plotting neither model testing") type <- match.arg(type) call <- match.call() m <- match.call(expand = FALSE) temp <- c("", "formula", "data", "na.action") m <- m[match(temp, names(m), nomatch = 0)] Terms <- if (missing(data)) terms(formula) else terms(formula, data = data) m$formula <- Terms m[[1]] <- as.name("model.frame") m <- eval(m, parent.frame()) if (nrow(m) == 0) stop("No (non-missing) observations") Y <- model.extract(m, "response") if (!inherits(Y, "Surv")) stop("Response must be a survival object") if (attr(Y, "type") != "right") stop(paste("Time-dependent covariates not allowed")) tt <- Y[,1] st <- Y[,2] pod <- data.frame(tt=tt,st=st,m[,-1]) names(pod) <- c("tt","st",names(m)[-1]) pod <- pod[order(pod$tt,-pod$st),] times <- unique(pod$tt[pod$st==1]) a <- pseudosurv(pod$tt,pod$st,times)[,-(1:2)] xi <- pod[,3] n <- length(xi) d <- length(times) xis <- sort(xi) nvar <- ncol(m)-1 out <- NULL if(pl){ if(type=="surv" & resid==TRUE)stop("The resid=T option can be used with type=Cox or type=add") if((nvar>1&type!="surv")|resid){ xmat <- as.matrix(pod[,-(1:2),drop=FALSE]) if(type=="cox"){ fit <- coxph(Surv(tt,st)~xmat,data=pod) bx <- fit$linear.predictors survfit <- getFromNamespace("survfit", "survival") basefit <- summary(survfit(fit),times=times)$surv #baseline hazard for the full model Stx <- matrix(rep(basefit, each = n)^exp(bx), byrow = F, nrow = n) } else{ xx <- function(x) formula(x) formula <- xx(paste("Surv(tt,st)~",paste("const(",names(m)[-1],")",collapse="+"),sep="")) fit <- aalen(formula,data=pod) B <- fit$cum B <- B[2:(d+1),] a0 <- matrix(B[,2],byrow=T,ncol=d,nrow=n) bx <- xmat%*%fit$gamma b <- matrix(rep(bx,d),ncol=d,nrow=n)*matrix(B[,1],byrow=T,ncol=d,nrow=n) Stx <- exp(-(a0+b)) } dp <- ncol(a) - ncol(Stx) if(dp>0) { Stx <- cbind(Stx,matrix(NA,nrow=n,ncol=dp)) } else if(dp<0){ Stx <- Stx[,1:d] } if(!resid){ if(linpred)xi <- bx Stx <- Stx[order(xi),] fs1 <- smoothy(Stx,nt,times,xis)$f1 } }#if((nvar>1&type!="surv")|resid) if(resid){ res <- (a- Stx)/sqrt(Stx*(1-Stx)) wh <- time.points(nt,d) timess <- times[wh] if(linpred) xi <- bx res <- res[order(xi),] if(missing(ylim)) ylim=range(apply(res[,wh],1,mean),na.rm=T) for(it in 1:nt){ plot(sort(xi),res[,wh[it]],col="grey70",ylim=ylim,xlab="",ylab="",cex=.3) xm <- sort(xi[!is.na(res[,wh[it]])]) ym <- res[!is.na(res[,wh[it]]),wh[it]] lfit <- loess(ym~xm)$fitted lines(xm,lfit,col=1,lwd=2) } }#if(resid) else{ #pseu values sedaj urejene po x, ki me zanima if(linpred){ xi <- bx xis <- sort(bx) } as <- a[order(xi),] ftemp <- smoothy(as,nt,times,xis) f1 <- ftemp$f1 timess <- ftemp$timess if(type=="cox"){ out <- log(-log(f1)) if(nvar>1) out <- out - log(-log(fs1)) + matrix(fit$coef[1]*xis,nrow=nrow(out),ncol=ncol(out)) ylab <- "cloglog S(t|z)" } else if(type=="add") { timi <- matrix(timess,byrow=T,ncol=ncol(f1),nrow=nrow(f1)) out <- -log(f1)/timi if(nvar>1) out <- out + log(fs1)/timi + matrix(fit$gamma[1]*xis,nrow=nrow(out),ncol=ncol(out)) ylab="log S(t|z)/t" } else{ out <- f1 ylab="S(t|z)" } if(missing(col))col <- 1:nt else if (length(col)==1) col <- rep(col,nt) if(missing(ylim))ylim <- range(out,na.rm=T) plot(xis,out[,1],type="l",ylim=ylim,xlab="z",ylab=ylab,col=col[1]) for(it in 2:ncol(out)){ lines(xis,out[,it],col=col[it]) } if(bins) bins(pod,xis,ylim) out <- list(ps=out,time=timess,z=xis) }#else }#if(pl) if(test&type!="surv"){ link <- "cloglog" if(type=="add")link <- "log" wh <- time.points(nt,d) atv <- as.vector(a[,wh]) timv <- rep(times[wh],each=n) idv <- rep(1:n,length(wh)) zbv <- rep(pod[,3],nt) if(nvar>1){ zb2 <- NULL for(it in 2:nvar){ zb2 <- cbind(zb2,y=rep(pod[,2+it],nt)) } } d1 <- data.frame(time=timv,id=idv,pseu=atv,x=zbv) if(nvar>1)d1<- cbind(d1,zb2) if(type=="cox")d1$pseu <- 1-d1$pseu d1 <- d1[order(d1$id),] duse <- d1 if(type=="add")duse[,-(1:3)] <- duse[,-(1:3)]*matrix(rep(d1$time,nvar),ncol=nvar) corstr="independence" if(nvar>1)fgee <- geese(pseu~x+as.matrix(duse[,-(1:4)])+as.factor(time)-1,id=id,data=duse,family=gaussian,mean.link=link,corstr=corstr) else fgee <- geese(pseu~x+as.factor(time)-1,id=id,data=duse,family=gaussian,mean.link=link,corstr=corstr) testrez <- list(NULL) testrez$x <- (-1)^(type=="add")* fgee$beta[1] testrez$p <- unlist(pfungee(fgee,1)) testrez$SE <- sqrt(fgee$vbeta[1,1]) d1$x2 <- as.vector(rcs(d1$x,4)[,2]) d1$x3 <- as.vector(rcs(d1$x,4)[,3]) duse <- d1 if(type=="add")duse[,-(1:3)] <- duse[,-(1:3)]*matrix(rep(d1$time,nvar),ncol=nvar) if(nvar>1)ymat <- as.matrix(duse[,5:(3+nvar)]) dmin <- length(unique(d1$time))+2+nvar-1 if(nvar>1)fgee <- geese(pseu~ymat+as.factor(time)*(x+x2+x3)-1,id=id,data=duse,family=gaussian,mean.link=link,corstr=corstr) else fgee <- geese(pseu~as.factor(time)*(x+x2+x3)-1,id=id,data=duse,family=gaussian,mean.link=link,corstr=corstr) #H0: no problems in any direction: testrez$p <- rbind(testrez$p,unlist(pfungee(fgee,dmin:length(fgee$beta)))) if(nvar>1) fgee <- geese(pseu~ymat+as.factor(time)+x+x2+x3-1,id=id,data=duse,family=gaussian,mean.link=link,corstr=corstr) else fgee <- geese(pseu~as.factor(time)+x+x2+x3-1,id=id,data=duse,family=gaussian,mean.link=link,corstr=corstr) testrez$p <- rbind(testrez$p,unlist(pfungee(fgee,dmin:length(fgee$beta)))) #H0: constant in time (linearity assumed) if(nvar>1) fgee <- geese(pseu~ymat+x*as.factor(time) -1,id=id,data=duse,family=gaussian,mean.link=link,corstr=corstr) else fgee <- geese(pseu~x*as.factor(time) -1,id=id,data=duse,family=gaussian,mean.link=link,corstr=corstr) testrez$p <- rbind(testrez$p,unlist(pfungee(fgee,dmin:length(fgee$beta)))) testrez$p <- data.frame(testrez$p[c(1,3,4,2),]) rownames(testrez$p)<-c(names(pod)[3],"linearity (assumed constant in time)","constant in time (assumed linearity)","overall") cat("coef=",format(testrez$x,digits=4),", SE=",format(testrez$SE,4),"\n") print(format(testrez$p)) out$test <- testrez$p } invisible(out) options(warn=0) } ################################################################################################################ # Auxiliary functions # ################################################################################################################ smoothy <- function(as,nt,times,xis){ #as: matrix of values, properly sorted #nt: number of lines #times: event times - sorted #xis: covariate values - sorted d <- length(times) sm.fun <- function(stolpec,xi){ sm <- loess(stolpec~xi,span=.85) sm$fitted } sm.funt <- function(vrstica,time){ sm <- loess(vrstica~time)#,span=.25) sm$fitted } f2 <- t(apply(as,1,sm.funt,times)) #izberem samo dolocene case wh <- time.points(nt,d) timess <- times[wh] asw <- f2[,wh] f1 <- apply(asw,2,sm.fun,xis) list(f1=f1,timess=timess) } bins <- function(pod,xis,ylim){ kbin <- min(51,floor(nrow(pod)/2)) x <- xis lim<-range(xis) bins <- seq(lim[1], lim[2], length = kbin) x <- x[x >= lim[1] & x <= lim[2]] f <- table(if (version$major < 5) cut(x, bins) else oldCut(x,bins)) j <- f > 0 bins <- (bins[-kbin])[j] f <- f[j] f <- f/max(f)*diff(ylim)/10 segments(bins, ylim[1], bins, f+ylim[1]) } time.points <- function(nt,d){ if(nt==0){ ntt <- .25 nt <- 3 } else if(nt>9) ntt <- .1 else ntt <- 1/(nt+1) wh <- round(quantile(1:d,seq(ntt,1-ntt,length=nt))) } pfungee <- function(fit,d,varmat=fit$vbeta){ b1 <- fit$beta[d] v1 <- varmat[d,d] hi <- try(b1%*%solve(v1)%*%b1) if(!inherits(hi,"try-error"))p <- 1-pchisq(hi,length(d)) else p <- NA list(chi2=hi,df=length(d),p=p) } "pseudosurv" <- function(time, cens, tmax){ if(any(is.na(time))) stop("missing values in 'time' vector") if(any(time)<0) stop("'time' must be nonnegative") if(any(is.na(cens))) stop("missing values in 'cens' vector") if(any(cens!=0 & cens!=1)) stop("'cens' must be a 0/1 variable (alive/dead)") if(missing(tmax)) tmax <- unique(time[cens==1]) if(any(is.na(tmax))) stop("missing values in 'tmax' vector") if (sum(tmax > max(time)) > 0) stop ("tmax greater than largest observation time") tmax <- sort(tmax) ltmax <- length(tmax) howmany <- length(time) ## preparing the output pseudo <- as.data.frame(matrix(data = NA, ncol = length(tmax)+3, nrow = howmany)) pseudo[,1] <- 1:howmany pseudo[,2] <- time pseudo[,3] <- cens names(pseudo) <- c("id","time", "cens",paste("tmax =", round(tmax,3), sep="")) # sort in time pseudo <- pseudo[order(pseudo$time,-pseudo$cens),] # time points chosen tu <- unique(pseudo$time[pseudo$cens==1]) ltu <- length(tu) tu <- matrix(tu,byrow=TRUE,ncol=ltu,nrow=ltmax) tgiven <- matrix(tmax,byrow=FALSE,ncol=ltu,nrow=ltmax) inx <- apply(tgiven>=tu,1,sum) # KM, leave one out KM.omit <- surv.omit(pseudo) KM.omit <- KM.omit[,inx,drop=FALSE] # KM, all cases KM.tot <- surv.tot(pseudo)[inx] KM.tot <- matrix(KM.tot,byrow=TRUE,nrow=howmany,ncol=length(tmax)) # pseudo-observations pseu <- howmany*KM.tot - (howmany-1)*KM.omit # back to original order pseudo[,-(1:3)] <- pseu pseudo <- pseudo[order(pseudo$id),] pseudo <- pseudo[,-1] return(pseudo) } "surv.omit" <- function(pseudo,tmax){ # calculate Kaplan - Meier, leave one out howmany <- nrow(pseudo) td <- pseudo$time[pseudo$cens==1] lt.temp <- c(td[-1],td[length(td)]+1) lt <- which(td!=lt.temp) #km - i Y1 <- matrix(howmany:1,byrow=TRUE,ncol=howmany,nrow=howmany) Y2 <- matrix((howmany-1):0,byrow=TRUE,ncol=howmany,nrow=howmany) Y <- upper.tri(Y1,diag=FALSE)*Y1+lower.tri(Y2,diag=TRUE)*Y2 N <- matrix(pseudo$cens,byrow=TRUE,ncol=howmany,nrow=howmany) Ndiag <- diag(diag(N)) N <- N - Ndiag kmji <- (Y-N)/Y km <- t(apply(kmji,1,cumprod)) if(!missing(tmax)){ tt <- matrix(pseudo$time,byrow=TRUE,nrow=nrow(pseudo),ncol=nrow(pseudo)) #diag(tt) <- c(diag(tt[-nrow(pseudo),-1]),tmax) diag(tt) <- c(0,diag(tt[-1,-nrow(pseudo)])) tt <- tt[,pseudo$cens==1,drop=FALSE] tt <- tt[,lt,drop=FALSE] tt <- cbind(rep(0,nrow(pseudo)),tt,rep(tmax,nrow(pseudo))) tt <- t(apply(tt,1,diff)) } #corrected value for the last time - last value carried forward aje <- which(is.na(km[howmany,])) if(length(aje)>0){ kir <- min(aje) km[howmany,kir:ncol(km)] <- km[howmany,kir-1] } #only for deaths, one value per tie km <- km[,pseudo$cens==1,drop=FALSE] km <- km[,lt,drop=FALSE] if(!missing(tmax)){ km <- apply(cbind(rep(1,nrow(pseudo)),km)*tt,1,sum) } km } "surv.tot" <- function(pseudo,tmax){ # calculate Kaplan - Meier, all cases howmany <- nrow(pseudo) td <- pseudo$time[pseudo$cens==1] lt.temp <- c(td[-1],td[length(td)]+1) lt <- which(td!=lt.temp) #km - i Y <- howmany:1 N <- pseudo$cens kmji <- (Y-N)/Y km <- cumprod(kmji) if(!missing(tmax)){ tt <- pseudo$time[pseudo$cens==1] tt <- tt[lt] tt <- c(0,tt,tmax) tt <- diff(tt) } #only for deaths, one value per tie km <- km[pseudo$cens==1] km <- km[lt] if(!missing(tmax)){ km <- sum(c(1,km)*tt) } km }