#functions for calculating the reduced citation distribution and reduced h-index cdis <- function(data,p,conf.level=.95){ #function for calculation of the reduced citation distribution: fixed probability #input: #data: a vector of paper citations (the value of citations for each paper, no need to be sorted) #p= the fixed probability = 1/reducement factor #conf.level=the level for the confidence interval calculation #output: #n= paper rank #cnew= expected values in the reduced citation distribution #sd= standard error #lo= lower bound of the CI #upp= upper bound of the CI n <- length(data) #number of all papers of this group npos <- sum(data!=0) #number of nonzero data <- rev(sort(data)) #sorted by number of citation cn <- sd <- lo <- upp <- rep(0,n) #to bodo novi, expected citati alfa <- 1-conf.level q <- 1-p #1-probability of publishing for(k in 1:npos){ datanz <- data[k:n] jtc <- k:n jtc <- jtc[datanz>0] datanz <- datanz[datanz>0] #only those who are greater than zero npos <- length(datanz) #number of nonzero papers probs <- rep(NA,npos) probs[1] <- p^k if(npos>1){for(jt in 2:npos)probs[jt] <- choose(jtc[jt]-1,k-1)*p^(k-1)*q^(jtc[jt]-k)*p} #cat(sum(probs),"\n") cn[k] <- sum(probs * datanz) sd[k] <- sqrt( sum(probs*(datanz - sum(probs * datanz))^2)) kir <- which (cumsum(probs)>(1-alfa/2))[1] if(!is.na(kir))lo[k] <- datanz[kir] else lo[k] <- 0 kir <- which(cumsum(probs)>alfa/2)[1] if(!is.na(kir))upp[k] <- datanz[kir] else kir[k] <- 0 } #fac <- abs(qnorm((1-conf.level)/2)) #old - calculation of CI using normal distr. ass. #lower <- cn - fac*sd # lower[lower<0] <- 0 # upper <- cn+fac*sd list(n=1:length(cn),cnew=cn,sd=sd,lo=lo,upp=upp) } hvr <- function(data,pji,conf.level=0.95){ #function for calculation of the h-index in the reduced data set (assuming fixed probablity) #input: #data: a vector of paper citations (the value of citations for each paper, no need to be sorted) #p= the fixed probability = 1/reducement factor (can be a vector of values) #conf.level=the level for the confidence interval calculation #output: #n= fixed probability #cnew= expected value in the reduced citation distribution #sd= standard error #lower= lower bound of the CI using normal approximation #upper= upper bound of the CI dmax <- length(pji) #number of different h values rezh <- varja <- rep(NA,dmax) for(d in 1:dmax){ maxh <- h.inx(data) hp <- rep(NA,maxh+1) for(h in 0:maxh){ p <- pji[d] inxa <- which(data>=h+1) inxb <- which(data==h) na <- length(inxa) nb <- length(inxb) verj <- 0 for(it in 1:h){ if(nb >= (h-it))verj <- verj + dbinom(it,na,p)*sum(dbinom((h-it):nb,nb,p)) } hp[h+1] <- verj } rezh[d] <- sum((0:maxh)*hp) varja[d] <- sqrt(sum((0:maxh)^2*hp) - (sum((0:maxh)*hp))^2) } fac <- abs(qnorm((1-conf.level)/2)) lower <- rezh - fac*varja lower[lower<0] <- 0 upper <- rezh+fac*varja list(n=pji,cnew=rezh,sd=varja,lower=lower,upper=upper) } h.inx <- function(x){ #h index of the original data set ven <- which(x==Inf) if(length(ven)>0)x <- x[-ven] which(rev(sort(x,na.last=F)) < (1:length(x)))[1]-1 } cdisn <- function(data,p,conf.level=.95){ #function for calculation of the reduced citation distribution: fixed size #input: #data: a vector of paper citations (the value of citations for each paper, no need to be sorted) #p= sample size of the new, smaller sample #conf.level=the level for the confidence interval calculation #output: #n= paper rank #cnew= expected values in the reduced citation distribution #sd= standard error #lower= lower bound of the CI using normal approximation #upper= upper bound of the CI n <- length(data) #number of all papers for the group npos <- sum(data!=0) #number of nonzero data <- rev(sort(data)) #sorted by number of citation cn <- sd <- rep(NA,p) #to bodo novi, expected citati for(k in 1:p){ datanz <- data[k:n] jtc <- k:n jtc <- jtc[datanz>0] datanz <- datanz[datanz>0] #only those who are greater than zero npos <- length(datanz) #number of nonzero papers probs <- rep(0,npos) probs[1] <- choose(n-k,p-k)/choose(n,p) for(jt in 2:npos){ if((n-jtc[jt]) >= p-k) probs[jt] <- choose(jtc[jt]-1,k-1)*choose(n-jtc[jt],p-k)/choose(n,p) } #if(k==1)probn <<-probs #cat(sum(probs),"\n") cn[k] <- sum(probs * datanz) # sd[k] <- sqrt(sum(probs*datanz^2) - cn[k]^2) sd[k] <- sqrt( sum(probs*(datanz - sum(probs * datanz))^2)) } fac <- abs(qnorm((1-conf.level)/2)) #this should be replaced by percentiles lower <- cn - fac*sd lower[lower<0] <- 0 upper <- cn+fac*sd list(n=1:length(cn),cnew=cn,sd=sd,lower=lower,upper=upper) }