######## Step 0 #########################################################
## Read in dataset0 - needs to be in data frame format
## The data structure:
## Column 1: SubjectID,
## Column 2: Treatment Indicator [called Trt in the function: Trt=1 in treatment,Trt=0 control]
## Column 3: outcome *
## * If it is a survival outcome, the Column 3 is the time to event [called TIME in the function]
## and column 4 is censoring status 1=event occured, 0=censored [called EVENT in the function]
## The rest of the columns (from Column c1 to Column c2) are binary covariate to define cells
## c1 and c2 are the starting and ending Column numbers of the binary covariates in the dataset
## type is the type of endpoint - type="cts" for continuous endpoint
## - type="bi" for binary endpoint
## - type="sv" for survival endpoint
## p is the probability each cell will be selected as in the paper
## k is the number of subpopulations to be drawn as in the paper
## n2 is the number of simulation ran to estimate the null distributions
## a is the required smallest number of cells included in one sub-population, the default number is 2
## i.e. to form one sub-population, at least 2 cells are required
## Two options are given in step 3, choose one of them to use, detault is option = "op2"
## option = "op1" is Parallel Computing, cl can be specified to be the number of worker clusters
## created and used, the default cl = 2
## option = "op2" is simplw for loop
############# Z-statistics Functions ################
# z-statistics function for continuous endpoint
fnz.cts=function(ncell,dataset,p,a){
flip0=rbinom(ncell,1,p)
while (sum(flip0)0,result1,0),na.rm = T)
T_D.2=mean(ifelse(result1<0,result1,0),na.rm = T)
###### Step 3 #######
###### Estimate the Distribution of S(D*) and T(D*) under the null hypothesis, as shown in the paper #######
# A function which runs z-statistics function using k selected sub-populations #
# assign fn.z to be one of the functions: fnz.cts, fnz.bi, fnz.sv
fk=function(k){
# Shuffle the treatment assignment in each cell dataset0$Trt is the Trt variable
dataset$Trt=unlist(tapply(dataset0$Trt,cellname,sample))
res=replicate(k,fn.z(ncell,dataset,p,a))
c(max(res,na.rm = T),min(res,na.rm = T), mean(ifelse(res>0,res,0),na.rm = T), mean(ifelse(res<0,res,0),na.rm = T))
}
## Under null hypothesis, the distribution of S(D*) and T(D*) ##
if(option=="op1"){
#Option 1: Parallel Computing
#cl=40 # the number of worker clusters will be created and used, default is 2
registerDoParallel(cl)
Tstat=foreach(i=1:n2,.combine='rbind', .packages='survival') %dorng% fk(k)
} else if (option=="op2"){
#Option 2: for loop
Tstat=NULL
for (i in 1:n2){
Tstat0=fk(k)
Tstat=rbind(Tstat,Tstat0)
}
} else {
print("error")
}
###### Step 4 #######
###### Calculate under the null distribution ##########
###### the probability of any value exceeding the observed S(D) and T(D) #######
## Extreme Value Test
I.S.1=round(length(which(Tstat[,1]>S_D.1))/n2,digits=4)
I.T.1=round(length(which(Tstat[,2]S_D.2))/n2,digits=4)
I.T.2=round(length(which(Tstat[,4]