# Volně přístupný kód G-testu jsme doplnili o možnost simulované p-hodnoty metodou Monte Carlo G.test <- function(x, y = NULL, correct=c("none", "williams", "yates"), p = rep(1/length(x), length(x)), rescale.p = FALSE, simulate.p.value = FALSE, B = 2000) { # Log-likelihood tests of independence & goodness of fit # Does Williams' and Yates' correction # Does Monte Carlo simulation of p-values, via Patefield's algorithm # G & q calculation from Sokal & Rohlf (1995) Biometry 3rd ed. # TOI Yates' correction taken from Mike Camann's 2x2 G-test fn. # GOF Yates' correction as described in Zar (2000) # Part of code taken from stats' chisq.test() DNAME <- deparse(substitute(x)) if (is.data.frame(x)) x <- as.matrix(x) if (is.matrix(x)) { if (min(dim(x)) == 1) x <- as.vector(x) } if (!is.matrix(x) && !is.null(y)) { if (length(x) != length(y)) stop("x and y must have the same length") DNAME <- paste(DNAME, "and", deparse(substitute(y))) OK <- complete.cases(x, y) x <- as.factor(x[OK]) y <- as.factor(y[OK]) if ((nlevels(x) < 2) || (nlevels(y) < 2)) stop("x and y must have at least 2 levels") x <- table(x, y) } if (any(x < 0) || any(is.na(x))) stop("all entries of x must be nonnegative and finite") if ((n <- sum(x)) == 0) stop("at least one entry of x must be positive") if (simulate.p.value) { setMETH <- function() METHOD <<- paste(METHOD, "with simulated p-value\n\t (based on", B, "replicates)") almost.1 <- 1 - 64 * .Machine$double.eps } correct <- match.arg(correct) if (is.matrix(x)) { # If x is matrix, do test of independence METHOD <- "Log likelihood ratio test (G-test) of independence" nrows<-nrow(x) ncols<-ncol(x) if (correct=="yates"){ # Do Yates' correction? if(dim(x)[1]!=2L || dim(x)[2]!=2L) # Check for 2x2 matrix stop("Yates' correction requires a 2 x 2 matrix") if((x[1,1]*x[2,2])-(x[1,2]*x[2,1]) > 0) { x <- x + 0.5 diag(x) <- diag(x) - 1 } else { x <- x - 0.5 diag(x) <- diag(x) + 1 } } sr <- apply(x,1,sum) sc <- apply(x,2,sum) E <- outer(sr,sc,"*")/n dimnames(E) <- dimnames(x) if (simulate.p.value) { # Monte Carlo p-value setMETH() tbls <- r2dtable(B, sr, sc) tmp <- NULL for (t in tbls) { g <- 0 for (i in 1:nrows){ for (j in 1:ncols){ if (t[i,j] != 0) g <- g + t[i,j] * log(t[i,j]/E[i,j]) } } tmp <- c(tmp, 2*g) } g <- 0 for (i in 1:nrows){ for (j in 1:ncols){ if (x[i,j] != 0) g <- g + x[i,j] * log(x[i,j]/E[i,j]) } } STATISTIC <- G <- 2 * g PARAMETER <- NA PVAL <- (1 + sum(tmp >= almost.1 * STATISTIC)) / (B + 1) } else { # No Monte Carlo p-value g <- 0 for (i in 1:nrows){ for (j in 1:ncols){ if (x[i,j] != 0) g <- g + x[i,j] * log(x[i,j]/E[i,j]) } } q <- 1 if (correct=="williams"){ # Do Williams' correction row.tot <- col.tot <- 0 for (i in 1:nrows){ row.tot <- row.tot + 1/(sum(x[i,])) } for (j in 1:ncols){ col.tot <- col.tot + 1/(sum(x[,j])) } q <- 1 + ((n*row.tot-1)*(n*col.tot-1))/(6*n*(ncols-1)*(nrows-1)) } STATISTIC <- G <- 2 * g / q PARAMETER <- (nrow(x)-1)*(ncol(x)-1) PVAL <- 1-pchisq(STATISTIC,df=PARAMETER) if(correct=="none") METHOD <- "Log likelihood ratio test (G-test) of independence without correction" if(correct=="williams") METHOD <- "Log likelihood ratio test (G-test) of independence with Williams' correction" if(correct=="yates") METHOD <- "Log likelihood ratio test (G-test) of independence with Yates' continuity correction" } } else { # x is not a matrix, so we do Goodness of Fit METHOD <- "Log likelihood ratio goodness of fit test (G-test)" if (length(dim(x)) > 2L) stop("invalid 'x'") if (length(x) == 1L) stop("x must at least have 2 elements") if (length(x) != length(p)) stop("'x' and 'p' must have the same number of elements") if (any(p < 0)) stop("probabilities must be non-negative.") if (abs(sum(p) - 1) > sqrt(.Machine$double.eps)) { if (rescale.p) p <- p/sum(p) else stop("probabilities must sum to 1.") } E <- n * p names(E) <- names(x) g <- 0 for (i in 1:length(x)){ if (x[i] != 0) g <- g + x[i] * log(x[i]/E[i]) } q <- 1 if (correct=="williams"){ # Do Williams' correction q <- 1+(length(x)+1)/(6*n) } STATISTIC <- G <- 2*g/q names(E) <- names(x) if(simulate.p.value) { setMETH() nx <- length(x) sm <- matrix(sample.int(nx, B*n, TRUE, prob = p),nrow = n) ss <- apply(sm, 2L, function(x) { g <- 0 for (i in 1L:nx) { if (x[i] != 0) g <- g + x[i] * log(x[i]/E[i]) } }) PARAMETER <- NA PVAL <- (1 + sum(ss >= almost.1 * STATISTIC))/(B + 1) } else { PARAMETER <- length(x) - 1 PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE) } if (correct=="yates"){ # Do Yates' correction if(length(x)!=2) stop("Yates' correction requires 2 data values") if ( (x[1]-E[1]) > 0.25) { x[1] <- x[1]-0.5 x[2] <- x[2]+0.5 } else if ( (E[1]-x[1]) > 0.25) { x[1] <- x[1]+0.5 x[2] <- x[2]-0.5 } } PARAMETER <- length(x) - 1 PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE) if(correct=="williams") METHOD <- "Log likelihood ratio goodness of fit test (G-test) with Williams' correction" if(correct=="yates") METHOD <- "Log likelihood ratio goodness of fit test (G-test) with Yates' continuity correction" } names(STATISTIC) <- "G" names(PARAMETER) <- "X-squared df" names(PVAL) <- "p.value" structure(list(statistic=STATISTIC,parameter=PARAMETER,p.value=PVAL, method=METHOD,data.name=DNAME, observed=x, expected=E), class="htest") }