LAB 11. STANDARD STATISTICS REVISITED AND COMPLEX ERROR STRUCTURES setwd("C:/Inference Course Spring 2006/Labs/Likelihood course") #library(emdbook) library(bbmle) data<-read.table("crown_rad.txt", header = TRUE) head(data) attach(data) Species<-as.factor(Species) plot(DBH, Radius) ## LINEAR REGRESSION ################################################### lm.reg<-lm(Radius~DBH) summary(lm.reg) ## Now how would you do this using likelihood? linreg.fun<-function(a,b,sigma) { Rad.pred<-a + b*DBH -sum(dnorm(Radius, mean=Rad.pred, sd=sigma, log=TRUE)) } m.lr<-mle(linreg.fun, start= list(a=1, b=1, sigma=1), method="L-BFGS-B") m.lr # Essentially same value as lm. ##ANALYSIS OF VARIANCE################################################ # Predicting Y as afunction of two categorical variables. # Are there species differences in DBH # First we plot the data: xbar<-tapply(DBH, Species, mean) s<-tapply(DBH, Species, sd) n<-tapply(DBH, Species, length) sem<-s/sqrt(n) stripchart(DBH~Species, jitter=0.09, pch=16, vert=TRUE) # using a linear model: lm.aov<-lm(DBH~Species) summary(lm.aov) anova(lm.aov) mean.DBH.by.species<-tapply(DBH, Species, mean) # The same calculation using likelihood: aovfun<-function(a11, a12, sigma) { Y.pred<-c(a11,a12)[Species] -sum(dnorm(DBH, mean=Y.pred, sd=sigma, log=TRUE)) } m.aov<-mle(aovfun, start= list(a11=15, a12=15, sigma=1), method="L-BFGS-B", control=list(maxit=50000)) summary(m.aov) # essentially same value as lm. # However, the output provides the means of the variable for all the values the factor (Species) can take. # Inference using likelihood requires that you compare the likeli for the two Species model to one # in which you only estmate one mean for all species. That is: aovfun2<-function(a, sigma) { Y.pred<-a -sum(dnorm(DBH, mean=Y.pred, sd=sigma, log=TRUE)) } m.aov2<-mle(aovfun2, start= list(a=15, sigma=1), method="L-BFGS-B", control=list(maxit=50000)) summary(m.aov2) # You will then need to use LRT or AIC to determine whether the ANOVA is significant. This is clearly more work # than one would like to do so, if you have normal data, you may want to stick with the ANOVA. ## ANALYSIS OF COVARIANCE ########################################### ## Different slopes model: assume that the relationship between radius and DBH varies by species. # Let's plot it first plot(DBH, Radius, pch=as.numeric(Species)) lm.anc<-lm(Radius~Species*DBH) summary(lm.anc) # So the output can be interpreted as follows: # The first and third coef are the intercept and slope for Species 1 and # the second and fourth are the difference between # intercept and slope for Species 1 and Species 2 # int 1=0.63 slope1=0.07 int2=1.77 slope2=0.0279. # let's see if we get the same results with likelihood ancfun<-function(a11, a12, slope1, slope2, sigma) { Rad.pred<-c(a11,a12)[Species] + c(slope1, slope2)[Species]*DBH -sum(dnorm(Radius, mean=Rad.pred, sd=sigma, log=TRUE)) } m.ancov<-mle(ancfun, start= list(a11=1, a12=1,slope1=1, slope2=1, sigma=5), method="L-BFGS-B", lower=0.01) # Again inference would require that you compare the likeli of the different slopes model with one of # identical slopes. How would you do this? # NON-LINEAR LEAST SQUARES ######################################### # This is very similar to the numerical methods we use to maximize the likelihood # but here we are trying to minimize the sums of squares. nls1<-nls(Radius~a*DBH^b, start=list(a=1,b=1)) summary(nls1) # How would you do this using likelihood? # GENERALIZED LINEAR MODELS ########################################## ## CASE 1: POISSON REGRESSION # First we simulate some data x<-seq(1, 10, length=1000) a=0.5 b=0.5 y=a + b*x E.y<-exp(a + b*x) # The transformation required to linearize the problem data.y<-rpois(length(y), lambda=E.y) # Note that this is count data. plot(x, data.y) ## Use glm lm.pois<-glm(data.y~x ,family=poisson) summary(lm.pois) # Try the likelihood version: poisregfun<-function(a,b, sigma) { Y.pred<-exp(a + b*x) # This is correct because remember that you exponentiated the data to linearize # it and use Poisson errors. For likelihood you have to reverse this. -sum(dpois(data.y, lambda=Y.pred, log=TRUE)) } glm1<-mle2(poisregfun, start=list(a=0.5, b=1), method="L-BFGS-B", lower=list(a=0, b=0), upper=list(a=5, b=5)) # So as you can see the estimates are identical under both methods. ############################################################# # # CASE2: LOGISTIC REGRESSION # ############################################################# # first we generate some data: x<-seq(1, 10, length=1000) a=0.5 b=0.01 Y.pred= a+b*x data.y<-rbinom(size=1, plogis(Y.pred), n=length(Y.pred)) # Note that this is binomial data plot(x, data.y) # Using a GLM glm2<-glm(data.y~x, family=binomial) summary(glm2) #With likelihood: logregfun<-function(a, b) { p.pred<-exp(a+b*x)/(1+exp(a+ b*x)) # OR p.pred<-plogis(a+b*x) -sum(dbinom(data.y, size=1, prob=p.pred, log=TRUE)) } glm3<-mle(logregfun, start=list(a=0.1, b=0.1)) glm3 # And again we get the same values. ################################################ # # COMPLEX ERROR STRUCTURES # ################################################ rho=0.5 m<-matrix(nrow=5, ncol=5) m<-rho^(abs(row(m)-col(m))) # or correlation with neighbors m<-diag(5) m[abs(row(m)-col(m))==1]=rho # An example may be a group of plants for which we have some predictors of growth (say height) # but when we look at the residuls we find that there is a spatial correlation at the scale of the neighbor). # We are trying to estimate the correlation in addition to the parameters of the models. radius<-Radius[1:40] dbh<-DBH[1:40] mvlik<-function(a,b, rho) { pred.rad=a+b*dbh n=length(radius) m=diag(n) #generates diag matrix of n rows, n columns m[abs(row(m)-col(m))==1]=rho # You need to define this upfront and you may not know this so it is # useful to do a variogram to estimate the spatial scale of the correlation. # see R package sgeostat -dmvnorm(radius, pred.rad, Sigma=m, log=TRUE) } model.4<-mle(mvlik, start=list(a=0.5, b=3,rho=0.5), method="L-BFGS-B", lower=0.001) summary(model.4) # The estimate of rho is at the lowest limit because there is # no built in correlation in these data. you could generate a matrix # that is built based on geographic or eucledian distance.