Worksheets/Week9

From Statistics for Engineering
Jump to navigation Jump to search

Part 1

The aim is to find the duration [hours] of the experiments to maximize the yield of a biological product. If you operate the reactor too long some side reactions start to occur which consume your product. If you operate for too short a time, then you miss out on the opportunity of creating more product.


# Copy/paste the code piece by piece. # Or run it piece by piece in RStudio. # Start small: with just 2 experiments center = 36 range = 24 rw.x <- c(24, 48) coded.x <- (rw.x - center)/(0.5*range) y0 <- c(28, 63) # Coded -1 : 24 hours # Coded +1 : 48 hours model.0 <- lm(y0 ~ coded.x) summary(model.0) # What is the interpretation of the # * slope? # * intercept # * why are R2 and SE the values they are? # Basic plot of everything so far: raw_data <- data.frame(coded.x = coded.x, y = y0) library(ggplot2) p <- ggplot(data=raw_data, aes(x=coded.x, y=y)) + geom_point(size=5) + xlab("Coded value for x_A") + scale_x_continuous(breaks=seq(-2, 5, 1)) + ylab("Outcome variable") + scale_y_continuous(breaks=seq(0, 150, 10)) + theme(axis.text=element_text(size=18), legend.position = "none") + theme(axis.title=element_text(face="bold", size=14)) p # Add the linear fit: through the 2 points plot_data <- data.frame(coded.x = seq(-2, +5, 0.1)) plot_data$y <- predict(model.0, newdata=plot_data) p <- p + geom_line(data=plot_data, color="blue", size=1) p # Run experiment at center point: predict it first rw.x <- c(24, 48, 36) coded.x <- (rw.x - center)/(0.5*range) x.test <- data.frame(coded.x=coded.x) predict(model.0, newdata=x.test) # Expect a predicted value of 45.5 in the output. Actual: 55. # Run a second experiment at the center, to get a feeling for spread. 54 # Similar values, and about 10 units difference from linear model. # Try fitting a linear model now through all the available data points: rw.x <- c(24, 48, 36, 36) y1 <- c(28, 63, 55, 54) coded.x <- (rw.x - center)/(0.5*range) model.1 <- lm(y1 ~ coded.x) summary(model.1) # Show the linear fit with the extra data point (center points): green line raw_data <- data.frame(coded.x = coded.x, y = y1) plot_data$y <- predict(model.1, newdata=plot_data) p <- p + geom_point(aes(x=coded.x, y=y, color='darkgreen', size=5), data=raw_data) p <- p + geom_line(data=plot_data, color="darkgreen", size=1) p # Seems like a quadratic model through all the data points could approximate it: rw.x <- c(24, 48, 36, 36) y1 <- c(28, 63, 55, 54) coded.x <- (rw.x - center)/(0.5*range) model.1.quad <- lm(y1 ~ coded.x + I(coded.x^2)) summary(model.1.quad) # Show the quadratic fit between -2 and +3 in coded units: red line # In real-world units this corresonds to _____ and _____ plot_data <- data.frame(coded.x = seq(-2, +3, 0.1)) plot_data$y <- predict(model.1.quad, newdata=plot_data) p <- p + geom_line(data=plot_data, color="red", size=1) p #------------ # Try a new point at +2: that is RW = coded * 0.5 * range + center; x = 60 hours rw.x <- c(24, 48, 36, 36, 60) coded.x <- (rw.x - center)/(0.5*range) x.test <- data.frame(coded.x=coded.x) predict(model.1.quad, newdata=x.test) # predicts 53.5 # Acutal y = 66. Therefore, our model isn't so good. Improve it with the new point rw.x <- c(24, 48, 36, 36, 60) y2 <- c(28, 63, 55, 54, 66) coded.x <- (rw.x - center)/(0.5*range) model.2.quad <- lm(y2 ~ coded.x + I(coded.x^2)) summary(model.2.quad) # Plot it again: purple raw_data <- data.frame(coded.x = coded.x, y = y2) plot_data <- data.frame(coded.x = seq(-2, +3, 0.1)) plot_data$y <- predict(model.2.quad, newdata=plot_data) p <- p + geom_point(aes(x=coded.x, y=y, color='purple', size=5), data=raw_data) p <- p + geom_line(data=plot_data, color="purple", size=1) p #------------ #------------ # Normally at this point I would reset the frame of reference; # keep our range the same (24 hours) # -1: 36 hours [prior coded "0" now becomes "-1] # 0: 48 hours [prior coded "+1" now becomes "0"] # +1: 60 hours [prior coded "+2" now becomes "+1"] center = 48 range = 24 # unchanged, but could also have been chosen to be smaller or bigger # At this point I will also switch the model type back to real-world units. # For 2 reasons: there is only 1 variable that is interesting (duration), so # coded units don't matter; secondly the model types we will use cannot be # built when the coded x-value is negative. rw.x <- c(24, 48, 36, 36, 60) y3 <- c(28, 63, 55, 54, 66) # Rebuild the model, and start the plots again. Try different model type model.3 <- lm(y3 ~ I(1/(rw.x))) summary(model.3) x.min = 20 x.max = 105 # Basic plot of everything so far: raw_data <- data.frame(rw.x = rw.x, y = y3) p <- ggplot(data=raw_data, aes(x=rw.x, y=y)) + geom_point(size=5) + xlab("Real-world values: time") + scale_x_continuous(breaks=seq(x.min, x.max,5)) + ylab("Outcome variable") + scale_y_continuous(breaks=seq(0, 150, 10)) + #theme(axis.text=element_text(size=18), legend.position = "none") + theme(axis.title=element_text(face="bold", size=14)) + expand_limits(x = c(x.min, x.max)) + expand_limits(y = c(50, 80)) p plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5)) plot_data$y <- predict(model.3, newdata=plot_data) p <- p + geom_line(data=plot_data, color="blue", size=1) p # Run experiment at use 75 hours - just a little more]: predict it first rw.x <- c(24, 48, 36, 36, 60, 75) x.test <- data.frame(rw.x=rw.x) predict(model.3, newdata=x.test) # Expect a predicted value of 74.3 in the output. # Actual result: 79. # So about 5 units difference. Not too bad. rw.x <- c(24, 48, 36, 36, 60, 75) y4 <- c(28, 63, 55, 54, 66, 79) # Rebuild the model, and start the plots again. Different model type: x + x^2 model.4 <- lm(y4 ~ I(1/(rw.x))) summary(model.4) # Plot it again: purple raw_data <- data.frame(rw.x = rw.x, y = y4) plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5)) plot_data$y <- predict(model.4, newdata=plot_data) p <- p + geom_point(aes(x=rw.x, y=y, color='purple', size=5), data=raw_data) p <- p + geom_line(data=plot_data, color="purple", size=1) p # ------- # Let's try at around 90 hours. Expect an outcome of around 80. # Got a value of 76 instead. Stabilized? rw.x <- c(24, 48, 36, 36, 60, 75, 90) y5 <- c(28, 63, 55, 54, 66, 79, 76) # Rebuild the model, and start the plots again. Different model type: x + x^2 model.5 <- lm(y5 ~ I(1/(rw.x))) summary(model.5) # Plot it again: green raw_data <- data.frame(rw.x = rw.x, y = y5) plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5)) plot_data$y <- predict(model.5, newdata=plot_data) p <- p + geom_point(aes(x=rw.x, y=y, color='green', size=5), data=raw_data) p <- p + geom_line(data=plot_data, color="green", size=1) p # Try adding few more points: # Point 8. Try 95 hours. Seems to confirm stabilization. Predict 79; actual: 81 # Point 9: Overshoot(?) to 105 hour to see decline. Predict 81; actual: 72 # Build a model with all values now rw.x <- c(48, 36, 36, 60, 75, 90, 95, 105) y6 <- c(63, 55, 54, 66, 79, 76, 81, 72) # Rebuild the model, and start the plots again model.6 <- lm(y6 ~ I(1/(rw.x))) summary(model.6) raw_data <- data.frame(rw.x = rw.x, y = y6) plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.1)) plot_data$y <- predict(model.5, newdata=plot_data) p <- p + geom_point(aes(x=rw.x, y=y, color='red', size=5), data=raw_data) p <- p + geom_line(data=plot_data, color="red", size=1) p # The current model structure does not allow for decrease/stabilization. # Rebuild it with a different structure # Try various ones: x + 1/sqrt(x) SE = 4.11 # Try various ones: x + 1/x SE = 4.24 # Try various ones: x + log(x) SE = 3.98 # Try various ones: x + x^2 SE = 3.41 model.6.revised <- lm(y6 ~ rw.x + I((rw.x)^2)) # summary(model.6.revised) x.min = 35 x.max = 105 raw_data <- data.frame(rw.x = rw.x, y = y6) p <- ggplot(data=raw_data, aes(x=rw.x, y=y)) + geom_point(size=5) + xlab("Real-world values: time") + scale_x_continuous(breaks=seq(x.min, x.max,5)) + ylab("Outcome variable") + scale_y_continuous(breaks=seq(0, 150, 10)) + theme(axis.title=element_text(face="bold", size=14)) + expand_limits(x = c(x.min, x.max)) + expand_limits(y = c(50, 80)) p plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5)) plot_data$y <- predict(model.6.revised, newdata=plot_data) p <- p + geom_line(data=plot_data, color="orange", size=1) p