Difference between revisions of "Worksheets/Week9"
Jump to navigation
Jump to search
Kevin Dunn (talk | contribs) (→Part 1) |
Kevin Dunn (talk | contribs) (→Part 1) |
||
Line 5: | Line 5: | ||
<html><div data-datacamp-exercise data-lang="r" data-height="500"> | <html><div data-datacamp-exercise data-lang="r" data-height="500"> | ||
<code data-type="sample-code"> | <code data-type="sample-code"> | ||
# Copy/paste the code piece by piece. | |||
# Start small: with just 2 experiments | |||
center = 36 | center = 36 | ||
range = 24 | range = 24 | ||
rw.x <- c(24, 48) | rw.x <- c(24, 48) | ||
coded.x <- (rw.x - center)/(0.5*range) | coded.x <- (rw.x - center)/(0.5*range) | ||
y0 <- c(28, 63) | y0 <- c(28, 63) | ||
# Coded -1 : 24 hours | |||
# Coded +1 : 48 hours | |||
model.0 <- lm(y0 ~ coded.x) | model.0 <- lm(y0 ~ coded.x) | ||
Line 18: | Line 23: | ||
# * slope? | # * slope? | ||
# * intercept | # * intercept | ||
# * why are R2 and SE | # * why are R2 and SE the values they are? | ||
# Basic plot of everything so far: | # Basic plot of everything so far: | ||
Line 29: | Line 34: | ||
ylab("Outcome variable") + | ylab("Outcome variable") + | ||
scale_y_continuous(breaks=seq(0, 150, 10)) + | scale_y_continuous(breaks=seq(0, 150, 10)) + | ||
theme(axis.text=element_text(size=18), legend.position = "none") + | theme(axis.text=element_text(size=18), legend.position = "none") + | ||
theme(axis.title=element_text(face="bold", size=14)) | theme(axis.title=element_text(face="bold", size=14)) | ||
p | p | ||
# Add the linear fit: through the 2 points | |||
plot_data <- data.frame(coded.x = seq(-2, +5, 0.1)) | |||
plot_data$y <- predict(model.0, newdata=plot_data) | |||
p <- p + geom_line(data=plot_data, color="blue", size=1) | |||
p | |||
# Run experiment at center point: predict it first | # Run experiment at center point: predict it first | ||
Line 41: | Line 50: | ||
predict(model.0, newdata=x.test) | predict(model.0, newdata=x.test) | ||
# Expect a predicted value of 45.5 in the output. Actual: | # Expect a predicted value of 45.5 in the output. Actual: 55. | ||
# 10 units difference | # Run a second experiment at the center, to get a feeling for spread. 54 | ||
# Similar values, and about 10 units difference from linear model. | |||
# Try fitting a linear model now through all the data points: | # Try fitting a linear model now through all the available data points: | ||
rw.x <- c(24, 48, 36, 36) | rw.x <- c(24, 48, 36, 36) | ||
y1 <- c(28, 63, 55, 54) | y1 <- c(28, 63, 55, 54) | ||
Line 58: | Line 61: | ||
summary(model.1) | summary(model.1) | ||
# Show the linear fit with the extra data point (center points) | # Show the linear fit with the extra data point (center points): green line | ||
raw_data <- data.frame(coded.x = coded.x, y = y1) | raw_data <- data.frame(coded.x = coded.x, y = y1) | ||
plot_data$y <- predict(model.1, newdata=plot_data) | plot_data$y <- predict(model.1, newdata=plot_data) | ||
Line 65: | Line 68: | ||
p | p | ||
# Seems like a quadratic model through all the data points could approximate it: | |||
# | |||
rw.x <- c(24, 48, 36, 36) | rw.x <- c(24, 48, 36, 36) | ||
y1 <- c(28, 63, 55, 54) | y1 <- c(28, 63, 55, 54) | ||
Line 74: | Line 75: | ||
summary(model.1.quad) | summary(model.1.quad) | ||
# Show the quadratic fit between -2 and + | # Show the quadratic fit between -2 and +3 in coded units: red line | ||
# In real-world units this corresonds to _____ and _____ | # In real-world units this corresonds to _____ and _____ | ||
plot_data <- data.frame(coded.x = seq(-2, +3, 0.1)) | plot_data <- data.frame(coded.x = seq(-2, +3, 0.1)) | ||
Line 83: | Line 84: | ||
#------------ | #------------ | ||
# Try a new point at +2: that is RW = coded * 0.5 * range + center; x = 60 hours | # Try a new point at +2: that is RW = coded * 0.5 * range + center; x = 60 hours | ||
rw.x <- c(24, 48, 36, 36, 60) | rw.x <- c(24, 48, 36, 36, 60) | ||
coded.x <- (rw.x - center)/(0.5*range) | coded.x <- (rw.x - center)/(0.5*range) | ||
predict(model.1.quad, newdata=x.test) # predicts | x.test <- data.frame(coded.x=coded.x) | ||
predict(model.1.quad, newdata=x.test) # predicts 53.5 | |||
# Acutal y = 66. Therefore, our model isn't so good. Improve it with the new point | # Acutal y = 66. Therefore, our model isn't so good. Improve it with the new point | ||
Line 95: | Line 96: | ||
summary(model.2.quad) | summary(model.2.quad) | ||
# Plot it again: | # Plot it again: purple | ||
raw_data <- data.frame(coded.x = coded.x, y = y2) | raw_data <- data.frame(coded.x = coded.x, y = y2) | ||
plot_data <- data.frame(coded.x = seq(-2, +3, 0.1)) | plot_data <- data.frame(coded.x = seq(-2, +3, 0.1)) | ||
Line 105: | Line 106: | ||
#------------ | #------------ | ||
# | # Normally at this point I would reset the frame of reference; | ||
# -1: 36 | # keep our range the same (24 hours) | ||
# +1 | # -1: 36 hours [prior coded "0" now becomes "-1] | ||
# | # 0: 48 hours [prior coded "+1" now becomes "0"] | ||
# +1: 60 hours [prior coded "+2" now becomes "+1"] | |||
center = 48 | center = 48 | ||
range = 24 | range = 24 # unchanged, but could also have been chosen to be smaller or bigger | ||
# At this point I will also switch the model type back to real-world units. | |||
# For 2 reasons: there is only 1 variable that is interesting (duration), so | |||
# coded units don't matter; secondly the model types we will use cannot be | |||
# built when the coded x-value is negative. | |||
rw.x <- c(24, 48, 36, 36, 60) | |||
y3 <- c(28, 63, 55, 54, 66) | |||
# Rebuild the model, and start the plots again | # Rebuild the model, and start the plots again. Try different model type | ||
model.3 <- lm(y3 ~ | model.3 <- lm(y3 ~ I(1/(rw.x))) | ||
summary(model.3) | summary(model.3) | ||
x.min = | x.min = 20 | ||
x.max = 105 | x.max = 105 | ||
# Basic plot of everything so far: | # Basic plot of everything so far: | ||
Line 129: | Line 137: | ||
ylab("Outcome variable") + | ylab("Outcome variable") + | ||
scale_y_continuous(breaks=seq(0, 150, 10)) + | scale_y_continuous(breaks=seq(0, 150, 10)) + | ||
#theme(axis.text=element_text(size=18), legend.position = "none") + | #theme(axis.text=element_text(size=18), legend.position = "none") + | ||
theme(axis.title=element_text(face="bold", size=14)) + | theme(axis.title=element_text(face="bold", size=14)) + | ||
Line 136: | Line 143: | ||
p | p | ||
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0. | plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5)) | ||
plot_data$y <- predict(model.3, newdata=plot_data) | plot_data$y <- predict(model.3, newdata=plot_data) | ||
p <- p + geom_line(data=plot_data, color="blue", size=1) | p <- p + geom_line(data=plot_data, color="blue", size=1) | ||
p | p | ||
# Run experiment at | # Run experiment at use 75 hours - just a little more]: predict it first | ||
rw.x <- c(48, 36, 36, 60, 75 | rw.x <- c(24, 48, 36, 36, 60, 75) | ||
x.test <- data.frame(rw.x=rw.x) | |||
x.test <- data.frame( | |||
predict(model.3, newdata=x.test) | predict(model.3, newdata=x.test) | ||
# Expect a predicted value of | # Expect a predicted value of 74.3 in the output. # Actual result: 79. | ||
# So about 5 units difference. Not too bad. | |||
rw.x <- c(48, 36, 36, 60, 75) | rw.x <- c(24, 48, 36, 36, 60, 75) | ||
y4 <- c(63, 55, 54, 66, 79) | y4 <- c(28, 63, 55, 54, 66, 79) | ||
# Rebuild the model, and start the plots again | # Rebuild the model, and start the plots again. Different model type: x + x^2 | ||
model.4 <- lm(y4 ~ | model.4 <- lm(y4 ~ I(1/(rw.x))) | ||
summary(model.4) | summary(model.4) | ||
# Plot it again: purple | |||
raw_data <- data.frame(rw.x = rw.x, y = y4) | |||
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5)) | |||
plot_data$y <- predict(model.4, newdata=plot_data) | |||
p <- p + geom_point(aes(x=rw.x, y=y, color='purple', size=5), data=raw_data) | |||
p <- p + geom_line(data=plot_data, color="purple", size=1) | |||
p | |||
# ------- | |||
# Let's try at around 90 hours. Expect an outcome of around 80. | |||
# Got a value of 76 instead. Stabilized? | |||
rw.x <- c(24, 48, 36, 36, 60, 75, 90) | |||
y5 <- c(28, 63, 55, 54, 66, 79, 76) | |||
# Rebuild the model, and start the plots again. Different model type: x + x^2 | |||
model.5 <- lm(y5 ~ I(1/(rw.x))) | |||
summary(model.5) | |||
# | # Plot it again: green | ||
raw_data <- data.frame(rw.x = rw.x, y = y5) | |||
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5)) | |||
plot_data$y <- predict(model.5, newdata=plot_data) | |||
p <- p + geom_point(aes(x=rw.x, y=y, color='green', size=5), data=raw_data) | |||
p <- p + geom_line(data=plot_data, color="green", size=1) | |||
p | |||
# Try adding few more points: | |||
# Point 8. Try 95 hours. Seems to confirm stabilization. Predict 79; actual: 81 | |||
# Point 9: Overshoot(?) to 105 hour to see decline. Predict 81; actual: 72 | |||
# Build a model with all values now | |||
rw.x <- c(48, 36, 36, 60, 75, 90, 95, 105) | rw.x <- c(48, 36, 36, 60, 75, 90, 95, 105) | ||
y6 <- c(63, 55, 54, 66, 79, 76, 81, 72) | |||
# Rebuild the model, and start the plots again | # Rebuild the model, and start the plots again | ||
model. | model.6 <- lm(y6 ~ I(1/(rw.x))) | ||
summary(model. | summary(model.6) | ||
raw_data <- data.frame(rw.x = rw.x, y = y6) | |||
raw_data <- data.frame(rw.x = rw.x, y = | |||
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.1)) | plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.1)) | ||
plot_data$y <- predict(model.5, newdata=plot_data) | plot_data$y <- predict(model.5, newdata=plot_data) | ||
Line 180: | Line 209: | ||
# Rebuild | # The current model structure does not allow for decrease/stabilization. | ||
model.6 <- lm( | # Rebuild it with a different structure | ||
summary(model.6) | # Try various ones: x + 1/sqrt(x) SE = 4.11 | ||
# Try various ones: x + 1/x SE = 4.24 | |||
# Try various ones: x + log(x) SE = 3.98 | |||
# Try various ones: x + x^2 SE = 3.41 | |||
model.6.revised <- lm(y6 ~ rw.x + I((rw.x)^2)) # | |||
summary(model.6.revised) | |||
x.min = 35 | |||
x.max = 105 | |||
raw_data <- data.frame(rw.x = rw.x, y = y6) | |||
p <- ggplot(data=raw_data, aes(x=rw.x, y=y)) + | |||
geom_point(size=5) + | |||
xlab("Real-world values: time") + | |||
scale_x_continuous(breaks=seq(x.min, x.max,5)) + | |||
ylab("Outcome variable") + | |||
scale_y_continuous(breaks=seq(0, 150, 10)) + | |||
theme(axis.title=element_text(face="bold", size=14)) + | |||
expand_limits(x = c(x.min, x.max)) + | |||
expand_limits(y = c(50, 80)) | |||
p | p | ||
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5)) | |||
plot_data$y <- predict(model.6.revised, newdata=plot_data) | |||
p <- p + geom_line(data=plot_data, color="orange", size=1) | |||
p | |||
</code> | </code> | ||
</div></html> | </div></html> |
Revision as of 04:32, 6 May 2019
Part 1
Description here
# Copy/paste the code piece by piece.
# Start small: with just 2 experiments
center = 36
range = 24
rw.x <- c(24, 48)
coded.x <- (rw.x - center)/(0.5*range)
y0 <- c(28, 63)
# Coded -1 : 24 hours
# Coded +1 : 48 hours
model.0 <- lm(y0 ~ coded.x)
summary(model.0)
# What is the interpretation of the
# * slope?
# * intercept
# * why are R2 and SE the values they are?
# Basic plot of everything so far:
raw_data <- data.frame(coded.x = coded.x, y = y0)
library(ggplot2)
p <- ggplot(data=raw_data, aes(x=coded.x, y=y)) +
geom_point(size=5) +
xlab("Coded value for x_A") +
scale_x_continuous(breaks=seq(-2, 5, 1)) +
ylab("Outcome variable") +
scale_y_continuous(breaks=seq(0, 150, 10)) +
theme(axis.text=element_text(size=18), legend.position = "none") +
theme(axis.title=element_text(face="bold", size=14))
p
# Add the linear fit: through the 2 points
plot_data <- data.frame(coded.x = seq(-2, +5, 0.1))
plot_data$y <- predict(model.0, newdata=plot_data)
p <- p + geom_line(data=plot_data, color="blue", size=1)
p
# Run experiment at center point: predict it first
rw.x <- c(24, 48, 36)
coded.x <- (rw.x - center)/(0.5*range)
x.test <- data.frame(coded.x=coded.x)
predict(model.0, newdata=x.test)
# Expect a predicted value of 45.5 in the output. Actual: 55.
# Run a second experiment at the center, to get a feeling for spread. 54
# Similar values, and about 10 units difference from linear model.
# Try fitting a linear model now through all the available data points:
rw.x <- c(24, 48, 36, 36)
y1 <- c(28, 63, 55, 54)
coded.x <- (rw.x - center)/(0.5*range)
model.1 <- lm(y1 ~ coded.x)
summary(model.1)
# Show the linear fit with the extra data point (center points): green line
raw_data <- data.frame(coded.x = coded.x, y = y1)
plot_data$y <- predict(model.1, newdata=plot_data)
p <- p + geom_point(aes(x=coded.x, y=y, color='darkgreen', size=5), data=raw_data)
p <- p + geom_line(data=plot_data, color="darkgreen", size=1)
p
# Seems like a quadratic model through all the data points could approximate it:
rw.x <- c(24, 48, 36, 36)
y1 <- c(28, 63, 55, 54)
coded.x <- (rw.x - center)/(0.5*range)
model.1.quad <- lm(y1 ~ coded.x + I(coded.x^2))
summary(model.1.quad)
# Show the quadratic fit between -2 and +3 in coded units: red line
# In real-world units this corresonds to _____ and _____
plot_data <- data.frame(coded.x = seq(-2, +3, 0.1))
plot_data$y <- predict(model.1.quad, newdata=plot_data)
p <- p + geom_line(data=plot_data, color="red", size=1)
p
#------------
# Try a new point at +2: that is RW = coded * 0.5 * range + center; x = 60 hours
rw.x <- c(24, 48, 36, 36, 60)
coded.x <- (rw.x - center)/(0.5*range)
x.test <- data.frame(coded.x=coded.x)
predict(model.1.quad, newdata=x.test) # predicts 53.5
# Acutal y = 66. Therefore, our model isn't so good. Improve it with the new point
rw.x <- c(24, 48, 36, 36, 60)
y2 <- c(28, 63, 55, 54, 66)
coded.x <- (rw.x - center)/(0.5*range)
model.2.quad <- lm(y2 ~ coded.x + I(coded.x^2))
summary(model.2.quad)
# Plot it again: purple
raw_data <- data.frame(coded.x = coded.x, y = y2)
plot_data <- data.frame(coded.x = seq(-2, +3, 0.1))
plot_data$y <- predict(model.2.quad, newdata=plot_data)
p <- p + geom_point(aes(x=coded.x, y=y, color='purple', size=5), data=raw_data)
p <- p + geom_line(data=plot_data, color="purple", size=1)
p
#------------
#------------
# Normally at this point I would reset the frame of reference;
# keep our range the same (24 hours)
# -1: 36 hours [prior coded "0" now becomes "-1]
# 0: 48 hours [prior coded "+1" now becomes "0"]
# +1: 60 hours [prior coded "+2" now becomes "+1"]
center = 48
range = 24 # unchanged, but could also have been chosen to be smaller or bigger
# At this point I will also switch the model type back to real-world units.
# For 2 reasons: there is only 1 variable that is interesting (duration), so
# coded units don't matter; secondly the model types we will use cannot be
# built when the coded x-value is negative.
rw.x <- c(24, 48, 36, 36, 60)
y3 <- c(28, 63, 55, 54, 66)
# Rebuild the model, and start the plots again. Try different model type
model.3 <- lm(y3 ~ I(1/(rw.x)))
summary(model.3)
x.min = 20
x.max = 105
# Basic plot of everything so far:
raw_data <- data.frame(rw.x = rw.x, y = y3)
p <- ggplot(data=raw_data, aes(x=rw.x, y=y)) +
geom_point(size=5) +
xlab("Real-world values: time") +
scale_x_continuous(breaks=seq(x.min, x.max,5)) +
ylab("Outcome variable") +
scale_y_continuous(breaks=seq(0, 150, 10)) +
#theme(axis.text=element_text(size=18), legend.position = "none") +
theme(axis.title=element_text(face="bold", size=14)) +
expand_limits(x = c(x.min, x.max)) +
expand_limits(y = c(50, 80))
p
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5))
plot_data$y <- predict(model.3, newdata=plot_data)
p <- p + geom_line(data=plot_data, color="blue", size=1)
p
# Run experiment at use 75 hours - just a little more]: predict it first
rw.x <- c(24, 48, 36, 36, 60, 75)
x.test <- data.frame(rw.x=rw.x)
predict(model.3, newdata=x.test)
# Expect a predicted value of 74.3 in the output. # Actual result: 79.
# So about 5 units difference. Not too bad.
rw.x <- c(24, 48, 36, 36, 60, 75)
y4 <- c(28, 63, 55, 54, 66, 79)
# Rebuild the model, and start the plots again. Different model type: x + x^2
model.4 <- lm(y4 ~ I(1/(rw.x)))
summary(model.4)
# Plot it again: purple
raw_data <- data.frame(rw.x = rw.x, y = y4)
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5))
plot_data$y <- predict(model.4, newdata=plot_data)
p <- p + geom_point(aes(x=rw.x, y=y, color='purple', size=5), data=raw_data)
p <- p + geom_line(data=plot_data, color="purple", size=1)
p
# -------
# Let's try at around 90 hours. Expect an outcome of around 80.
# Got a value of 76 instead. Stabilized?
rw.x <- c(24, 48, 36, 36, 60, 75, 90)
y5 <- c(28, 63, 55, 54, 66, 79, 76)
# Rebuild the model, and start the plots again. Different model type: x + x^2
model.5 <- lm(y5 ~ I(1/(rw.x)))
summary(model.5)
# Plot it again: green
raw_data <- data.frame(rw.x = rw.x, y = y5)
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5))
plot_data$y <- predict(model.5, newdata=plot_data)
p <- p + geom_point(aes(x=rw.x, y=y, color='green', size=5), data=raw_data)
p <- p + geom_line(data=plot_data, color="green", size=1)
p
# Try adding few more points:
# Point 8. Try 95 hours. Seems to confirm stabilization. Predict 79; actual: 81
# Point 9: Overshoot(?) to 105 hour to see decline. Predict 81; actual: 72
# Build a model with all values now
rw.x <- c(48, 36, 36, 60, 75, 90, 95, 105)
y6 <- c(63, 55, 54, 66, 79, 76, 81, 72)
# Rebuild the model, and start the plots again
model.6 <- lm(y6 ~ I(1/(rw.x)))
summary(model.6)
raw_data <- data.frame(rw.x = rw.x, y = y6)
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.1))
plot_data$y <- predict(model.5, newdata=plot_data)
p <- p + geom_point(aes(x=rw.x, y=y, color='red', size=5), data=raw_data)
p <- p + geom_line(data=plot_data, color="red", size=1)
p
# The current model structure does not allow for decrease/stabilization.
# Rebuild it with a different structure
# Try various ones: x + 1/sqrt(x) SE = 4.11
# Try various ones: x + 1/x SE = 4.24
# Try various ones: x + log(x) SE = 3.98
# Try various ones: x + x^2 SE = 3.41
model.6.revised <- lm(y6 ~ rw.x + I((rw.x)^2)) #
summary(model.6.revised)
x.min = 35
x.max = 105
raw_data <- data.frame(rw.x = rw.x, y = y6)
p <- ggplot(data=raw_data, aes(x=rw.x, y=y)) +
geom_point(size=5) +
xlab("Real-world values: time") +
scale_x_continuous(breaks=seq(x.min, x.max,5)) +
ylab("Outcome variable") +
scale_y_continuous(breaks=seq(0, 150, 10)) +
theme(axis.title=element_text(face="bold", size=14)) +
expand_limits(x = c(x.min, x.max)) +
expand_limits(y = c(50, 80))
p
plot_data <- data.frame(rw.x = seq(x.min, x.max, 0.5))
plot_data$y <- predict(model.6.revised, newdata=plot_data)
p <- p + geom_line(data=plot_data, color="orange", size=1)
p