Difference between revisions of "Univariate data analysis"
Jump to navigation
Jump to search
Kevin Dunn (talk | contribs) |
Kevin Dunn (talk | contribs) |
||
Line 203: | Line 203: | ||
hist((s1+s2+s3+s4+s5+s6+s7+s8)/8, breaks=bins, main="", xlab="Average of 8 throws") | hist((s1+s2+s3+s4+s5+s6+s7+s8)/8, breaks=bins, main="", xlab="Average of 8 throws") | ||
hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws") | hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws") | ||
</syntaxhighlight> | |||
=== Code used to illustrate how the q-q plot is constructed === | |||
[http://www.r-fiddle.org/#/fiddle?id=5mdsZDiD Web-based version of the code] | |||
<syntaxhighlight lang="rsplus"> | |||
N <- 10 | |||
# What are the quantiles from the theoretical normal distribution? | |||
index <- seq(1, N) | |||
P <- (index - 0.5) / N | |||
theoretical.quantity <- qnorm(P) | |||
# Our sampled data: | |||
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4) | |||
mean.yield <- mean(yields) # 80.0 | |||
sd.yield <- sd(yields) # 8.35 | |||
# What are the quantiles for the sampled data? | |||
yields.z <- (yields - mean.yield)/sd.yield | |||
yields.z | |||
yields.z.sorted <- sort(yields.z) | |||
# Compare the values in text: | |||
yields.z.sorted | |||
theoretical.quantity | |||
# Compare them graphically: | |||
plot(theoretical.quantity, yields.z.sorted, asp=1) | |||
abline(a=0, b=1) | |||
# Built-in R function to do all the above for you: | |||
qqnorm(yields) | |||
qqline(yields) | |||
# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages | |||
library(car) | |||
qqPlot(yields) | |||
</syntaxhighlight> | |||
=== Code to illustrate the central limit theorem's reduction in variance === | |||
[http://www.r-fiddle.org/#/fiddle?id=g75N9Yh5 Web-based version of the code] | |||
<syntaxhighlight lang="rsplus"> | |||
# Show the 3 plots side by side | |||
layout(matrix(c(1,2,3), 1, 3)) | |||
# Sample the population: | |||
N <- 100 | |||
x <- rnorm(N, mean=80, sd=5) | |||
mean(x) | |||
sd(x) | |||
# Plot the raw data | |||
x.range <- range(x) | |||
plot(x, ylim=x.range, main='Raw data') | |||
# Subgroups of 2 | |||
subsize <- 2 | |||
x.2 <- numeric(N/subsize) | |||
for (i in 1:(N/subsize)) | |||
{ | |||
x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)]) | |||
} | |||
plot(x.2, ylim=x.range, main='Subgroups of 2') | |||
# Subgroups of 4 | |||
subsize <- 4 | |||
x.4 <- numeric(N/subsize) | |||
for (i in 1:(N/subsize)) | |||
{ | |||
x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)]) | |||
} | |||
plot(x.4, ylim=x.range, main='Subgroups of 4') | |||
</syntaxhighlight> | |||
=== Paired test example === | |||
[http://www.r-fiddle.org/#/fiddle?id=SkursT0M Web-based version of the code] | |||
<syntaxhighlight lang="rsplus"> | |||
# Analysis of the data here: | |||
dilution <- c(11, 26, 18, 16, 20, 12, 8, 26, 12, 17, 14) | |||
manometric <- c(25, 3, 27, 30, 33, 16, 28, 27, 12, 32, 16) | |||
N <- length(dilution) | |||
mean(manometric) | |||
mean(dilution) | |||
plot(c(dilution, manometric), ylab="BOD values", xaxt='n') | |||
text(5.5,3, "Dilution") | |||
text(18,3, "Manometric") | |||
abline(v=11.5) | |||
par(mar=c(4.2, 4.2, 0.2, 0.2)) # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1) | |||
plot(dilution, type="p", pch=4, | |||
cex=2, cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, | |||
ylab="BOD values", xlab="Sample number", | |||
ylim=c(0,35), xlim=c(0,11.5), col="darkgreen") | |||
lines(manometric, type="p", pch=16, cex=2, col="blue") | |||
lines(rep(0, N), dilution, type="p", pch=4, cex=2, col="darkgreen") | |||
lines(rep(0, N), manometric, type="p", pch=16, cex=2, col="blue") | |||
abline(v=0.5) | |||
legend(8, 5, pch=c(4, 16), c("Dilution", "Manometric"), col=c("darkgreen", "blue"), pt.cex=2) | |||
par(mar=c(4.2, 4.2, 0.2, 0.2)) # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1) | |||
plot(dilution-manometric, type="p", ylab="Dilution - Manometric", xlab="Sample number", | |||
cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, cex=2) | |||
abline(h=0, col="grey60") | |||
</syntaxhighlight> | </syntaxhighlight> |
Revision as of 19:32, 2 January 2016
Learning outcomes
- The study of variability important to help answer: "what happened?"
- Univariate tools such as the histogram, median, MAD, standard deviation, quartiles will be reviewed from prior courses (as a refresher)
- The normal and t-distribution will be important in our work: what are they, how to interpret them, and use tables of these distributions
- The central limit theorem will be explained conceptually: you cannot finish a course on stats without knowing the key result from this theorem.
- Using and interpreting confidence intervals will be crucial in all the modules that follow.
Extended readings
- New Boeing planes will generate 0.5 TB of data per flight. Read about this, and other sources of data: "every piece of that plane has an internet connection, from the engines to the flaps to the landing gear".
- All students, but especially the 600-level students should read the article by Peter J. Rousseeuw, Tutorial to Robust Statistics it is easy to read, and contains so much useful content.
Resources
Class notes 2015
Class notes 2014
- Textbook, chapter 2
- Quizzes (with solutions): attempt these after you have watched the videos
Tasks to do first Quiz Solution Complete steps 10, 11, 12 and 13 of the software tutorial (also steps 1 through 9)
Quiz Solution Watch videos 1, 2, 3, 4, and 5 Quiz Solution Watch videos 6, 7, and 8 Quiz Solution Watch videos 9 and 10 Quiz Solution Watch videos 11, 12, and 13 Quiz Solution Watch videos 14, 15, and 16 Quiz Solution
Class videos from prior years
Videos from 2015
- Introduction [05:59]
- Histograms [04:50]
- Basic terminology [06:41]
- Outliers, medians and MAD [04:42]
- The central limit theorem [06:56]
- The normal distribution, and standardizing variables [05:54]
- Normal distribution notation and using tables and R [05:48]
- Checking if data are normally distributed [05:57]
- Introducing the idea of a confidence interval [covered in class]
- Confidence intervals when we don't know the variance [07:59]
- Interpreting the confidence interval [07:52]
- A worked example: calculating and interpreting the CI [03:37]
- A motivating example to see why tests for differences are important [08:29]
- The mathematical derivation for a confidence interval for differences [covered in class]
- Using the confidence interval to test for differences to solve the motivating example [covered in class]
- Confidence intervals for paired tests: theory and an example [11:59]
05:59 | Download video | Download captions | Script |
04:50 | Download video | Download captions | Script |
06:41 | Download video | Download captions | Script |
04:42 | Download video | Download captions | Script |
06:56 | Download video | Download captions | Script |
05:54 | Download video | Download captions | Script |
05:48 | Download video | Download captions | Script |
05:57 | Download video | Download captions | Script |
Covered in class | No video | Script |
07:59 | Download video | Download captions | Script |
07:52 | Download video | Download captions | Script |
03:37 | Download video | Download captions | Script |
08:29 | Download video | Download captions | Script |
Audio only | No video | Script |
Audio only | No video | Script |
11:59 | Download video | Download captions | Script |
Videos from 2014
Videos from 2013
Software codes for this section
Understanding the central limit theorem with the rolling dice example
N = 500
m <- t(matrix(seq(1,6), 3, 2))
layout(m)
s1 <- as.integer(runif(N, 1, 7))
s2 <- as.integer(runif(N, 1, 7))
s3 <- as.integer(runif(N, 1, 7))
s4 <- as.integer(runif(N, 1, 7))
s5 <- as.integer(runif(N, 1, 7))
s6 <- as.integer(runif(N, 1, 7))
s7 <- as.integer(runif(N, 1, 7))
s8 <- as.integer(runif(N, 1, 7))
s9 <- as.integer(runif(N, 1, 7))
s10 <- as.integer(runif(N, 1, 7))
hist(s1, main="", xlab="One throw", breaks=seq(0,6)+0.5)
bins = 8
hist((s1+s2)/2, breaks=bins, main="", xlab="Average of two throws")
hist((s1+s2+s3+s4)/4, breaks=bins, main="", xlab="Average of 4 throws")
hist((s1+s2+s3+s4+s5+s6)/6, breaks=bins, main="", xlab="Average of 6 throws")
bins=12
hist((s1+s2+s3+s4+s5+s6+s7+s8)/8, breaks=bins, main="", xlab="Average of 8 throws")
hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws")
Code used to illustrate how the q-q plot is constructed
N <- 10
# What are the quantiles from the theoretical normal distribution?
index <- seq(1, N)
P <- (index - 0.5) / N
theoretical.quantity <- qnorm(P)
# Our sampled data:
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
mean.yield <- mean(yields) # 80.0
sd.yield <- sd(yields) # 8.35
# What are the quantiles for the sampled data?
yields.z <- (yields - mean.yield)/sd.yield
yields.z
yields.z.sorted <- sort(yields.z)
# Compare the values in text:
yields.z.sorted
theoretical.quantity
# Compare them graphically:
plot(theoretical.quantity, yields.z.sorted, asp=1)
abline(a=0, b=1)
# Built-in R function to do all the above for you:
qqnorm(yields)
qqline(yields)
# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
library(car)
qqPlot(yields)
Code to illustrate the central limit theorem's reduction in variance
# Show the 3 plots side by side
layout(matrix(c(1,2,3), 1, 3))
# Sample the population:
N <- 100
x <- rnorm(N, mean=80, sd=5)
mean(x)
sd(x)
# Plot the raw data
x.range <- range(x)
plot(x, ylim=x.range, main='Raw data')
# Subgroups of 2
subsize <- 2
x.2 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.2, ylim=x.range, main='Subgroups of 2')
# Subgroups of 4
subsize <- 4
x.4 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.4, ylim=x.range, main='Subgroups of 4')
Paired test example
# Analysis of the data here:
dilution <- c(11, 26, 18, 16, 20, 12, 8, 26, 12, 17, 14)
manometric <- c(25, 3, 27, 30, 33, 16, 28, 27, 12, 32, 16)
N <- length(dilution)
mean(manometric)
mean(dilution)
plot(c(dilution, manometric), ylab="BOD values", xaxt='n')
text(5.5,3, "Dilution")
text(18,3, "Manometric")
abline(v=11.5)
par(mar=c(4.2, 4.2, 0.2, 0.2)) # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1)
plot(dilution, type="p", pch=4,
cex=2, cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8,
ylab="BOD values", xlab="Sample number",
ylim=c(0,35), xlim=c(0,11.5), col="darkgreen")
lines(manometric, type="p", pch=16, cex=2, col="blue")
lines(rep(0, N), dilution, type="p", pch=4, cex=2, col="darkgreen")
lines(rep(0, N), manometric, type="p", pch=16, cex=2, col="blue")
abline(v=0.5)
legend(8, 5, pch=c(4, 16), c("Dilution", "Manometric"), col=c("darkgreen", "blue"), pt.cex=2)
par(mar=c(4.2, 4.2, 0.2, 0.2)) # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1)
plot(dilution-manometric, type="p", ylab="Dilution - Manometric", xlab="Sample number",
cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, cex=2)
abline(h=0, col="grey60")