Difference between revisions of "Univariate data analysis"
Jump to navigation
Jump to search
Kevin Dunn (talk | contribs) |
Kevin Dunn (talk | contribs) |
||
Line 203: | Line 203: | ||
hist((s1+s2+s3+s4+s5+s6+s7+s8)/8, breaks=bins, main="", xlab="Average of 8 throws") | hist((s1+s2+s3+s4+s5+s6+s7+s8)/8, breaks=bins, main="", xlab="Average of 8 throws") | ||
hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws") | hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws") | ||
</syntaxhighlight> | |||
=== Code used to illustrate how the q-q plot is constructed === | |||
[http://www.r-fiddle.org/#/fiddle?id=5mdsZDiD Web-based version of the code] | |||
<syntaxhighlight lang="rsplus"> | |||
N <- 10 | |||
# What are the quantiles from the theoretical normal distribution? | |||
index <- seq(1, N) | |||
P <- (index - 0.5) / N | |||
theoretical.quantity <- qnorm(P) | |||
# Our sampled data: | |||
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4) | |||
mean.yield <- mean(yields) # 80.0 | |||
sd.yield <- sd(yields) # 8.35 | |||
# What are the quantiles for the sampled data? | |||
yields.z <- (yields - mean.yield)/sd.yield | |||
yields.z | |||
yields.z.sorted <- sort(yields.z) | |||
# Compare the values in text: | |||
yields.z.sorted | |||
theoretical.quantity | |||
# Compare them graphically: | |||
plot(theoretical.quantity, yields.z.sorted, asp=1) | |||
abline(a=0, b=1) | |||
# Built-in R function to do all the above for you: | |||
qqnorm(yields) | |||
qqline(yields) | |||
# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages | |||
library(car) | |||
qqPlot(yields) | |||
</syntaxhighlight> | |||
=== Code to illustrate the central limit theorem's reduction in variance === | |||
[http://www.r-fiddle.org/#/fiddle?id=g75N9Yh5 Web-based version of the code] | |||
<syntaxhighlight lang="rsplus"> | |||
# Show the 3 plots side by side | |||
layout(matrix(c(1,2,3), 1, 3)) | |||
# Sample the population: | |||
N <- 100 | |||
x <- rnorm(N, mean=80, sd=5) | |||
mean(x) | |||
sd(x) | |||
# Plot the raw data | |||
x.range <- range(x) | |||
plot(x, ylim=x.range, main='Raw data') | |||
# Subgroups of 2 | |||
subsize <- 2 | |||
x.2 <- numeric(N/subsize) | |||
for (i in 1:(N/subsize)) | |||
{ | |||
x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)]) | |||
} | |||
plot(x.2, ylim=x.range, main='Subgroups of 2') | |||
# Subgroups of 4 | |||
subsize <- 4 | |||
x.4 <- numeric(N/subsize) | |||
for (i in 1:(N/subsize)) | |||
{ | |||
x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)]) | |||
} | |||
plot(x.4, ylim=x.range, main='Subgroups of 4') | |||
</syntaxhighlight> | |||
=== Paired test example === | |||
[http://www.r-fiddle.org/#/fiddle?id=SkursT0M Web-based version of the code] | |||
<syntaxhighlight lang="rsplus"> | |||
# Analysis of the data here: | |||
dilution <- c(11, 26, 18, 16, 20, 12, 8, 26, 12, 17, 14) | |||
manometric <- c(25, 3, 27, 30, 33, 16, 28, 27, 12, 32, 16) | |||
N <- length(dilution) | |||
mean(manometric) | |||
mean(dilution) | |||
plot(c(dilution, manometric), ylab="BOD values", xaxt='n') | |||
text(5.5,3, "Dilution") | |||
text(18,3, "Manometric") | |||
abline(v=11.5) | |||
par(mar=c(4.2, 4.2, 0.2, 0.2)) # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1) | |||
plot(dilution, type="p", pch=4, | |||
cex=2, cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, | |||
ylab="BOD values", xlab="Sample number", | |||
ylim=c(0,35), xlim=c(0,11.5), col="darkgreen") | |||
lines(manometric, type="p", pch=16, cex=2, col="blue") | |||
lines(rep(0, N), dilution, type="p", pch=4, cex=2, col="darkgreen") | |||
lines(rep(0, N), manometric, type="p", pch=16, cex=2, col="blue") | |||
abline(v=0.5) | |||
legend(8, 5, pch=c(4, 16), c("Dilution", "Manometric"), col=c("darkgreen", "blue"), pt.cex=2) | |||
par(mar=c(4.2, 4.2, 0.2, 0.2)) # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1) | |||
plot(dilution-manometric, type="p", ylab="Dilution - Manometric", xlab="Sample number", | |||
cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, cex=2) | |||
abline(h=0, col="grey60") | |||
</syntaxhighlight> | </syntaxhighlight> |
Revision as of 19:32, 2 January 2016
Learning outcomes
- The study of variability important to help answer: "what happened?"
- Univariate tools such as the histogram, median, MAD, standard deviation, quartiles will be reviewed from prior courses (as a refresher)
- The normal and t-distribution will be important in our work: what are they, how to interpret them, and use tables of these distributions
- The central limit theorem will be explained conceptually: you cannot finish a course on stats without knowing the key result from this theorem.
- Using and interpreting confidence intervals will be crucial in all the modules that follow.
Extended readings
- New Boeing planes will generate 0.5 TB of data per flight. Read about this, and other sources of data: "every piece of that plane has an internet connection, from the engines to the flaps to the landing gear".
- All students, but especially the 600-level students should read the article by Peter J. Rousseeuw, Tutorial to Robust Statistics it is easy to read, and contains so much useful content.
Resources
- Class notes 2015
- Class notes 2014
- Textbook, chapter 2
- Quizzes (with solutions): attempt these after you have watched the videos
Tasks to do first Quiz Solution Complete steps 10, 11, 12 and 13 of the software tutorial (also steps 1 through 9)
Quiz Solution Watch videos 1, 2, 3, 4, and 5 Quiz Solution Watch videos 6, 7, and 8 Quiz Solution Watch videos 9 and 10 Quiz Solution Watch videos 11, 12, and 13 Quiz Solution Watch videos 14, 15, and 16 Quiz Solution
Class videos from prior years
Videos from 2015
- Introduction [05:59]
- Histograms [04:50]
- Basic terminology [06:41]
- Outliers, medians and MAD [04:42]
- The central limit theorem [06:56]
- The normal distribution, and standardizing variables [05:54]
- Normal distribution notation and using tables and R [05:48]
- Checking if data are normally distributed [05:57]
- Introducing the idea of a confidence interval [covered in class]
- Confidence intervals when we don't know the variance [07:59]
- Interpreting the confidence interval [07:52]
- A worked example: calculating and interpreting the CI [03:37]
- A motivating example to see why tests for differences are important [08:29]
- The mathematical derivation for a confidence interval for differences [covered in class]
- Using the confidence interval to test for differences to solve the motivating example [covered in class]
- Confidence intervals for paired tests: theory and an example [11:59]
05:59 | Download video | Download captions | Script |
04:50 | Download video | Download captions | Script |
06:41 | Download video | Download captions | Script |
04:42 | Download video | Download captions | Script |
06:56 | Download video | Download captions | Script |
05:54 | Download video | Download captions | Script |
05:48 | Download video | Download captions | Script |
05:57 | Download video | Download captions | Script |
Covered in class | No video | Script |
07:59 | Download video | Download captions | Script |
07:52 | Download video | Download captions | Script |
03:37 | Download video | Download captions | Script |
08:29 | Download video | Download captions | Script |
Audio only | No video | Script |
Audio only | No video | Script |
11:59 | Download video | Download captions | Script |
Videos from 2014
Videos from 2013
Software codes for this section
Understanding the central limit theorem with the rolling dice example
N = 500
m <- t(matrix(seq(1,6), 3, 2))
layout(m)
s1 <- as.integer(runif(N, 1, 7))
s2 <- as.integer(runif(N, 1, 7))
s3 <- as.integer(runif(N, 1, 7))
s4 <- as.integer(runif(N, 1, 7))
s5 <- as.integer(runif(N, 1, 7))
s6 <- as.integer(runif(N, 1, 7))
s7 <- as.integer(runif(N, 1, 7))
s8 <- as.integer(runif(N, 1, 7))
s9 <- as.integer(runif(N, 1, 7))
s10 <- as.integer(runif(N, 1, 7))
hist(s1, main="", xlab="One throw", breaks=seq(0,6)+0.5)
bins = 8
hist((s1+s2)/2, breaks=bins, main="", xlab="Average of two throws")
hist((s1+s2+s3+s4)/4, breaks=bins, main="", xlab="Average of 4 throws")
hist((s1+s2+s3+s4+s5+s6)/6, breaks=bins, main="", xlab="Average of 6 throws")
bins=12
hist((s1+s2+s3+s4+s5+s6+s7+s8)/8, breaks=bins, main="", xlab="Average of 8 throws")
hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws")
Code used to illustrate how the q-q plot is constructed
N <- 10
# What are the quantiles from the theoretical normal distribution?
index <- seq(1, N)
P <- (index - 0.5) / N
theoretical.quantity <- qnorm(P)
# Our sampled data:
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
mean.yield <- mean(yields) # 80.0
sd.yield <- sd(yields) # 8.35
# What are the quantiles for the sampled data?
yields.z <- (yields - mean.yield)/sd.yield
yields.z
yields.z.sorted <- sort(yields.z)
# Compare the values in text:
yields.z.sorted
theoretical.quantity
# Compare them graphically:
plot(theoretical.quantity, yields.z.sorted, asp=1)
abline(a=0, b=1)
# Built-in R function to do all the above for you:
qqnorm(yields)
qqline(yields)
# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
library(car)
qqPlot(yields)
Code to illustrate the central limit theorem's reduction in variance
# Show the 3 plots side by side
layout(matrix(c(1,2,3), 1, 3))
# Sample the population:
N <- 100
x <- rnorm(N, mean=80, sd=5)
mean(x)
sd(x)
# Plot the raw data
x.range <- range(x)
plot(x, ylim=x.range, main='Raw data')
# Subgroups of 2
subsize <- 2
x.2 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.2, ylim=x.range, main='Subgroups of 2')
# Subgroups of 4
subsize <- 4
x.4 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.4, ylim=x.range, main='Subgroups of 4')
Paired test example
# Analysis of the data here:
dilution <- c(11, 26, 18, 16, 20, 12, 8, 26, 12, 17, 14)
manometric <- c(25, 3, 27, 30, 33, 16, 28, 27, 12, 32, 16)
N <- length(dilution)
mean(manometric)
mean(dilution)
plot(c(dilution, manometric), ylab="BOD values", xaxt='n')
text(5.5,3, "Dilution")
text(18,3, "Manometric")
abline(v=11.5)
par(mar=c(4.2, 4.2, 0.2, 0.2)) # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1)
plot(dilution, type="p", pch=4,
cex=2, cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8,
ylab="BOD values", xlab="Sample number",
ylim=c(0,35), xlim=c(0,11.5), col="darkgreen")
lines(manometric, type="p", pch=16, cex=2, col="blue")
lines(rep(0, N), dilution, type="p", pch=4, cex=2, col="darkgreen")
lines(rep(0, N), manometric, type="p", pch=16, cex=2, col="blue")
abline(v=0.5)
legend(8, 5, pch=c(4, 16), c("Dilution", "Manometric"), col=c("darkgreen", "blue"), pt.cex=2)
par(mar=c(4.2, 4.2, 0.2, 0.2)) # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1)
plot(dilution-manometric, type="p", ylab="Dilution - Manometric", xlab="Sample number",
cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, cex=2)
abline(h=0, col="grey60")