Difference between revisions of "Univariate data analysis (2014)"

Revision as of 14:25, 17 January 2014

Class date(s):

13 to 23 January 2014

(PDF)



Download video: Link (plays in Google Chrome) [343 M]



Download video: Link (plays in Google Chrome) [327 M]

Class materials

Date	Class number	Video and audio files		Other materials	Reading (PID)	Slides
13 January	02A	Video (343 M)	Audio (42 M)	R demo file	Chapter 2	Slides for class
15 January	02B	Video (327 M)	Audio (42 M)	See code below
16 January	02C	Video (347 M)	Audio (42 M)	See code below
20 January	03A

Software source code

Please follow the software tutorial to install and run the course software. You should be able to quickly read, understand and use the material in steps 1 to 13.

Class example, 15 Jan

Seeing the Central Limit Theorem in action: rolling dice.

N = 500
m <- t(matrix(seq(1,6), 3, 2))
layout(m)
s1 <- as.integer(runif(N, 1, 7))
s2 <- as.integer(runif(N, 1, 7))
s3 <- as.integer(runif(N, 1, 7))
s4 <- as.integer(runif(N, 1, 7))
s5 <- as.integer(runif(N, 1, 7))
s6 <- as.integer(runif(N, 1, 7))
s7 <- as.integer(runif(N, 1, 7))
s8 <- as.integer(runif(N, 1, 7))
s9 <- as.integer(runif(N, 1, 7))
s10 <- as.integer(runif(N, 1, 7))

hist(s1, main="", xlab="One throw", breaks=seq(0,6)+0.5)
bins = 8
hist((s1+s2)/2, breaks=bins, main="", xlab="Average of two throws")
hist((s1+s2+s3+s4)/4, breaks=bins, main="", xlab="Average of 4 throws")
hist((s1+s2+s3+s4+s5+s6)/6, breaks=bins, main="", xlab="Average of 6 throws")
bins=12
hist((s1+s2+s3+s4+s5+s6+s7+s8)/8,  breaks=bins, main="", xlab="Average of 8 throws")
hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws")

Class example, 16 Jan

# Read data from a web address
batch <- read.csv('http://datasets.connectmv.com/file/batch-yields.csv')

Code used to illustrate how the q-q plot is constructed:

N <- 10

# What are the quantiles from the theoretical normal distribution?
index <- seq(1, N)
P <- (index - 0.5) / N
theoretical.quantity <- qnorm(P)

# Our sampled data:
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
mean.yield <- mean(yields)       # 80.0
sd.yield <- sd(yields)           # 8.35

# What are the quantiles for the sampled data?
yields.z <- (yields - mean.yield)/sd.yield
yields.z
 
yields.z.sorted <- sort(yields.z)

# Compare the values in text:
yields.z.sorted 
theoretical.quantity  

# Compare them graphically:
plot(theoretical.quantity, yields.z.sorted, asp=1)
abline(a=0, b=1)

# Built-in R function to do all the above for you:
qqnorm(yields)
qqline(yields)

# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
library(car)
qqPlot(yields)

Code used to illustrate the central limit theorem's reduction in variance:

# Show the 3 plots side by side
layout(matrix(c(1,2,3), 1, 3))

# Sample the population:
N <- 100
x <- rnorm(N, mean=80, sd=5)
mean(x)
sd(x)

# Plot the raw data
x.range <- range(x)
plot(x, ylim=x.range, main='Raw data')

# Subgroups of 2
subsize <- 2
x.2 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.2, ylim=x.range, main='Subgroups of 2')

# Subgroups of 4
subsize <- 4
x.4 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.4, ylim=x.range, main='Subgroups of 4')

@@ Line 63: / Line 63: @@
 | [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02B.mp4 Video] (327 M)
 | [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02B.mp3 Audio] (42 M)
-| None
+| See code below
 |-
 | 16 January
@@ Line 69: / Line 69: @@
 | [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02C.mp4 Video] (347 M)
 | [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02C.mp3 Audio] (42 M)
-| None
+| See code below
+|-
+| 20 January
+| 03A
+| <!-- [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03A.mp4 Video] (347 M) -->
+| <!-- [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03A.mp3 Audio] (42 M) -->
+|
 |}