Difference between revisions of "Univariate data analysis (2014)"

Latest revision as of 06:54, 4 January 2017

Class date(s):

13 to 23 January 2014

(PDF)

Class materials

Date	Class number	Video and audio files		Other materials	Reading (PID)	Slides
13 January	02A	Video (343 M)	Audio (42 M)	R demo file	Chapter 2	Slides for class
15 January	02B	Video (327 M)	Audio (42 M)	See code below
16 January	02C	Video (347 M)	Audio (42 M)	See code below
20 January	03A	Video (347 M)	Audio (42 M)	Using tables of the normal distribution
22 January	03B	Video (262 M)	Audio (42 M)	Using tables of the t-distribution
23 January	03C	Video (293 M)	Audio (41 M)	None
27 January	04A	Video from 2013 (357M)	Audio from 2013 (43M)	See code below
29 January	04B	Video (180 M)	Audio (24 M)	None

Software source code

Please follow the software tutorial to install and run the course software. You should be able to quickly read, understand and use the material in steps 1 to 13.

Class example, 15 Jan

Seeing the Central Limit Theorem in action: rolling dice. Run this code in your browser (no need to install/run in R)

N = 500
m <- t(matrix(seq(1,6), 3, 2))
layout(m)
s1 <- as.integer(runif(N, 1, 7))
s2 <- as.integer(runif(N, 1, 7))
s3 <- as.integer(runif(N, 1, 7))
s4 <- as.integer(runif(N, 1, 7))
s5 <- as.integer(runif(N, 1, 7))
s6 <- as.integer(runif(N, 1, 7))
s7 <- as.integer(runif(N, 1, 7))
s8 <- as.integer(runif(N, 1, 7))
s9 <- as.integer(runif(N, 1, 7))
s10 <- as.integer(runif(N, 1, 7))

hist(s1, main="", xlab="One throw", breaks=seq(0,6)+0.5)
bins = 8
hist((s1+s2)/2, breaks=bins, main="", xlab="Average of two throws")
hist((s1+s2+s3+s4)/4, breaks=bins, main="", xlab="Average of 4 throws")
hist((s1+s2+s3+s4+s5+s6)/6, breaks=bins, main="", xlab="Average of 6 throws")
bins=12
hist((s1+s2+s3+s4+s5+s6+s7+s8)/8,  breaks=bins, main="", xlab="Average of 8 throws")
hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws")

Class example, 16 Jan

# Read data from a web address
batch <- read.csv('http://datasets.connectmv.com/file/batch-yields.csv')

Code used to illustrate how the q-q plot is constructed:

Run this code in your browser (no need to install/run in R)

N <- 10

# What are the quantiles from the theoretical normal distribution?
index <- seq(1, N)
P <- (index - 0.5) / N
theoretical.quantity <- qnorm(P)

# Our sampled data:
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
mean.yield <- mean(yields)       # 80.0
sd.yield <- sd(yields)           # 8.35

# What are the quantiles for the sampled data?
yields.z <- (yields - mean.yield)/sd.yield
yields.z
 
yields.z.sorted <- sort(yields.z)

# Compare the values in text:
yields.z.sorted 
theoretical.quantity  

# Compare them graphically:
plot(theoretical.quantity, yields.z.sorted, asp=1)
abline(a=0, b=1)

# Built-in R function to do all the above for you:
qqnorm(yields)
qqline(yields)

# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
library(car)
qqPlot(yields)

Code used to illustrate the central limit theorem's reduction in variance:

Run this code in your browser (no need to install/run in R)

# Show the 3 plots side by side
layout(matrix(c(1,2,3), 1, 3))

# Sample the population:
N <- 100
x <- rnorm(N, mean=80, sd=5)
mean(x)
sd(x)

# Plot the raw data
x.range <- range(x)
plot(x, ylim=x.range, main='Raw data')

# Subgroups of 2
subsize <- 2
x.2 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.2, ylim=x.range, main='Subgroups of 2')

# Subgroups of 4
subsize <- 4
x.4 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.4, ylim=x.range, main='Subgroups of 4')

Class example, 27 Jan

Test for differences: plotting the raw data.

Run this code in a web-browser

# Analysis of the data here:
dilution <-   c(11, 26, 18, 16, 20, 12,  8, 26, 12, 17, 14)
manometric <- c(25,  3, 27, 30, 33, 16, 28, 27, 12, 32, 16)

mean(manometric)
mean(dilution)

plot(c(dilution, manometric), ylab="BOD values", xaxt='n')
text(5.5,3, "Dilution")
text(18,3, "Manometric")
abline(v=11.5)

par(mar=c(4.2, 4.2, 0.2, 0.2))  # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1)
plot(dilution, type="p", pch=4, 
    cex=2, cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, 
    ylab="BOD values", xlab="Sample number",
    ylim=c(0,35), xlim=c(0,11.5), col="darkgreen")
lines(manometric, type="p", pch=16, cex=2, col="blue")
lines(rep(0, N), dilution, type="p", pch=4, cex=2, col="darkgreen")
lines(rep(0, N), manometric, type="p", pch=16, cex=2, col="blue")

abline(v=0.5)

legend(8, 5, pch=c(4, 16), c("Dilution", "Manometric"), col=c("darkgreen", "blue"), pt.cex=2)


par(mar=c(4.2, 4.2, 0.2, 0.2))  # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1)
plot(dilution-manometric, type="p", ylab="Dilution - Manometric", xlab="Sample number", 
     cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, cex=2)
abline(h=0, col="grey60")

@@ Line 1: / Line 1: @@
-__NOTOC__{{ClassSidebar
+__NOTOC__{{ClassSidebarYouTube
-| date = 13 January 2014
+| date = 13 to 23 January 2014
 | dates_alt_text =
-| vimeoID1 =
+| vimeoID1 = 0a1YeaheSXc
-| vimeoID2 =
+| vimeoID2 = 2ffZAlWUUAE
-| vimeoID3 =
+| vimeoID3 = fWcvYScLSC4
-| vimeoID4 =
+| vimeoID4 = 8OJGWGlP0Ok
-| vimeoID5 =
+| vimeoID5 = q5wzW8k2TIE
-| vimeoID6 =
+| vimeoID6 = aGr4LVOgVhk
-| vimeoID7 =
+| vimeoID7 = 58487266  <!-- replace later -->
-| vimeoID8 =
+| vimeoID8 = f7KkIy9wZco
 | course_notes_PDF = 2014-4C3-6C3-Univariate-data-analysis.pdf
 | course_notes_alt = Course slides
@@ Line 15: / Line 15: @@
 | assignment_instructions =
 | assignment_solutions =
-| video_download_link_MP4 = http://learnche.mcmaster.ca/media/
+| video_download_link_MP4 = http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02A.mp4
-| video_download_link_MP4_size =M
+| video_download_link_MP4_size = 343 M
 | video_notes1 =
-| video_download_link2_MP4 = http://learnche.mcmaster.ca/media/
+| video_download_link2_MP4 = http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02B.mp4
-| video_download_link2_MP4_size = M
+| video_download_link2_MP4_size = 327 M
 | video_notes2 =
-| video_download_link3_MP4 = http://learnche.mcmaster.ca/media/
+| video_download_link3_MP4 = http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02C.mp4
-| video_download_link3_MP4_size = M
+| video_download_link3_MP4_size = 347 M
 | video_notes3 =
-| video_download_link4_MP4 = http://learnche.mcmaster.ca/media/
+| video_download_link4_MP4 = http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03A.mp4
-| video_download_link4_MP4_size = M
+| video_download_link4_MP4_size = 344 M
 | video_notes4 =
-| video_download_link5_MP4 = http://learnche.mcmaster.ca/media/
+| video_download_link5_MP4 = http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03B.mp4
-| video_download_link5_MP4_size = M
+| video_download_link5_MP4_size = 262 M
 | video_notes5 =
-| video_download_link6_MP4 = http://learnche.mcmaster.ca/media/
+| video_download_link6_MP4 = http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03C.mp4
-| video_download_link6_MP4_size = M
+| video_download_link6_MP4_size = 293 M
 | video_notes6 =
-| video_download_link7_MP4 = http://learnche.mcmaster.ca/media/
+| video_download_link7_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-04A.mp4
-| video_download_link7_MP4_size = M
+| video_download_link7_MP4_size = 357 M
 | video_notes7 =
+| video_download_link8_MP4 = http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-04B.mp4
+| video_download_link8_MP4_size = 180 M
+| video_notes8 =
 }}
-== Course notes and slides ==
+== Class materials ==
-* [http://learnche.mcmaster.ca/pid/?source=Univariate Course textbook] (print chapter 2)
+{| class="wikitable"  style="text-align: center;"
-* [[Image:Nuvola_mimetypes_pdf.png|20px|link=Media:2014-4C3-6C3-Univariate-data-analysis.pdf]] [[Media:2014-4C3-6C3-Univariate-data-analysis.pdf|Slides for class]]
+|-
+! Date
+! Class number
+! colspan="2"|Video and audio files
+! Other materials
+! Reading (PID)
+! Slides
+|-
+| 13 January
+| 02A
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02A.mp4 Video] (343 M)
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02A.mp3 Audio] (42 M)
+|[[Media:Demo.R|R demo file]]
+| rowspan="8"|[http://learnche.mcmaster.ca/pid/?source=Univariate Chapter 2]
+| rowspan="8"|[[Image:Nuvola_mimetypes_pdf.png|20px|link=Media:2014-4C3-6C3-Univariate-data-analysis.pdf]] [[Media:2014-4C3-6C3-Univariate-data-analysis.pdf|Slides for class]]
+|-
+| 15 January
+| 02B
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02B.mp4 Video] (327 M)
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02B.mp3 Audio] (42 M)
+| See code below
+|-
+| 16 January
+| 02C
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02C.mp4 Video] (347 M)
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-02C.mp3 Audio] (42 M)
+| See code below
+|-
+| 20 January
+| 03A
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03A.mp4 Video] (347 M)
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03A.mp3 Audio] (42 M)
+| [[Tables_of_the_normal_and_t-distribution| Using tables of the normal distribution]]
+|-
+| 22 January
+| 03B
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03B.mp4 Video] (262 M)
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03B.mp3 Audio] (42 M)
+| [[Tables_of_the_normal_and_t-distribution | Using tables of the t-distribution]]
+|-
+| 23 January
+| 03C
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03C.mp4 Video] (293 M)
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-03C.mp3 Audio] (41 M)
+| None
+|-
+| 27 January
+| 04A
+| [http://learnche.mcmaster.ca/media/4C3-2013-Class-04A.mp4 Video from 2013] (357M)
+| [http://learnche.mcmaster.ca/media/4C3-2013-Class-04A.mp3 Audio from 2013] (43M)
+| See code below
+|-
+| 29 January
+| 04B
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-04B.mp4 Video] (180 M)
+| [http://learnche.mcmaster.ca/media/2014-4C3-6C3-Class-04B.mp3 Audio] (24 M)
+| None
+|}
 == Software source code ==
-* Please follow the [[Software_tutorial|software tutorial]] to install and run the course software. You should be able to quickly read, understand and use the material in steps 1 to 13.
+Please follow the [[Software_tutorial|software tutorial]] to install and run the course software. You should be able to quickly read, understand and use the material in steps 1 to 13.
+== Class example, 15 Jan ==
+Seeing the Central Limit Theorem in action: ''rolling dice''. [http://www.r-fiddle.org/#/fiddle?id=vGjkcRws&version=1 Run this code in your browser] (no need to install/run in R)
+<syntaxhighlight lang="rsplus">
+N = 500
+m <- t(matrix(seq(1,6), 3, 2))
+layout(m)
+s1 <- as.integer(runif(N, 1, 7))
+s2 <- as.integer(runif(N, 1, 7))
+s3 <- as.integer(runif(N, 1, 7))
+s4 <- as.integer(runif(N, 1, 7))
+s5 <- as.integer(runif(N, 1, 7))
+s6 <- as.integer(runif(N, 1, 7))
+s7 <- as.integer(runif(N, 1, 7))
+s8 <- as.integer(runif(N, 1, 7))
+s9 <- as.integer(runif(N, 1, 7))
+s10 <- as.integer(runif(N, 1, 7))
+hist(s1, main="", xlab="One throw", breaks=seq(0,6)+0.5)
+bins = 8
+hist((s1+s2)/2, breaks=bins, main="", xlab="Average of two throws")
+hist((s1+s2+s3+s4)/4, breaks=bins, main="", xlab="Average of 4 throws")
+hist((s1+s2+s3+s4+s5+s6)/6, breaks=bins, main="", xlab="Average of 6 throws")
+bins=12
+hist((s1+s2+s3+s4+s5+s6+s7+s8)/8,  breaks=bins, main="", xlab="Average of 8 throws")
+hist((s1+s2+s3+s4+s5+s6+s7+s8+s9+s10)/10, breaks=bins, main="", xlab="Average of 10 throws")
+</syntaxhighlight>
-* [[Media:Demo.R|The demo file from class on 13 January]]
+== Class example, 16 Jan ==
-<!-- Here was the example used in class:
 <syntaxhighlight lang="rsplus">
@@ Line 57: / Line 145: @@
 Code used to illustrate how the q-q plot is constructed:
+[http://www.r-fiddle.org/#/fiddle?id=mBl1TMyP&version=1 Run this code in your browser] (no need to install/run in R)
 <syntaxhighlight lang="rsplus">
 N <- 10
@@ Line 94: / Line 184: @@
 Code used to illustrate the central limit theorem's reduction in variance:
+[http://www.r-fiddle.org/#/fiddle?id=MHIU665v&version=1 Run this code in your browser] (no need to install/run in R)
 <syntaxhighlight lang="rsplus">
 # Show the 3 plots side by side
@@ Line 127: / Line 219: @@
 </syntaxhighlight>
-Code to show how to superimpose plots
+== Class example, 27 Jan ==
+Test for differences: plotting the raw data.
+[http://www.r-fiddle.org/#/fiddle?id=deSYk0W1&version=2 Run this code in a web-browser]
 <syntaxhighlight lang="rsplus">
-data <- read.csv('http://datasets.connectmv.com/file/raw-material-properties.csv')
+# Analysis of the data here:
-summary(data)
+dilution <-   c(11, 26, 18, 16, 20, 12,  8, 26, 12, 17, 14)
+manometric <- c(25,  3, 27, 30, 33, 16, 28, 27, 12, 32, 16)
-# Single plot
-plot(data$density1)
-# Connect the dots
+mean(manometric)
-plot(data$density1, type='b')
+mean(dilution)
-# Another variable
+plot(c(dilution, manometric), ylab="BOD values", xaxt='n')
-plot(data$density2, type='b', col="red")
+text(5.5,3, "Dilution")
+text(18,3, "Manometric")
+abline(v=11.5)
-# Superimpose them?
+par(mar=c(4.2, 4.2, 0.2, 0.2))  # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1)
-plot(data$density1, type='b', col="blue")
+plot(dilution, type="p", pch=4,
-lines(data$density2, type='b', col="red")  # where's density2 ?
+    cex=2, cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8,
+    ylab="BOD values", xlab="Sample number",
+    ylim=c(0,35), xlim=c(0,11.5), col="darkgreen")
+lines(manometric, type="p", pch=16, cex=2, col="blue")
+lines(rep(0, N), dilution, type="p", pch=4, cex=2, col="darkgreen")
+lines(rep(0, N), manometric, type="p", pch=16, cex=2, col="blue")
-# Superimpose them: limits
+abline(v=0.5)
-plot(data$density1, type='b', col="blue", ylim=c(10, 45))
-lines(data$density2, type='b', col="red")  # now density2 shows up
-</syntaxhighlight>
-Code to show how to deal with missing values:
+legend(8, 5, pch=c(4, 16), c("Dilution", "Manometric"), col=c("darkgreen", "blue"), pt.cex=2)
-<syntaxhighlight lang="rsplus">
-data <- read.csv('http://datasets.connectmv.com/file/raw-material-properties.csv')
-summary(data)  # notice the NAs in the columns: these refer to missing value (Not Available)
-sd(data$density1)  # why NA as the answer?
-help(sd)
-sd(data$density1, na.rm=TRUE)  # no NA answer anymore!
-help(mad)
+par(mar=c(4.2, 4.2, 0.2, 0.2))  # (bottom, left, top, right); defaults are par(mar=c(5, 4, 4, 2) + 0.1)
-help(IQR)  # etc: all these functions accept and na.rm input
+plot(dilution-manometric, type="p", ylab="Dilution - Manometric", xlab="Sample number",
+     cex.lab=1.5, cex.main=1.8, cex.sub=1.8, cex.axis=1.8, cex=2)
+abline(h=0, col="grey60")
 </syntaxhighlight>
- -->