Difference between revisions of "Univariate data analysis (2013)"
Jump to navigation
Jump to search
Kevin Dunn (talk | contribs) m |
Kevin Dunn (talk | contribs) |
||
(12 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
__NOTOC__{{ClassSidebar | __NOTOC__{{ClassSidebar | ||
| date = 15 to | | date = 15 to 29 January 2013 | ||
| dates_alt_text = | | dates_alt_text = | ||
| vimeoID1 = | | vimeoID1 = 57483092 | ||
| vimeoID2 = | | vimeoID2 = 57550116 | ||
| vimeoID3 = | | vimeoID3 = 57708665 | ||
| vimeoID4 = | | vimeoID4 = 57962292 | ||
| vimeoID5 = | | vimeoID5 = 58049398 | ||
| vimeoID6 = | | vimeoID6 = 58214822 | ||
| vimeoID7 = | | vimeoID7 = 58487266 | ||
| vimeoID8 = | | vimeoID8 = | ||
| course_notes_PDF = 2013-4C3-Overheads-Univariate-data-analysis.pdf | | course_notes_PDF = 2013-4C3-Overheads-Univariate-data-analysis.pdf | ||
Line 16: | Line 16: | ||
| assignment_solutions = | | assignment_solutions = | ||
| video_download_link_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02A.mp4 | | video_download_link_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02A.mp4 | ||
| video_download_link_MP4_size = M | | video_download_link_MP4_size =360 M | ||
| video_notes1 = | | video_notes1 = | ||
| video_download_link2_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02B.mp4 | | video_download_link2_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02B.mp4 | ||
| video_download_link2_MP4_size = | | video_download_link2_MP4_size = 362 M | ||
| video_notes2 = | | video_notes2 = | ||
| video_download_link3_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02C.mp4 | | video_download_link3_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02C.mp4 | ||
| video_download_link3_MP4_size = M | | video_download_link3_MP4_size = 353 M | ||
| video_notes3 = | | video_notes3 = | ||
| video_download_link4_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03A.mp4 | | video_download_link4_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03A.mp4 | ||
| video_download_link4_MP4_size = M | | video_download_link4_MP4_size = 392 M | ||
| video_notes4 = | | video_notes4 = | ||
| video_download_link5_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03B.mp4 | | video_download_link5_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03B.mp4 | ||
| video_download_link5_MP4_size = M | | video_download_link5_MP4_size = 328 M | ||
| video_notes5 = | | video_notes5 = | ||
| video_download_link6_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03C.mp4 | | video_download_link6_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03C.mp4 | ||
| video_download_link6_MP4_size = M | | video_download_link6_MP4_size = 361 M | ||
| video_notes6 = | | video_notes6 = | ||
| video_download_link7_MP4 = http://learnche.mcmaster.ca/media/ | | video_download_link7_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-04A.mp4 | ||
| video_download_link7_MP4_size = M | | video_download_link7_MP4_size = 357 M | ||
| video_notes7 = | | video_notes7 = | ||
}} | }} | ||
Line 46: | Line 43: | ||
* [[Image:Nuvola_mimetypes_pdf.png|20px|link=Media:2013-4C3-Overheads-Univariate-data-analysis.pdf]] [[Media:2013-4C3-Overheads-Univariate-data-analysis.pdf|Slides for class]] | * [[Image:Nuvola_mimetypes_pdf.png|20px|link=Media:2013-4C3-Overheads-Univariate-data-analysis.pdf]] [[Media:2013-4C3-Overheads-Univariate-data-analysis.pdf|Slides for class]] | ||
== Software source code == | |||
Please follow the [[Software_tutorial|software tutorial]] to install and run the course software. Here was the example used in class: | |||
<syntaxhighlight lang="rsplus"> | |||
* | # Read data from a web address | ||
batch <- read.csv('http://openmv.net/file/batch-yields.csv') | |||
</syntaxhighlight> | |||
Code used to illustrate how the q-q plot is constructed: | |||
<syntaxhighlight lang="rsplus"> | |||
N <- 10 | |||
# What are the quantiles from the theoretical normal distribution? | |||
index <- seq(1, N) | |||
P <- (index - 0.5) / N | |||
theoretical.quantity <- qnorm(P) | |||
# Our sampled data: | |||
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4) | |||
mean.yield <- mean(yields) # 80.0 | |||
sd.yield <- sd(yields) # 8.35 | |||
# What are the quantiles for the sampled data? | |||
yields.z <- (yields - mean.yield)/sd.yield | |||
yields.z | |||
yields.z.sorted <- sort(yields.z) | |||
# Compare the values in text: | |||
yields.z.sorted | |||
theoretical.quantity | |||
# Compare them graphically: | |||
plot(theoretical.quantity, yields.z.sorted, asp=1) | |||
abline(a=0, b=1) | |||
# Built-in R function to do all the above for you: | |||
qqnorm(yields) | |||
qqline(yields) | |||
# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages | |||
library(car) | |||
qqPlot(yields) | |||
</syntaxhighlight> | |||
Code used to illustrate the central limit theorem's reduction in variance: | |||
<syntaxhighlight lang="rsplus"> | |||
# Show the 3 plots side by side | |||
layout(matrix(c(1,2,3), 1, 3)) | |||
# Sample the population: | |||
N <- 100 | |||
x <- rnorm(N, mean=80, sd=5) | |||
mean(x) | |||
sd(x) | |||
# Plot the raw data | |||
x.range <- range(x) | |||
plot(x, ylim=x.range, main='Raw data') | |||
# Subgroups of 2 | |||
subsize <- 2 | |||
x.2 <- numeric(N/subsize) | |||
for (i in 1:(N/subsize)) | |||
{ | |||
x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)]) | |||
} | |||
plot(x.2, ylim=x.range, main='Subgroups of 2') | |||
# Subgroups of 4 | |||
subsize <- 4 | |||
x.4 <- numeric(N/subsize) | |||
for (i in 1:(N/subsize)) | |||
{ | |||
x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)]) | |||
} | |||
plot(x.4, ylim=x.range, main='Subgroups of 4') | |||
</syntaxhighlight> | |||
Code to show how to superimpose plots | |||
<syntaxhighlight lang="rsplus"> | |||
data <- read.csv('http://openmv.net/file/raw-material-properties.csv') | |||
summary(data) | |||
# Single plot | |||
plot(data$density1) | |||
# Connect the dots | |||
plot(data$density1, type='b') | |||
# Another variable | |||
plot(data$density2, type='b', col="red") | |||
# Superimpose them? | |||
plot(data$density1, type='b', col="blue") | |||
lines(data$density2, type='b', col="red") # where's density2 ? | |||
# Superimpose them: limits | |||
plot(data$density1, type='b', col="blue", ylim=c(10, 45)) | |||
lines(data$density2, type='b', col="red") # now density2 shows up | |||
</syntaxhighlight> | |||
Code to show how to deal with missing values: | |||
<syntaxhighlight lang="rsplus"> | |||
data <- read.csv('http://openmv.net/file/raw-material-properties.csv') | |||
summary(data) # notice the NAs in the columns: these refer to missing value (Not Available) | |||
sd(data$density1) # why NA as the answer? | |||
help(sd) | |||
sd(data$density1, na.rm=TRUE) # no NA answer anymore! | |||
help(mad) | |||
help(IQR) # etc: all these functions accept and na.rm input | |||
</syntaxhighlight> |
Latest revision as of 09:39, 13 January 2016
Class date(s): | 15 to 29 January 2013 | ||||
(PDF) | Course slides | ||||
| |||||
| |||||
| |||||
| |||||
| |||||
| |||||
| |||||
Course notes and slides
- Course textbook (print chapter 2)
- Slides for class
Software source code
Please follow the software tutorial to install and run the course software. Here was the example used in class:
# Read data from a web address
batch <- read.csv('http://openmv.net/file/batch-yields.csv')
Code used to illustrate how the q-q plot is constructed:
N <- 10
# What are the quantiles from the theoretical normal distribution?
index <- seq(1, N)
P <- (index - 0.5) / N
theoretical.quantity <- qnorm(P)
# Our sampled data:
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
mean.yield <- mean(yields) # 80.0
sd.yield <- sd(yields) # 8.35
# What are the quantiles for the sampled data?
yields.z <- (yields - mean.yield)/sd.yield
yields.z
yields.z.sorted <- sort(yields.z)
# Compare the values in text:
yields.z.sorted
theoretical.quantity
# Compare them graphically:
plot(theoretical.quantity, yields.z.sorted, asp=1)
abline(a=0, b=1)
# Built-in R function to do all the above for you:
qqnorm(yields)
qqline(yields)
# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
library(car)
qqPlot(yields)
Code used to illustrate the central limit theorem's reduction in variance:
# Show the 3 plots side by side
layout(matrix(c(1,2,3), 1, 3))
# Sample the population:
N <- 100
x <- rnorm(N, mean=80, sd=5)
mean(x)
sd(x)
# Plot the raw data
x.range <- range(x)
plot(x, ylim=x.range, main='Raw data')
# Subgroups of 2
subsize <- 2
x.2 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.2, ylim=x.range, main='Subgroups of 2')
# Subgroups of 4
subsize <- 4
x.4 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.4, ylim=x.range, main='Subgroups of 4')
Code to show how to superimpose plots
data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
summary(data)
# Single plot
plot(data$density1)
# Connect the dots
plot(data$density1, type='b')
# Another variable
plot(data$density2, type='b', col="red")
# Superimpose them?
plot(data$density1, type='b', col="blue")
lines(data$density2, type='b', col="red") # where's density2 ?
# Superimpose them: limits
plot(data$density1, type='b', col="blue", ylim=c(10, 45))
lines(data$density2, type='b', col="red") # now density2 shows up
Code to show how to deal with missing values:
data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
summary(data) # notice the NAs in the columns: these refer to missing value (Not Available)
sd(data$density1) # why NA as the answer?
help(sd)
sd(data$density1, na.rm=TRUE) # no NA answer anymore!
help(mad)
help(IQR) # etc: all these functions accept and na.rm input