Difference between revisions of "Univariate data analysis (2013)"

From Statistics for Engineering
Jump to navigation Jump to search
m
 
(12 intermediate revisions by the same user not shown)
Line 1: Line 1:
__NOTOC__{{ClassSidebar
__NOTOC__{{ClassSidebar
| date = 15 to 24 January 2013
| date = 15 to 29 January 2013
| dates_alt_text =  
| dates_alt_text =  
| vimeoID1 =  
| vimeoID1 = 57483092
| vimeoID2 =  
| vimeoID2 = 57550116
| vimeoID3 =
| vimeoID3 = 57708665
| vimeoID4 =
| vimeoID4 = 57962292
| vimeoID5 =
| vimeoID5 = 58049398
| vimeoID6 =
| vimeoID6 = 58214822
| vimeoID7 =
| vimeoID7 = 58487266
| vimeoID8 =
| vimeoID8 =
| course_notes_PDF = 2013-4C3-Overheads-Univariate-data-analysis.pdf
| course_notes_PDF = 2013-4C3-Overheads-Univariate-data-analysis.pdf
Line 16: Line 16:
| assignment_solutions =  
| assignment_solutions =  
| video_download_link_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02A.mp4
| video_download_link_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02A.mp4
| video_download_link_MP4_size = M
| video_download_link_MP4_size =360 M
| video_notes1 =
| video_notes1 =
| video_download_link2_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02B.mp4
| video_download_link2_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02B.mp4
| video_download_link2_MP4_size = 355 M
| video_download_link2_MP4_size = 362 M
| video_notes2 =
| video_notes2 =
| video_download_link3_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02C.mp4
| video_download_link3_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02C.mp4
| video_download_link3_MP4_size = M
| video_download_link3_MP4_size = 353 M
| video_notes3 =
| video_notes3 =
| video_download_link4_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03A.mp4
| video_download_link4_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03A.mp4
| video_download_link4_MP4_size = M
| video_download_link4_MP4_size = 392 M
| video_notes4 =
| video_notes4 =
| video_download_link5_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03B.mp4
| video_download_link5_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03B.mp4
| video_download_link5_MP4_size = M
| video_download_link5_MP4_size = 328 M
| video_notes5 =
| video_notes5 =
| video_download_link6_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03C.mp4
| video_download_link6_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03C.mp4
| video_download_link6_MP4_size = M
| video_download_link6_MP4_size = 361 M
| video_notes6 =
| video_notes6 =
| video_download_link7_MP4 = http://learnche.mcmaster.ca/media/
| video_download_link7_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-04A.mp4
| video_download_link7_MP4_size = M
| video_download_link7_MP4_size = 357 M
| video_notes7 =
| video_notes7 =
| video_download_link8_MP4 = http://learnche.mcmaster.ca/media/
| video_download_link8_MP4_size = M
| video_notes8 =
}}
}}


Line 46: Line 43:
* [[Image:Nuvola_mimetypes_pdf.png|20px|link=Media:2013-4C3-Overheads-Univariate-data-analysis.pdf]] [[Media:2013-4C3-Overheads-Univariate-data-analysis.pdf|Slides for class]]  
* [[Image:Nuvola_mimetypes_pdf.png|20px|link=Media:2013-4C3-Overheads-Univariate-data-analysis.pdf]] [[Media:2013-4C3-Overheads-Univariate-data-analysis.pdf|Slides for class]]  


<!-- === Class materials ===
== Software source code ==
* 15 Jan 2013 (Class 02A): <!-- [http://learnche.mcmaster.ca/media/4C3-2013-Class-01B.mp3 Audio] and  [http://learnche.mcmaster.ca/media/4C3-2013-Class-01B.mp4 video] -->


-->
Please follow the [[Software_tutorial|software tutorial]] to install and run the course software. Here was the example used in class:
== Other readings ==


* [http://www.investopedia.com/articles/financial-theory/11/lie-with-financial-statistics.asp How To Lie With Financial Statistics], Investopedia, November 2011
<syntaxhighlight lang="rsplus">
* [http://vita.had.co.nz/papers/boxplots.pdf 40 years of boxplots]
# Read data from a web address
batch <- read.csv('http://openmv.net/file/batch-yields.csv')
</syntaxhighlight>
 
 
Code used to illustrate how the q-q plot is constructed:
<syntaxhighlight lang="rsplus">
N <- 10
 
# What are the quantiles from the theoretical normal distribution?
index <- seq(1, N)
P <- (index - 0.5) / N
theoretical.quantity <- qnorm(P)
 
# Our sampled data:
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
mean.yield <- mean(yields)      # 80.0
sd.yield <- sd(yields)          # 8.35
 
# What are the quantiles for the sampled data?
yields.z <- (yields - mean.yield)/sd.yield
yields.z
yields.z.sorted <- sort(yields.z)
 
# Compare the values in text:
yields.z.sorted
theoretical.quantity 
 
# Compare them graphically:
plot(theoretical.quantity, yields.z.sorted, asp=1)
abline(a=0, b=1)
 
# Built-in R function to do all the above for you:
qqnorm(yields)
qqline(yields)
 
# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
library(car)
qqPlot(yields)
</syntaxhighlight>
 
Code used to illustrate the central limit theorem's reduction in variance:
<syntaxhighlight lang="rsplus">
# Show the 3 plots side by side
layout(matrix(c(1,2,3), 1, 3))
 
# Sample the population:
N <- 100
x <- rnorm(N, mean=80, sd=5)
mean(x)
sd(x)
 
# Plot the raw data
x.range <- range(x)
plot(x, ylim=x.range, main='Raw data')
 
# Subgroups of 2
subsize <- 2
x.2 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.2, ylim=x.range, main='Subgroups of 2')
 
# Subgroups of 4
subsize <- 4
x.4 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.4, ylim=x.range, main='Subgroups of 4')
</syntaxhighlight>
 
Code to show how to superimpose plots
<syntaxhighlight lang="rsplus">
data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
summary(data)
 
# Single plot
plot(data$density1)
 
# Connect the dots
plot(data$density1, type='b')
 
# Another variable
plot(data$density2, type='b', col="red")
 
# Superimpose them?
plot(data$density1, type='b', col="blue")
lines(data$density2, type='b', col="red")  # where's density2 ?
 
# Superimpose them: limits
plot(data$density1, type='b', col="blue", ylim=c(10, 45))
lines(data$density2, type='b', col="red")  # now density2 shows up
</syntaxhighlight>
 
Code to show how to deal with missing values:
<syntaxhighlight lang="rsplus">
data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
summary(data)  # notice the NAs in the columns: these refer to missing value (Not Available)
 
sd(data$density1)  # why NA as the answer?
help(sd)
sd(data$density1, na.rm=TRUE)  # no NA answer anymore!
 
help(mad)
help(IQR)  # etc: all these functions accept and na.rm input
</syntaxhighlight>

Latest revision as of 09:39, 13 January 2016

Class date(s): 15 to 29 January 2013
Nuvola mimetypes pdf.png (PDF) Course slides
Download video: Link (plays in Google Chrome) [360 M]

Download video: Link (plays in Google Chrome) [362 M]

Download video: Link (plays in Google Chrome) [353 M]

Download video: Link (plays in Google Chrome) [392 M]

Download video: Link (plays in Google Chrome) [328 M]

Download video: Link (plays in Google Chrome) [361 M]

Download video: Link (plays in Google Chrome) [357 M]

Course notes and slides

Software source code

Please follow the software tutorial to install and run the course software. Here was the example used in class:

# Read data from a web address
batch <- read.csv('http://openmv.net/file/batch-yields.csv')


Code used to illustrate how the q-q plot is constructed:

N <- 10

# What are the quantiles from the theoretical normal distribution?
index <- seq(1, N)
P <- (index - 0.5) / N
theoretical.quantity <- qnorm(P)

# Our sampled data:
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
mean.yield <- mean(yields)       # 80.0
sd.yield <- sd(yields)           # 8.35

# What are the quantiles for the sampled data?
yields.z <- (yields - mean.yield)/sd.yield
yields.z
 
yields.z.sorted <- sort(yields.z)

# Compare the values in text:
yields.z.sorted 
theoretical.quantity  

# Compare them graphically:
plot(theoretical.quantity, yields.z.sorted, asp=1)
abline(a=0, b=1)

# Built-in R function to do all the above for you:
qqnorm(yields)
qqline(yields)

# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
library(car)
qqPlot(yields)

Code used to illustrate the central limit theorem's reduction in variance:

# Show the 3 plots side by side
layout(matrix(c(1,2,3), 1, 3))

# Sample the population:
N <- 100
x <- rnorm(N, mean=80, sd=5)
mean(x)
sd(x)

# Plot the raw data
x.range <- range(x)
plot(x, ylim=x.range, main='Raw data')

# Subgroups of 2
subsize <- 2
x.2 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.2, ylim=x.range, main='Subgroups of 2')

# Subgroups of 4
subsize <- 4
x.4 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.4, ylim=x.range, main='Subgroups of 4')

Code to show how to superimpose plots

data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
summary(data)

# Single plot
plot(data$density1)

# Connect the dots
plot(data$density1, type='b')

# Another variable
plot(data$density2, type='b', col="red")

# Superimpose them?
plot(data$density1, type='b', col="blue")
lines(data$density2, type='b', col="red")  # where's density2 ?

# Superimpose them: limits
plot(data$density1, type='b', col="blue", ylim=c(10, 45))
lines(data$density2, type='b', col="red")  # now density2 shows up

Code to show how to deal with missing values:

data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
summary(data)  # notice the NAs in the columns: these refer to missing value (Not Available)

sd(data$density1)  # why NA as the answer?
help(sd)
sd(data$density1, na.rm=TRUE)  # no NA answer anymore!

help(mad)
help(IQR)  # etc: all these functions accept and na.rm input