Difference between revisions of "Univariate data analysis (2013)"

Latest revision as of 09:39, 13 January 2016

Class date(s):

15 to 29 January 2013

(PDF)



Download video: Link (plays in Google Chrome) [360 M]



Download video: Link (plays in Google Chrome) [362 M]



Download video: Link (plays in Google Chrome) [353 M]



Download video: Link (plays in Google Chrome) [392 M]



Download video: Link (plays in Google Chrome) [328 M]



Download video: Link (plays in Google Chrome) [361 M]



Download video: Link (plays in Google Chrome) [357 M]

Course notes and slides

Course textbook (print chapter 2)
Slides for class

Software source code

Please follow the software tutorial to install and run the course software. Here was the example used in class:

# Read data from a web address
batch <- read.csv('http://openmv.net/file/batch-yields.csv')

Code used to illustrate how the q-q plot is constructed:

N <- 10

# What are the quantiles from the theoretical normal distribution?
index <- seq(1, N)
P <- (index - 0.5) / N
theoretical.quantity <- qnorm(P)

# Our sampled data:
yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
mean.yield <- mean(yields)       # 80.0
sd.yield <- sd(yields)           # 8.35

# What are the quantiles for the sampled data?
yields.z <- (yields - mean.yield)/sd.yield
yields.z
 
yields.z.sorted <- sort(yields.z)

# Compare the values in text:
yields.z.sorted 
theoretical.quantity  

# Compare them graphically:
plot(theoretical.quantity, yields.z.sorted, asp=1)
abline(a=0, b=1)

# Built-in R function to do all the above for you:
qqnorm(yields)
qqline(yields)

# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
library(car)
qqPlot(yields)

Code used to illustrate the central limit theorem's reduction in variance:

# Show the 3 plots side by side
layout(matrix(c(1,2,3), 1, 3))

# Sample the population:
N <- 100
x <- rnorm(N, mean=80, sd=5)
mean(x)
sd(x)

# Plot the raw data
x.range <- range(x)
plot(x, ylim=x.range, main='Raw data')

# Subgroups of 2
subsize <- 2
x.2 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.2, ylim=x.range, main='Subgroups of 2')

# Subgroups of 4
subsize <- 4
x.4 <- numeric(N/subsize)
for (i in 1:(N/subsize))
{
    x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
}
plot(x.4, ylim=x.range, main='Subgroups of 4')

Code to show how to superimpose plots

data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
summary(data)

# Single plot
plot(data$density1)

# Connect the dots
plot(data$density1, type='b')

# Another variable
plot(data$density2, type='b', col="red")

# Superimpose them?
plot(data$density1, type='b', col="blue")
lines(data$density2, type='b', col="red")  # where's density2 ?

# Superimpose them: limits
plot(data$density1, type='b', col="blue", ylim=c(10, 45))
lines(data$density2, type='b', col="red")  # now density2 shows up

Code to show how to deal with missing values:

data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
summary(data)  # notice the NAs in the columns: these refer to missing value (Not Available)

sd(data$density1)  # why NA as the answer?
help(sd)
sd(data$density1, na.rm=TRUE)  # no NA answer anymore!

help(mad)
help(IQR)  # etc: all these functions accept and na.rm input

@@ Line 1: / Line 1: @@
 __NOTOC__{{ClassSidebar
-| date = 15 to 24 January 2013
+| date = 15 to 29 January 2013
 | dates_alt_text =
-| vimeoID1 =
+| vimeoID1 = 57483092
-| vimeoID2 =
+| vimeoID2 = 57550116
-| vimeoID3 =
+| vimeoID3 = 57708665
-| vimeoID4 =
+| vimeoID4 = 57962292
-| vimeoID5 =
+| vimeoID5 = 58049398
-| vimeoID6 =
+| vimeoID6 = 58214822
-| vimeoID7 =
+| vimeoID7 = 58487266
 | vimeoID8 =
 | course_notes_PDF = 2013-4C3-Overheads-Univariate-data-analysis.pdf
@@ Line 16: / Line 16: @@
 | assignment_solutions =
 | video_download_link_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02A.mp4
-| video_download_link_MP4_size = M
+| video_download_link_MP4_size =360 M
 | video_notes1 =
 | video_download_link2_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02B.mp4
-| video_download_link2_MP4_size = 355 M
+| video_download_link2_MP4_size = 362 M
 | video_notes2 =
 | video_download_link3_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-02C.mp4
-| video_download_link3_MP4_size = M
+| video_download_link3_MP4_size = 353 M
 | video_notes3 =
 | video_download_link4_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03A.mp4
-| video_download_link4_MP4_size = M
+| video_download_link4_MP4_size = 392 M
 | video_notes4 =
 | video_download_link5_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03B.mp4
-| video_download_link5_MP4_size = M
+| video_download_link5_MP4_size = 328 M
 | video_notes5 =
 | video_download_link6_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-03C.mp4
-| video_download_link6_MP4_size = M
+| video_download_link6_MP4_size = 361 M
 | video_notes6 =
-| video_download_link7_MP4 = http://learnche.mcmaster.ca/media/
+| video_download_link7_MP4 = http://learnche.mcmaster.ca/media/4C3-2013-Class-04A.mp4
-| video_download_link7_MP4_size = M
+| video_download_link7_MP4_size = 357 M
 | video_notes7 =
-| video_download_link8_MP4 = http://learnche.mcmaster.ca/media/
-| video_download_link8_MP4_size = M
-| video_notes8 =
 }}
@@ Line 46: / Line 43: @@
 * [[Image:Nuvola_mimetypes_pdf.png|20px|link=Media:2013-4C3-Overheads-Univariate-data-analysis.pdf]] [[Media:2013-4C3-Overheads-Univariate-data-analysis.pdf|Slides for class]]
-<!-- === Class materials ===
+== Software source code ==
-* 15 Jan 2013 (Class 02A): <!-- [http://learnche.mcmaster.ca/media/4C3-2013-Class-01B.mp3 Audio] and  [http://learnche.mcmaster.ca/media/4C3-2013-Class-01B.mp4 video] -->
- -->
+Please follow the [[Software_tutorial|software tutorial]] to install and run the course software. Here was the example used in class:
-== Other readings ==
-* [http://www.investopedia.com/articles/financial-theory/11/lie-with-financial-statistics.asp How To Lie With Financial Statistics], Investopedia, November 2011
+<syntaxhighlight lang="rsplus">
-* [http://vita.had.co.nz/papers/boxplots.pdf 40 years of boxplots]
+# Read data from a web address
+batch <- read.csv('http://openmv.net/file/batch-yields.csv')
+</syntaxhighlight>
+Code used to illustrate how the q-q plot is constructed:
+<syntaxhighlight lang="rsplus">
+N <- 10
+# What are the quantiles from the theoretical normal distribution?
+index <- seq(1, N)
+P <- (index - 0.5) / N
+theoretical.quantity <- qnorm(P)
+# Our sampled data:
+yields <- c(86.2, 85.7, 71.9, 95.3, 77.1, 71.4, 68.9, 78.9, 86.9, 78.4)
+mean.yield <- mean(yields)       # 80.0
+sd.yield <- sd(yields)           # 8.35
+# What are the quantiles for the sampled data?
+yields.z <- (yields - mean.yield)/sd.yield
+yields.z
+yields.z.sorted <- sort(yields.z)
+# Compare the values in text:
+yields.z.sorted
+theoretical.quantity
+# Compare them graphically:
+plot(theoretical.quantity, yields.z.sorted, asp=1)
+abline(a=0, b=1)
+# Built-in R function to do all the above for you:
+qqnorm(yields)
+qqline(yields)
+# A better function: see http://learnche.mcmaster.ca/4C3/Software_tutorial/Extending_R_with_packages
+library(car)
+qqPlot(yields)
+</syntaxhighlight>
+Code used to illustrate the central limit theorem's reduction in variance:
+<syntaxhighlight lang="rsplus">
+# Show the 3 plots side by side
+layout(matrix(c(1,2,3), 1, 3))
+# Sample the population:
+N <- 100
+x <- rnorm(N, mean=80, sd=5)
+mean(x)
+sd(x)
+# Plot the raw data
+x.range <- range(x)
+plot(x, ylim=x.range, main='Raw data')
+# Subgroups of 2
+subsize <- 2
+x.2 <- numeric(N/subsize)
+for (i in 1:(N/subsize))
+{
+    x.2[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
+}
+plot(x.2, ylim=x.range, main='Subgroups of 2')
+# Subgroups of 4
+subsize <- 4
+x.4 <- numeric(N/subsize)
+for (i in 1:(N/subsize))
+{
+    x.4[i] <- mean(x[((i-1)*subsize+1):(i*subsize)])
+}
+plot(x.4, ylim=x.range, main='Subgroups of 4')
+</syntaxhighlight>
+Code to show how to superimpose plots
+<syntaxhighlight lang="rsplus">
+data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
+summary(data)
+# Single plot
+plot(data$density1)
+# Connect the dots
+plot(data$density1, type='b')
+# Another variable
+plot(data$density2, type='b', col="red")
+# Superimpose them?
+plot(data$density1, type='b', col="blue")
+lines(data$density2, type='b', col="red")  # where's density2 ?
+# Superimpose them: limits
+plot(data$density1, type='b', col="blue", ylim=c(10, 45))
+lines(data$density2, type='b', col="red")  # now density2 shows up
+</syntaxhighlight>
+Code to show how to deal with missing values:
+<syntaxhighlight lang="rsplus">
+data <- read.csv('http://openmv.net/file/raw-material-properties.csv')
+summary(data)  # notice the NAs in the columns: these refer to missing value (Not Available)
+sd(data$density1)  # why NA as the answer?
+help(sd)
+sd(data$density1, na.rm=TRUE)  # no NA answer anymore!
+help(mad)
+help(IQR)  # etc: all these functions accept and na.rm input
+</syntaxhighlight>

Difference between revisions of "Univariate data analysis (2013)"

Latest revision as of 09:39, 13 January 2016

Course notes and slides

Software source code

Navigation menu

Search