Introduction to Data Analysis and Graphics in R

author: Hellen Gakuruh date: 2017-03-10 autosize: true

Slide 4: Summarizing Data

Outline

What we shall cover

Introduction

type: section

Introduction cont.

type: section

Measures of central Tendency

type: section

=================================================================== type: sub-section

================================================================== type: sub-section

Locating median

type: sub-section

Determining mode

type: sub-section

Standard deviation (SD)

type: sub-section

Skewness

type: sub-section

Kurtosis

type: sub-section

================================================================ type: sub-section

Numerical summaries for discrete variables

type: section

============================================================== type: sub-section

# Data
set.seed(4)
scores <- as.integer(round(rnorm(50, 78, 1)))

# Source own function for printing frequency tables 
source("~/R/Scripts/desc-statistics.R")

# Frequency table
freq(scores)
  Values Freq Perc
1     76    2    4
2     77    8   16
3     78   19   38
4     79   17   34
5     80    4    8

=================================================================== type: sub-section

# Mean
mean(scores)
[1] 78.26
# Median
median(scores)
[1] 78

# Range
cat("Range for this distribution is", diff(range(scores)), paste0("(", paste(range(scores), collapse = ", "), ")"))
Range for this distribution is 4 (76, 80)

==================================================================== type: sub-section

# Where 50% of values lie
cat("50% of values lie between score of about", round(quantile(scores, 0.25)), "and", paste0(round(quantile(scores, 0.75)), ":"), "an IQR of about", round(IQR(scores)))
50% of values lie between score of about 78 and 79: an IQR of about 1
# Standard deviation (spread of values around mean)
sd(scores)
[1] 0.964894

====================================================================== type: sub-section

# Functions developed to measure and interpret skewness and kurtosis
source("~/R/Scripts/skewness-kurtosis-fun.R")

# Skewness
m3_std(scores)
[1] -0.2551918
skewness_interpreter(m3_std(scores))
[1] "approximately symmetric"

# Kurtosis
excess_kurt(scores)
[1] -0.365273
excess_interpreter(excess_kurt(scores))
[1] "approximately mesokurtic"

Conclusion (discrete numerical measures)

type: sub-section

Numerical summaries for continuous variables

type: section

============================================================ type: sub-section

# Example data
set.seed(4)
height <- round(rnorm(50, 5.4), 2)
sort(height)
 [1] 3.60 3.71 3.92 4.12 4.47 4.54 4.58 4.65 4.76 4.86 4.93 5.00 5.02 5.12
[15] 5.12 5.17 5.19 5.30 5.35 5.36 5.42 5.43 5.50 5.55 5.57 5.57 5.58 5.62
[29] 5.78 5.97 5.99 6.00 6.09 6.12 6.26 6.29 6.31 6.33 6.45 6.57 6.64 6.66
[43] 6.69 6.69 6.71 6.74 6.94 7.04 7.18 7.30

=============================================================== type: sub-section

# Average
mean(height)
[1] 5.6352
median(height)
[1] 5.57

# Dispersion
sd(height)
[1] 0.9184931
diff(range(height)); range(height)
[1] 3.7
[1] 3.6 7.3

================================================================ type: sub-section

IQR(height)
[1] 1.28
# Modal Class (interval)
tab <- freq_continuous(height)
as.vector(tab[which.max(tab$Perc), 1])
[1] "(5,5.5]"

# Functions for generating frequency tables
freq_continuous(height)
   Values Freq Perc
1 (3.5,4]    3    6
2 (4,4.5]    2    4
3 (4.5,5]    7   14
4 (5,5.5]   11   22
5 (5.5,6]    9   18
6 (6,6.5]    7   14
7 (6.5,7]    8   16
8 (7,7.5]    3    6

============================================================ type: sub-section

# Skewness
m3_std(height)
[1] -0.2186212
skewness_interpreter(m3_std(height))
[1] "approximately symmetric"

# Kurtosis
excess_kurt(height)
[1] -0.7024805
excess_interpreter(excess_kurt(height))
[1] "moderately platykurtic"

Tables for dichotomous variables

type: section


set.seed(4)
dichot <- sample(c("Yes", "No"), 100, replace = TRUE)
freq(dichot)
  Values Freq Perc
1     No   57   57
2    Yes   43   43

Tables for categorical variables

type: section

========================================================== type: sub-section

groups <- rep(c("a", "b", "c"), 200)
set.seed(4)
outcome <- sample(c("improved", "same", "decreased"), length(groups), replace = TRUE, prob = c(0.7, 0.2, 0.1))
freq(groups)
  Values Freq Perc
1      a  200   33
2      b  200   33
3      c  200   33

freq(outcome)
     Values Freq Perc
1 decreased   66   11
2  improved  418   70
3      same  116   19

Contingency table

type: sub-section

source("~/R/Scripts/desc-statistics.R")
contigency_tab(groups, outcome)
      outcome
groups decreased perc improved perc same perc
     a        22   33      136   33   42   36
     b        23   35      140   33   37   32
     c        21   32      142   34   37   32