Hellen Gakuruh
2017-03-31
n
n
edit()
fix()
data.entry()
, anddataentry()
scan()
can be quite handy in addition to calling functions for any of the data structures; c()
for vector, matrix()
, array()
, data.frame()
, and list()
scan()
is also not a good data entry process as it looses on reproducibility as data is entered interactively (console) We will look at:
"data.entry()"
"scan()"
c()
, matrix()
, array()
, data.frame()
, and list()
dataset2 <- scan(what = "character")
c()
for atomic vectorsmatrix()
for matricesarray()
for 1 or more dimension arraysdata.frame
for data frameslist()
for lists# An integer vector
num <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) # same as 1:10
# A logical vector
logi <- c(TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE)
# A character vector
R_authours <- c("Douglas Bates", "John Chambers", "Peter Dalgaard", "Seth Falcon", "Robert Gentleman", "Kurt Hornik", "Ross Ihaka", "Michael Lawrence", "Friedrich Leisch", "Uwe Ligges", "Thomas Lumley", "Martin Morgan", "Duncan Murdoch", "Paul Murrell", "Martyn Plummer", "Brian Ripley", "Deepayan Sarkar", "Duncan Temple Lang", "Luke Tierney", "Simon Urbanek")
matrix()
but rbind()
, cbind()
and as.matrix()
can be used to convert other vectors to a matrixmatrix()
can be called without any input thus creating an empty matrixmat1 <- matrix(data = 1:9, nrow = 3, dimnames = list(NULL, c("a", "b", "c")))
mat1
a b c
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
array()
n
dims <- list(1:3, c("a", "b", "c"), c("Yes", "No"))
arry <- array(data = seq(1, 9*2), dim = c(3, 3, 2), dimnames = dims)
arry
, , Yes
a b c
1 1 4 7
2 2 5 8
3 3 6 9
, , No
a b c
1 10 13 16
2 11 14 17
3 12 15 18
data.frame()
# Example of weight loss data set
dataset3 <- data.frame(ID = 1:5, Exercise = c(TRUE, TRUE, FALSE, TRUE, FALSE), Height = c(5.2, 4.9, 5.1, 5.2, 5.4), Weight = c(69, 72, 75, 67, 77))
dataset3
ID Exercise Height Weight
1 1 TRUE 5.2 69
2 2 TRUE 4.9 72
3 3 FALSE 5.1 75
4 4 TRUE 5.2 67
5 5 FALSE 5.4 77
list()
lst <- list(vect = 5:9, Matrix = mat1, Array = arry, Dataframe = dataset3, List = list("a", 2:3))
str(lst)
List of 5
$ vect : int [1:5] 5 6 7 8 9
$ Matrix : int [1:3, 1:3] 1 2 3 4 5 6 7 8 9
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr [1:3] "a" "b" "c"
$ Array : int [1:3, 1:3, 1:2] 1 2 3 4 5 6 7 8 9 10 ...
..- attr(*, "dimnames")=List of 3
.. ..$ : chr [1:3] "1" "2" "3"
.. ..$ : chr [1:3] "a" "b" "c"
.. ..$ : chr [1:2] "Yes" "No"
$ Dataframe:'data.frame': 5 obs. of 4 variables:
..$ ID : int [1:5] 1 2 3 4 5
..$ Exercise: logi [1:5] TRUE TRUE FALSE TRUE FALSE
..$ Height : num [1:5] 5.2 4.9 5.1 5.2 5.4
..$ Weight : num [1:5] 69 72 75 67 77
$ List :List of 2
..$ : chr "a"
..$ : int [1:2] 2 3
mode()
and length()
are used to establish mode and length of an objectn
mode(num)
[1] "numeric"
mode(mat1)
[1] "numeric"
mode(arry)
[1] "numeric"
n
mode(dataset3)
[1] "list"
mode(lst)
[1] "list"
# Atomic vector
length(num)
[1] 10
# Matrix
length(mat1)
[1] 9
mat1; dim(mat1)
a b c
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
[1] 3 3
# Data frames
length(dataset3) # This shows number of variables not cases/rows
[1] 4
# Lists
length(lst)
[1] 5
mode()
which is an S compatible function for checking typestorage.mode()
which is used for compatability when calling functions written in other languages (ensures data is of expected type)typeof()
which is basically an R's implementation of S's mode()
typeof()
is adequatemode()
storage.mode()
attributes()
attr()
mode
and length
other often used attributes are:
name()
and set with name(object) <-
colnames()
is used for matrix-like objects# Creating an unnamed vector
vect1 <- c(12, 54, 98)
names(vect1)
NULL
# Naming vector elements
names(vect1) <- c("a", "b", "c")
names(vect1)
[1] "a" "b" "c"
n
# An unnamed matrix
mat3 <- matrix(1:9, 3)
mat3
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
n
# Naming a matrix
colnames(mat3) <- c("a", "b", "c")
mat3
a b c
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
dim()
is used to query an objects dimension and dim <-
used to set dimension to an objectn
# An atomic vector (dimensionless)
vect2 <- 1
vect2
[1] 1
dim(vect2)
NULL
n
# Converting to 1 dimension array
dim(vect2) <- 1
vect2
[1] 1
dim(vect2)
[1] 1
n
# Matrix with no dimnames
vect3 <- matrix(1:9, 3)
vect3
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
n
# Adding dimnames
dimnames(vect3) <- list(1:3, c("a", "b", "c"))
vect3
a b c
1 1 4 7
2 2 5 8
3 3 6 9
class()
and class <-
respectively or attr(obj, class)
s3
object. This makes it part of R's Object Oriented Programming (OOP)n
# Intrinsic class attribute
vect <- 1:5; class(vect)
[1] "integer"
# (Assigned) Class attribute
attr(vect, "class")
NULL
n
# Add class with either
attr(vect, which = "class") <- "myclass"
# OR class(vect) <- "myclass"
# Query class attribute
attr(vect, "class")
[1] "myclass"
n
# Random annual data
set.seed(28)
tms <- round(rnorm(12, 56))
tms
[1] 54 56 55 54 56 57 56 56 56 58 55 58
attributes(tms)
NULL
n
# Adding attribute `tsp`
tsp(tms) <- c(start = 1, end = 12, frequency = 1)
attributes(tms)
$tsp
[1] 1 12 1
n
help(package = datasets)
Data Structure | Number |
---|---|
Array | 1 |
Character (1 dim vector) | 2 |
Data frame | 46 |
Dist (Distance Matrix) | 2 |
Factor (1 dim integer vector) | 2 |
List | 4 |
Matrix | 8 |
Numeric (1 dim vector) | 6 |
Table (Atomic vectors) | 51 |
ts | 28 |
n
if-else
, ifelse
and for
if-else
can only be performed if condition evaluated to one logical value either TRUE or FALSEx <- c("a", "b", "c")
class(x)
[1] "character"
if (class(x) == "character") {
x <- as.factor(x)
} else {
x
}
class(x)
[1] "factor"
for()
is a looping structure used to perform repetitive tasksfor
iterates from a certain value through a sequence performing an action defined it's body (body of any function including for loop is what is in between {})for(i in 1:5) { # variable "i" is a counter (conting from 1 to 5)
cat("Hello \n") # function "cat" is used to print to console
}
Hello
Hello
Hello
Hello
Hello
n
'data.frame': 71 obs. of 2 variables:
$ weight: num 179 160 136 227 217 168 108 124 143 140 ...
$ feed : Factor w/ 6 levels "casein","horsebean",..: 2 2 2 2 2 2 2 2 2 2 ...
n
# Current categories of variable of interest (feed)
levels(chickwts$feed)
[1] "casein" "horsebean" "linseed" "meatmeal" "soybean" "sunflower"
# Recording with function "ifelse"
chickwts$feed2 <- ifelse(chickwts$feed == "casein", yes = "casein", no = "other")
# Conveting to a factor vector
chickwts$feed2 <- factor(chickwts$feed2)
# New levels
levels(chickwts$feed2)
[1] "casein" "other"
names() <-
function# Current name
names(chickwts)
[1] "weight" "feed" "feed2"
# Renaming variables (all must be proived)
names(chickwts) <- c("weight", "feed", "casein")
names(chickwts)
[1] "weight" "feed" "casein"
is.na()
used to check for missing valuecomplete.cases()
might be more appropriate n
# Vector with a missing value
vect1 <- c(letters[1:5], NA); vect1
[1] "a" "b" "c" "d" "e" NA
# A logical vector checking for missing values
is.na(vect1)
[1] FALSE FALSE FALSE FALSE FALSE TRUE
n
vect2 <- letters[1:6]
mat3 <- rbind(vect1, vect2)
mat3
[,1] [,2] [,3] [,4] [,5] [,6]
vect1 "a" "b" "c" "d" "e" NA
vect2 "a" "b" "c" "d" "e" "f"
n
complete.cases(mat3)
[1] FALSE TRUE
as.Date()
specifying argument format as detailed by ?strftime
as.Date()
can also be used to convert a numeric vector to a date object, by specifying argument origin; origin in R is “1970-01-01”# Converting a character vector
date1char <- c("3/6/2017", "3/7/2017", "4/7/2017")
class(date1char)
[1] "character"
date1 <- as.Date(date1char, format = "%m/%e/%Y")
date1
[1] "2017-03-06" "2017-03-07" "2017-04-07"
class(date1)
[1] "Date"
# Converting a numeric vector
date1num <- c(17231, 17232, 17263)
class(date1num)
[1] "numeric"
date2 <- as.Date(date1num, origin = "1970-01-01")
date2
[1] "2017-03-06" "2017-03-07" "2017-04-07"
class(date2)
[1] "Date"
as.data_type
like as.logical()
, as.integer()
, as.double()
, as.character()
, as.raw()
, and as.complex()
sort()
order()
apply
sort
is done in an increasing manner, be nullified by setting argument “decreasing” to TRUEn
# An unsorted random numbers
set.seed(58)
tosort <- round(rnorm(10, 87, 10))
tosort
[1] 83 91 97 80 81 68 84 92 106 96
# Sorted vector (increasing)
sort(tosort)
[1] 68 80 81 83 84 91 92 96 97 106
# Sorted vector (decreasing)
sort(tosort, TRUE)
[1] 106 97 96 92 91 84 83 81 80 68
n
mat2sort <- matrix(tosort[-1], 3, dimnames = list(1:3, c("a", "b", "c")))
mat2sort
a b c
1 91 81 92
2 97 68 106
3 80 84 96
n
# Sort by columns of a matrix
apply(mat2sort, 2, sort)
a b c
[1,] 80 68 92
[2,] 91 81 96
[3,] 97 84 106
set.seed(3)
v1 <- round(rnorm(9, 50, 10))
set.seed(3)
v2 <- round(rnorm(9, 90))
set.seed(3)
logi <- sample(c(TRUE, FALSE), 9, TRUE, c(0.7, 0.3))
df1 <- data.frame(Logi = logi, V1 = v1, V2 = v2)
# Sorted by first variable "logi"
df1[order(df1$Logi, decreasing = TRUE),]
Logi V1 V2
1 TRUE 40 89
3 TRUE 53 90
4 TRUE 38 89
5 TRUE 52 90
6 TRUE 50 90
7 TRUE 51 90
8 TRUE 61 91
9 TRUE 38 89
2 FALSE 47 90
df1[order(df1$Logi, df1$V1, decreasing = TRUE),]
Logi V1 V2
8 TRUE 61 91
3 TRUE 53 90
5 TRUE 52 90
7 TRUE 51 90
6 TRUE 50 90
1 TRUE 40 89
4 TRUE 38 89
9 TRUE 38 89
2 FALSE 47 90
# Negative sign used to indicate decreasing
df1[order(-df1$Logi, df1$V1), ]
Logi V1 V2
4 TRUE 38 89
9 TRUE 38 89
1 TRUE 40 89
6 TRUE 50 90
7 TRUE 51 90
5 TRUE 52 90
3 TRUE 53 90
8 TRUE 61 91
2 FALSE 47 90
# Additional data set
dataset4 <- data.frame(ID = 6:10, Exercise = c(TRUE, FALSE, TRUE, TRUE, FALSE), Height = c(5.4, 5.4, 5.2, 5.6, 5.4), Weight = c(77, 74, 75, 79, 82))
# Similar columns to be used for merging
intersect(names(dataset3), names(dataset4))
[1] "ID" "Exercise" "Height" "Weight"
# Merging (adding cases)
merge(dataset3, dataset4, all = TRUE)
ID Exercise Height Weight
1 1 TRUE 5.2 69
2 2 TRUE 4.9 72
3 3 FALSE 5.1 75
4 4 TRUE 5.2 67
5 5 FALSE 5.4 77
6 6 TRUE 5.4 77
7 7 FALSE 5.4 74
8 8 TRUE 5.2 75
9 9 TRUE 5.6 79
10 10 FALSE 5.4 82
Look at:
[
[[
$
, andgetElement()
"["
can select more than one element and keeps their names if present while "[["
and "$"
can only select one element without their names"$"
is only applicable for recursive objects (generic/list data structures), basically data frames and lists"getElement()"
function is similar to extracting with "[["
n
[
, although [[
can also be used to select a single element without it's names attributen
vect1
[1] "a" "b" "c" "d" "e" NA
# Index vector: Elements that are not NA
!is.na(vect1)
[1] TRUE TRUE TRUE TRUE TRUE FALSE
# Subset non-na values
vect1[!is.na(vect1)]
[1] "a" "b" "c" "d" "e"
# Subsetting with an empty index
tms[]
[1] 54 56 55 54 56 57 56 56 56 58 55 58
# Empty index useful for replacement while keeping attributes
set.seed(3)
tms[] <- sample(1:100, 12)
tms
[1] 17 80 38 32 58 96 12 28 54 95 47 45
attr(,"tsp")
[1] 1 12 1
n
# Some of my favourite fruits
fruits <- c(Mangoes = 50, Apples = 35, Pineapples = 20)
n
fruits["Mangoes"]
Mangoes
50
fruits[["Mangoes"]]
[1] 50
[
and [[
# One of R's data set
USPersonalExpenditure
1940 1945 1950 1955 1960
Food and Tobacco 22.200 44.500 59.60 73.2 86.80
Household Operation 10.500 15.500 29.00 36.5 46.20
Medical and Health 3.530 5.760 9.71 14.0 21.10
Personal Care 1.040 1.980 2.45 3.4 5.40
Private Education 0.341 0.974 1.80 2.6 3.64
# Subsetting with an empty index
USPersonalExpenditure[]
1940 1945 1950 1955 1960
Food and Tobacco 22.200 44.500 59.60 73.2 86.80
Household Operation 10.500 15.500 29.00 36.5 46.20
Medical and Health 3.530 5.760 9.71 14.0 21.10
Personal Care 1.040 1.980 2.45 3.4 5.40
Private Education 0.341 0.974 1.80 2.6 3.64
# Subseting with one index
USPersonalExpenditure[5]
[1] 0.341
# Subsetting with dimensions
USPersonalExpenditure[1, ] # Subset 1st row, all columns
1940 1945 1950 1955 1960
22.2 44.5 59.6 73.2 86.8
USPersonalExpenditure[1, 1] # Subset 1st row, first column
[1] 22.2
USPersonalExpenditure[3, "1950"] # Subset 3rd row, column 3 "1950"
[1] 9.71
USPersonalExpenditure[, "1960"] # Subset an entire row, drops dimension
Food and Tobacco Household Operation Medical and Health
86.80 46.20 21.10
Personal Care Private Education
5.40 3.64
# Maintaining dimension
USPersonalExpenditure[, "1960", drop = FALSE]
1960
Food and Tobacco 86.80
Household Operation 46.20
Medical and Health 21.10
Personal Care 5.40
Private Education 3.64
dim(USPersonalExpenditure[, "1960", drop = FALSE])
[1] 5 1
[
, [[
and $
) can be used[
can selects more than one element[[
and $
can select one item, difference is that $
can not be used with computed values like “i + 1” (index + 1)subset()
Example data set: USArrests
# Vewing first 6 rows
head(USArrests)
Murder Assault UrbanPop Rape
Alabama 13.2 236 58 21.2
Alaska 10.0 263 48 44.5
Arizona 8.1 294 80 31.0
Arkansas 8.8 190 50 19.5
California 9.0 276 91 40.6
Colorado 7.9 204 78 38.7
# Computing average of assault, murder and rape using "$"
avg_murder <- median(USArrests$Murder)
avg_assault <- median(USArrests$Assault)
avg_rape <- median(USArrests$Rape)
# Using "[" subset states with above average assault, murder and rape
high_crime <- USArrests[USArrests$Murder > avg_murder & USArrests$Assault > avg_assault & USArrests$Rape > avg_rape, ]
# Sort (by decreasing order for Murder) and output names of states
high_crime <- high_crime[order(high_crime$Murder, decreasing = TRUE),]
row.names(high_crime)
[1] "Georgia" "Florida" "Louisiana" "South Carolina"
[5] "Alabama" "Tennessee" "Texas" "Nevada"
[9] "Michigan" "New Mexico" "Maryland" "New York"
[13] "Illinois" "Alaska" "California" "Missouri"
[17] "Arizona" "Colorado"
# Subset a column without name attribute
high_crime[[1]]
[1] 17.4 15.4 15.4 14.4 13.2 13.2 12.7 12.2 12.1 11.4 11.3 11.1 10.4 10.0
[15] 9.0 9.0 8.1 7.9
# Or
USArrests[["Assault"]]
[1] 236 263 294 190 276 204 110 238 335 211 46 120 249 113 56 115 109
[18] 249 83 300 149 255 72 259 178 109 102 252 57 159 285 254 337 45
[35] 120 151 159 106 174 279 86 188 201 120 48 156 145 81 53 161
subset
can be used to subset any vector, but most suitable for data frameswith()
to access variables without making reference to data frame namehigh_crime2 <- with(USArrests, subset(USArrests, Murder > avg_murder & Assault > avg_assault & Rape > avg_rape, Murder:Rape))
high_crime2 <- high_crime2[order(high_crime2$Murder, decreasing = TRUE), ]
# Check both data sets are identical
identical(high_crime, high_crime2)
[1] TRUE
[
returns a list, subsetting with [[
or $
outputs the same type as element being subset i.e. if list has data frame, subsetting with [[
or $
will output a data frame# Example data
state.center; class(state.center)
$x
[1] -86.7509 -127.2500 -111.6250 -92.2992 -119.7730 -105.5130 -72.3573
[8] -74.9841 -81.6850 -83.3736
$y
[1] 32.5901 49.2500 34.2192 34.7336 36.5341 38.6777 41.5928 38.6777
[9] 27.8744 32.3329
[1] "list"
# Using `[` outputs a list
state.center[1]
$x
[1] -86.7509 -127.2500 -111.6250 -92.2992 -119.7730 -105.5130 -72.3573
[8] -74.9841 -81.6850 -83.3736
class(state.center[1])
[1] "list"
# Using `[[` outputs elements type
state.center[[1]]
[1] -86.7509 -127.2500 -111.6250 -92.2992 -119.7730 -105.5130 -72.3573
[8] -74.9841 -81.6850 -83.3736
class(state.center[[1]])
[1] "numeric"
# Using "$" outputs elements type
state.center$x
[1] -86.7509 -127.2500 -111.6250 -92.2992 -119.7730 -105.5130 -72.3573
[8] -74.9841 -81.6850 -83.3736
class(state.center$x)
[1] "numeric"
Function | Description |
---|---|
str | A compact display internals of a data frame |
head | Prints first part, default is first 6 rows |
tail | Prints last part, default is last 6 row |
attach | Put data frame on R's search path hence variables are accessible without reference to data frame name |
dettach | Remove data frame from R's search path. Recommended after completion of task |
Function | Description |
---|---|
with | Recommended alternative to attach , makes it possible to run expressions/function on a data frame's element |
which | Locates indices of logical value TRUE. Used for indexing data frame elements |