Chapter 21 - Iteration

21.2.1 Exercises

1. Write for loops to:

Compute the mean of every column in mtcars.

# loop through each column of mtcars and compute mean
for (column in colnames(mtcars)) {
  print (c(column,mean(mtcars[,column])))
}

## [1] "mpg"       "20.090625"
## [1] "cyl"    "6.1875"
## [1] "disp"       "230.721875"
## [1] "hp"       "146.6875"
## [1] "drat"      "3.5965625"
## [1] "wt"      "3.21725"
## [1] "qsec"     "17.84875"
## [1] "vs"     "0.4375"
## [1] "am"      "0.40625"
## [1] "gear"   "3.6875"
## [1] "carb"   "2.8125"

Determine the type of each column in nycflights13::flights.

for (column in colnames(nycflights13::flights)) {
  print ( c(column,class(nycflights13::flights[[column]])) )
}

## [1] "year"    "integer"
## [1] "month"   "integer"
## [1] "day"     "integer"
## [1] "dep_time" "integer" 
## [1] "sched_dep_time" "integer"       
## [1] "dep_delay" "numeric"  
## [1] "arr_time" "integer" 
## [1] "sched_arr_time" "integer"       
## [1] "arr_delay" "numeric"  
## [1] "carrier"   "character"
## [1] "flight"  "integer"
## [1] "tailnum"   "character"
## [1] "origin"    "character"
## [1] "dest"      "character"
## [1] "air_time" "numeric" 
## [1] "distance" "numeric" 
## [1] "hour"    "numeric"
## [1] "minute"  "numeric"
## [1] "time_hour" "POSIXct"   "POSIXt"

Compute the number of unique values in each column of iris.

for (column in colnames(iris)) {
  print (c(column, length(unique(iris[,column]))))
}

## [1] "Sepal.Length" "35"          
## [1] "Sepal.Width" "23"         
## [1] "Petal.Length" "43"          
## [1] "Petal.Width" "22"         
## [1] "Species" "3"

Generate 10 random normals for each of μ=−10,0,10, and 100.

means <- c(-10, 0, 10, 100)
for (i in means) {
  print(rnorm (10, i))
}

##  [1]  -9.542133 -10.383358 -10.617098  -9.313501  -7.463658  -9.586247
##  [7]  -9.421090  -9.638472  -9.422668 -10.978171
##  [1] -0.7398129  1.0338034 -0.3695387  0.2969773 -0.2615178  0.4294088
##  [7] -0.5317923 -0.6347465  1.1473329 -0.8340858
##  [1] 11.511716 10.842998 10.932023 10.068180 10.532980 10.034497 11.432468
##  [8]  8.899156  9.640771 10.233389
##  [1]  96.89362 100.77170 100.80290  98.66342  99.94461 100.67008  99.43201
##  [8]  98.76813 101.47166  99.92212

2. Eliminate the for loop in each of the following examples by taking advantage of an existing function that works with vectors:

out <- ""
for (x in letters) {
  out <- stringr::str_c(out, x)
}
out

## [1] "abcdefghijklmnopqrstuvwxyz"

# use str_c() with collapse to put all the letters together.
str_c(letters, collapse = "")

## [1] "abcdefghijklmnopqrstuvwxyz"

x <- sample(100)
sd <- 0
for (i in seq_along(x)) {
  sd <- sd + (x[i] - mean(x)) ^ 2
}
sd <- sqrt(sd / (length(x) - 1))
sd

## [1] 29.01149

# if we only want to replace the for loop, then use: sum( (x-mean(x))^2 )
sqrt( sum( (x-mean(x))^2 ) / (length(x) - 1) )

## [1] 29.01149

# otherwise just use the sd() function instead of the for loop
sd(x)

## [1] 29.01149

x <- runif(100)
out <- vector("numeric", length(x))
out[1] <- x[1]
for (i in 2:length(x)) {
  out[i] <- out[i - 1] + x[i]
}
out

##   [1]  0.6313162  0.6350445  1.1486326  1.2040190  2.1091825  2.5919667
##   [7]  3.5285872  3.5669447  3.8200857  4.2717039  5.2522156  6.2230116
##  [13]  6.2547521  6.8618498  7.6779305  8.0026124  8.5884170  8.9350433
##  [19]  9.4358700  9.6799386  9.8543304  9.9217561 10.6799236 11.2705327
##  [25] 11.3392938 11.5473192 11.7873142 12.5941426 12.6774564 12.7050889
##  [31] 13.2318285 14.1942259 14.8976633 15.6473616 16.1710684 16.6063363
##  [37] 17.5369760 18.0194281 18.8200097 19.2988724 20.0333152 20.9567123
##  [43] 21.8056936 22.6026529 23.1870958 23.9420246 24.1123016 24.6263081
##  [49] 24.8890961 25.8257526 26.3472017 27.3124038 27.3452063 27.8161866
##  [55] 27.8432487 27.8733143 28.0645089 28.5740391 28.6505543 29.1707966
##  [61] 29.4401222 29.8309588 30.0853641 30.7624207 31.5143308 32.0077487
##  [67] 32.5189378 33.1957695 33.3022747 33.9595799 34.4133002 34.9947384
##  [73] 35.3375869 35.5890653 35.8753518 36.1899076 37.1314045 37.6922446
##  [79] 38.3816194 39.1572676 39.4777520 40.3120925 40.4726386 41.1175236
##  [85] 42.0939471 42.7347619 43.1511472 43.4366199 43.7438335 44.4980710
##  [91] 44.5894311 45.4152320 46.1072375 47.0610684 47.3500387 48.2433428
##  [97] 48.4237648 49.4193446 50.2648524 50.3503730

# the same output can be achieved by calculating the cumulative sum (cumsum()):
cumsum(x)

##   [1]  0.6313162  0.6350445  1.1486326  1.2040190  2.1091825  2.5919667
##   [7]  3.5285872  3.5669447  3.8200857  4.2717039  5.2522156  6.2230116
##  [13]  6.2547521  6.8618498  7.6779305  8.0026124  8.5884170  8.9350433
##  [19]  9.4358700  9.6799386  9.8543304  9.9217561 10.6799236 11.2705327
##  [25] 11.3392938 11.5473192 11.7873142 12.5941426 12.6774564 12.7050889
##  [31] 13.2318285 14.1942259 14.8976633 15.6473616 16.1710684 16.6063363
##  [37] 17.5369760 18.0194281 18.8200097 19.2988724 20.0333152 20.9567123
##  [43] 21.8056936 22.6026529 23.1870958 23.9420246 24.1123016 24.6263081
##  [49] 24.8890961 25.8257526 26.3472017 27.3124038 27.3452063 27.8161866
##  [55] 27.8432487 27.8733143 28.0645089 28.5740391 28.6505543 29.1707966
##  [61] 29.4401222 29.8309588 30.0853641 30.7624207 31.5143308 32.0077487
##  [67] 32.5189378 33.1957695 33.3022747 33.9595799 34.4133002 34.9947384
##  [73] 35.3375869 35.5890653 35.8753518 36.1899076 37.1314045 37.6922446
##  [79] 38.3816194 39.1572676 39.4777520 40.3120925 40.4726386 41.1175236
##  [85] 42.0939471 42.7347619 43.1511472 43.4366199 43.7438335 44.4980710
##  [91] 44.5894311 45.4152320 46.1072375 47.0610684 47.3500387 48.2433428
##  [97] 48.4237648 49.4193446 50.2648524 50.3503730

3. Combine your function writing and for loop skills:

Write a for loop that prints() the lyrics to the children’s song “Alice the camel”.

Alice the Camel has one hump.
Alice the Camel has one hump.
Alice the Camel has one hump.
Go Alice go!

The package english has a nice function to convert numericals to their corresponding english words, which we can utilize to write out this poem. For example, the number 1 can be converted to “one” and written out. This makes looping through numbers and printing them out easy. The code below prints the poem out if Alice started out with 3 humps. This isn’t exactly the same as the real poem, but you can get the idea.

num_humps <- 3
while (num_humps >=0) {
  eng_num_humps <- english::as.english(num_humps)
  if (num_humps == 0)
    cat("Alice the Camel has no more humps! End of poem.")
  else if (num_humps == 1){
    cat(str_c(rep(paste("Alice the Camel has", eng_num_humps, "hump.\n"),3), collapse = ""))
    cat("Go Alice go!\n\n")
  }
  else{
    cat(str_c(rep(paste("Alice the Camel has", eng_num_humps, "humps.\n"),3), collapse = ""))
    cat("Go Alice go!\n\n")
  }
  num_humps <- num_humps -1
}

## Alice the Camel has three humps.
## Alice the Camel has three humps.
## Alice the Camel has three humps.
## Go Alice go!
## 
## Alice the Camel has two humps.
## Alice the Camel has two humps.
## Alice the Camel has two humps.
## Go Alice go!
## 
## Alice the Camel has one hump.
## Alice the Camel has one hump.
## Alice the Camel has one hump.
## Go Alice go!
## 
## Alice the Camel has no more humps! End of poem.

Convert the nursery rhyme “ten in the bed” to a function. Generalise it to any number of people in any sleeping structure.

There were ten in the bed
And the little one said,
"Roll over! Roll over!"
So they all rolled over and
one fell out

... nine ... eight ... etc.

There were two in the bed
And the little one said,
"Roll over! Roll over!"
So they all rolled over and one fell out

There was one in the bed
And the little one said,

"Alone at last!"

sleeping_people_poem <- function (num_people = 10, struct = "bed") {
  while (num_people > 0) {
    
    if (num_people == 1) {
      cat( paste0( "\nThere was ", english::as.english(num_people), " in the ", struct  ) )
      cat( "\nAnd the little one said,\nAlone at last!" )

    }
    else {
      cat( paste0( "\nThere were ", english::as.english(num_people), " in the ", struct  ) )
      cat( "\nAnd the little one said,\nRoll over! Roll over!\nSo they all rolled over and one fell out\n" )
    }
    num_people <- num_people - 1
  }
}
sleeping_people_poem()

## 
## There were ten in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There were nine in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There were eight in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There were seven in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There were six in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There were five in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There were four in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There were three in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There were two in the bed
## And the little one said,
## Roll over! Roll over!
## So they all rolled over and one fell out
## 
## There was one in the bed
## And the little one said,
## Alone at last!

We use a similar concept as above to generate the poem.

Convert the song “99 bottles of beer on the wall” to a function. Generalise to any number of any vessel containing any liquid on any surface.

99 bottles of beer on the wall, 99 bottles of beer.
Take one down and pass it around, 98 bottles of beer on the wall.

...

No more bottles of beer on the wall, no more bottles of beer. 
Go to the store and buy some more, 99 bottles of beer on the wall.

We use a similar concept as above, but to generalize for any number of any vessel for any liquid, we pass in parameters to a function containing the loop, with the defaults set to bottles of beer. So that this markdown document isn’t insanely long, I’ve cut down the number of bottles to 3.

count_down_poem <- function (num_vessels=99, vessel_type="bottles", liquid_type = "beer", surface_type = "wall" ){
  item = paste0(vessel_type, " of ", liquid_type)
  starting_vessels <- num_vessels
  while (num_vessels >0) {
    cat (paste0( num_vessels," ", item, " on the ", surface_type, ", ", num_vessels," ", item, ".\n" ))
    num_vessels <- num_vessels-1
    if(num_vessels == 0)
      cat( paste0( "Take one down and pass it around, no more ", item, " on the ", surface_type, ".\n\n" ))
    else
      cat ( paste0( "Take one down and pass it around, ", num_vessels, " ", item, " on the ", surface_type, ".\n\n" ))
  }
  cat(paste0("No more ", item, " on the ", surface_type, ", ", "no more ", item, ".\n") )
  cat(paste0("Go to the store and buy some more, ", starting_vessels, " ", item, " on the ", surface_type, ".\n\n") )
  
}
count_down_poem(num_vessels = 3)

## 3 bottles of beer on the wall, 3 bottles of beer.
## Take one down and pass it around, 2 bottles of beer on the wall.
## 
## 2 bottles of beer on the wall, 2 bottles of beer.
## Take one down and pass it around, 1 bottles of beer on the wall.
## 
## 1 bottles of beer on the wall, 1 bottles of beer.
## Take one down and pass it around, no more bottles of beer on the wall.
## 
## No more bottles of beer on the wall, no more bottles of beer.
## Go to the store and buy some more, 3 bottles of beer on the wall.

count_down_poem(num_vessels = 3, vessel_type = "tanks", liquid_type = "water", surface_type = "farm")

## 3 tanks of water on the farm, 3 tanks of water.
## Take one down and pass it around, 2 tanks of water on the farm.
## 
## 2 tanks of water on the farm, 2 tanks of water.
## Take one down and pass it around, 1 tanks of water on the farm.
## 
## 1 tanks of water on the farm, 1 tanks of water.
## Take one down and pass it around, no more tanks of water on the farm.
## 
## No more tanks of water on the farm, no more tanks of water.
## Go to the store and buy some more, 3 tanks of water on the farm.

4. It’s common to see for loops that don’t preallocate the output and instead increase the length of a vector at each step:

# make a list of 100000 lists of differing length
x <- vector("list", 100000)
for (i in seq_along(x)) {
  n <- sample(100,1)
  x[[i]] <- rnorm(n, 10, 1)
}

# time a loop that increases the length of a vector at each step:
output <- vector("integer", 0)
system.time(
for (i in seq_along(x)) {
  output <- c(output, lengths(x[i]))
}
)

##    user  system elapsed 
##  15.977   7.459  23.675

# time a loop that places the output inside a preallocated vector:
output <- vector("integer", length(x))
system.time(
for (i in seq_along(x)) {
  output[i] <- lengths(x[i])
}
)

##    user  system elapsed 
##    0.07    0.00    0.07

How does this affect performance? Design and execute an experiment.

Preallocation significantly increases the performance of the loop, especially when there are large numbers of iterations involved. I generate a list of 100,000 lists of differing length. The loops will assess the length of each list within the list. Using system.time to measure the time it takes to execute the for loop, we find that it takes roughly 17 seconds if the vector’s length is increased at each step, whereas the loop takes less than a tenth of a second if preallocation is used. This is quite a significant performance improvement!

21.3.5 Exercises

1. Imagine you have a directory full of CSV files that you want to read in. You have their paths in a vector, files <- dir(“data/”, pattern = “\.csv$”, full.names = TRUE), and now want to read each one with read_csv(). Write the for loop that will load them into a single data frame.

I wrote the diamonds dataset to a csv file twice and stored the files in a folder called test_output (this folder is part of .gitignore so it does not show up in this repo). The loop below should read the csv files, store the tables in a list, then bind the list into a data frame.

#write.csv(diamonds, file = "diamonds.csv")
files <- dir("test_output/", pattern = "\\.csv$", full.names = TRUE)
files

## character(0)

output <- vector("list", length(files))
for (i in seq_along(files)) {
  output[[i]] <- read_csv(files[i])
}
output <- bind_cols(output)

2. What happens if you use for (nm in names(x)) and x has no names? What if only some of the elements are named? What if the names are not unique?

If there are no names, names(x) is NULL and the loop does not execute. If only some of the elements are named, all elements of X are iterated through but nm will be NA for the elements that are unnamed. If the names are not unique, the loop acts normally–all elements will be iterated through and the non-unique names will still be used. Example below:

print_names <- function (x){
  for(nm in names(x)) {
    print(nm)
  }
}

# if x has no names:
x <- c(1:10)
print_names(x) # nothing happens

names(x)[2:5] <- letters[1:4]
print_names(x) # NA is printed when names do not exist

## [1] NA
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
## [1] NA
## [1] NA
## [1] NA
## [1] NA
## [1] NA

names(x) <- c(letters[1:5], letters[1:5])
print_names(x)

## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
## [1] "e"
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
## [1] "e"

3. Write a function that prints the mean of each numeric column in a data frame, along with its name. For example, show_mean(iris) would print:

print_means <- function (df) {
  for (i in seq_along(df)) {
    if (is.numeric(df[,i])){
        print(paste0( colnames(df)[i], ": ", mean(df[,i])))
    }
  }
}

print_means(iris)

## [1] "Sepal.Length: 5.84333333333333"
## [1] "Sepal.Width: 3.05733333333333"
## [1] "Petal.Length: 3.758"
## [1] "Petal.Width: 1.19933333333333"

# show_mean(iris)
#> Sepal.Length: 5.84
#> Sepal.Width:  3.06
#> Petal.Length: 3.76
#> Petal.Width:  1.20

(Extra challenge: what function did I use to make sure that the numbers lined up nicely, even though the variable names had different lengths?)

If we want to make the numbers line up nicely, we can find the length of the longest column name and add spaces to the other column names to let them match up. The function that could do this (add spaces to strings) would be str_pad, part of stringr. I would first find which column names correspond to numeric columns. Then, I would add the colon using str_c(). I would then determine the maximum str length using str_length() and max(), and then apply this length to the str_pad() function in order to add the appropriate number of spaces.

print_means <- function (df) {
  numeric_cols <- vector()
  # find out which columns are numeric
  for (i in seq_along(df)) {
    if (is.numeric(df[,i])){
        numeric_cols <- append(numeric_cols, colnames(df)[i])
    }
  }
  # add the colon to the column name
  padded_cols <- str_c(numeric_cols, ": ")
  # determine lenth of each name
  max_str_length <- max(str_length(padded_cols))
  
  # print out a padded version of the name with a rounded mean value
  for (i in seq_along(padded_cols)) {
    print(paste0( str_pad(padded_cols[i], max_str_length, "right"), round(mean(df[[numeric_cols[i]]]),2) ))
  }
}
print_means(iris)

## [1] "Sepal.Length: 5.84"
## [1] "Sepal.Width:  3.06"
## [1] "Petal.Length: 3.76"
## [1] "Petal.Width:  1.2"

4. What does this code do? How does it work?

trans <- list( 
  disp = function(x) x * 0.0163871,
  am = function(x) {
    factor(x, labels = c("auto", "manual"))
  }
)
head(mtcars[,c("disp", "am")])

##                   disp am
## Mazda RX4          160  1
## Mazda RX4 Wag      160  1
## Datsun 710         108  1
## Hornet 4 Drive     258  0
## Hornet Sportabout  360  0
## Valiant            225  0

for (var in names(trans)) {
  mtcars[[var]] <- trans[[var]](mtcars[[var]])
}
head(mtcars[,c("disp", "am")])

##                       disp     am
## Mazda RX4         2.621936 manual
## Mazda RX4 Wag     2.621936 manual
## Datsun 710        1.769807 manual
## Hornet 4 Drive    4.227872   auto
## Hornet Sportabout 5.899356   auto
## Valiant           3.687098   auto

The first chunk of code defines a list called “trans” which contains two entries, one named “disp” which is a function that multiplies a value by 0.0163871, and another function named “am” which categorizes values based on whether they are equal to “auto” or “manual”.

Afterwards, the for loop iterates through the items in the list trans (which are “disp” and “am”), which both also happen to be names of columns in the built-in R dataset mtcars. The code overwrites the existing columns with new values according to the function called by trans[[var]]. For example, trans[disp] will multiply the disp column in mtcars by 0.0163871 and then update that column with the new value. The column “am” will be updated from 1 and 0 values to “manual” and “auto”.

21.4.1 Exercises

1. Read the documentation for apply(). In the 2d case, what two for loops does it generalise?

apply(), as the name suggests, will apply a function of your choosing to either all the rows (MARGIN = 1), all the columns (MARGIN=2), or all the rows and columns (MARGIN = c(1,2)). The for loops that it generalizes are those that iterate sequentially through each column of the df, or through each row of the df, or a nested loop that iterates through each row within each column. A short example is below, which calculates the mean each column of a dataset (MARGIN = 2), or of each row of dataset (MARGIN = 1).

mtcars_subset <- mtcars[1:5,1:5]


apply(mtcars_subset, 2, mean) # by column

##        mpg        cyl       disp         hp       drat 
##  20.980000   6.000000   3.428181 119.600000   3.576000

apply(mtcars_subset, 1, mean) # by row

##         Mazda RX4     Mazda RX4 Wag        Datsun 710    Hornet 4 Drive 
##          28.70439          28.70439          25.08396          28.94157 
## Hornet Sportabout 
##          42.14987

apply(mtcars_subset, c(1,2), mean) # by both col and row

##                    mpg cyl     disp  hp drat
## Mazda RX4         21.0   6 2.621936 110 3.90
## Mazda RX4 Wag     21.0   6 2.621936 110 3.90
## Datsun 710        22.8   4 1.769807  93 3.85
## Hornet 4 Drive    21.4   6 4.227872 110 3.08
## Hornet Sportabout 18.7   8 5.899356 175 3.15

# theoretical loop that apply (MARGIN = 2) generalizes, similar to col_summary() from this chapter
apply_mean_col <- function(df, fun) {
  out <- vector("double", ncol(df))
  for (i in seq_along(df)) {
    out[i] <- fun(df[[i]])
  }
  out
}
apply_mean_col(mtcars_subset, mean)

## [1]  20.980000   6.000000   3.428181 119.600000   3.576000

# theoretical loop that apply (MARGIN = 1) generalizes
apply_mean_row <- function(df, fun) {
  out <- vector("double", nrow(df))
  for (i in 1:nrow(df)) {
    out[i] <- fun(unlist(df[i,]))
  }
  out
}
apply_mean_row(mtcars_subset, mean)

## [1] 28.70439 28.70439 25.08396 28.94157 42.14987

2. Adapt col_summary() so that it only applies to numeric columns. You might want to start with an is_numeric() function that returns a logical vector that has a TRUE corresponding to each numeric column.

col_summary_numeric <- function(df, fun) {
  numeric_cols <- vector("logical", length(df))
  for ( i in seq_along(df)) {
    if ( is.numeric(df[[i]]) )
      numeric_cols[i] <- T
    else
      numeric_cols[i] <- F
  }
  df_numeric <- df[,numeric_cols]
  out <- vector("double", length(df_numeric))
  for (i in seq_along(df_numeric)) {
    out[i] <- fun(df_numeric[[i]])
  }
  names(out) <- colnames(df_numeric)
  out
}

col_summary_numeric(mtcars, mean)

##        mpg        cyl       disp         hp       drat         wt 
##  20.090625   6.187500   3.780862 146.687500   3.596563   3.217250 
##       qsec         vs       gear       carb 
##  17.848750   0.437500   3.687500   2.812500

21.5.3 Exercises

1. Write code that uses one of the map functions to:

Compute the mean of every column in mtcars.

map_dbl(mtcars, mean)

## Warning in mean.default(.x[[i]], ...): argument is not numeric or logical:
## returning NA

##        mpg        cyl       disp         hp       drat         wt 
##  20.090625   6.187500   3.780862 146.687500   3.596563   3.217250 
##       qsec         vs         am       gear       carb 
##  17.848750   0.437500         NA   3.687500   2.812500

Determine the type of each column in nycflights13::flights.

map_chr(nycflights13::flights, typeof)

##           year          month            day       dep_time sched_dep_time 
##      "integer"      "integer"      "integer"      "integer"      "integer" 
##      dep_delay       arr_time sched_arr_time      arr_delay        carrier 
##       "double"      "integer"      "integer"       "double"    "character" 
##         flight        tailnum         origin           dest       air_time 
##      "integer"    "character"    "character"    "character"       "double" 
##       distance           hour         minute      time_hour 
##       "double"       "double"       "double"       "double"

Compute the number of unique values in each column of iris.

map_int(iris, function(a) length(unique(a)))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           35           23           43           22            3

Generate 10 random normals for each of μ=−10,0,10, and 100.

map(c(10,0,10,100), function (b) rnorm(10, b, 1))

## Error in paste("(^", regions, ")", sep = "", collapse = "|"): cannot coerce type 'closure' to vector of type 'character'

# also works
# map(c(10,0,10,100), ~ rnorm(10, ., 1))

2. How can you create a single vector that for each column in a data frame indicates whether or not it’s a factor?

Use map() to run is.factor() on each column.

map_lgl(iris, is.factor)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##        FALSE        FALSE        FALSE        FALSE         TRUE

3. What happens when you use the map functions on vectors that aren’t lists? What does map(1:5, runif) do? Why?

The output is the same whether the vector is a list or an atomic vector. map(1:5,runif) calls runif(1), runif(2), runif(3), runif(4), and runif(5), as does map(list(1,2,3,4,5), runif).

map(1:5, runif)

## Error in paste("(^", regions, ")", sep = "", collapse = "|"): cannot coerce type 'closure' to vector of type 'character'

map(list(1,2,3,4,5), runif)

## Error in paste("(^", regions, ")", sep = "", collapse = "|"): cannot coerce type 'closure' to vector of type 'character'

4. What does map(-2:2, rnorm, n = 5) do? Why? What does map_dbl(-2:2, rnorm, n = 5) do? Why?

map(-2:2, rnorm, n = 5) calls rnorm(n=5,-2), rnorm(n=5,-1), rnorm(n=5,0), rnorm(n=5,1), and rnorm(n=5,2), and returns the output as a list of vectors. However, map_dbl(-2:2, rnorm, n = 5) results in an error. This is because map_dbl cannot return a list of vectors, and can only return one vector in which the values are all doubles.

map(-2:2, rnorm, n = 5)

## Error in map(-2:2, rnorm, n = 5): argument 3 matches multiple formal arguments

# map_dbl(-2:2, rnorm, n = 5) # Error: Result 1 must be a single double, not a double vector of length 5

5. Rewrite map(x, function(df) lm(mpg ~ wt, data = df)) to eliminate the anonymous function.

This mapping function assumes that the dataset x has multiple entries in which a linear model can be fitted between variables mpg and wt. This looks like it was meant to analyze the mtcars dataset, which we can subset using split(). For example, let’s split by column “am”. Then, the mapping call should calculate a linear model between mpg and wt for “auto” cars as well as “manual” cars.

x <- split(mtcars, mtcars$am)

# orig function
map(x, function(df) lm(mpg ~ wt, data = df))

## Error in paste("(^", regions, ")", sep = "", collapse = "|"): cannot coerce type 'closure' to vector of type 'character'

# rewrite function to eliminate anonymous function
map(x, ~ lm(mpg ~ wt, data = .))

## Error in map.poly(database, regions, exact, xlim, ylim, boundary, interior, : no recognized region names

21.9.3 Exercises

1. Implement your own version of every() using a for loop. Compare it with purrr::every(). What does purrr’s version do that your version doesn’t?

# purrr::every()
every(mtcars, is.numeric)

## [1] FALSE

every(mtcars, is.atomic)

## [1] TRUE

# my implementation
my_every <- function (x, fun, ...) {
  # default value to return is TRUE
  is_every <- TRUE
  for (i in seq_along(x)) {
    # if an item in x does not satisfy the function, change is_every to FALSE
    if (fun(x[[i]],...) == F) {
     is_every <- FALSE 
    }
  }
  is_every
}

my_every(mtcars, is.numeric)

## [1] FALSE

my_every(mtcars, is.atomic)

## [1] TRUE

When looking at the source code behind purrr:every(), I see that they tested whether there are any NA values in the input, and return NA if true. They also use is_false() instead of == F, which seems to be safer. They also return a default value of TRUE unless one of the items in the input does not satisfy the logical function, which then ends the for loop early by returning FALSE. In retrospect, this method is much more efficient compared to my version because it will end the loop early rather than having to finish iterating through what could potentially be a very large loop.

2. Create an enhanced col_summary() that applies a summary function to every numeric column in a data frame.

col_summary_numeric <- function(df, fun) {
  # first determine which columns are numeric
  numeric_cols <- vector("logical", length(df))
  for ( i in seq_along(df)) {
    if ( is.numeric(df[[i]]) )
      numeric_cols[i] <- T
    else
      numeric_cols[i] <- F
  }
  # subset the data based on only the numeric columns
  df_numeric <- df[,numeric_cols]
  
  # apply the summary function to each of the columns in the subsetted data
  out <- vector("double", length(df_numeric))
  for (i in seq_along(df_numeric)) {
    out[i] <- fun(df_numeric[[i]])
  }
  
  # annotate and return the output
  names(out) <- colnames(df_numeric)
  out
}

col_summary_numeric(mtcars, mean)

##        mpg        cyl       disp         hp       drat         wt 
##  20.090625   6.187500   3.780862 146.687500   3.596563   3.217250 
##       qsec         vs       gear       carb 
##  17.848750   0.437500   3.687500   2.812500

3. A possible base R equivalent of col_summary() is:

col_sum3 <- function(df, f) {
  is_num <- sapply(df, is.numeric)
  df_num <- df[, is_num]

  sapply(df_num, f)
}

But it has a number of bugs as illustrated with the following inputs:

df <- tibble(
  x = 1:3, 
  y = 3:1,
  z = c("a", "b", "c")
)
# OK
col_sum3(df, mean)

## x y 
## 2 2

# Has problems: don't always return numeric vector
col_sum3(df[1:2], mean)

## x y 
## 2 2

col_sum3(df[1], mean)

## x 
## 2

# col_sum3(df[0], mean) # Error: Can't subset with `[` using an object of class list.

What causes the bugs?

To view the errors more in-depth, we can use purrr::safely() to observe what’s going on. For the first two “problematic” entries, there does not seem to be an error, as the $error portion is NULL. Furthermore, using typeof( col_sum3(df[1:2], mean)) returns “double”, suggesting that the function is indeed returning a numeric vector in this instance. We do observe an error for col_sum3(df[0], mean), which tries to call the col_sum3 function on an empty data frame. The error comes from the line, df_num <- df[, is_num]. This is because is_num is an emtpy list, and trying to subset an empty data frame with an empty list results in the error. The code below will walk through this phenomenon.

safely_col_sum3 <- safely(col_sum3)

safely_col_sum3(df[1:2], mean)

## $result
## x y 
## 2 2 
## 
## $error
## NULL

safely_col_sum3(df[1], mean)

## $result
## x 
## 2 
## 
## $error
## NULL

safely_col_sum3(df[0], mean)

## $result
## NULL
## 
## $error
## <error>
## message: Can't subset with `[` using an object of class list.
## class:   `rlang_error`
## backtrace:
##   1. rmarkdown::render_site(output_format = "bookdown::gitbook", encoding = "UTF-8")
##  30. purrr:::safely_col_sum3(df[0], mean)
##  39. global::.f(...)
##  41. tibble:::`[.tbl_df`(df, , is_num)
##  42. tibble:::check_names_df(j, x)
## Call `rlang::last_trace()` to see the full backtrace

# df[0] creates an empty data frame
df <- df[0]
df

## # A tibble: 3 x 0

# is_num is an empty list
is_num <- sapply(df, is.numeric)
is_num

## named list()

# this throws the error
# df_num <- df[, is_num] # Error: Can't subset with `[` using an object of class list.