In this document we will briefly practice working with different vector types. Follow the instructions in the comments of each code chunk.
# In a recent coding competition, Liz came first, Alex came second
# and Jiena came third.
# here is a vector with their names
name <- c("Alex", "Jiena", "Liz")
# Check that this vector is a character vector of length 3
str(name)
## chr [1:3] "Alex" "Jiena" "Liz"
# Create a corresponding numeric vector with each person's position.
position <- c(2, 3, 1)
# Check that your vector is a numeric vector of length three.
str(position)
## num [1:3] 2 3 1
# Convert the 'name' character vector to a factor vector called 'name_factor'.
# Check that the conversion succeeded.
name_factor <- as.factor(name)
str(name_factor)
## Factor w/ 3 levels "Alex","Jiena",..: 1 2 3
# Convert your 'name' character vector to an ordered factor vector
# in increasing position order. Check that the conversion succeeded.
name_ordered <- ordered(name, levels = c("Jiena", "Alex", "Liz"))
str(name_ordered)
## Ord.factor w/ 3 levels "Jiena"<"Alex"<..: 2 1 3
# EXTENSION: Use a function to display the vector in order of position (highest to lowest)?
sort(name_ordered, decreasing = TRUE)
## [1] Liz Alex Jiena
## Levels: Jiena < Alex < Liz
# Here is a URL for on online csv of data about charity donations
url <- "https://peopleanalytics-regression-book.org/data/charity_donation.csv"
# Download this into a dataframe with a name of your choice
charity_data <- read.csv(url)
# Inspect the dataframe to find out how many rows of data there are
# and what the columns names and data types are
str(charity_data)
## 'data.frame': 354 obs. of 8 variables:
## $ n_donations : int 2 3 3 2 6 8 3 10 5 3 ...
## $ total_donations: int 310 1200 1200 760 2540 3440 1200 4330 2100 1200 ...
## $ time_donating : int 27 22 37 19 37 35 36 42 27 34 ...
## $ recent_donation: int 0 0 1 1 0 1 0 0 1 0 ...
## $ last_donation : int 4 6 11 1 11 1 4 1 6 4 ...
## $ gender : chr "M" "M" "M" "F" ...
## $ reside : chr "Urban Domestic" "Rural Domestic" "Urban Domestic" "Overseas" ...
## $ age : int 29 41 77 66 26 65 60 50 82 41 ...
# Load the dplyr package
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Seek help on the functions filter() and pull() from dplyr
# What do these functions do and what are their arguments?
?dplyr::filter
?dplyr::pull
# Using the pipe operator, write code to calculate the average amount
# donated by Rural Domestic individuals, rounded to the nearest dollar.
# Remember that == (double equals) is used for precise equality
charity_data |>
dplyr::filter(reside == "Rural Domestic") |>
dplyr::pull(total_donations) |>
mean() |>
round()
## [1] 2565
## EXTENSION: Write similar code to calculate the average amount donated
## by males the age range of 30-39 inclusive, rounded to the nearest dollar
charity_data |>
dplyr::filter(gender == "M", age %in% 30:39) |>
dplyr::pull(total_donations) |>
mean() |>
round()
## [1] 2187
# Using your charity dataset from the previous exercises, plot
# total_donations (y-axis) against age (x-axis)
plot(charity_data$age, charity_data$total_donations)
# Using an appropriate function, plot a histogram of total donations.
# Use Help if you need to.
hist(charity_data$total_donations)
# EXTENSION: If you know ggplot2, use it to create a boxplot of
# total donations grouped by gender.
library(ggplot2)
ggplot(data = charity_data, aes(x = gender, y = total_donations)) +
geom_boxplot() +
labs(x = "Gender", y = "Total donations") +
theme_minimal()
# EXTENSION: Run a pairplot to see if you can identify interesting patterns
# in the charity_data dataset. Remember to consider the data types.
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
charity_data <- charity_data |>
dplyr::mutate(
across(c("recent_donation", "gender", "reside"), as.factor)
)
GGally::ggpairs(charity_data)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Knit this document into a HTML file using the 'Knit' button
# View your output