Instructions

Download the *.qmd file from here
Place it in the relevant folder “week2”
Open the *.qmd file

1 Removing Elements from Lists

Question 1

Remove the word “literature” from the following vector and create a new object

list_fields <- c("politics", "philosophy", "literature", "chemistry")
list_fields_new<-list_fields[list_fields!="literature"]
list_fields_new

[1] "politics"   "philosophy" "chemistry"

Question 2

Remove from the following list all the numbers between 13 and 15 (inclusive)

list_no<-c(11, 12, 13, 14, 15, 16, 17)
new_list_no <- list_no[list_no <= 13 | list_no >= 15]
new_list_no

[1] 11 12 13 15 16 17

Question 3

Remove from the following list, “word”, “sentence”, “books” using the %in% operator.

list_words<-c("random", "word", "sentence", "books")

exclusion_list<-c("word", "sentence", "books")

(list_words %in% exclusion_list)

[1] FALSE  TRUE  TRUE  TRUE

!(list_words %in% exclusion_list)

[1]  TRUE FALSE FALSE FALSE

list_words[!(list_words %in% exclusion_list)]

[1] "random"

student_group1<-c("Alex", "Helen", "Charlie")
student_group2<-c("Ollie", "James", "Ellie", "Charlie")

student_group2 %in% student_group1

[1] FALSE FALSE FALSE  TRUE

list_words_new<-list_words[!(list_words %in% exclusion_list)]
list_words_new

[1] "random"

"four" > "five"

[1] TRUE

2 Counting string elements

Question 4

Explain in your words: Why does the following “four” > “five” return TRUE

"four" > "five"

[1] TRUE

#Because "o" comes after "i" alphabetically

nchar("four")

[1] 4

3 Dealing with Missing Observations

Question 5

Remove the missing data from the following list

list_no <- c(1, 2, 3, 4, NA, 5, 6)
#ANSWER

#is.na(list_no)
#!is.na(list_no)

list_no_updated<-list_no[!is.na(list_no)]
list_no_updated

[1] 1 2 3 4 5 6

4 Mean and Median

Question 6

Calculate the mean and median from the following list of numbers

list_no<-c(30, 12, NA, 14, NA)

mean(list_no, na.rm=TRUE)

[1] 18.66667

median(list_no, na.rm=TRUE)

[1] 14

5 Dataframes

Question 7

Create the following dataframe

#Option1
student<-c('Alex', 'Jane', 'Tom', 'Lilly', 'Turner', 'Ruby', 'Nick')
grade<-c(77, 81, 89, 83, 99, 92, 97)

df<-data.frame(student, grade)

#Option2
df <- data.frame(student=c('Alex', 'Jane', 'Tom', 'Lilly', 'Turner', 'Ruby', 'Nick'),
                 grade=c(77, 81, 89, 83, 99, 92, 97))
df

Question 8

Create a new dataframe with only Alex, Jane and Turner

new_df<-subset(df, student=="Alex" | student == "Jane" |  student == "Turner")
new_df

Question 9

What is the average grade for the entire class

mean(df$grade)

[1] 88.28571

Question 10

What is the mean of Alex, Jane, and Turner’s grades

new_df<-subset(df, student=="Alex" | student == "Jane" |  student == "Turner")
mean(new_df$grade)

[1] 85.66667

6 Max and Min

Question 11

What is the highest grade?

max(df$grade)

[1] 99

max_grade<-max(df$grade)
max_grade

[1] 99

Question 12

What is the lowest grade?

min_grade<-min(df$grade)
min_grade

[1] 77

7 Subsetting

Question 13

Who is student with the highest grade?

df$student[df$grade==max_grade]

[1] "Turner"

Question 14

Who is the student with the lowest grade?

df$student[df$grade==min_grade]

[1] "Alex"

8 Removing items from list based on index

Question 15

Extract the second element from the following list

list_new<-c("el1", "el2", "el3")
list_new[2]

[1] "el2"

Question 16

Extract the last element from the following list

list_new<-c("el1", "el2", "el3")
list_new[length(list_new)]

[1] "el3"

9 Merging Datsets

Question 17

Merging Datsets

Load the life_expectancy and urbanization datasets

#Removing previous datasets in memory
rm(list = ls())
#Setting path
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week2/lab")
life_expectancy_df <- read.csv(file = './data/life-expectancy.csv')
urbanization_df <- read.csv(file = './data/share-of-population-urban.csv')

Calculate the mean by country for life_expectancy

library(dplyr)
life_expectancy_df2<-life_expectancy_df%>%
  dplyr::group_by(Code)%>%
  dplyr::summarize(life_exp_mean=mean(Life.expectancy.at.birth..historical.))

head(life_expectancy_df2, 5)

Remove the countries which have missing continents

life_expectancy_df3<-subset(life_expectancy_df2, life_expectancy_df2$Code!="")
head(life_expectancy_df3, 3)

Calculate the mean by country for urbanization

urbanization_df2<-urbanization_df%>%
  dplyr::group_by(Code)%>%
  dplyr::summarize(urb_mean=mean(Urban.population....of.total.population.))

Remove the countries which have missing continents

#Removing continents
urbanization_df3<-subset(urbanization_df2, urbanization_df2$Code!="")

Perform a left merge

merged_df<-left_join(life_expectancy_df3, urbanization_df3, by = c("Code"="Code"))

Remove NA values

library(tidyr)
merged_df<-merged_df%>%
  drop_na()

10 Scatterplots

Creating a scatterplot

library(ggplot2)
ggplot(merged_df, aes(x=urb_mean, y=life_exp_mean)) +
  geom_point()