<- c("politics", "philosophy", "literature", "chemistry")
list_fields <-list_fields[list_fields!="literature"]
list_fields_new list_fields_new
[1] "politics" "philosophy" "chemistry"
Remove the word “literature” from the following vector and create a new object
<- c("politics", "philosophy", "literature", "chemistry")
list_fields <-list_fields[list_fields!="literature"]
list_fields_new list_fields_new
[1] "politics" "philosophy" "chemistry"
Remove from the following list all the numbers between 13 and 15 (inclusive)
<-c(11, 12, 13, 14, 15, 16, 17)
list_no<- list_no[list_no <= 13 | list_no >= 15]
new_list_no new_list_no
[1] 11 12 13 15 16 17
Remove from the following list, “word”, “sentence”, “books” using the %in% operator.
<-c("random", "word", "sentence", "books")
list_words
<-c("word", "sentence", "books")
exclusion_list
%in% exclusion_list) (list_words
[1] FALSE TRUE TRUE TRUE
!(list_words %in% exclusion_list)
[1] TRUE FALSE FALSE FALSE
!(list_words %in% exclusion_list)] list_words[
[1] "random"
<-c("Alex", "Helen", "Charlie")
student_group1<-c("Ollie", "James", "Ellie", "Charlie")
student_group2
%in% student_group1 student_group2
[1] FALSE FALSE FALSE TRUE
<-list_words[!(list_words %in% exclusion_list)]
list_words_new list_words_new
[1] "random"
"four" > "five"
[1] TRUE
Explain in your words: Why does the following “four” > “five” return TRUE
"four" > "five"
[1] TRUE
#Because "o" comes after "i" alphabetically
nchar("four")
[1] 4
Remove the missing data from the following list
<- c(1, 2, 3, 4, NA, 5, 6)
list_no #ANSWER
#is.na(list_no)
#!is.na(list_no)
<-list_no[!is.na(list_no)]
list_no_updated list_no_updated
[1] 1 2 3 4 5 6
Calculate the mean and median from the following list of numbers
<-c(30, 12, NA, 14, NA)
list_no
mean(list_no, na.rm=TRUE)
[1] 18.66667
median(list_no, na.rm=TRUE)
[1] 14
Create the following dataframe
#Option1
<-c('Alex', 'Jane', 'Tom', 'Lilly', 'Turner', 'Ruby', 'Nick')
student<-c(77, 81, 89, 83, 99, 92, 97)
grade
<-data.frame(student, grade)
df
#Option2
<- data.frame(student=c('Alex', 'Jane', 'Tom', 'Lilly', 'Turner', 'Ruby', 'Nick'),
df grade=c(77, 81, 89, 83, 99, 92, 97))
df
Create a new dataframe with only Alex, Jane and Turner
<-subset(df, student=="Alex" | student == "Jane" | student == "Turner")
new_df new_df
What is the average grade for the entire class
mean(df$grade)
[1] 88.28571
What is the mean of Alex, Jane, and Turner’s grades
<-subset(df, student=="Alex" | student == "Jane" | student == "Turner")
new_dfmean(new_df$grade)
[1] 85.66667
What is the highest grade?
max(df$grade)
[1] 99
<-max(df$grade)
max_grade max_grade
[1] 99
What is the lowest grade?
<-min(df$grade)
min_grade min_grade
[1] 77
Who is student with the highest grade?
$student[df$grade==max_grade] df
[1] "Turner"
Who is the student with the lowest grade?
$student[df$grade==min_grade] df
[1] "Alex"
Extract the second element from the following list
<-c("el1", "el2", "el3")
list_new2] list_new[
[1] "el2"
Extract the last element from the following list
<-c("el1", "el2", "el3")
list_newlength(list_new)] list_new[
[1] "el3"
Merging Datsets
#Removing previous datasets in memory
rm(list = ls())
#Setting path
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week2/lab")
<- read.csv(file = './data/life-expectancy.csv')
life_expectancy_df <- read.csv(file = './data/share-of-population-urban.csv') urbanization_df
library(dplyr)
<-life_expectancy_df%>%
life_expectancy_df2::group_by(Code)%>%
dplyr::summarize(life_exp_mean=mean(Life.expectancy.at.birth..historical.))
dplyr
head(life_expectancy_df2, 5)
<-subset(life_expectancy_df2, life_expectancy_df2$Code!="")
life_expectancy_df3head(life_expectancy_df3, 3)
<-urbanization_df%>%
urbanization_df2::group_by(Code)%>%
dplyr::summarize(urb_mean=mean(Urban.population....of.total.population.)) dplyr
#Removing continents
<-subset(urbanization_df2, urbanization_df2$Code!="") urbanization_df3
<-left_join(life_expectancy_df3, urbanization_df3, by = c("Code"="Code")) merged_df
library(tidyr)
<-merged_df%>%
merged_dfdrop_na()
library(ggplot2)
ggplot(merged_df, aes(x=urb_mean, y=life_exp_mean)) +
geom_point()