#Removing previous datasets in memory
rm(list = ls())
#Setting path
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week4/lab")
<- read.csv(file = './data/life-expectancy.csv')
life_expectancy_df <- read.csv(file = './data/share-of-population-urban.csv') urbanization_df
To unserstand better how histograms work, imagine a set of values that are spaced out along a number line.
To construct a histogram, a section of the number line is divided into equal chunks, called bins. Next, count how many data points sit inside each bin, and draw bars, one for each bin, whose heights correspond to the number of data points.
Label the data (in the example below each data point is an SAT score), draw in a y-axis which counts the number of data points in each bin, and finally label your bins.
1 Loading the Data
But let’s go back to our old datasets: life expectancy and urbanization. Let’s open them first.
If you don’t have them anymore, you can download them from the following links:
We should now have in memory the two datasets. Let’s inspect the first 5 entries in both dataframes:
head(life_expectancy_df, n=5)
head(urbanization_df, n=5)
2 Create averages
The next step is to create averages for all the countries
library(dplyr)
<-life_expectancy_df%>%
life_expectancy_df2::group_by(Code)%>%
dplyr::summarize(life_exp_mean=mean(Life.expectancy.at.birth..historical.))
dplyr
<-urbanization_df%>%
urbanization_df2::group_by(Code)%>%
dplyr::summarize(urb_mean=mean(Urban.population....of.total.population.)) dplyr
3 Inspect the first 5 entries of each new dataframe
head(life_expectancy_df2, n=4)
head(urbanization_df2, n=4)
So we have some unusual entries such as ““,”OWID_KOS” or “OWID_WRL”. What are these countries? We can do a subset to answer this question.
<- c("OWID_KOS", "OWID_WRL", "")
weird_labels <-subset(life_expectancy_df2, (Code %in% weird_labels)) weird_countries
Let’s inspect them by verifying unique countries
<-subset(weird_countries, !duplicated(weird_countries$Code))
weird_countries2head(weird_countries2, n=10)
3.1 Cleaning our dataset
<- c("OWID_KOS", "OWID_WRL", "")
weird_labels <-subset(life_expectancy_df2, !(Code %in% weird_labels))
clean_life_expectancy_dfhead(clean_life_expectancy_df, n=5)
<- c("OWID_KOS", "OWID_WRL", "")
weird_labels <-subset(urbanization_df2, !(Code %in% weird_labels))
clean_urbanization_dfhead(clean_urbanization_df, n=5)
3.2 Performing a left merge
We will now perform a left merge whereby we try to merge urbanization data to life expectancy based on Code.
<-left_join(clean_life_expectancy_df, clean_urbanization_df, by = c("Code"="Code"))
merged_datahead(merged_data, n=10)
3.3 Getting rid of NAs for urbanization
<-subset(na.omit(merged_data))
merged_data2head(merged_data2, n=10)
4 Creating a bar plot
We need to get to know our data better. What is the distribution of life expectancy. What about urbanization? We can create bar plots and then histograms.
Let’s create a frequency table.
#Step1: Rounding the values
$life_exp_mean_rounded<-round(merged_data2$life_exp_mean, 0)
merged_data2$urb_mean_rounded<-round(merged_data2$urb_mean, 0)
merged_data2head(merged_data2, n=5)
#Step2: Creating a frequency table
<-table(merged_data2$life_exp_mean_rounded)
freq_table freq_table
38 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
1 2 3 4 4 5 5 2 2 4 11 6 5 6 4 5 6 6 10 10 6 6 12 11 5 11
68 69 70 71 72 73 74 75 77
9 11 12 12 4 4 1 6 3
#Step3: Turning the table into a dataframe
<-data.frame(freq_table) freq_table
#Step4: Inspecting the names
#names(freq_table)
names(urbanization_df)
[1] "Entity"
[2] "Code"
[3] "Year"
[4] "Urban.population....of.total.population."
#Step5:Identifying how to extract the variable names
names(freq_table)
[1] "Var1" "Freq"
#Step6: Providing more intuitive names
names(freq_table)[1]<-"life_exp_mean_rounded"
names(freq_table)[2]<-"frequency"
names(freq_table)
[1] "life_exp_mean_rounded" "frequency"
#Step7: Inspecting variable types
str(freq_table)
'data.frame': 35 obs. of 2 variables:
$ life_exp_mean_rounded: Factor w/ 35 levels "38","43","44",..: 1 2 3 4 5 6 7 8 9 10 ...
$ frequency : int 1 2 3 4 4 5 5 2 2 4 ...
#Step8: Turning factor variables into numeric
$life_exp_mean_rounded<-as.numeric(as.character(freq_table$life_exp_mean_rounded))
freq_tablestr(freq_table)
'data.frame': 35 obs. of 2 variables:
$ life_exp_mean_rounded: num 38 43 44 45 46 47 48 49 50 51 ...
$ frequency : int 1 2 3 4 4 5 5 2 2 4 ...
library(ggplot2)
#Step9: Creating the barplot
<-ggplot(data = freq_table, aes(x=life_exp_mean_rounded, y=frequency))+
fig5geom_bar(stat="identity")+
theme_bw()
fig5
You can obtain a similar graph by using the geom_histogram
option in ggplot. You will of course have to use some options.
<-ggplot(data = merged_data2, aes(x=life_exp_mean_rounded))+
fig5_bgeom_histogram()+
theme_bw()
fig5_b
<-ggplot(data = merged_data2, aes(x=life_exp_mean_rounded))+
fig5_bgeom_histogram(bins = 50, col="white")+
theme_bw()
fig5_b
<-ggplot(data = merged_data2, aes(x=life_exp_mean_rounded))+
fig5_bgeom_histogram(bins = 35, col="white")+
theme_bw()
fig5_b
library(gridExtra)
grid.arrange(fig5, fig5_b, ncol=2)
4.1 Calculating the mean
We already know how to calculate the mean for our dataset. This is 61.243 which is obtained by typing mean(merged_data2$life_exp_mean_rounded)
Let us put it in our graph.
<-mean(merged_data2$life_exp_mean_rounded)
mean_life_expectancy<-ggplot(data = freq_table, aes(x=life_exp_mean_rounded, y=frequency))+
fig6geom_bar(stat="identity")+
theme_bw()+
geom_vline(xintercept=mean_life_expectancy, linetype='dashed', col = 'red')
fig6
4.2 Adding the mean to the graph
Let us add some some text to indicate that the line is the mean life expectancy in our dataset. The way to do this is by using annotate
.
Here is how we can do that.
<-"Mean"
text_to_add<-11
y_coord<-mean((merged_data2$life_exp_mean_rounded))
x_coord
<-mean((merged_data2$life_exp_mean_rounded))
mean_life_expectancy<-ggplot(data = freq_table, aes(x=life_exp_mean_rounded, y=frequency))+
fig6geom_bar(stat="identity")+
theme_bw()+
geom_vline(xintercept=mean_life_expectancy, linetype='dashed', col = 'red')+
annotate(geom="text",
x=x_coord,
y=y_coord,
label=text_to_add,
color="red")
fig6
Maybe we want to move the text a bit further to the left. We can subtract 2 from the x_coord.
<-"Mean"
text_to_add<-11
y_coord<-mean((merged_data2$life_exp_mean_rounded))-2
x_coord
<-mean((merged_data2$life_exp_mean_rounded))
mean_life_expectancy<-ggplot(data = freq_table, aes(x=life_exp_mean_rounded, y=frequency))+
fig6geom_bar(stat="identity")+
theme_bw()+
geom_vline(xintercept=mean_life_expectancy, linetype='dashed', col = 'red')+
annotate(geom="text",
x=x_coord,
y=y_coord,
label=text_to_add,
color="red")
fig6
5 Creating a line plot
An important question is understanding how life expectancy might have evolved over time. Let us go back to the original dataset before calculating the averages.
head(life_expectancy_df, n=6)
#Renaming variable
names(life_expectancy_df)[4]<-"life_exp_yearly"
head(life_expectancy_df, n=6)
<-ggplot(data = life_expectancy_df, aes(x = Year, y = life_exp_yearly))+
fig7geom_line(aes(color = Entity))+
theme_bw()+
guides(color = "none")
fig7
Unfortunately, we cannot make out much out of this graph, as we have way too many countries. It probably makes sense to focus on 1-2 coutries. Let’s take the US and the UK as an example.
5.1 Subsetting the sample to fewer countries
<-c("United States", "United Kingdom")
countries_of_interest<-subset(life_expectancy_df, Entity %in% countries_of_interest)
df_us_ukhead(df_us_uk, n=10)
Let us now plot life expectancy only for the US and the UK.
<-ggplot(data = df_us_uk, aes(x = Year, y = life_exp_yearly))+
fig8geom_line(aes(color = Entity))+
theme_bw()
fig8
5.2 Subsetting the sample to fewer years
It is kind of cool that in the UK, there is data that goes so far back. For comparison purposes, it might be a good idea to restrict our sample to the period after 1900.
<-subset(df_us_uk, Year>1900)
df_us_uk_after1900head(df_us_uk_after1900, n=10)
<-ggplot(data = df_us_uk_after1900, aes(x = Year, y = life_exp_yearly))+
fig9geom_line(aes(color = Entity))+
theme_bw()
fig9
Let’s color the two countries in red and blue so that the difference is clearer.
<-ggplot(data = df_us_uk_after1900, aes(x = Year, y = life_exp_yearly))+
fig9geom_line(aes(color = Entity))+
theme_bw()+
scale_color_manual(values=c('Red','Blue'))
fig9
What is the year with the lowest life expectancy for the US?
#Creating a new df with the US
<-subset(df_us_uk_after1900, Entity=="United States")
df_us_after1900$Year[df_us_after1900$life_exp_yearly==min(df_us_after1900$life_exp_yearly)] df_us_after1900
[1] 1918
What is the year with the second lowest life expectancy for the US?
#Arranging and creating a new dataframe
<- df_us_after1900 %>% arrange(life_exp_yearly)
df #Selecting the second lowest
<- df$life_exp_yearly[2]
second_highest_life_expectancy #Selecting the year with the second lowest
$Year[df$life_exp_yearly==second_highest_life_expectancy] df
[1] 1901
What is the year with the lowest life expectancy for the UK?
#Creating a new df with the UK
<-subset(df_us_uk_after1900, Entity=="United Kingdom")
df_uk_after1900$Year[df_uk_after1900$life_exp_yearly==min(df_uk_after1900$life_exp_yearly)] df_uk_after1900
[1] 1901
Great! We have gone over a variety of graphs in ggplot including points, barplots, and time lines. You are now in a very good position to do some basic data analysis in R.