#Removing previous datasets in memory
rm(list = ls())
#Loading the relevant libraries
library(ggplot2)
library(gridExtra)
library(dplyr)
library(BSDA)
1 Intro
In this lab, we learn about Z-tests, standardization, and z-scores.
2 Z-Tests
We learned from the lecture that Z-tests evaluate if the averages of two datasets are different from each other when standard deviation or variance is known. The sample size typically should be larger than 30. To illustrate how Z-tests work, let us go back to the sample of Latin American countries and compare them to the world.
2.1 Loading the Data
If you don’t have the Life Expectancy dataset, anymore, you can download it from the following link:
Let us re-examine the distribution of our life expectancy dataset by making a histogram. We need to first load the data:
#Setting path
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week6/lab/")
#Step1: Loading the data
<- read.csv(file = './data/life-expectancy.csv') life_expectancy_df
2.2 Cleaning the Data
In the next few lines we average life expectancy over country (the oiriginal dataset is a panel - with countries and years). This means that we are getting rid of the time component.
#Step1: Calculating the mean
<-life_expectancy_df%>%
life_expectancy_df2::group_by(Entity, Code)%>%
dplyr::summarize(life_exp_mean=mean(Life.expectancy.at.birth..historical.))
dplyr
#Step2: Cleaning the Data
<- c("OWID_KOS", "OWID_WRL", "")
weird_labels <-subset(life_expectancy_df2, !(Code %in% weird_labels)) clean_life_expectancy_df
2.3 Inspecting the data
Let us inspect the first 10 entries.
head(clean_life_expectancy_df, n=10)
2.4 Sorting the dataframe in the order of life_expectancy
Let us now order our dataset based on life expectancy.
<-clean_life_expectancy_df[order(-clean_life_expectancy_df$life_exp_mean),]
clean_life_expectancy_sorted_dfhead(clean_life_expectancy_sorted_df, n=10)
2.5 Make a barplot for the first 10 countries
Let us now make a barplot where life expectancy is ordered from highest to lowest. The first step entails selecting the top 10 countries.
<-head(clean_life_expectancy_sorted_df, n=10)
life_exp_top10 life_exp_top10
Let us know create the barplot for the first 10 entries:
<-ggplot(life_exp_top10, aes(x = reorder(Entity, -life_exp_mean),
figure_1y = life_exp_mean)) +
geom_bar(stat="identity")+
coord_cartesian(ylim = c(60,80))+
geom_text(data=life_exp_top10,
aes(label = round(life_exp_mean,2),
y = life_exp_mean,
vjust = 2),
colour = "white", size=2)+
xlab("Country") + ylab("Life Expectancy")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
figure_1
This is how we select the countries in Latin America.
#Step3: Selecting the Latin American countries
<-c("Belize",
latam_countries"Costa Rica",
"El Salvador",
"Guatemala",
"Honduras",
"Mexico",
"Nicaragua",
"Panama",
"Argentina",
"Bolivia",
"Brazil",
"Chile",
"Colombia",
"Ecuador",
"Guyana",
"Paraguay",
"Peru",
"Suriname",
"Uruguay",
"Venezuela",
"Cuba",
"Dominican Republic",
"Haiti")
<-subset(clean_life_expectancy_df, (Entity %in% latam_countries))
latam_countries_dfhead(latam_countries_df, n=10)
#Ordering countries based on the life expectancy values
<-latam_countries_df[order(-latam_countries_df$life_exp_mean),]
life_exp_top10_latam#Selecting only the first 10
<-head(life_exp_top10_latam, n=10)
life_exp_top10_latam life_exp_top10_latam
Let us create a barplot for the top 10 countries in Latin America:
<-ggplot(life_exp_top10_latam, aes(x = reorder(Entity, -life_exp_mean),
figure_2y = life_exp_mean)) +
geom_bar(stat="identity")+
coord_cartesian(ylim = c(60,80))+
geom_text(data=life_exp_top10_latam,
aes(label = round(life_exp_mean,2),
y = life_exp_mean,
vjust = 2),
colour = "white", size=2)+
xlab("Country") + ylab("Life Expectancy")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
figure_2
Let us now recreate the two graphs and add them together using grid.arrange.
grid.arrange(figure_1, figure_2, ncol=2)
A few things could help:
- remove the Y-axis label from the second graph
- Add descriptive titles
<-ggplot(life_exp_top10, aes(x = reorder(Entity, -life_exp_mean), y = life_exp_mean)) +
figure_3geom_bar(stat="identity")+
coord_cartesian(ylim = c(60,80))+
geom_text(data=life_exp_top10,
aes(label = round(life_exp_mean,2),
y = life_exp_mean,
vjust = 2),
colour = "white", size=2)+
xlab("Country") + ylab("Life Expectancy")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
#adding a title
ggtitle("Top 10 countries in the World")
figure_3
<-ggplot(life_exp_top10_latam, aes(x = reorder(Entity, -life_exp_mean), y = life_exp_mean)) +
figure_4geom_bar(stat="identity")+
coord_cartesian(ylim = c(60,80))+
geom_text(data=life_exp_top10_latam,
aes(label = round(life_exp_mean,2),
y = life_exp_mean,
vjust = 2),
colour = "white", size=2)+
#removing the y label
xlab("Country") + ylab("")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
#adding a title
ggtitle("Top 10 countries in Latin America")
figure_4
Let us now put them side by side
grid.arrange(figure_3, figure_4, ncol=2)
This looks much better.
3 Problem
Now we can finally ask the question: Is the mean life expectancy in Latin America equal to the population life expectancy? What kind of test do we perform?
Answer: Two-tailed z-test. This is because we are asking if something is “EQUAL” to something else.
What do we need to perform a z-test?
We need the following:
- Population Mean \(\mu\)
- Population SD \(\sigma\)
- Sample Mean for Latam: \(\bar{X}\)
- Sample size for Latam: n
Population Mean \(\mu\) is: 61.934159
Population SD \(\sigma\) is: 8.68723
Sample Mean for Latam \(\bar{X}\) is 61.9140352
Sample size for Latam n is 23
How do we know this?
<-mean(clean_life_expectancy_df$life_exp_mean)
mean_population<-sd(clean_life_expectancy_df$life_exp_mean)
sd_population<-mean(latam_countries_df$life_exp_mean)
mean_latam<-nrow(latam_countries_df)
sample_size_latam
mean_population
[1] 61.93416
sd_population
[1] 8.68723
mean_latam
[1] 61.91404
sample_size_latam
[1] 23
We can finally perform a z-test following the formula:
\[Z_{obs}=\frac{\bar{X}-\mu}{\frac{\sigma}{\sqrt{n}}}\] We now simply calculate the \(Z_{obs}\).
<-(mean_latam-mean_population)/(sd_population/sqrt(sample_size_latam))
z_obs z_obs
[1] -0.01110945
The same score can be obtained if we do the following:
<-BSDA::z.test(latam_countries_df$life_exp_mean,
z_obs2mu=mean_population, sigma.x=sd_population,
alternative = "two.sided")
z_obs2
One-sample z-Test
data: latam_countries_df$life_exp_mean
z = -0.011109, p-value = 0.9911
alternative hypothesis: true mean is not equal to 61.93416
95 percent confidence interval:
58.36373 65.46434
sample estimates:
mean of x
61.91404
Note that we define whether this is a one-tailed or a two-tailed z-test by using the option two.sided
in alternative
.
What is the Null Hypothesis for a one-tailed z-test?
The Null Hypothesis - \(H_0\): The sample mean (for Latam) is equal to the population mean.
What is the Alternative Hypothesis for one-tailed z-test - \(H_1\)?
The Alternative Hypothesis - \(H_1\): The sample mean (for Latam) is NOT equal to the population mean.
Note that this is also specified in the test itself: “alternative hypothesis: true mean is not equal to 61.93416”
Do we accept or reject the \(H_0\)?
We do not reject the null hypothesis (H0) because p ≥ 0.05 and conclude that there is no significant difference between the means of the two groups: Latam and the population mean.
This is easier to see by doing a boxplot.
#Creating a new dataframe
<-clean_life_expectancy_df
clean_life_expectancy_df_continent#Adding a new variable, called sample
$sample<-"Rest"
clean_life_expectancy_df_continent#Labeling Latin America
$sample[clean_life_expectancy_df_continent$Entity %in% latam_countries]<-"Latam" clean_life_expectancy_df_continent
ggplot(clean_life_expectancy_df_continent, aes(x = sample, y = life_exp_mean, color = sample)) +
geom_boxplot() +
theme_bw()
And this is the interpretation of a box plot.
3.1 Comparing means - greater than a number
Now we can finally ask the question: Is the mean life expectancy in Latin America greater than the population life expectancy? What kind of test do we perform?
Answer: One-tailed z-test. This is because we are asking if something is “GREATER” than something else.
library(BSDA)
<-BSDA::z.test(latam_countries_df$life_exp_mean,
z_obs_greatermu=mean_population, sigma.x=sd_population,
alternative = "greater")
z_obs_greater
One-sample z-Test
data: latam_countries_df$life_exp_mean
z = -0.011109, p-value = 0.5044
alternative hypothesis: true mean is greater than 61.93416
95 percent confidence interval:
58.93453 NA
sample estimates:
mean of x
61.91404
What is the Null Hypothesis for a one-tailed z-test?
The Null Hypothesis - \(H_0\): The sample mean (for Latam) is not greater than the population mean.
What is the Alternative Hypothesis for a one-tailed z-test - \(H_1\)?
The Alternative Hypothesis - \(H_1\): The sample mean (for Latam) is greater than the population mean.
Do we accept or reject the \(H_0\)?
We do not reject the null hypothesis (H0) because p ≥ 0.05 and conclude that the sample mean (for Latam) is not greater than the population mean.
3.2 Comparing means - less than a number
<-BSDA::z.test(latam_countries_df$life_exp_mean,
z_obs_lessmu=mean_population, sigma.x=sd_population,
alternative = "less")
z_obs_less
One-sample z-Test
data: latam_countries_df$life_exp_mean
z = -0.011109, p-value = 0.4956
alternative hypothesis: true mean is less than 61.93416
95 percent confidence interval:
NA 64.89354
sample estimates:
mean of x
61.91404
What is the Null Hypothesis for a one-tailed z-test?
The Null Hypothesis - \(H_0\): The sample mean (for Latam) is NOT less than the population mean.
What is the Alternative Hypothesis for a one-tailed z-test - \(H_1\)?
The Alternative Hypothesis - \(H_1\): The sample mean (for Latam) is less than the population mean.
Do we accept or reject the \(H_0\)?
We do not reject the null hypothesis (H0) because p ≥ 0.05 and conclude that the sample mean (for Latam) is not less than the population mean.
4 Z-Scores
As we discussed previously, the standard normal distribution is a special type of distribution with a mean \(\mu = 0\) and a standard deviation \(\sigma = 1\). For example, a z-score of 2 tells us that we are 2 standard deviations to the right of the mean.
A z-score allows to calculate how much area that specific score is associated with using the z-score table.
#Generating 10000 obs.
<-rnorm(10000, mean=0, sd=1)
generated_data_10000obs#Turning them into a dataframe
<-data.frame(generated_data_10000obs)
generated_data_10000obs#Changing the variable names
names(generated_data_10000obs)<-"value"
#Adding notes to the graph using UTF-8 characters for the Greek letters
<- paste("Mean (\u03BC)", ": 0", "\n", "SD(\u03C3)", ": 1")
lines
#Graphing
<-ggplot(data = generated_data_10000obs, aes(x=value))+
figure_5geom_histogram(bins=50, col="white")+
theme_bw()+
scale_x_continuous(breaks=seq(-4, 4, by = 1), limits = c(-4,4))+
annotate(geom="text",
x=-3,
y=600,
label=lines,
color="red")
figure_5
Each number on the X-axis (i.e. -4…4) corresponds to a z-score. A z-score tells us how many standard deviations an observation is from the mean \(\mu\). A z-score also allows to calculate the area associated with that z-score is, based on a z-score table (standard normal table).
The cool thing about a standard normal distribution is that any normal distribution can be turned into a standard normal distribution with a mean \(\mu\) of 0 and a standard distrubution \(\sigma\) of 1. This process is called standardization.
The standardization formula:
\[ z=\frac{x - \mu}{\sigma} \] where: z - z-score x - observation \(\mu\) - population mean \(\sigma\) - population variance
5 Converting a distribution to a standard normal distribution
#Step1: Loading the data
<- read.csv(file = './data/life-expectancy.csv')
life_exp_df #Step2: Creating more friendly column names
names(life_exp_df)[4]<-"life_exp_yearly"
#Step3: Graphing
<-ggplot(data = life_exp_df, aes(x=life_exp_yearly))+
figure_5geom_histogram(bins=50, col="white")+
theme_bw()
figure_5
Let us examine the first entries in our dataset:
head(life_exp_df, n=10)
Let us now apply standardization to our dataset.
5.1 Step: Calculating the mean
<-mean(life_exp_df$life_exp_yearly, na.rm=T) mean_life_exp
The mean for life expectancy throughout the world for the entire time series between 1543 and 2021 is 61.78. How come we have years that are lower than 1800? Let’s examine what the countries for which we have such long historical data are.
<-subset(life_exp_df, Year<1800)
historical_df_life_expunique(historical_df_life_exp$Entity)
[1] "Denmark" "Europe" "Finland" "Sweden"
[5] "United Kingdom"
Thus, we can clearly see that our dataset contains detailed historical data on life expectancy for: Denmark, Europe, Finland, Sweden, United Kingdom. This is even more visible if we plot the time series for these countries prior to 1800s.
<-ggplot(data = historical_df_life_exp, aes(x = Year, y = life_exp_yearly))+
figure_6geom_line(aes(color = Entity))+
theme_bw()
figure_6
So the graph shows us that we have the longest data for the UK.
5.2 Step: Calculating the variance
The next step entails calculating the variance of our dataset.
<-var(life_exp_df$life_exp_yearly, na.rm=T)
var_life_exp var_life_exp
[1] 167.3311
Life expectancy has a variance of 167.3311413, and the standard deviation is 12.94.
5.3 Step: Applying the formula
We can now apply the formula:
\[ z=\frac{x - \mu}{\sigma} \]
Let’s do exactly that and inspect the first 10 entries:
$z_score<-(life_exp_df$life_exp_yearly- mean_life_exp) / var_life_exp
life_exp_dfhead(life_exp_df, n=10)
6 Plotting the new distribution
We can now plot the new distribution:
#Adding notes to the graph
<- paste("Mean (\u03BC)", ": ", round(mean_life_exp, 2), "\n", "SD(\u03C3)", ": ", round(sd(life_exp_df$life_exp_yearly, na.rm=T), 2))
lines
<-ggplot(data = life_exp_df, aes(x=z_score))+
figure_7geom_histogram(bins=50, col="white")+
theme_bw()+
scale_x_continuous(breaks=seq(-0.3, 0.3, by = 0.1),
limits = c(-0.3, 0.3),
labels = scales::comma)+
annotate(geom="text",
x=-0.2,
y=1000,
label=lines,
color="red")
figure_7
I used scales::comma
option within scale_x_continuous
to avoid scientific notation.
Let us now plot the two histograms side by side
grid.arrange(figure_5, figure_7, ncol=2)
We can see that the distributions are very similar. The logic is that if I plug in an x value of 50, that will get a z-score of -0.0704272. Similarly, if I plug an x value of 80, that will get a z-score of 0.108858.
7 Problem
Let us say that we were interested in the following problem: What is the proportion of countries where the life expectancy is lower than 50?
\(P(X<50) = ?\)
We first calculate the z-score. Remember a z-score is a number representing how many standard deviations above or below the mean population the score derived from a z-test is. Essentially, it is a numerical measurement that describes a value’s relationship to the mean of a group of values.
<-(50-mean_life_exp)/sd(life_expectancy_df2$life_exp_mean, na.rm=T)
z z
[1] -1.363251
Note that is the empirical probability.
<-subset(life_expectancy_df2, life_exp_mean<50)
ctry_50nrow(ctry_50)/nrow(life_expectancy_df2)
[1] 0.1171875
Thus we need to look for the proportion of z less than -1.36.
<-pnorm(q=z, lower.tail=TRUE)
xval xval
[1] 0.08640166
Note that this is based on a theoretical probability.
Thus, the proportion countries where life expectancy is lower than 50 is 0.09.
8 Problem
You scored 72 in your last assignment. The class mean was 60 and standard deviation was 10. The grades follow a normal distribution. What is the probability that a randomly selected person in the class had a grade lower than yours?
The way we do that is the following:
<-(72-60)/10
z <-pnorm(q=z, lower.tail=TRUE)
xval xval
[1] 0.8849303
What is the probability that a randomly selected person in the class had a grade higher than yours?
1-xval
[1] 0.1150697