#Removing previous datasets in memory
rm(list = ls())
#Loading the relevant libraries
#install.packages("gghighlight")
library(ggplot2)
library(gridExtra)
library(gghighlight)
Before we get into coding, let us go back to a short explanation of how frequncies relate to probability mass functions (pmf) and cummulative distribution functions (cdf).
There are four functions in R for calculating normal distributions: rnorm
, pnorm
, qnorm
, dnorm
.
1 Generating Random Values in a Standard Normal Distribution (mean: 0, sd:1)
The rnorm
function generates a vector of normally distributed random numbers. We can specify the mean \(\mu\) and the standard deviation \(\sigma\). Remember that that for a standard normal distribution the mean \(\mu = 0\) and the standard deviation \(\sigma = 1\).
Let us see how it works by creating three samples with 10, 100, and 1000 observations.
1.1 Creating a sample of 10 observations
#Set seed allows us to make sure everyone gets the same results
set.seed(123)
#Step1: Creating 10 obs. with mean 0 and sd of 1
<-rnorm(10, mean=0, sd=1)
generated_data_10obs#Step2: Turning the created data into a data frame
<-data.frame(generated_data_10obs)
generated_data_10obs#Step3: Inspecting the first 10 obs.
head(generated_data_10obs, n=10)
#Step4: Renaming variable inside the datafarme
names(generated_data_10obs)<-"value"
head(generated_data_10obs, n=10)
#Step5: Plotting the data
<-ggplot(data = generated_data_10obs, aes(x=value))+
figure_1geom_histogram()+
theme_bw()+
ggtitle("10 Obs.")
figure_1
1.2 Creating a sample of 100 observations
set.seed(123)
#Step1: Creating 100 obs. with mean 0 and sd of 1
<-rnorm(100, mean=0, sd=1)
generated_data_100obs#Step2: Turning the created data into a data frame
<-data.frame(generated_data_100obs)
generated_data_100obs#Step3: Inspecting the first 10 obs.
head(generated_data_100obs, n=10)
#Step4: Renaming variable inside the datafarme
names(generated_data_100obs)<-"value"
head(generated_data_100obs, n=10)
#Step5: Plotting the data
<-ggplot(data = generated_data_100obs, aes(x=value))+
figure_2geom_histogram()+
theme_bw()+
ggtitle("100 Obs.")
figure_2
1.3 Creating a sample of 1000 observations
set.seed(123)
#Step1: Creating 1000 obs. with mean 0 and sd of 1
<-rnorm(1000, mean=0, sd=1)
generated_data_1000obs#Step2: Turning the created data into a data frame
<-data.frame(generated_data_1000obs)
generated_data_1000obs#Step3: Inspecting the first 10 obs.
head(generated_data_1000obs, n=10)
#Step4: Renaming variable inside the datafarme
names(generated_data_1000obs)<-"value"
head(generated_data_1000obs, n=10)
#Step5: Plotting the data
<-ggplot(data = generated_data_1000obs, aes(x=value))+
figure_3geom_histogram()+
theme_bw()+
ggtitle("1000 Obs.")
figure_3
1.4 Comparing the distribution of the three different samples
<-grid.arrange(figure_1, figure_2, figure_3, ncol=3) new_fig
set.seed(123)
#Step1: Creating one million obs. with mean 0 and sd of 1
<-rnorm(1000000, mean=0, sd=1)
generated_data_milobs#Step2: Turning the created data into a data frame
<-data.frame(generated_data_milobs)
generated_data_milobs#Step3: Inspecting the first 10 obs.
head(generated_data_milobs, n=10)
#Step4: Renaming variable inside the datafarme
names(generated_data_milobs)<-"value"
head(generated_data_milobs, n=10)
#Step5: Plotting the data
<-ggplot(data = generated_data_milobs, aes(x=value))+
figure_4geom_histogram()+
theme_bw()+
ggtitle("One Mil. Obs.")
figure_4
2 Generating the cumulative distribution function (CDF) for a standard normal distribution
The pnorm
function generates a vector for a cumulative distribution function of the normal distribution. We can specify the mean \(\mu\) and the standard deviation \(\sigma\). The pnorm
will return the probability that the randomly generated variable X takes on a value less than or equal to x:
\(F_X(x) = P (X\leq x)\)
2.1 Sequence from -3 to 3
#99% of a normal distribution falls between -3 and 3
<-seq(-3, 3, by =0.01)
list_seq<-data.frame(list_seq) list_seq_df
<-pnorm(list_seq, mean = 0, sd = 1)
dummy_df<-data.frame(dummy_df) dummy_df
names(dummy_df)<-"y"
<-cbind(dummy_df, list_seq) dummy_df2
<-ggplot()+
figure_4geom_step(data=dummy_df2, mapping=aes(x=list_seq, y=y)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Sequence: -3, 3")
figure_4
2.2 Highlighting probabilities
This will return the probability that the randomly generated variable X takes on a value less than or equal to -1.
<-ggplot(data=dummy_df2, aes(list_seq, y)) +
fig4bgeom_area(fill = "green") +
gghighlight(list_seq < -1)+
theme_bw()
fig4b
This will return the probability that the randomly generated variable X takes on a value less than or equal to 2.
<-ggplot(data=dummy_df2, aes(list_seq, y)) +
fig4cgeom_area(fill = "green") +
gghighlight(list_seq < 2)+
theme_bw()
fig4c
2.3 Sequence from -10 to 10
#99% of a normal distribution falls between -3 and 3
<-seq(-10, 10, by =0.01)
list_seq<-pnorm(list_seq, mean = 0, sd = 1)
dummy_df<-data.frame(dummy_df)
dummy_dfnames(dummy_df)<-"y"
<-cbind(dummy_df, list_seq) dummy_df2
<-ggplot()+
figure_5geom_step(data=dummy_df2, mapping=aes(x=list_seq, y=y)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Sequence: -10, 10")
figure_5
Comparing the two CDFs
<-grid.arrange(figure_4, figure_5, ncol=2) new_fig
2.4 Changing size of the randomly generated variable X: increments of 1
<-seq(-3, 3, by =1)
list_seq list_seq
[1] -3 -2 -1 0 1 2 3
<-pnorm(list_seq, mean = 0, sd = 1)
dummy_df<-data.frame(dummy_df)
dummy_dfnames(dummy_df)<-"y"
<-cbind(dummy_df, list_seq) dummy_df2
<-ggplot()+
figure_6geom_step(data=dummy_df2, mapping=aes(x=list_seq, y=y)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Sequence: -3, 3\n by 1")
figure_6
2.5 Changing size of the randomly generated variable X: increments of 0.1
<-seq(-3, 3, by =0.1)
list_seq<-pnorm(list_seq, mean = 0, sd = 1)
dummy_df<-data.frame(dummy_df)
dummy_dfnames(dummy_df)<-"y"
<-cbind(dummy_df, list_seq) dummy_df2
<-ggplot()+
figure_7geom_step(data=dummy_df2, mapping=aes(x=list_seq, y=y)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Sequence: -3, 3\n by 0.1")
figure_7
2.6 Changing size of the randomly generated variable X: increments of 0.01
<-seq(-3, 3, by =0.01)
list_seq<-pnorm(list_seq, mean = 0, sd = 1)
dummy_df<-data.frame(dummy_df)
dummy_dfnames(dummy_df)<-"y"
<-cbind(dummy_df, list_seq) dummy_df2
<-ggplot()+
figure_8geom_step(data=dummy_df2, mapping=aes(x=list_seq, y=y)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Sequence: -3, 3\n by 0.01")
figure_8
2.7 Visualizing the results
grid.arrange(figure_6, figure_7, figure_8, ncol=3)
3 Generating the inverse of cumulative distribution function (CDF) for a standard normal distribution
The qnorm
function generates a vector for the inverse cumulative distribution function of the normal distribution with a mean \(\mu\) and a standard deviation \(\sigma\). It returns the value x such that pnorm
(x, \(\mu\), \(\sigma\)) = p
\(P(X\leq x) = F_X(x)\)
Let’s use the same data. First let us examine what we created previously.
head(dummy_df2, n = 10)
Because qnorm
works in the opposite direction, we will use the probabilities (that the randomly generated variable X takes on a value less than or equal to x) to recreate the values in our sequence
<-dummy_df2$y
list_seq2<-qnorm(list_seq2, mean = 0, sd = 1)
dummy_df_q<-data.frame(dummy_df_q)
dummy_df_qnames(dummy_df_q)<-"q"
<-cbind(dummy_df_q, list_seq2) dummy_df_q2
Let us now inspect the output
head(dummy_df_q2, n=10)
This looks very familiar. Let us now compare it to the previous dataset
head(dummy_df2, n=10)
head(dummy_df_q2, n=10)
4 Generating the probability density function (PDF) for a standard normal distribution
The dnorm
function generates a vector for a probability density function of the normal distribution. We can specify the mean \(\mu\) and the standard deviation \(\sigma\).
dnorm
(x, \(\mu\), \(\sigma\)) - probability density function of the normal distribution with mean \(\mu\) and standard deviation \(\sigma\)
<-seq(-3, 3, by =0.01)
list_seq<-dnorm(list_seq, mean = 0, sd = 1)
dnorm_df<-data.frame(dnorm_df)
dnorm_dfnames(dnorm_df)<-"value"
<-cbind(dnorm_df, list_seq) dnorm_df2
<-ggplot()+
figure_8geom_step(data=dnorm_df, mapping=aes(y=value, x=list_seq)) +
geom_bar(stat="identity")+
theme_bw()
figure_8
Let us see how changing some of these parameters may affect the appearance of our bell curve
4.1 Sequence from -3 to 3
<-seq(-3, 3, by =0.01)
list_seq<-dnorm(list_seq, mean = 0, sd = 1)
dnorm_df_3<-data.frame(dnorm_df_3)
dnorm_df_3names(dnorm_df_3)<-"value"
<-cbind(dnorm_df_3, list_seq) dnorm_df_3_b
<-ggplot()+
figure_8geom_step(data=dnorm_df_3_b, mapping=aes(y=value, x=list_seq)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Sequence from -3 to 3")
figure_8
4.2 Sequence from -10 to 10
<-seq(-10, 10, by =0.01)
list_seq<-dnorm(list_seq, mean = 0, sd = 1)
dnorm_df_10<-data.frame(dnorm_df_10)
dnorm_df_10names(dnorm_df_10)<-"value"
<-cbind(dnorm_df_10, list_seq) dnorm_df_10_b
<-ggplot()+
figure_9geom_step(data=dnorm_df_10_b, mapping=aes(y=value, x=list_seq)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Sequence from -10 to 10")
figure_9
Comparing them
grid.arrange(figure_8, figure_9, ncol=2)
Let us see what happens if we change the increment
<-seq(-3, 3, by =0.01)
list_seq<-dnorm(list_seq, mean = 0, sd = 1)
dnorm_df_inc3<-data.frame(dnorm_df_inc3)
dnorm_df_inc3names(dnorm_df_inc3)<-"value"
<-cbind(dnorm_df_inc3, list_seq)
dnorm_df_inc3_b
<-ggplot()+
figure_10geom_step(data=dnorm_df_inc3_b, mapping=aes(y=value, x=list_seq)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Increments of 0.01")
figure_10
<-seq(-3, 3, by =0.1)
list_seq<-dnorm(list_seq, mean = 0, sd = 1)
dnorm_df_inc3<-data.frame(dnorm_df_inc3)
dnorm_df_inc3names(dnorm_df_inc3)<-"value"
<-cbind(dnorm_df_inc3, list_seq)
dnorm_df_inc3_b
<-ggplot()+
figure_11geom_step(data=dnorm_df_inc3_b, mapping=aes(y=value, x=list_seq)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Increments of 0.1")
figure_11
<-seq(-3, 3, by =1)
list_seq<-dnorm(list_seq, mean = 0, sd = 1)
dnorm_df_inc3<-data.frame(dnorm_df_inc3)
dnorm_df_inc3names(dnorm_df_inc3)<-"value"
<-cbind(dnorm_df_inc3, list_seq)
dnorm_df_inc3_b
<-ggplot()+
figure_12geom_step(data=dnorm_df_inc3_b, mapping=aes(y=value, x=list_seq)) +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Increments of 1")
figure_12
Comparing them
grid.arrange(figure_10, figure_11, figure_12, ncol=3)
Let’s do some problems now.
5 Normal vs. Non-Normal Distributions
As you might suspect, life expectancy in our dataset is not normally distributed.
#Setting path
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week5/lab/")
#Step1: Loading the data
<- read.csv(file = './data/life-expectancy.csv')
life_expectancy_df library(dplyr)
#Step2: Calculating the mean
<-life_expectancy_df%>%
life_expectancy_df2::group_by(Code)%>%
dplyr::summarize(life_exp_mean=mean(Life.expectancy.at.birth..historical.))
dplyr
#Step3: Getting rid of countries
<- c("OWID_KOS", "OWID_WRL", "")
weird_labels <-subset(life_expectancy_df2, !(Code %in% weird_labels))
clean_life_expectancy_df
#Step4: Calculating the mean and SDin the dataset
<-mean(clean_life_expectancy_df$life_exp_mean, na.rm = T)
mean_empirical<-as.character(round(mean_empirical, 0))
mean_empirical_val<-sd(clean_life_expectancy_df$life_exp_mean, na.rm = T)
sd_empirical<-as.character(round(sd(clean_life_expectancy_df$life_exp_mean, na.rm = T), 2))
sd_empirical_val
#Step5: Creating annotation text
<- paste("Mean (\u03BC)", ": ", mean_empirical_val, "\n", "SD(\u03C3)", ": ", sd_empirical_val)
lines
#Step6: Making the histogram for the dataset
<-17
y_coord<-ggplot(data = clean_life_expectancy_df, aes(x=life_exp_mean))+
fig_hist1geom_histogram()+
#Multiplying by 1.5. Otherwise, density will not be visible
geom_density(aes(y=..count..*1.5))+
theme_bw()+
geom_vline(xintercept=mean_empirical, linetype='dashed', col = 'red')+
ggtitle("Life Expectacy as is\n in the Dataset")+
annotate(geom="text",
x=mean_empirical-20,
y=y_coord,
label=lines,
color="red")
#Step7: Generating 235 observations following a normal distribution with same mean and standard deviation
<-rnorm(235, mean=mean_empirical, sd=sd_empirical)
generated_data<-data.frame(generated_data)
generated_datanames(generated_data)<-"life_exp_mean"
#Step8: Making the histogram for the generated dataset
<-ggplot(data = generated_data, aes(x=life_exp_mean))+
fig_hist2geom_histogram()+
#Multiplying by 1.5. Otherwise, density will not be visible
geom_density(aes(y=..count..*1.5))+
theme_bw()+
geom_vline(xintercept=mean_empirical, linetype='dashed', col = 'red')+
ggtitle("Life Expectacy generated\n on a Normal Distribution")+
annotate(geom="text",
x=mean_empirical-20,
y=y_coord,
label=lines,
color="red")
#Step9: Comparing the two histograms
grid.arrange(fig_hist1, fig_hist2, ncol=2)
How do we do we obtain these graphs? Let us work on the first graph.
5.1 Working with the empirical (actual) data
Let us go back to the life expectancy dataset
Copy and paste the life expectancy dataset from Lab 4 or download it again. You can download the life expectancy dataset below:
5.2 Load the dataset
#Setting path
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week5/lab/")
<- read.csv(file = './data/life-expectancy.csv') life_exp_mean
5.3 Calculating the mean
<-life_exp_mean%>%
life_exp_mean2::group_by(Code, Entity)%>%
dplyr::summarize(life_exp_mean=mean(Life.expectancy.at.birth..historical.)) dplyr
5.4 Clearning the Data
<- c("OWID_KOS", "OWID_WRL", "")
weird_labels <-subset(life_exp_mean2, !(Code %in% weird_labels)) clean_life_exp_mean_df
5.5 Calculating the mean and SD in the dataset
<-mean(clean_life_exp_mean_df$life_exp_mean, na.rm = T)
mean_empirical<-as.character(round(mean_empirical, 0))
mean_empirical_val<-sd(clean_life_exp_mean_df$life_exp_mean, na.rm = T)
sd_empirical<-as.character(round(sd(clean_life_exp_mean_df$life_exp_mean, na.rm = T), 2)) sd_empirical_val
5.6 Creating annotation text
<- paste("Mean (\u03BC)", ": ", mean_empirical_val, "\n", "SD(\u03C3)", ": ", sd_empirical_val) lines
5.7 Making the histogram for the dataset
<-17
y_coord<-ggplot(data = clean_life_exp_mean_df, aes(x=life_exp_mean))+
fig_hist1geom_histogram()+
#Multiplying by 1.5. Otherwise, density will not be visible
geom_density(aes(y=..count..*1.5))+
theme_bw()+
geom_vline(xintercept=mean_empirical, linetype='dashed', col = 'red')+
ggtitle("Life Expectacy as is\n in the Dataset")+
annotate(geom="text",
x=mean_empirical-10,
y=y_coord,
label=lines,
color="red")
fig_hist1
6 Working with the made-up (generated) data
6.1 Generating 235 observations following a normal distribution with same mean and standard deviation[235]
<-rnorm(235, mean=mean_empirical, sd=sd_empirical)
generated_data#We can also do:
<-rnorm(nrow(clean_life_exp_mean_df), mean=mean_empirical, sd=sd_empirical)
generated_data<-data.frame(generated_data)
generated_datanames(generated_data)<-"life_exp_mean"
6.2 Making the histogram for the generated dataset
<-ggplot(data = generated_data, aes(x=life_exp_mean))+
fig_hist2geom_histogram()+
#Multiplying by 1.5. Otherwise, density will not be visible
geom_density(aes(y=..count..*1.5))+
theme_bw()+
geom_vline(xintercept=mean_empirical, linetype='dashed', col = 'red')+
ggtitle("Life Expectacy generated\n on a Normal Distribution")+
annotate(geom="text",
x=mean_empirical-10,
y=y_coord,
label=lines,
color="red")
7 Comparing the two histograms
grid.arrange(fig_hist1, fig_hist2, ncol=2)
7.1 Generating more observations following a normal distribution with same mean and standard deviation
<-rnorm(1000, mean=mean_empirical, sd=sd_empirical)
generated_data_1000obs<-data.frame(generated_data_1000obs)
generated_data_1000obsnames(generated_data_1000obs)<-"life_exp_mean"
7.2 Making the histogram for the generated dataset (1000 obs.)
<-ggplot(data = generated_data_1000obs, aes(x=life_exp_mean))+
fig_hist3geom_histogram()+
#Multiplying by 1.5. Otherwise, density will not be visible
geom_density(aes(y=..count..*1.5))+
theme_bw()+
geom_vline(xintercept=mean_empirical, linetype='dashed', col = 'red')+
ggtitle("Life Expectacy generated\n on a Normal Distribution\n 1000 Obs.")+
annotate(geom="text",
x=mean_empirical-10,
y=y_coord,
label=lines,
color="red")
fig_hist3
So, we can see that by having more observations, our dataset approaches something that looks like a normal distribution. Note that our mean \(\mu\) and standard deviation \(\sigma\) are the same as in the original data.
8 Problem
Let’s assume that the data in our original empirical dataset is normally distributed with the mean of 61.93 years and a standard deviation of 8.69 years.
8.1 Life expectancy of less than 50 years
What is the probability that a randomly selected country will have a life expectancy of less than 50 years?
<-subset(life_exp_mean2, life_exp_mean<50)
ctry_50 ctry_50
nrow(ctry_50)/nrow(life_exp_mean2)
[1] 0.1171875
The probability that a randomly selected country will have a life expectancy of less than 50 years is 0.12.
Let us also visualize this probability. We will proceed by calculating the frequency table.
#Step1: Rounding the values
<-life_exp_mean2
new_data2$life_exp_mean_rounded<-round(new_data2$life_exp_mean, 0)
new_data2head(new_data2, n=5)
#Step2: Creating a frequency table
<-table(new_data2$life_exp_mean_rounded)
freq_table freq_table
38 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
1 2 3 4 4 5 5 3 4 5 12 6 5 6 5 6 7 8 11 10 7 8 14 13 5 15
68 69 70 71 72 73 74 75 77
10 15 14 14 8 6 3 8 4
#Step3: Turning the table into a dataframe
<-data.frame(freq_table) freq_table
#Step4: Providing more intuitive names
names(freq_table)[1]<-"life_exp_mean_rounded"
names(freq_table)[2]<-"frequency"
names(freq_table)
[1] "life_exp_mean_rounded" "frequency"
#Step5: Turning factor variables into numeric
$life_exp_mean_rounded<-as.numeric(as.character(freq_table$life_exp_mean_rounded))
freq_tablestr(freq_table)
'data.frame': 35 obs. of 2 variables:
$ life_exp_mean_rounded: num 38 43 44 45 46 47 48 49 50 51 ...
$ frequency : int 1 2 3 4 4 5 5 3 4 5 ...
#Step6: Calculaing the PMFs
$sum_frequency<-sum(freq_table$frequency)
freq_table$pmf<-freq_table$frequency/freq_table$sum_frequency
freq_tablehead(freq_table, n=5)
The next step is to create the CDF. A CDF tells us probability that the randomly generated variable X takes on a value less than or equal to x. We will be using the function that we created previously.
#Step7: Defining the function
<- function(t, x) {
EvalCdf = 0.0
count for (value in t){
if (value <= x){
<- count+1
count
}
}= count / length(t)
prob return(prob)
}
#Step8: Making list of all rounded ages
<-new_data2$life_exp_mean_rounded list_all_ages_rounded
Evaluating the CDF and saving it to a list
#Step9: Evaluating the CDF
<-cbind(sapply(new_data2$life_exp_mean_rounded, function(x){
xsxEvalCdf(list_all_ages_rounded, x)
}))
Adding the list to the dataframe:
#Step10: Adding the list to the dataframe
$cdf <- xsx[,1] new_data2
Plotting the CDS
#Step11: Plotting the CDS
<-ggplot()+
figure_3geom_step(data=new_data2, mapping=aes(x=life_exp_mean_rounded, y=cdf)) +
geom_bar(stat="identity")+
theme_bw()
figure_3
Merging the PMFs to the dataset
#Step12: Selecting only the relevant variables
<-subset(freq_table, select = c(life_exp_mean_rounded, pmf))
freq_table_xhead(freq_table_x, n=10)
#Step13: Performing the left join
<-left_join(new_data2, freq_table_x, by=c("life_exp_mean_rounded" = "life_exp_mean_rounded")) new_data3
#Probability Density
<-ggplot(data=new_data3, mapping=aes(x=life_exp_mean_rounded, y=pmf)) +
fig4ageom_area(fill = "green") +
gghighlight(new_data2$life_exp_mean_rounded < 50)+
theme_bw()+
ggtitle("Probability Density")
fig4a
#Cumulative Probability
<-ggplot()+
fig4bgeom_step(data=new_data2, mapping=aes(x=life_exp_mean_rounded, y=cdf)) +
geom_area(fill = "green") +
geom_bar(stat="identity")+
theme_bw()+
ggtitle("Cumulative Probability")
#Cumulative Probability < 50
<-ggplot(data=new_data2, aes(x=life_exp_mean_rounded, y=cdf)) +
fig4cgeom_area(fill = "green") +
gghighlight(new_data2$life_exp_mean_rounded < 50)+
theme_bw()+
ggtitle("Cumulative Probability")
grid.arrange(fig4a, fig4b, fig4c, ncol=3)
If our distribution was normal, it would look different:
It would look something like the following:
If we assume a normal distribution, we can simply get this answer this way:
<-pnorm(50, mean = mean_empirical, sd = sd_empirical)
res res
[1] 0.08475828
The probability of getting a country with a life expectancy of less than 50 is 0.12, if we assume a normal distribution.
8.2 Life expectancy of more than 70 years
What is the probability that a randomly selected country will have a life expectancy of more than 70 years?
So remember that the CDF is the probability that the randomly generated variable X takes on a value LESS than or equal to x. We will first subset the data to less than 70 and take the complement.
<-subset(new_data2, life_exp_mean_rounded<70)
prob_more_701-max(prob_more_70$cdf)
[1] 0.2226562
Therefore, probability of getting a country with a life expectancy of greater than 70 is 0.22.
If we assume a normal distribution, we can simply get a similar answer this way:
<-pnorm(70, mean = mean_empirical, sd = sd_empirical)
res1-res
[1] 0.1765817
Here is what it looks like
<-ggplot()+
fig4bgeom_step(data=new_data2, mapping=aes(x=life_exp_mean_rounded, y=cdf)) +
geom_area(fill = "green") +
geom_bar(stat="identity")+
theme_bw()
<-ggplot(data=new_data2, aes(x=life_exp_mean_rounded, y=cdf)) +
fig4cgeom_area(fill = "green") +
gghighlight(new_data2$life_exp_mean_rounded > 70)+
theme_bw()
grid.arrange(fig4b, fig4c, ncol=2)
8.3 Percentile
What is the 90th percentile of life expectancy?
We can obtain this by - ordering the ages and calculate the percentile - subset based on CDF and get the minimum
<-new_data2$life_exp_mean_rounded
list_ages<-list_ages[order(list_ages)]
list_agesquantile(new_data2$life_exp_mean_rounded, c(0.90))
90%
72
<-subset(new_data2, cdf>0.9)
pct_90#pct_90
min(pct_90$life_exp_mean_rounded)
[1] 72
The 90th percentile of life expectancy is 72.
If we assume a normal distribution, we can also use qnorm
. This generates a vector for the inverse cumulative distribution function of the normal distribution with a mean \(\mu\) and a standard deviation \(\sigma\). It is not exactly the same, but it is close.
<-qnorm(0.9, mean = mean_empirical, sd = sd_empirical)
res res
[1] 73.06729