library(readr)
data <- read_csv("C:\\Users\\Fakudze\\Desktop\\archive (23).zip")
## Rows: 179 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Region
## dbl (18): Year, Infant_deaths, Under_five_deaths, Adult_mortality, Alcohol_c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(data)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
colSums(is.na(data))
## Country Region
## 0 0
## Year Infant_deaths
## 0 0
## Under_five_deaths Adult_mortality
## 0 0
## Alcohol_consumption Hepatitis_B
## 0 0
## Measles BMI
## 0 0
## Polio Diphtheria
## 0 0
## Incidents_HIV GDP_per_capita
## 0 0
## Population_mln Thinness_ten_nineteen_years
## 0 0
## Thinness_five_nine_years Schooling
## 0 0
## Economy_status Life_expectancy
## 0 0
glimpse(data)
## Rows: 179
## Columns: 20
## $ Country <chr> "Afghanistan", "Albania", "Algeria", "Ango…
## $ Region <chr> "Asia", "Rest of Europe", "Africa", "Afric…
## $ Year <dbl> 2007.5, 2007.5, 2007.5, 2007.5, 2007.5, 20…
## $ Infant_deaths <dbl> 71.08125, 15.25625, 26.75625, 88.76875, 9.…
## $ Under_five_deaths <dbl> 98.61250, 17.14375, 31.19375, 144.16250, 1…
## $ Adult_mortality <dbl> 265.80497, 83.13297, 113.43928, 297.84406,…
## $ Alcohol_consumption <dbl> 0.016125, 4.696875, 0.400625, 4.935625, 7.…
## $ Hepatitis_B <dbl> 64.5625, 98.0000, 88.3125, 68.8125, 98.250…
## $ Measles <dbl> 24.3750, 95.9375, 93.2500, 64.0000, 75.437…
## $ BMI <dbl> 22.46250, 25.85625, 24.86875, 22.51875, 25…
## $ Polio <dbl> 55.3750, 98.1250, 91.7500, 35.7500, 96.937…
## $ Diphtheria <dbl> 55.1250, 98.0625, 91.8750, 55.5625, 98.312…
## $ Incidents_HIV <dbl> 0.022500, 0.025625, 0.021875, 1.303750, 0.…
## $ GDP_per_capita <dbl> 408.5625, 3071.1250, 3745.1250, 2647.8125,…
## $ Population_mln <dbl> 27.450625, 2.969375, 34.820625, 21.623750,…
## $ Thinness_ten_nineteen_years <dbl> 16.58125, 1.61875, 6.09375, 6.19375, 3.425…
## $ Thinness_five_nine_years <dbl> 15.58125, 1.70000, 5.97500, 6.66875, 3.375…
## $ Schooling <dbl> 2.90000, 9.24375, 6.99375, 4.60625, 9.0187…
## $ Economy_status <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, …
## $ Life_expectancy <dbl> 59.65625, 75.95000, 73.78750, 52.82500, 75…
data <- data %>%
rename(LifeExpectancy = Life_expectancy,
HepatitisB = Hepatitis_B,
AdultMortality = Adult_mortality,
InfantDeaths = Infant_deaths,
Under5deaths = Under_five_deaths,
AlcoholConsumption = Alcohol_consumption,
HIVincidence = Incidents_HIV)
sample_data <- data %>%
select(LifeExpectancy, Population_mln, GDP_per_capita,
HIVincidence, Polio, BMI, HepatitisB,
AlcoholConsumption, AdultMortality,
InfantDeaths, Region, Country)
View(sample_data)
Descriptive statistics
summary(sample_data$AlcoholConsumption)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000025 1.317813 4.209375 4.820882 7.843438 15.100000
summary(sample_data$LifeExpectancy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 45.61 62.30 71.51 68.86 74.94 82.46
Scatter plot with a regression line
sample_data %>%
ggplot(aes(AlcoholConsumption, LifeExpectancy))+
geom_point(aes(color = Region))+
geom_smooth(method = "lm", se = T, color = "blue")+
labs(title = "Alcohol Consumption Vs Life expectancy",
x = "Alcohol Cunsumption (Litres per capita)",
y = "Life expectancy (years)")+
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
correlation between vaccinations and Life expectancy
cor(sample_data[, c("Polio", "HepatitisB",
"LifeExpectancy")])
## Polio HepatitisB LifeExpectancy
## Polio 1.0000000 0.8106981 0.6823676
## HepatitisB 0.8106981 1.0000000 0.4517228
## LifeExpectancy 0.6823676 0.4517228 1.0000000
sample_data %>%
ggplot(aes(Polio, LifeExpectancy))+
geom_point(aes(color = Region))+
geom_smooth(method = "lm", se = T, color = "blue")+
labs(title = "Polio vaccine Rate Vs Life Expectancy",
x = "Polio Vaccine (%)",
y = "Life Expectancy (years)")+
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
sample_data %>%
ggplot(aes(HepatitisB, LifeExpectancy))+
geom_point(aes(color = Region))+
geom_smooth(method = "lm", se = T, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
summary(sample_data$AdultMortality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 57.71 107.05 164.43 192.25 247.52 572.97
sample_data %>%
ggplot(aes(AdultMortality, LifeExpectancy))+
geom_point(aes(color = Region))+
geom_smooth(method = "lm", se = F, color = "blue")+
labs(title = "Adult Mortality Vs Life expectancy",
x = "Adult Mortality (per 1000)",
y = "Life expectancy (years)")+
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
cor(sample_data$AdultMortality,
sample_data$LifeExpectancy)
## [1] -0.9474846
Model Illustration
model <- lm(LifeExpectancy ~ AdultMortality + Region, data = sample_data)
summary(model)
##
## Call:
## lm(formula = LifeExpectancy ~ AdultMortality + Region, data = sample_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4112 -1.4057 -0.0171 1.3618 7.8081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.722791 0.874579 90.012 < 2e-16 ***
## AdultMortality -0.065425 0.002515 -26.017 < 2e-16 ***
## RegionAsia 1.930700 0.698693 2.763 0.006356 **
## RegionCentral America and Caribbean 4.417062 0.773974 5.707 5.05e-08 ***
## RegionEuropean Union 5.822359 0.800792 7.271 1.27e-11 ***
## RegionMiddle East 2.525263 0.914108 2.763 0.006371 **
## RegionNorth America 5.706804 1.575217 3.623 0.000385 ***
## RegionOceania 1.451747 0.914770 1.587 0.114380
## RegionRest of Europe 4.869456 0.859551 5.665 6.20e-08 ***
## RegionSouth America 4.216879 0.897451 4.699 5.40e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.485 on 169 degrees of freedom
## Multiple R-squared: 0.9307, Adjusted R-squared: 0.927
## F-statistic: 252.1 on 9 and 169 DF, p-value: < 2.2e-16
model1 <- lm(LifeExpectancy ~ AdultMortality + Region, data = sample_data)
summary(model1)
##
## Call:
## lm(formula = LifeExpectancy ~ AdultMortality + Region, data = sample_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4112 -1.4057 -0.0171 1.3618 7.8081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.722791 0.874579 90.012 < 2e-16 ***
## AdultMortality -0.065425 0.002515 -26.017 < 2e-16 ***
## RegionAsia 1.930700 0.698693 2.763 0.006356 **
## RegionCentral America and Caribbean 4.417062 0.773974 5.707 5.05e-08 ***
## RegionEuropean Union 5.822359 0.800792 7.271 1.27e-11 ***
## RegionMiddle East 2.525263 0.914108 2.763 0.006371 **
## RegionNorth America 5.706804 1.575217 3.623 0.000385 ***
## RegionOceania 1.451747 0.914770 1.587 0.114380
## RegionRest of Europe 4.869456 0.859551 5.665 6.20e-08 ***
## RegionSouth America 4.216879 0.897451 4.699 5.40e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.485 on 169 degrees of freedom
## Multiple R-squared: 0.9307, Adjusted R-squared: 0.927
## F-statistic: 252.1 on 9 and 169 DF, p-value: < 2.2e-16
correlation matrix
cor(sample_data[, c("BMI", "AlcoholConsumption",
"LifeExpectancy")])
## BMI AlcoholConsumption LifeExpectancy
## BMI 1.0000000 0.2926699 0.5944032
## AlcoholConsumption 0.2926699 1.0000000 0.4160609
## LifeExpectancy 0.5944032 0.4160609 1.0000000
model2 <- lm(LifeExpectancy ~ BMI + AlcoholConsumption, data = sample_data)
summary(model2)
##
## Call:
## lm(formula = LifeExpectancy ~ BMI + AlcoholConsumption, data = sample_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.5145 -3.9537 0.7335 5.0808 16.9651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.8964 6.2544 1.742 0.0832 .
## BMI 2.1955 0.2556 8.591 4.50e-15 ***
## AlcoholConsumption 0.6221 0.1414 4.401 1.87e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.06 on 176 degrees of freedom
## Multiple R-squared: 0.4174, Adjusted R-squared: 0.4108
## F-statistic: 63.05 on 2 and 176 DF, p-value: < 2.2e-16
selecting health indicators
healthInc_data <- sample_data %>%
select(LifeExpectancy, BMI, InfantDeaths, AdultMortality,
AlcoholConsumption, InfantDeaths, HIVincidence, HepatitisB)
model3 <- lm(LifeExpectancy ~ ., data = healthInc_data)
summary(model3)
##
## Call:
## lm(formula = LifeExpectancy ~ ., data = healthInc_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2061 -0.8190 0.0572 0.9162 3.9875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.151977 1.942331 44.870 < 2e-16 ***
## BMI -0.111930 0.062895 -1.780 0.0769 .
## InfantDeaths -0.149021 0.009371 -15.902 < 2e-16 ***
## AdultMortality -0.052574 0.002326 -22.605 < 2e-16 ***
## AlcoholConsumption 0.147017 0.030444 4.829 3.02e-06 ***
## HIVincidence 0.162780 0.070948 2.294 0.0230 *
## HepatitisB -0.020357 0.009051 -2.249 0.0258 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.346 on 172 degrees of freedom
## Multiple R-squared: 0.9793, Adjusted R-squared: 0.9786
## F-statistic: 1357 on 6 and 172 DF, p-value: < 2.2e-16
library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(100)
model_imp <- train(LifeExpectancy ~ ., data = healthInc_data,
model = "lm")
varImp(model_imp)
## rf variable importance
##
## Overall
## InfantDeaths 100.0000
## AdultMortality 88.8680
## HIVincidence 8.5833
## BMI 4.6386
## AlcoholConsumption 0.6399
## HepatitisB 0.0000
par(mfrow = c(2, 2))
plot(model_imp)