Data Importing

library(readr)
data <- read_csv("C:\\Users\\Fakudze\\Desktop\\archive (23).zip")
## Rows: 179 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Country, Region
## dbl (18): Year, Infant_deaths, Under_five_deaths, Adult_mortality, Alcohol_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(data)

Data Examination

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
colSums(is.na(data))
##                     Country                      Region 
##                           0                           0 
##                        Year               Infant_deaths 
##                           0                           0 
##           Under_five_deaths             Adult_mortality 
##                           0                           0 
##         Alcohol_consumption                 Hepatitis_B 
##                           0                           0 
##                     Measles                         BMI 
##                           0                           0 
##                       Polio                  Diphtheria 
##                           0                           0 
##               Incidents_HIV              GDP_per_capita 
##                           0                           0 
##              Population_mln Thinness_ten_nineteen_years 
##                           0                           0 
##    Thinness_five_nine_years                   Schooling 
##                           0                           0 
##              Economy_status             Life_expectancy 
##                           0                           0
glimpse(data)
## Rows: 179
## Columns: 20
## $ Country                     <chr> "Afghanistan", "Albania", "Algeria", "Ango…
## $ Region                      <chr> "Asia", "Rest of Europe", "Africa", "Afric…
## $ Year                        <dbl> 2007.5, 2007.5, 2007.5, 2007.5, 2007.5, 20…
## $ Infant_deaths               <dbl> 71.08125, 15.25625, 26.75625, 88.76875, 9.…
## $ Under_five_deaths           <dbl> 98.61250, 17.14375, 31.19375, 144.16250, 1…
## $ Adult_mortality             <dbl> 265.80497, 83.13297, 113.43928, 297.84406,…
## $ Alcohol_consumption         <dbl> 0.016125, 4.696875, 0.400625, 4.935625, 7.…
## $ Hepatitis_B                 <dbl> 64.5625, 98.0000, 88.3125, 68.8125, 98.250…
## $ Measles                     <dbl> 24.3750, 95.9375, 93.2500, 64.0000, 75.437…
## $ BMI                         <dbl> 22.46250, 25.85625, 24.86875, 22.51875, 25…
## $ Polio                       <dbl> 55.3750, 98.1250, 91.7500, 35.7500, 96.937…
## $ Diphtheria                  <dbl> 55.1250, 98.0625, 91.8750, 55.5625, 98.312…
## $ Incidents_HIV               <dbl> 0.022500, 0.025625, 0.021875, 1.303750, 0.…
## $ GDP_per_capita              <dbl> 408.5625, 3071.1250, 3745.1250, 2647.8125,…
## $ Population_mln              <dbl> 27.450625, 2.969375, 34.820625, 21.623750,…
## $ Thinness_ten_nineteen_years <dbl> 16.58125, 1.61875, 6.09375, 6.19375, 3.425…
## $ Thinness_five_nine_years    <dbl> 15.58125, 1.70000, 5.97500, 6.66875, 3.375…
## $ Schooling                   <dbl> 2.90000, 9.24375, 6.99375, 4.60625, 9.0187…
## $ Economy_status              <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, …
## $ Life_expectancy             <dbl> 59.65625, 75.95000, 73.78750, 52.82500, 75…

Data extraction and cleaning

data <- data %>% 
         rename(LifeExpectancy =  Life_expectancy,
                HepatitisB = Hepatitis_B,
                AdultMortality = Adult_mortality,
                InfantDeaths = Infant_deaths,
                Under5deaths = Under_five_deaths,
                AlcoholConsumption = Alcohol_consumption,
                HIVincidence = Incidents_HIV)
sample_data <- data %>% 
         select(LifeExpectancy, Population_mln, GDP_per_capita,
                HIVincidence, Polio, BMI, HepatitisB,
                AlcoholConsumption, AdultMortality,
                InfantDeaths, Region, Country) 
View(sample_data)

Data Analysis

What is the relationship between Alcohol consumption and Life expectancy across countries?

Descriptive statistics

summary(sample_data$AlcoholConsumption)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  0.000025  1.317813  4.209375  4.820882  7.843438 15.100000
summary(sample_data$LifeExpectancy)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   45.61   62.30   71.51   68.86   74.94   82.46

Scatter plot with a regression line

sample_data %>% 
         ggplot(aes(AlcoholConsumption, LifeExpectancy))+
         geom_point(aes(color = Region))+
         geom_smooth(method = "lm", se = T, color = "blue")+
         labs(title = "Alcohol Consumption Vs Life expectancy",
              x = "Alcohol Cunsumption (Litres per capita)",
              y = "Life expectancy (years)")+
         theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

How does the prevalence of Polio or Hepatisi B vaccination influence life expectancy?

correlation between vaccinations and Life expectancy

cor(sample_data[, c("Polio", "HepatitisB",
                    "LifeExpectancy")])
##                    Polio HepatitisB LifeExpectancy
## Polio          1.0000000  0.8106981      0.6823676
## HepatitisB     0.8106981  1.0000000      0.4517228
## LifeExpectancy 0.6823676  0.4517228      1.0000000
sample_data %>% 
         ggplot(aes(Polio, LifeExpectancy))+
         geom_point(aes(color = Region))+
         geom_smooth(method = "lm", se = T, color = "blue")+
         labs(title = "Polio vaccine Rate Vs Life Expectancy",
              x = "Polio Vaccine (%)",
              y = "Life Expectancy (years)")+
         theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

sample_data %>% 
         ggplot(aes(HepatitisB, LifeExpectancy))+
         geom_point(aes(color = Region))+
         geom_smooth(method = "lm", se = T, color = "red")
## `geom_smooth()` using formula = 'y ~ x'

To what extent does adult mortality contribute to the variations in life expectancy by Region?

summary(sample_data$AdultMortality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   57.71  107.05  164.43  192.25  247.52  572.97
sample_data %>% 
         ggplot(aes(AdultMortality, LifeExpectancy))+
         geom_point(aes(color = Region))+
         geom_smooth(method = "lm", se = F, color = "blue")+
         labs(title = "Adult Mortality Vs Life expectancy",
              x = "Adult Mortality (per 1000)",
              y = "Life expectancy (years)")+
         theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Correlation check

cor(sample_data$AdultMortality,
    sample_data$LifeExpectancy)
## [1] -0.9474846

Model Illustration

model <- lm(LifeExpectancy ~ AdultMortality + Region, data = sample_data)
summary(model)
## 
## Call:
## lm(formula = LifeExpectancy ~ AdultMortality + Region, data = sample_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4112 -1.4057 -0.0171  1.3618  7.8081 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         78.722791   0.874579  90.012  < 2e-16 ***
## AdultMortality                      -0.065425   0.002515 -26.017  < 2e-16 ***
## RegionAsia                           1.930700   0.698693   2.763 0.006356 ** 
## RegionCentral America and Caribbean  4.417062   0.773974   5.707 5.05e-08 ***
## RegionEuropean Union                 5.822359   0.800792   7.271 1.27e-11 ***
## RegionMiddle East                    2.525263   0.914108   2.763 0.006371 ** 
## RegionNorth America                  5.706804   1.575217   3.623 0.000385 ***
## RegionOceania                        1.451747   0.914770   1.587 0.114380    
## RegionRest of Europe                 4.869456   0.859551   5.665 6.20e-08 ***
## RegionSouth America                  4.216879   0.897451   4.699 5.40e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.485 on 169 degrees of freedom
## Multiple R-squared:  0.9307, Adjusted R-squared:  0.927 
## F-statistic: 252.1 on 9 and 169 DF,  p-value: < 2.2e-16
model1 <- lm(LifeExpectancy ~ AdultMortality + Region, data = sample_data)
summary(model1)
## 
## Call:
## lm(formula = LifeExpectancy ~ AdultMortality + Region, data = sample_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4112 -1.4057 -0.0171  1.3618  7.8081 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         78.722791   0.874579  90.012  < 2e-16 ***
## AdultMortality                      -0.065425   0.002515 -26.017  < 2e-16 ***
## RegionAsia                           1.930700   0.698693   2.763 0.006356 ** 
## RegionCentral America and Caribbean  4.417062   0.773974   5.707 5.05e-08 ***
## RegionEuropean Union                 5.822359   0.800792   7.271 1.27e-11 ***
## RegionMiddle East                    2.525263   0.914108   2.763 0.006371 ** 
## RegionNorth America                  5.706804   1.575217   3.623 0.000385 ***
## RegionOceania                        1.451747   0.914770   1.587 0.114380    
## RegionRest of Europe                 4.869456   0.859551   5.665 6.20e-08 ***
## RegionSouth America                  4.216879   0.897451   4.699 5.40e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.485 on 169 degrees of freedom
## Multiple R-squared:  0.9307, Adjusted R-squared:  0.927 
## F-statistic: 252.1 on 9 and 169 DF,  p-value: < 2.2e-16

How do BMI and Alcohol consumption together affect life expectancy?

correlation matrix

cor(sample_data[, c("BMI", "AlcoholConsumption",
                    "LifeExpectancy")])
##                          BMI AlcoholConsumption LifeExpectancy
## BMI                1.0000000          0.2926699      0.5944032
## AlcoholConsumption 0.2926699          1.0000000      0.4160609
## LifeExpectancy     0.5944032          0.4160609      1.0000000
model2 <- lm(LifeExpectancy ~ BMI + AlcoholConsumption, data = sample_data)
summary(model2)
## 
## Call:
## lm(formula = LifeExpectancy ~ BMI + AlcoholConsumption, data = sample_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.5145  -3.9537   0.7335   5.0808  16.9651 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         10.8964     6.2544   1.742   0.0832 .  
## BMI                  2.1955     0.2556   8.591 4.50e-15 ***
## AlcoholConsumption   0.6221     0.1414   4.401 1.87e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.06 on 176 degrees of freedom
## Multiple R-squared:  0.4174, Adjusted R-squared:  0.4108 
## F-statistic: 63.05 on 2 and 176 DF,  p-value: < 2.2e-16

What combination of health indicators best predicts life expectancy using multiple regression analysis?

selecting health indicators

healthInc_data <- sample_data %>% 
         select(LifeExpectancy, BMI, InfantDeaths, AdultMortality,
                AlcoholConsumption, InfantDeaths, HIVincidence, HepatitisB)

Fitting a multiple linear regression model

model3 <- lm(LifeExpectancy ~ ., data = healthInc_data)
summary(model3)
## 
## Call:
## lm(formula = LifeExpectancy ~ ., data = healthInc_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2061 -0.8190  0.0572  0.9162  3.9875 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        87.151977   1.942331  44.870  < 2e-16 ***
## BMI                -0.111930   0.062895  -1.780   0.0769 .  
## InfantDeaths       -0.149021   0.009371 -15.902  < 2e-16 ***
## AdultMortality     -0.052574   0.002326 -22.605  < 2e-16 ***
## AlcoholConsumption  0.147017   0.030444   4.829 3.02e-06 ***
## HIVincidence        0.162780   0.070948   2.294   0.0230 *  
## HepatitisB         -0.020357   0.009051  -2.249   0.0258 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.346 on 172 degrees of freedom
## Multiple R-squared:  0.9793, Adjusted R-squared:  0.9786 
## F-statistic:  1357 on 6 and 172 DF,  p-value: < 2.2e-16

Variable importance

library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
set.seed(100)

model_imp <- train(LifeExpectancy ~ ., data = healthInc_data,
                   model = "lm")

varImp(model_imp)
## rf variable importance
## 
##                     Overall
## InfantDeaths       100.0000
## AdultMortality      88.8680
## HIVincidence         8.5833
## BMI                  4.6386
## AlcoholConsumption   0.6399
## HepatitisB           0.0000
par(mfrow = c(2, 2))
plot(model_imp)