Data loading and examination

library(readr)
data <- read_csv("C:/Users/Fakudze/Downloads/archive (31).zip")
## Rows: 1205 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Risk Level
## dbl (11): Age, Systolic BP, Diastolic, BS, Body Temp, BMI, Previous Complica...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(data)
library(skimr)
## Warning: package 'skimr' was built under R version 4.5.1
skim(data)
Data summary
Name data
Number of rows 1205
Number of columns 12
_______________________
Column type frequency:
character 1
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Risk Level 18 0.99 3 4 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1.00 27.48 9.20 10 21.00 25.0 31.0 65 ▅▇▂▁▁
Systolic BP 5 1.00 116.82 18.72 70 100.00 120.0 130.0 200 ▂▇▃▁▁
Diastolic 4 1.00 77.17 14.31 40 65.00 80.0 90.0 140 ▃▇▅▁▁
BS 2 1.00 7.50 3.05 3 6.00 6.9 7.9 19 ▅▇▁▁▁
Body Temp 0 1.00 98.40 1.09 97 98.00 98.0 98.0 103 ▇▁▁▁▁
BMI 18 0.99 23.32 3.88 0 20.45 23.0 25.0 37 ▁▁▇▇▁
Previous Complications 2 1.00 0.18 0.38 0 0.00 0.0 0.0 1 ▇▁▁▁▂
Preexisting Diabetes 2 1.00 0.29 0.45 0 0.00 0.0 1.0 1 ▇▁▁▁▃
Gestational Diabetes 0 1.00 0.12 0.32 0 0.00 0.0 0.0 1 ▇▁▁▁▁
Mental Health 0 1.00 0.33 0.47 0 0.00 0.0 1.0 1 ▇▁▁▁▅
Heart Rate 2 1.00 75.82 7.23 58 70.00 76.0 80.0 92 ▁▃▇▂▂

Data cleaning

Reviewing Age variable(Checking for outliers and replacing them)

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.1
## Warning: package 'purrr' was built under R version 4.5.1
## Warning: package 'forcats' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.1.0
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
boxplot(data$Age, main = "Boxplot fopr outliers detection on Age")

Cleaning Systolic BP variable (checking for outliers and replacing them

boxplot(data$`Systolic BP`, main = "Boxplot for outlier detection on Systolic BP")

data <- data %>% 
         mutate(`Systolic BP` = if_else(is.na(`Systolic BP`),
                                        median(`Systolic BP`, na.rm = T),
                                        `Systolic BP`))

Cleaning Diastolic variable (checking for outliers and replacing them

boxplot(data$Diastolic, main = "Boxplot for detecting outliers on Diastolic")

data <- data %>% 
         mutate(Diastolic = if_else(is.na(Diastolic),
                                    median(Diastolic, na.rm = T),
                                    Diastolic))

Cleaning BS variable (checking for outliers and replacing them)

boxplot(data$BS, main = "Boxplot for detecting outliers on BS")

data <- data %>% 
         mutate(BS = if_else(is.na(BS),
                             median(BS, na.rm = T),
                             BS))

Cleaning BMI variable (checking for outliers and replacing them)

boxplot(data$BMI, main = "boxplot for outlier detection on BMI")

data <- data %>% 
  mutate(BMI = if_else(is.na(BMI),
                       median(BMI),
                       BMI))

Cleaning Heart Rate variable (checking for outliers and replacing them)

boxplot(data$`Heart Rate`, main = "Barplot for detecting outliers on Heart Rate")

data <- data %>% 
         mutate(`Heart Rate` = if_else(is.na(`Heart Rate`),
                                       mean(`Heart Rate`, na.rm = T),
                                     `Heart Rate`))

Recoding character indicators and converting to factor

library(dplyr)
data <- data %>% 
         mutate(across(c(`Preexisting Diabetes`, `Gestational Diabetes`,
                         `Mental Health`, `Previous Complications`),
                       as.factor))
data <- data %>% 
  mutate(`Risk Level` = factor(`Risk Level`, levels = c("Low", "High")))

Replacing Missing values with Unknown in factor variables

library(forcats)
data <- data %>% 
         mutate(across(c(`Preexisting Diabetes`, `Gestational Diabetes`,
                         `Mental Health`, `Previous Complications`), ~ fct_explicit_na(as.factor(.), na_level
                                                          = "Unknown")))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.

Extracting clean data for analysis

clean_data <- data %>% 
         na.omit()

skim(clean_data)
Data summary
Name clean_data
Number of rows 1173
Number of columns 12
_______________________
Column type frequency:
factor 5
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Previous Complications 0 1 FALSE 3 0: 963, 1: 209, Unk: 1
Preexisting Diabetes 0 1 FALSE 3 0: 831, 1: 341, Unk: 1
Gestational Diabetes 0 1 FALSE 2 0: 1036, 1: 137
Mental Health 0 1 FALSE 2 0: 781, 1: 392
Risk Level 0 1 FALSE 2 Low: 706, Hig: 467

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 27.52 9.25 10 21 25.0 32.0 65 ▅▇▂▁▁
Systolic BP 0 1 116.85 18.69 70 100 120.0 130.0 200 ▂▇▃▁▁
Diastolic 0 1 77.26 14.29 40 65 80.0 90.0 140 ▃▇▅▁▁
BS 0 1 7.52 3.06 3 6 6.9 7.9 19 ▅▇▁▁▁
Body Temp 0 1 98.39 1.08 97 98 98.0 98.0 103 ▇▁▁▁▁
BMI 0 1 23.34 3.88 0 21 23.0 25.0 37 ▁▁▇▇▂
Heart Rate 0 1 75.80 7.22 58 70 76.0 80.0 92 ▁▃▇▂▂

Checking for class imbalance on Risk Level

table(clean_data$`Risk Level`)
## 
##  Low High 
##  706  467

Data Analysis

Does Blood Sugar (BS) signicantly predict Risk Level?

ggplot(clean_data)+
  geom_boxplot(mapping = aes(`Risk Level`, Age, fill = `Risk Level`))+
  labs(title = "The relationship between Risk Level and Age")

model_Age <- glm(`Risk Level` ~ Age, data = clean_data,
                 family = binomial)
summary(model_Age)
## 
## Call:
## glm(formula = `Risk Level` ~ Age, family = binomial, data = clean_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.624897   0.196880  -8.253  < 2e-16 ***
## Age          0.043679   0.006735   6.485 8.86e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1577.1  on 1172  degrees of freedom
## Residual deviance: 1532.4  on 1171  degrees of freedom
## AIC: 1536.4
## 
## Number of Fisher Scoring iterations: 4

The boxplot visual shows that woman from the High Risk category have higher median Age value than those from the Low Risk category. The logistic model reveals that for every one year increase in Age, the log-odds of a woman being in the High Risk category increases by 0.0437. Age is a statistically significant predictor of Risk Level.

Does Blood Sugar (BS) signicantly predict Risk Level?

ggplot(clean_data)+
  geom_boxplot(mapping = aes(`Risk Level`, BS, fill = `Risk Level`))+
  coord_flip()+
  labs(title = "The relationship between Risk Level and Blood Sugar")

model_BS <- glm(`Risk Level`~ BS, 
                data = clean_data, family = binomial)
summary(model_BS)
## 
## Call:
## glm(formula = `Risk Level` ~ BS, family = binomial, data = clean_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.24639    0.48309  -15.00   <2e-16 ***
## BS           0.93987    0.06729   13.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1577.1  on 1172  degrees of freedom
## Residual deviance: 1006.4  on 1171  degrees of freedom
## AIC: 1010.4
## 
## Number of Fisher Scoring iterations: 6

The box plot reveals that the High Risk group has the highest median Blood Sugar values than the Low Risk Group. The results from this model reveals that Blood Sugar is a statistical significant predictor of Risk Level. The coefficient of 0.93987 means that as Blood Sugar increases, the odds of being in the High Risk level increases.

What is the relationship between BMI and Risk Level?

ggplot(clean_data)+
  geom_jitter(mapping = aes(`Risk Level`, BMI), width = 0.2, color = "red")+
  labs(title = "The relationship between Risk Level and BMI")

model_BMI <- glm(`Risk Level`~ BMI, data = clean_data,
                 family = binomial)

summary(model_BMI)
## 
## Call:
## glm(formula = `Risk Level` ~ BMI, family = binomial, data = clean_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.33491    0.58999  -15.82   <2e-16 ***
## BMI          0.37841    0.02479   15.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1577.1  on 1172  degrees of freedom
## Residual deviance: 1208.2  on 1171  degrees of freedom
## AIC: 1212.2
## 
## Number of Fisher Scoring iterations: 4

The box plot reveals that the High Risk group has the highest median Body Mass Index values than the Low Risk Group. The results from this model reveals that a pregnant woman’s Body Mass Index is a statistical significant predictor of Risk Level. The positive coefficient of 0.37841 means that as Body Mass Index increases, the odds of being in the High Risk level also increases. One Unit increase in Body Mass Index maternal woman, the likelihood of her being in the High Risk category is 0.37841.

Do Previous Clomplications increase the likelihood of High Risk Level

ggplot(clean_data)+
  geom_bar(mapping = aes(`Previous Complications`, fill = `Risk Level`),
           position = "dodge")+
  labs(title = "The relationship between Risk Level and Previous Complications")

model_PC <- glm(`Risk Level`~ `Previous Complications`, data = clean_data,
                family = binomial)

summary(model_PC)
## 
## Call:
## glm(formula = `Risk Level` ~ `Previous Complications`, family = binomial, 
##     data = clean_data)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -0.97893    0.07232 -13.535   <2e-16 ***
## `Previous Complications`1         4.50037    0.42050  10.702   <2e-16 ***
## `Previous Complications`Unknown  14.54499  535.41117   0.027    0.978    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1577.1  on 1172  degrees of freedom
## Residual deviance: 1183.7  on 1170  degrees of freedom
## AIC: 1189.7
## 
## Number of Fisher Scoring iterations: 12

The estimate of 4.50037 means that, compared to the reference group (Previous Complications 0), the log-odds of being in the High Risk Level increase by 4.50037 when previous Complications is 1. Also, The estimate of 14.54499 means that, compared to the reference group (Previous Complications 0), the log-odds of being in the High Risk Level increase by 14.54499 when previous Complications is Unknown.

How do Heart Rate and Systolic BP together influence Risk Level

model_HS <- glm(`Risk Level`~ `Heart Rate` + `Systolic BP`, data = clean_data,
                family = binomial)

summary(model_HS)
## 
## Call:
## glm(formula = `Risk Level` ~ `Heart Rate` + `Systolic BP`, family = binomial, 
##     data = clean_data)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -17.272401   1.087241 -15.886  < 2e-16 ***
## `Heart Rate`    0.168549   0.012061  13.975  < 2e-16 ***
## `Systolic BP`   0.034087   0.004165   8.185 2.73e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1577.1  on 1172  degrees of freedom
## Residual deviance: 1202.0  on 1170  degrees of freedom
## AIC: 1208
## 
## Number of Fisher Scoring iterations: 4

The results from this model reveals that a pregnant woman’s Heart Rate is a statistical significant predictor of Risk Level. The positive coefficient of 0.168549 means that as Heart Rate increases, the odds of being in the High Risk level also increases. One Unit increase in Heart Rate of maternal woman, the likelihood of her being in the High Risk category is 0.168549.

The results from this model reveals that a pregnant woman’s Systolic BP is a statistical significant predictor of Risk Level. The positive coefficient of 0.034087 means that as Systolic BP increases, the odds of being in the High Risk level also increases. One Unit increase in Systolic BP of maternal woman, the likelihood of her being in the High Risk category is 0.034087.

Which factors among Age, BS, BMI, and Mental Health best predict Risk Level?

library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
model <- glm(`Risk Level`~ Age + BMI + BS +`Mental Health`,
             data = clean_data, family = binomial)

summary(model)
## 
## Call:
## glm(formula = `Risk Level` ~ Age + BMI + BS + `Mental Health`, 
##     family = binomial, data = clean_data)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -10.42337    0.79627 -13.090  < 2e-16 ***
## Age               -0.10490    0.01585  -6.619 3.61e-11 ***
## BMI                0.24158    0.03146   7.678 1.61e-14 ***
## BS                 0.83782    0.07604  11.018  < 2e-16 ***
## `Mental Health`1   2.93717    0.22073  13.307  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1577.1  on 1172  degrees of freedom
## Residual deviance:  637.5  on 1168  degrees of freedom
## AIC: 647.5
## 
## Number of Fisher Scoring iterations: 6
model_best_predict <- train(`Risk Level`~ Age + BMI + BS + `Mental Health`,
                            data = clean_data, model = "glm")
varImp(model_best_predict)
## rf variable importance
## 
##                  Overall
## BMI               100.00
## BS                 78.29
## `Mental Health`1   26.30
## Age                 0.00

Maternal woman’s Body Mass Index and Blood Sugar are the best predictors of Maternal Risk Level on women while Mental Health and the women’ Age are the least to none predictors.