# Load necessary libraries
library(tidyverse)
library(readr)
library(naniar)
library(patchwork)
library(ggplot2)
library(ggcorrplot)
library(caret)
library(dplyr)
library(pROC)
library(randomForest)

# Define the file path
file_path <- "C:/Users/sheyi/OneDrive/Documents/rassignment/hotel_bookings.csv"

# Load the dataset without showing column types message
hotel_data <- read_csv(file_path, show_col_types = FALSE)

# Check the structure of the dataset
glimpse(hotel_data)

# Print the number of rows and columns
cat("Dataset Dimensions:", dim(hotel_data), "\n")

Rows: 119,390
Columns: 32
$ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Resort…
$ is_canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
$ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
$ arrival_date_year              <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
$ arrival_date_month             <chr> "July", "July", "July", "July", "July",…
$ arrival_date_week_number       <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
$ arrival_date_day_of_month      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
$ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
$ market_segment                 <chr> "Direct", "Direct", "Direct", "Corporat…
$ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corporat…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
$ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
$ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
$ agent                          <chr> "NULL", "NULL", "NULL", "304", "240", "…
$ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type                  <chr> "Transient", "Transient", "Transient", …
$ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
$ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
$ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out", …
$ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
Dataset Dimensions: 119390 32

# Check for missing values
missing_summary <- hotel_data %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "MissingCount")

# Display only variables with missing values
missing_summary_with_na <- missing_summary %>%
  filter(MissingCount > 0)

print(missing_summary_with_na)
# Visualise missing values
gg_miss_var(hotel_data) +
  labs(title = "Missing Values by Variable")

# A tibble: 1 × 2
  Variable MissingCount
  <chr>           <int>
1 children            4

# Impute missing values in the 'children' column with the median
hotel_data <- hotel_data %>%
  mutate(children = ifelse(is.na(children), median(children, na.rm = TRUE), children))

# Verify that there are no missing values remaining
missing_summary_after_imputation <- hotel_data %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "MissingCount")

print(missing_summary_after_imputation)

#visualise if there are any missing variables
gg_miss_var(hotel_data) +
  labs(title = "Missing Values by Variable")

# A tibble: 32 × 2
   Variable                  MissingCount
   <chr>                            <int>
 1 hotel                                0
 2 is_canceled                          0
 3 lead_time                            0
 4 arrival_date_year                    0
 5 arrival_date_month                   0
 6 arrival_date_week_number             0
 7 arrival_date_day_of_month            0
 8 stays_in_weekend_nights              0
 9 stays_in_week_nights                 0
10 adults                               0
# ℹ 22 more rows

# Define numerical variables
numeric_vars <- c("lead_time", "adr", "stays_in_weekend_nights", "stays_in_week_nights")

# Plot histograms for numerical variables
library(ggplot2)
plots <- lapply(numeric_vars, function(var) {
  ggplot(hotel_data, aes_string(x = var)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
    labs(title = paste("Distribution of", var), x = var, y = "Frequency") +
    theme_minimal()
})

# Arrange plots
library(patchwork)
wrap_plots(plots, ncol = 2)

# Plot cancellation distribution
ggplot(hotel_data, aes(x = factor(is_canceled), fill = factor(is_canceled))) +
  geom_bar(alpha = 0.8) +
  scale_fill_manual(values = c("steelblue", "orange"), labels = c("Not Canceled", "Canceled")) +
  labs(title = "Booking Cancellation Rates", x = "Booking Status", y = "Count", fill = "Cancellation") +
  theme_minimal()

# Plot distribution of hotel types
ggplot(hotel_data, aes(x = hotel, fill = hotel)) +
  geom_bar(alpha = 0.8) +
  labs(title = "Hotel Types Distribution", x = "Hotel Type", y = "Count", fill = "Hotel Type") +
  theme_minimal()

# creating total stay variable for visualisation 
hotel_data <- hotel_data %>%
  mutate(total_stay = stays_in_weekend_nights + stays_in_week_nights)
# Lead Time
ggplot(hotel_data, aes(x = factor(is_canceled), y = lead_time, fill = factor(is_canceled))) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Lead Time by Cancellation Status",
    x = "Cancellation Status (0 = Not Canceled, 1 = Canceled)",
    y = "Lead Time",
    fill = "Cancellation"
  ) +
  theme_minimal()

# Total Stay
ggplot(hotel_data, aes(x = factor(is_canceled), y = total_stay, fill = factor(is_canceled))) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Total Stay by Cancellation Status",
    x = "Cancellation Status (0 = Not Canceled, 1 = Canceled)",
    y = "Total Stay (Nights)",
    fill = "Cancellation"
  ) +
  theme_minimal()

# Market Segment
ggplot(hotel_data, aes(x = market_segment, fill = factor(is_canceled))) +
  geom_bar(position = "dodge", alpha = 0.7) +
  labs(
    title = "Market Segment by Cancellation Status",
    x = "Market Segment",
    y = "Count",
    fill = "Cancellation"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Deposit Type
ggplot(hotel_data, aes(x = deposit_type, fill = factor(is_canceled))) +
  geom_bar(position = "dodge", alpha = 0.7) +
  labs(
    title = "Deposit Type by Cancellation Status",
    x = "Deposit Type",
    y = "Count",
    fill = "Cancellation"
  ) +
  theme_minimal()

# Define a function to summarise outliers using IQR
summarise_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_value
  upper_bound <- Q3 + 1.5 * IQR_value
  
  # Count outliers
  num_outliers <- sum(data[[variable]] < lower_bound | data[[variable]] > upper_bound, na.rm = TRUE)
  total_values <- sum(!is.na(data[[variable]]))
  percentage_outliers <- (num_outliers / total_values) * 100
  
  return(data.frame(
    Variable = variable,
    Lower_Bound = lower_bound,
    Upper_Bound = upper_bound,
    Num_Outliers = num_outliers,
    Total_Values = total_values,
    Percentage_Outliers = percentage_outliers
  ))
}

# Select numerical variables
numerical_vars <- c("adr", "lead_time", "stays_in_week_nights", "stays_in_weekend_nights")

# Apply the function to each numerical variable
outliers_summary <- do.call(rbind, lapply(numerical_vars, function(var) summarise_outliers(hotel_data, var)))

# Print the summary of outliers
print(outliers_summary)

# Define a function for Winsorisation
winsorise <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_value
  upper_bound <- Q3 + 1.5 * IQR_value
  
  # Cap the values
  data[[variable]] <- ifelse(data[[variable]] < lower_bound, lower_bound, data[[variable]])
  data[[variable]] <- ifelse(data[[variable]] > upper_bound, upper_bound, data[[variable]])
  
  return(data)
}

# Variables to Winsorise
numerical_vars <- c("adr", "lead_time", "stays_in_week_nights", "stays_in_weekend_nights")

# Apply Winsorisation to each variable
for (var in numerical_vars) {
  hotel_data <- winsorise(hotel_data, var)
}

# Confirm Winsorisation
print("Winsorisation completed for selected variables.")

summary(hotel_data[numerical_vars])

                    Variable Lower_Bound Upper_Bound Num_Outliers Total_Values
25%                      adr     -15.775     211.065         3793       119390
25%1               lead_time    -195.000     373.000         3005       119390
25%2    stays_in_week_nights      -2.000       6.000         3354       119390
25%3 stays_in_weekend_nights      -3.000       5.000          265       119390
     Percentage_Outliers
25%            3.1769830
25%1           2.5169612
25%2           2.8092805
25%3           0.2219616
[1] "Winsorisation completed for selected variables."

      adr           lead_time     stays_in_week_nights stays_in_weekend_nights
 Min.   : -6.38   Min.   :  0.0   Min.   :0.000        Min.   :0.0000         
 1st Qu.: 69.29   1st Qu.: 18.0   1st Qu.:1.000        1st Qu.:0.0000         
 Median : 94.58   Median : 69.0   Median :2.000        Median :1.0000         
 Mean   :100.66   Mean   :102.2   Mean   :2.406        Mean   :0.9227         
 3rd Qu.:126.00   3rd Qu.:160.0   3rd Qu.:3.000        3rd Qu.:2.0000         
 Max.   :211.06   Max.   :373.0   Max.   :6.000        Max.   :5.0000

# Define numerical variables for plotting
numerical_vars <- c("adr", "lead_time", "stays_in_week_nights", "stays_in_weekend_nights")

# Create histograms for the numerical variables
plots <- lapply(numerical_vars, function(var) {
  ggplot(hotel_data, aes_string(x = var)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
    labs(title = paste(var, "After Winsorisation"), x = var, y = "Frequency") +
    theme_minimal()
})

# Arrange plots in a grid
library(patchwork)
wrap_plots(plots, ncol = 2)

# Boxplots for Winsorised Data
boxplots <- lapply(numerical_vars, function(var) {
  ggplot(hotel_data, aes(y = .data[[var]])) +
    geom_boxplot(fill = "orange", alpha = 0.7) +
    labs(title = paste(var, "After Winsorisation"), y = var) +
    theme_minimal()
})

# Arrange boxplots
wrap_plots(boxplots, ncol = 2)

# Compute correlation matrix for numerical variables
cor_matrix <- cor(hotel_data %>% select(adr, lead_time, stays_in_week_nights, stays_in_weekend_nights, is_canceled))

# Load ggcorrplot for visualisation
library(ggcorrplot)

# Plot correlation heatmap
ggcorrplot(cor_matrix, lab = TRUE, outline.color = "white", colors = c("red", "white", "blue")) +
  labs(title = "Correlation Heatmap")

# Select features
selected_features <- hotel_data %>%
  select(is_canceled, lead_time, adr, stays_in_week_nights, stays_in_weekend_nights, 
         hotel, market_segment, deposit_type, customer_type)

# Encode categorical variables using one-hot encoding
dummy_vars <- dummyVars("~ .", data = selected_features, fullRank = TRUE)
encoded_data <- data.frame(predict(dummy_vars, newdata = selected_features))

# Split into training and testing subsets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(encoded_data$is_canceled, p = 0.7, list = FALSE)
train_data <- encoded_data[train_index, ]
test_data <- encoded_data[-train_index, ]

# Normalise numerical variables (for Logistic Regression)
num_vars <- c("lead_time", "adr", "stays_in_week_nights", "stays_in_weekend_nights")
preprocess <- preProcess(train_data[, num_vars], method = c("center", "scale"))
train_data[, num_vars] <- predict(preprocess, train_data[, num_vars])
test_data[, num_vars] <- predict(preprocess, test_data[, num_vars])

# Confirm the structure of training and testing data
cat("Training Data Dimensions:", dim(train_data), "\n")
cat("Testing Data Dimensions:", dim(test_data), "\n")

Training Data Dimensions: 83573 18 
Testing Data Dimensions: 35817 18

# Train a logistic regression model
logistic_model <- glm(
  is_canceled ~ ., 
  data = train_data, 
  family = binomial
)

# Display model summary
summary(logistic_model)


# Predict probabilities on the testing dataset
test_data$predicted_prob <- predict(logistic_model, newdata = test_data, type = "response")

# Convert probabilities to binary predictions (threshold = 0.5)
test_data$predicted_class <- ifelse(test_data$predicted_prob > 0.5, 1, 0)

Call:
glm(formula = is_canceled ~ ., family = binomial, data = train_data)

Coefficients: (1 not defined because of singularities)
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -1.478764   0.200560  -7.373 1.67e-13 ***
lead_time                     0.410698   0.010349  39.684  < 2e-16 ***
adr                           0.123881   0.009218  13.440  < 2e-16 ***
stays_in_week_nights          0.076243   0.009578   7.960 1.72e-15 ***
stays_in_weekend_nights       0.064280   0.009239   6.958 3.46e-12 ***
hotelResort.Hotel            -0.271546   0.019315 -14.059  < 2e-16 ***
market_segmentComplementary  -0.066100   0.232504  -0.284  0.77618    
market_segmentCorporate      -0.169554   0.200171  -0.847  0.39697    
market_segmentDirect         -0.490157   0.195961  -2.501  0.01237 *  
market_segmentGroups          0.433559   0.197457   2.196  0.02811 *  
market_segmentOffline.TA.TO  -0.393231   0.195465  -2.012  0.04424 *  
market_segmentOnline.TA       0.510557   0.193860   2.634  0.00845 ** 
market_segmentUndefined             NA         NA      NA       NA    
deposit_typeNon.Refund        5.847317   0.130783  44.710  < 2e-16 ***
deposit_typeRefundable       -0.010815   0.217401  -0.050  0.96033    
customer_typeGroup           -0.220390   0.168583  -1.307  0.19111    
customer_typeTransient        0.552017   0.052911  10.433  < 2e-16 ***
customer_typeTransient.Party  0.042291   0.057109   0.741  0.45898    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 110085  on 83572  degrees of freedom
Residual deviance:  81867  on 83556  degrees of freedom
AIC: 81901

Number of Fisher Scoring iterations: 7

Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
"prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases"

# Confusion Matrix
conf_matrix <- confusionMatrix(
  factor(test_data$predicted_class), 
  factor(test_data$is_canceled)
)
print(conf_matrix)

# AUC-ROC Curve
roc_curve <- roc(test_data$is_canceled, test_data$predicted_prob)
plot(roc_curve, col = "blue", main = "ROC Curve for Logistic Regression")
auc_value <- auc(roc_curve)
cat("AUC:", auc_value, "\n")

Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 21625  7919
         1   837  5436
                                       
               Accuracy : 0.7555       
                 95% CI : (0.751, 0.76)
    No Information Rate : 0.6271       
    P-Value [Acc > NIR] : < 2.2e-16    
                                       
                  Kappa : 0.4143       
                                       
 Mcnemar's Test P-Value : < 2.2e-16    
                                       
            Sensitivity : 0.9627       
            Specificity : 0.4070       
         Pos Pred Value : 0.7320       
         Neg Pred Value : 0.8666       
             Prevalence : 0.6271       
         Detection Rate : 0.6038       
   Detection Prevalence : 0.8249       
      Balanced Accuracy : 0.6849       
                                       
       'Positive' Class : 0

Setting levels: control = 0, case = 1

Setting direction: controls < cases

AUC: 0.7893547

# Ensure the target variable is a factor
hotel_data$is_canceled <- as.factor(hotel_data$is_canceled)

# Create an 80-20 train-test split
set.seed(123)  
train_index <- createDataPartition(hotel_data$is_canceled, p = 0.8, list = FALSE)

# Split data
train_data <- hotel_data[train_index, ]
test_data <- hotel_data[-train_index, ]

# Confirm the structure of train and test data
cat("Training Data Dimensions:", dim(train_data), "\n")
cat("Testing Data Dimensions:", dim(test_data), "\n")

# Convert categorical variables to factors
train_data <- train_data %>%
  mutate(
    hotel = as.factor(hotel),
    market_segment = as.factor(market_segment),
    deposit_type = as.factor(deposit_type),
    customer_type = as.factor(customer_type),
    is_canceled = as.factor(is_canceled)
  )

test_data <- test_data %>%
  mutate(
    hotel = as.factor(hotel),
    market_segment = as.factor(market_segment),
    deposit_type = as.factor(deposit_type),
    customer_type = as.factor(customer_type),
    is_canceled = as.factor(is_canceled)
  )

# Verify transformation
str(train_data)


# Train the model
set.seed(123)
rf_model <- randomForest(
  is_canceled ~ lead_time + adr + stays_in_week_nights + stays_in_weekend_nights +
    hotel + market_segment + deposit_type + customer_type,
  data = train_data,  
  ntree = 500,    
  mtry = 3,       
  importance = TRUE  
)

# Print model summary
print(rf_model)

# Ensure test data levels match train data levels
test_data$hotel <- factor(test_data$hotel, levels = levels(train_data$hotel))
test_data$market_segment <- factor(test_data$market_segment, levels = levels(train_data$market_segment))
test_data$deposit_type <- factor(test_data$deposit_type, levels = levels(train_data$deposit_type))
test_data$customer_type <- factor(test_data$customer_type, levels = levels(train_data$customer_type))

# Verify that levels match
str(test_data)


# Make predictions on the test set
rf_predictions <- predict(rf_model, test_data)

# Convert predictions and actual values to factors
rf_predictions <- as.factor(rf_predictions)
test_data$is_canceled <- as.factor(test_data$is_canceled)

# Confusion Matrix
rf_conf_matrix <- confusionMatrix(rf_predictions, test_data$is_canceled)
# Print the confusion matrix results
print(rf_conf_matrix)

Training Data Dimensions: 95513 33 
Testing Data Dimensions: 23877 33 
tibble [95,513 × 33] (S3: tbl_df/tbl/data.frame)
 $ hotel                         : Factor w/ 2 levels "City Hotel","Resort Hotel": 2 2 2 2 2 2 2 2 2 2 ...
 $ is_canceled                   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...
 $ lead_time                     : num [1:95513] 342 373 7 14 14 0 9 85 75 23 ...
 $ arrival_date_year             : num [1:95513] 2015 2015 2015 2015 2015 ...
 $ arrival_date_month            : chr [1:95513] "July" "July" "July" "July" ...
 $ arrival_date_week_number      : num [1:95513] 27 27 27 27 27 27 27 27 27 27 ...
 $ arrival_date_day_of_month     : num [1:95513] 1 1 1 1 1 1 1 1 1 1 ...
 $ stays_in_weekend_nights       : num [1:95513] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights          : num [1:95513] 0 0 1 2 2 2 2 3 3 4 ...
 $ adults                        : num [1:95513] 2 2 1 2 2 2 2 2 2 2 ...
 $ children                      : num [1:95513] 0 0 0 0 0 0 0 0 0 0 ...
 $ babies                        : num [1:95513] 0 0 0 0 0 0 0 0 0 0 ...
 $ meal                          : chr [1:95513] "BB" "BB" "BB" "BB" ...
 $ country                       : chr [1:95513] "PRT" "PRT" "GBR" "GBR" ...
 $ market_segment                : Factor w/ 8 levels "Aviation","Complementary",..: 4 4 4 7 7 4 4 7 6 7 ...
 $ distribution_channel          : chr [1:95513] "Direct" "Direct" "Direct" "TA/TO" ...
 $ is_repeated_guest             : num [1:95513] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_cancellations        : num [1:95513] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_bookings_not_canceled: num [1:95513] 0 0 0 0 0 0 0 0 0 0 ...
 $ reserved_room_type            : chr [1:95513] "C" "C" "A" "A" ...
 $ assigned_room_type            : chr [1:95513] "C" "C" "C" "A" ...
 $ booking_changes               : num [1:95513] 3 4 0 0 0 0 0 0 0 0 ...
 $ deposit_type                  : Factor w/ 3 levels "No Deposit","Non Refund",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ agent                         : chr [1:95513] "NULL" "NULL" "NULL" "240" ...
 $ company                       : chr [1:95513] "NULL" "NULL" "NULL" "NULL" ...
 $ days_in_waiting_list          : num [1:95513] 0 0 0 0 0 0 0 0 0 0 ...
 $ customer_type                 : Factor w/ 4 levels "Contract","Group",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ adr                           : num [1:95513] 0 0 75 98 98 ...
 $ required_car_parking_spaces   : num [1:95513] 0 0 0 0 0 0 0 0 0 0 ...
 $ total_of_special_requests     : num [1:95513] 0 0 0 1 1 0 1 1 0 0 ...
 $ reservation_status            : chr [1:95513] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
 $ reservation_status_date       : Date[1:95513], format: "2015-07-01" "2015-07-01" ...
 $ total_stay                    : num [1:95513] 0 0 1 2 2 2 2 3 3 4 ...

Call:
 randomForest(formula = is_canceled ~ lead_time + adr + stays_in_week_nights +      stays_in_weekend_nights + hotel + market_segment + deposit_type +      customer_type, data = train_data, ntree = 500, mtry = 3,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 19.78%
Confusion matrix:
      0     1 class.error
0 57333  2800  0.04656345
1 16097 19283  0.45497456
tibble [23,877 × 33] (S3: tbl_df/tbl/data.frame)
 $ hotel                         : Factor w/ 2 levels "City Hotel","Resort Hotel": 2 2 2 2 2 2 2 2 2 2 ...
 $ is_canceled                   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
 $ lead_time                     : num [1:23877] 13 37 72 77 118 69 43 1 1 14 ...
 $ arrival_date_year             : num [1:23877] 2015 2015 2015 2015 2015 ...
 $ arrival_date_month            : chr [1:23877] "July" "July" "July" "July" ...
 $ arrival_date_week_number      : num [1:23877] 27 27 27 27 27 27 27 27 27 27 ...
 $ arrival_date_day_of_month     : num [1:23877] 1 1 1 1 1 2 2 2 2 2 ...
 $ stays_in_weekend_nights       : num [1:23877] 0 0 2 2 4 2 1 0 0 0 ...
 $ stays_in_week_nights          : num [1:23877] 1 4 4 5 6 4 3 1 1 2 ...
 $ adults                        : num [1:23877] 1 2 2 2 1 2 3 2 2 2 ...
 $ children                      : num [1:23877] 0 0 0 0 0 0 0 0 2 0 ...
 $ babies                        : num [1:23877] 0 0 0 0 0 0 0 0 0 0 ...
 $ meal                          : chr [1:23877] "BB" "BB" "BB" "BB" ...
 $ country                       : chr [1:23877] "GBR" "PRT" "PRT" "PRT" ...
 $ market_segment                : Factor w/ 8 levels "Aviation","Complementary",..: 3 6 4 7 4 6 7 7 4 7 ...
 $ distribution_channel          : chr [1:23877] "Corporate" "TA/TO" "Direct" "TA/TO" ...
 $ is_repeated_guest             : num [1:23877] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_cancellations        : num [1:23877] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_bookings_not_canceled: num [1:23877] 0 0 0 0 0 0 0 0 0 0 ...
 $ reserved_room_type            : chr [1:23877] "A" "E" "A" "A" ...
 $ assigned_room_type            : chr [1:23877] "A" "E" "A" "A" ...
 $ booking_changes               : num [1:23877] 0 0 1 0 2 0 0 0 0 0 ...
 $ deposit_type                  : Factor w/ 3 levels "No Deposit","Non Refund",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ agent                         : chr [1:23877] "304" "8" "250" "240" ...
 $ company                       : chr [1:23877] "NULL" "NULL" "NULL" "NULL" ...
 $ days_in_waiting_list          : num [1:23877] 0 0 0 0 0 0 0 0 0 0 ...
 $ customer_type                 : Factor w/ 4 levels "Contract","Group",..: 3 1 3 3 3 3 3 3 3 3 ...
 $ adr                           : num [1:23877] 75 97.5 84.7 94 62 ...
 $ required_car_parking_spaces   : num [1:23877] 0 0 0 0 0 0 0 1 1 0 ...
 $ total_of_special_requests     : num [1:23877] 0 0 1 0 2 0 0 0 2 1 ...
 $ reservation_status            : chr [1:23877] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
 $ reservation_status_date       : Date[1:23877], format: "2015-07-02" "2015-07-05" ...
 $ total_stay                    : num [1:23877] 1 4 6 7 14 6 4 1 1 2 ...
Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 14332  3951
         1   701  4893
                                          
               Accuracy : 0.8052          
                 95% CI : (0.8001, 0.8102)
    No Information Rate : 0.6296          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5481          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9534          
            Specificity : 0.5533          
         Pos Pred Value : 0.7839          
         Neg Pred Value : 0.8747          
             Prevalence : 0.6296          
         Detection Rate : 0.6002          
   Detection Prevalence : 0.7657          
      Balanced Accuracy : 0.7533          
                                          
       'Positive' Class : 0

# Get predicted probabilities
rf_probabilities <- predict(rf_model, test_data, type = "prob")[,2]

# Compute ROC curve
rf_roc_curve <- roc(test_data$is_canceled, rf_probabilities)

# Plot ROC Curve
plot(rf_roc_curve, col = "red", main = "ROC Curve - Random Forest", lwd = 2)

# Compute AUC
rf_auc <- auc(rf_roc_curve)

# Add AUC to the plot
legend("bottomright", legend = paste("AUC =", round(rf_auc, 3)), col = "red", lwd = 2)

Setting levels: control = 0, case = 1

Setting direction: controls < cases

# Plot feature importance
varImpPlot(rf_model, main = "Feature Importance - Random Forest")

# Extract numerical importance values
feature_importance <- as.data.frame(importance(rf_model))
feature_importance$Feature <- rownames(feature_importance)

# Print top features
#feature_importance <- feature_importance[order(-feature_importance$MeanDecreaseGini), ]
#print(feature_importance)

# Boxplot for Lead Time by Cancellation Status
boxplot_lead_time <- ggplot(hotel_data, aes(x = factor(is_canceled), y = lead_time, fill = factor(is_canceled))) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Lead Time by Cancellation Status",
    x = "Cancellation Status (0 = Not Canceled, 1 = Canceled)",
    y = "Lead Time (Days)",
    fill = "Cancellation"
  ) +
  theme_minimal()

# Boxplot for ADR by Cancellation Status
boxplot_adr <- ggplot(hotel_data, aes(x = factor(is_canceled), y = adr, fill = factor(is_canceled))) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "ADR by Cancellation Status",
    x = "Cancellation Status (0 = Not Canceled, 1 = Canceled)",
    y = "Average Daily Rate (ADR)",
    fill = "Cancellation"
  ) +
  theme_minimal()

# Combine the two plots side by side
boxplot_lead_time + boxplot_adr

# Boxplot for Stays in Week Nights by Cancellation Status
boxplot_week_nights <- ggplot(hotel_data, aes(x = factor(is_canceled), y = stays_in_week_nights, fill = factor(is_canceled))) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Stays in Week Nights by Cancellation Status",
    x = "Cancellation Status (0 = Not Canceled, 1 = Canceled)",
    y = "Stays in Week Nights",
    fill = "Cancellation"
  ) +
  theme_minimal()

# Boxplot for Stays in Weekend Nights by Cancellation Status
boxplot_weekend_nights <- ggplot(hotel_data, aes(x = factor(is_canceled), y = stays_in_weekend_nights, fill = factor(is_canceled))) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Stays in Weekend Nights by Cancellation Status",
    x = "Cancellation Status (0 = Not Canceled, 1 = Canceled)",
    y = "Stays in Weekend Nights",
    fill = "Cancellation"
  ) +
  theme_minimal()

# Combine the two plots side by side
boxplot_week_nights + boxplot_weekend_nights

	Reference Class 0	Reference Class 1
Predicted 0	14,332	3,951
Predicted 1	701	4,893

What Factors Influence Booking Cancellations, and How Accurately Can We Predict Whether a Booking Will Be Cancelled?¶

Author: Sheyin Avong¶

Student Number: G21285146¶

Key Fields in the Dataset¶

Access the file and check the dataset¶

Checking for Missing Values¶

Findings:¶

Visualisation:¶

Imputing Missing Values¶

Exploring the Distribution of Numerical Variables¶

Observations:¶

Booking Cancellation Rates¶

Observation:¶

Hotel Types Distribution¶

Insights:¶

Lead Time by Cancellation Status¶

Insights:¶

Total Stay by Cancellation Status¶

Insights:¶

Market Segment by Cancellation Status¶

Insights:¶

Deposit Type by Cancellation Status¶

Insights:¶

Outlier Detection and Winsorisation¶

Outlier Detection¶

Winsorisation¶

Impact of Winsorisation¶

Histograms of Variables After Winsorisation¶

Insights:¶

Boxplots of Variables After Winsorisation¶

Insights:¶

Correlation Heatmap After Winsorisation¶

Insights:¶

Feature Selection and Data Preprocessing¶

Feature Selection:¶

Encoding Categorical Variables:¶

Train-Test Split:¶

Normalisation:¶

Importance of Preprocessing:¶

Logistic Regression Model Summary¶

Model Fit and Performance¶

Logistic Regression Model Evaluation¶

Key Metrics:¶

Observations:¶

Note on Warnings:¶

Random Forest Model Evaluation¶

Training and Testing Data¶

Model Summary¶

Confusion Matrix on Testing Data:¶

Key Metrics:¶

Observations:¶

Insights:¶

ROC Curve for Random Forest Model¶

Key Observations:¶

Insights:¶

Feature Importance - Random Forest Model¶

Key Findings:¶

Insights:¶

Boxplots of Key Features by Cancellation Status¶

Insights:¶

Stays in Week Nights and Weekend Nights by Cancellation Status¶

Observations:¶

Insights:¶

Business Insights & Conclusion¶

Summary¶

Key Influencing Factors:¶

Predictive Modelling:¶

Conclusion:¶