#Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Load datasets
data_path_monthly = r"C:\Users\sheyi\OneDrive\Documents\python coursework\housing_in_london_monthly_variables.csv"
data_path_yearly = r"C:\Users\sheyi\OneDrive\Documents\python coursework\housing_in_london_yearly_variables.csv"
data_path_inflation = r"C:\Users\sheyi\OneDrive\Documents\python coursework\inflationrate.csv"

monthly_data = pd.read_csv(data_path_monthly)  # Load monthly data
yearly_data = pd.read_csv(data_path_yearly)    # Load yearly data
inflation_data = pd.read_csv(data_path_inflation)  # Load inflation data

#Explore Datasets
print("Monthly Data Overview:")
print(monthly_data.info())
print(monthly_data.head())

Monthly Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13549 entries, 0 to 13548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           13549 non-null  object 
 1   area           13549 non-null  object 
 2   average_price  13549 non-null  int64  
 3   code           13549 non-null  object 
 4   houses_sold    13455 non-null  float64
 5   no_of_crimes   7439 non-null   float64
 6   borough_flag   13549 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 741.1+ KB
None
         date            area  average_price       code  houses_sold  \
0  1995-01-01  city of london          91449  E09000001         17.0   
1  1995-02-01  city of london          82203  E09000001          7.0   
2  1995-03-01  city of london          79121  E09000001         14.0   
3  1995-04-01  city of london          77101  E09000001          7.0   
4  1995-05-01  city of london          84409  E09000001         10.0   

   no_of_crimes  borough_flag  
0           NaN             1  
1           NaN             1  
2           NaN             1  
3           NaN             1  
4           NaN             1

print("\nYearly Data Overview:")
print(yearly_data.info())
print(yearly_data.head())

Yearly Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1071 entries, 0 to 1070
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   code               1071 non-null   object 
 1   area               1071 non-null   object 
 2   date               1071 non-null   object 
 3   median_salary      1049 non-null   float64
 4   life_satisfaction  352 non-null    float64
 5   mean_salary        1071 non-null   object 
 6   recycling_pct      860 non-null    object 
 7   population_size    1018 non-null   float64
 8   number_of_jobs     931 non-null    float64
 9   area_size          666 non-null    float64
 10  no_of_houses       666 non-null    float64
 11  borough_flag       1071 non-null   int64  
dtypes: float64(6), int64(1), object(5)
memory usage: 100.5+ KB
None
        code                  area        date  median_salary  \
0  E09000001        city of london  1999-12-01        33020.0   
1  E09000002  barking and dagenham  1999-12-01        21480.0   
2  E09000003                barnet  1999-12-01        19568.0   
3  E09000004                bexley  1999-12-01        18621.0   
4  E09000005                 brent  1999-12-01        18532.0   

   life_satisfaction mean_salary recycling_pct  population_size  \
0                NaN       48922             0           6581.0   
1                NaN       23620             3         162444.0   
2                NaN       23128             8         313469.0   
3                NaN       21386            18         217458.0   
4                NaN       20911             6         260317.0   

   number_of_jobs  area_size  no_of_houses  borough_flag  
0             NaN        NaN           NaN             1  
1             NaN        NaN           NaN             1  
2             NaN        NaN           NaN             1  
3             NaN        NaN           NaN             1  
4             NaN        NaN           NaN             1

print("\nInflation Data Overview:")
print(inflation_data.info())
print(inflation_data.head())

Inflation Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    35 non-null     int64  
 1   Rate    35 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 692.0 bytes
None
   Year  Rate
0  1989   5.7
1  1990   8.0
2  1991   7.5
3  1992   4.6
4  1993   2.6

#Select relevant columns for analysis
monthly_relevant = monthly_data[['date', 'area', 'average_price', 'borough_flag']]
yearly_relevant = yearly_data[['date', 'area', 'median_salary', 'borough_flag', 'population_size']]

#Filter for boroughs only (borough_flag = 1)
monthly_boroughs = monthly_relevant[monthly_relevant['borough_flag'] == 1].copy()
yearly_boroughs = yearly_relevant[yearly_relevant['borough_flag'] == 1].copy()

#Clean Inflation Data
inflation_data = inflation_data.rename(columns={'Year': 'year', 'Rate': 'inflation_rate'})#renaming the columns
inflation_data = inflation_data[(inflation_data['year'] >= 1999) & (inflation_data['year'] <= 2019)]#selecting from  1999 to 2019
inflation_data['cumulative_inflation'] = (1 + inflation_data['inflation_rate'] / 100).cumprod()#cumulative inflation formula

#Check for missing values
print("\nMissing Values in Monthly Boroughs Data:")
print(monthly_boroughs.isnull().sum())

print("\nMissing Values in Yearly Boroughs Data:")
print(yearly_boroughs.isnull().sum())

Missing Values in Monthly Boroughs Data:
date             0
area             0
average_price    0
borough_flag     0
dtype: int64

Missing Values in Yearly Boroughs Data:
date                0
area                0
median_salary      10
borough_flag        0
population_size    33
dtype: int64

#Convert 'date' column to datetime format in monthly and yearly dataset
monthly_boroughs['date'] = pd.to_datetime(monthly_boroughs['date'], errors='coerce')
yearly_boroughs['date'] = pd.to_datetime(yearly_boroughs['date'], errors='coerce')

#Drop rows with invalid dates
monthly_boroughs = monthly_boroughs.dropna(subset=['date']).copy()
yearly_boroughs = yearly_boroughs.dropna(subset=['date']).copy()

#Extract year from the date column
monthly_boroughs['year'] = monthly_boroughs['date'].dt.year
yearly_boroughs['year'] = yearly_boroughs['date'].dt.year

#Filter data between 1999 and 2019 in the monthly and yearly dataset
monthly_boroughs = monthly_boroughs[(monthly_boroughs['year'] >= 1999) & (monthly_boroughs['year'] <= 2019)]
yearly_boroughs = yearly_boroughs[(yearly_boroughs['year'] >= 1999) & (yearly_boroughs['year'] <= 2019)]

#Aggregate monthly average prices into yearly averages for consistency
monthly_aggregated = monthly_boroughs.groupby(['area', 'year'], as_index=False)['average_price'].mean()

#Merge aggregated data with yearly salary and inflation data
yearly_data_filtered = yearly_boroughs[['area', 'year', 'median_salary', 'population_size']]
merged_data = pd.merge(monthly_aggregated, yearly_data_filtered, on=['area', 'year'], how='inner')
merged_data = pd.merge(merged_data, inflation_data[['year', 'cumulative_inflation', 'inflation_rate']], on='year', how='left')

# Handle Missing Values in Median Salary
# Interpolate missing values for median salary

merged_data['median_salary'] = merged_data['median_salary'].interpolate(method='linear', limit_direction='both')

# Handle any remaining missing values (e.g., forward-fill or backward-fill)
merged_data['median_salary'] = merged_data['median_salary'].ffill().bfill()

#Adjust average_price and median_salary for inflation
merged_data['adjusted_price'] = merged_data['average_price'] / merged_data['cumulative_inflation']
merged_data['adjusted_salary'] = merged_data['median_salary'] / merged_data['cumulative_inflation']
merged_data['adjusted_price_to_income_ratio'] = merged_data['adjusted_price'] / merged_data['adjusted_salary']
#Round values for readability
merged_data = merged_data.round(0)

#Save the merged dataset to a CSV file
output_path = r"C:\Users\sheyi\OneDrive\Documents\python coursework\merged_dataset.csv"
merged_data.to_csv(output_path, index=False)
print(f"Merged dataset saved to {output_path}")

Merged dataset saved to C:\Users\sheyi\OneDrive\Documents\python coursework\merged_dataset.csv

# Visualize Inflation Rate Over Time
plt.figure(figsize=(12, 6))
plt.plot(inflation_data['year'], inflation_data['inflation_rate'], marker='o', label='Inflation Rate', color='blue')
plt.xlabel('Year')
plt.ylabel('Inflation Rate (%)')
plt.title('Annual Inflation Rate Over Time')
plt.xticks(range(1999, 2020), rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

#Compare Original vs Adjusted Prices for Selected Boroughs
selected_boroughs = ['city of london', 'barking and dagenham', 'westminster', 'barnet', 'croydon', 'tower hamlets', 'hammersmith and fulham']
plt.figure(figsize=(12, 6))
for borough in selected_boroughs:
    borough_data = merged_data[merged_data['area'] == borough]
    plt.plot(borough_data['year'], borough_data['average_price'], label=f'{borough} (Original)')
    plt.plot(borough_data['year'], borough_data['adjusted_price'], linestyle='--', label=f'{borough} (Adjusted)')
plt.xlabel('Year')
plt.ylabel('Price')
plt.title('Original vs Inflation-Adjusted Prices')
plt.legend()
plt.xticks(range(1999, 2020), rotation=45)
plt.tight_layout()
plt.show()

#Heatmap of Adjusted Price-to-Income Ratios
plt.figure(figsize=(14, 10))
pivot_data = merged_data.pivot_table(index='area', columns='year', values='adjusted_price_to_income_ratio')
sns.heatmap(pivot_data, cmap='YlOrRd', annot=True, fmt='.1f', cbar_kws={'label': 'Adjusted Price-to-Income Ratio'})
plt.title('Heatmap of Adjusted Price-to-Income Ratios (1999-2019)')
plt.xlabel('Year')
plt.ylabel('London Boroughs')
plt.tight_layout()
# Save the heatmap as an image
plt.savefig('heatmap_price_to_income.png', dpi=300)
plt.show()

Housing Affordability Trends in London: An Inflation-Adjusted Analysis for the Last Two Decades¶

Load the datasets¶

The datasets containing monthly, yearly and inflation rate¶

Explore the Datasets and check the datatypes in the datasets¶

Select the columns that have been selected to be used for analysis¶

Line Chart for Annual Inflation rate from 1999 to 2019¶

Annual Inflation Rate Over Time (1999-2019)¶

Line Chart for Original vs Inflation-Adjusted Prices¶

Original vs Inflation-Adjusted Prices for Selected Boroughs (1999-2019)¶

Heatmap of Adjusted Price to Income Ratios(1999-2019)¶

Insights from the Heatmap of Adjusted Price-to-Income Ratios (1999-2019)¶

1. Least Affordable Boroughs:¶

2. More Affordable Boroughs:¶

3. General Trends:¶

4. Patterns Across Boroughs:¶

Key Takeaways:¶

Summary¶