import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import f_oneway
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsRegressor

#Load health data
df_health = pd.read_csv("sample_data/health.csv", header=1)
print(df_health.shape)
df_health

(25, 249)

print(list(df_health.columns))

['FIPS', 'State', 'County', 'Unreliable', 'Deaths', 'Years of Potential Life Lost Rate', '95% CI - Low', '95% CI - High', 'Z-Score', 'YPLL Rate (AIAN)', 'YPLL Rate (AIAN) 95% CI - Low', 'YPLL Rate (AIAN) 95% CI - High', 'YPLL Rate (AIAN) Unreliable', 'YPLL Rate (Asian)', 'YPLL Rate (Asian) 95% CI - Low', 'YPLL Rate (Asian) 95% CI - High', 'YPLL Rate (Asian) Unreliable', 'YPLL Rate (Black)', 'YPLL Rate (Black) 95% CI - Low', 'YPLL Rate (Black) 95% CI - High', 'YPLL Rate (Black) Unreliable', 'YPLL Rate (Hispanic)', 'YPLL Rate (Hispanic) 95% CI - Low', 'YPLL Rate (Hispanic) 95% CI - High', 'YPLL Rate (Hispanic) Unreliable', 'YPLL Rate (white)', 'YPLL Rate (white) 95% CI - Low', 'YPLL Rate (white) 95% CI - High', 'YPLL Rate (white) Unreliable', '% Fair or Poor Health', '95% CI - Low.1', '95% CI - High.1', 'Z-Score.1', 'Average Number of Physically Unhealthy Days', '95% CI - Low.2', '95% CI - High.2', 'Z-Score.2', 'Average Number of Mentally Unhealthy Days', '95% CI - Low.3', '95% CI - High.3', 'Z-Score.3', 'Unreliable.1', '% Low birthweight', '95% CI - Low.4', '95% CI - High.4', 'Z-Score.4', '% LBW (AIAN)', '% LBW (AIAN) 95% CI - Low', '% LBW (AIAN) 95% CI - High', '% LBW (Asian)', '% LBW (Asian) 95% CI - Low', '% LBW (Asian) 95% CI - High', '% LBW (Black)', '% LBW (Black) 95% CI - Low', '% LBW (Black) 95% CI - High', '% LBW (Hispanic)', '% LBW (Hispanic) 95% CI - Low', '% LBW (Hispanic) 95% CI - High', '% LBW (white)', '% LBW (white) 95% CI - Low', '% LBW (white) 95% CI - High', '% Smokers', '95% CI - Low.5', '95% CI - High.5', 'Z-Score.5', '% Adults with Obesity', '95% CI - Low.6', '95% CI - High.6', 'Z-Score.6', 'Food Environment Index', 'Z-Score.7', '% Physically Inactive', '95% CI - Low.7', '95% CI - High.7', 'Z-Score.8', '% With Access to Exercise Opportunities', 'Z-Score.9', '% Excessive Drinking', '95% CI - Low.8', '95% CI - High.8', 'Z-Score.10', '# Alcohol-Impaired Driving Deaths', '# Driving Deaths', '% Driving Deaths with Alcohol Involvement', '95% CI - Low.9', '95% CI - High.9', 'Z-Score.11', '# Chlamydia Cases', 'Chlamydia Rate', 'Z-Score.12', 'Teen Birth Rate', '95% CI - Low.10', '95% CI - High.10', 'Z-Score.13', 'Teen Birth Rate (AIAN)', 'Teen Birth Rate (AIAN) 95% CI - Low', 'Teen Birth Rate (AIAN) 95% CI - High', 'Teen Birth Rate (Asian)', 'Teen Birth Rate (Asian) 95% CI - Low', 'Teen Birth Rate (Asian) 95% CI - High', 'Teen Birth Rate (Black)', 'Teen Birth Rate (Black) 95% CI - Low', 'Teen Birth Rate (Black) 95% CI - High', 'Teen Birth Rate (Hispanic)', 'Teen Birth Rate (Hispanic) 95% CI - Low', 'Teen Birth Rate (Hispanic) 95% CI - High', 'Teen Birth Rate (white)', 'Teen Birth Rate (white) 95% CI - Low', 'Teen Birth Rate (white) 95% CI - High', '# Uninsured', '% Uninsured', '95% CI - Low.11', '95% CI - High.11', 'Z-Score.14', '# Primary Care Physicians', 'Primary Care Physicians Rate', 'Primary Care Physicians Ratio', 'Z-Score.15', '# Dentists', 'Dentist Rate', 'Dentist Ratio', 'Z-Score.16', '# Mental Health Providers', 'Mental Health Provider Rate', 'Mental Health Provider Ratio', 'Z-Score.17', 'Preventable Hospitalization Rate', 'Z-Score.18', 'Preventable Hosp. Rate (AIAN)', 'Preventable Hosp. Rate (Asian)', 'Preventable Hosp. Rate (Black)', 'Preventable Hosp. Rate (Hispanic)', 'Preventable Hosp. Rate (white)', '% With Annual Mammogram', 'Z-Score.19', '% Screened (AIAN)', '% Screened (Asian)', '% Screened (Black)', '% Screened (Hispanic)', '% Screened (white)', '% Vaccinated', 'Z-Score.20', '% Vaccinated (AIAN)', '% Vaccinated (Asian)', '% Vaccinated (Black)', '% Vaccinated (Hispanic)', '% Vaccinated (white)', '# Completed High School', 'Population', '% Completed High School', '95% CI - Low.12', '95% CI - High.12', 'Z-Score.21', '# Some College', 'Population.1', '% Some College', '95% CI - Low.13', '95% CI - High.13', 'Z-Score.22', '# Unemployed', 'Labor Force', '% Unemployed', 'Z-Score.23', '% Children in Poverty', '95% CI - Low.14', '95% CI - High.14', 'Z-Score.24', '% Children in Poverty (AIAN)', '% Children in Poverty (Asian)', '% Children in Poverty (Black)', '% Children in Poverty (Hispanic)', '% Children in Poverty (white)', '80th Percentile Income', '20th Percentile Income', 'Income Ratio', 'Z-Score.25', '# Children in Single-Parent Households', '# Children in Households', '% Children in Single-Parent Households', '95% CI - Low.15', '95% CI - High.15', 'Z-Score.26', '# Associations', 'Social Association Rate', 'Z-Score.27', 'Annual Average Violent Crimes', 'Violent Crime Rate', 'Z-Score.28', '# Injury Deaths', 'Injury Death Rate', '95% CI - Low.16', '95% CI - High.16', 'Z-Score.29', 'Injury Death Rate (AIAN)', 'Injury Death Rate (AIAN) 95% CI - Low', 'Injury Death Rate (AIAN) 95% CI - High', 'Injury Death Rate (Asian)', 'Injury Death Rate (Asian) 95% CI - Low', 'Injury Death Rate (Asian) 95% CI - High', 'Injury Death Rate (Black)', 'Injury Death Rate (Black) 95% CI - Low', 'Injury Death Rate (Black) 95% CI - High', 'Injury Death Rate (Hispanic)', 'Injury Death Rate (Hispanic) 95% CI - Low', 'Injury Death Rate (Hispanic) 95% CI - High', 'Injury Death Rate (white)', 'Injury Death Rate (white) 95% CI - Low', 'Injury Death Rate (white) 95% CI - High', 'Average Daily PM2.5', 'Z-Score.30', 'Presence of Water Violation', 'Z-Score.31', '% Severe Housing Problems', '95% CI - Low.17', '95% CI - High.17', 'Severe Housing Cost Burden', 'Severe Housing Cost Burden 95% CI - Low', 'Severe Housing Cost Burden 95% CI - High', 'Overcrowding', 'Overcrowding 95% CI - Low', 'Overcrowding 95% CI - High', 'Inadequate Facilities', 'Inadequate Facilities 95% CI - Low', 'Inadequate Facilities 95% CI - High', 'Z-Score.32', '% Drive Alone to Work', '95% CI - Low.18', '95% CI - High.18', 'Z-Score.33', '% Drive Alone (AIAN)', '% Drive Alone (AIAN) 95% CI - Low', '% Drive Alone (AIAN) 95% CI - High', '% Drive Alone (Asian)', '% Drive Alone (Asian) 95% CI - Low', '% Drive Alone (Asian) 95% CI - High', '% Drive Alone (Black)', '% Drive Alone (Black) 95% CI - Low', '% Drive Alone (Black) 95% CI - High', '% Drive Alone (Hispanic)', '% Drive Alone (Hispanic) 95% CI - Low', '% Drive Alone (Hispanic) 95% CI - High', '% Drive Alone (white)', '% Drive Alone (white) 95% CI - Low', '% Drive Alone (white) 95% CI - High', '# Workers who Drive Alone', '% Long Commute - Drives Alone', '95% CI - Low.19', '95% CI - High.19', 'Z-Score.34']

# cleaning the health data, keeping only the columns that we are using to determine health outcomes
cols_to_keep = ['County', '% Fair or Poor Health', 'Average Number of Physically Unhealthy Days', 'Average Number of Mentally Unhealthy Days']
df_filtered_health = df_health[cols_to_keep]
df_filtered_health = df_filtered_health.drop(df_filtered_health.index[0])
df_filtered_health

#load in the business data, filtering for only MD counties
df_county = pd.read_csv("sample_data/county_3digitnaics_2022.csv", header=2)
df_county
df_county_MD = df_county[df_county['State Name'].str.contains('Maryland', case=False, na=False)]
print(df_county_MD.shape)

(6099, 15)

df_county = pd.read_csv("sample_data/county_3digitnaics_2022.csv", header=2)
# print(df_county.shape) # Keep this commented out for now

# Inspect the DataFrame to understand the column names and data
print("df_county head:")
display(df_county.head())
print("Unique values in 'State Name' column:")
print(df_county['State Name'].unique())

df_county_MD = df_county[df_county['State Name'].str.contains('Maryland', case=False, na=False)]
print(df_county_MD.shape)

df_county head:

Unique values in 'State Name' column:
[' Alabama' 'Alabama' ' Alaska' 'Alaska' ' Arizona' 'Arizona' ' Arkansas'
 'Arkansas' ' California' 'California' ' Colorado' 'Colorado'
 ' Connecticut' 'Connecticut' ' Delaware' 'Delaware'
 ' District of Columbia' ' Florida' 'Florida' ' Georgia' 'Georgia'
 ' Hawaii' 'Hawaii' ' Idaho' 'Idaho' ' Illinois' 'Illinois' ' Indiana'
 'Indiana' ' Iowa' 'Iowa' ' Kansas' 'Kansas' ' Kentucky' 'Kentucky'
 ' Louisiana' 'Louisiana' ' Maine' 'Maine' ' Maryland' 'Maryland'
 ' Massachusetts' 'Massachusetts' ' Michigan' 'Michigan' ' Minnesota'
 'Minnesota' ' Mississippi' 'Mississippi' ' Missouri' 'Missouri'
 ' Montana' 'Montana' ' Nebraska' 'Nebraska' ' Nevada' 'Nevada'
 ' New Hampshire' 'New Hampshire' ' New Jersey' 'New Jersey' ' New Mexico'
 'New Mexico' ' New York' 'New York' ' North Carolina' 'North Carolina'
 ' North Dakota' 'North Dakota' ' Ohio' 'Ohio' ' Oklahoma' 'Oklahoma'
 ' Oregon' 'Oregon' ' Pennsylvania' 'Pennsylvania' ' Rhode Island'
 'Rhode Island' ' South Carolina' 'South Carolina' ' South Dakota'
 'South Dakota' ' Tennessee' 'Tennessee' ' Texas' 'Texas' ' Utah' 'Utah'
 ' Vermont' 'Vermont' ' Virginia' 'Virginia' ' Washington' 'Washington'
 ' West Virginia' 'West Virginia' ' Wisconsin' 'Wisconsin' ' Wyoming'
 'Wyoming']
(6099, 15)

#Clean the data, keeping only the columns that are needed for analysis
columns_to_keep = ['County Name', 'NAICS Description', 'Enterprise Size', 'Establishments']
df_filtered = df_county_MD[df_county_MD['NAICS Description'].str.contains('Total', case=False, na=False)][columns_to_keep]
display(df_filtered)

#remove unneccessary rows (don't need statewide totals, for example)
df_filtered = df_filtered[~df_filtered['Enterprise Size'].str.contains('Total', case=False, na=False)]
df_filtered = df_filtered[~df_filtered['County Name'].str.contains('Statewide Maryland', case=False, na=False)]
df_filtered

#classify the size as small or large business
def identify_size(size):
    size = size.lower()
    if '<20' in size or '20-99' in size:
        return 'Small'
    elif '100-499' in size or '500+' in size:
        return 'Large'
    else:
        return None

df_filtered['Business Size'] = df_filtered['Enterprise Size'].apply(identify_size)
df_filtered

#remove , to convert to numeric type
df_filtered['Establishments'] = df_filtered['Establishments'].str.replace(',', '', regex=False).astype(int)

#summarize establishments by size for each county
df_summary = (
    df_filtered
    .groupby(['County Name', 'Business Size'], as_index=False)['Establishments']
    .sum()
)

df_summary

#calculate the percentages of small vs large businesses for each county
df_summary['Total per County'] = df_summary.groupby('County Name')['Establishments'].transform('sum')
df_summary['Percentage'] = (df_summary['Establishments'] / df_summary['Total per County']) * 100
df_summary['Percentage'] = df_summary['Percentage'].round()
df_summary

#categorize the small business presence as high vs low based on the median percentage value
df_categorize_presence = df_summary[df_summary['Business Size'] == "Small"]
median_value = df_categorize_presence['Percentage'].median()

df_categorize_presence['Small_Business_Presence'] = np.where(
    df_categorize_presence['Percentage'] <= median_value,
    'Low',
    'High'
)

cols_to_keep = ['County Name', 'Percentage', 'Small_Business_Presence']
df_final = df_categorize_presence[cols_to_keep]

df_final

/tmp/ipython-input-2825041807.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorize_presence['Small_Business_Presence'] = np.where(

# create FINAL MERGED DATASET
df_filtered_health = df_filtered_health.rename(columns={'County': 'County Name'})
df_final.loc[7, 'County Name'] = "Baltimore City"
df_final = pd.merge(df_final, df_filtered_health, on='County Name', how='left')
df_final

# Overall summary statistics for all maryland counties

numeric_cols = ['Percentage', '% Fair or Poor Health',
                'Average Number of Physically Unhealthy Days',
                'Average Number of Mentally Unhealthy Days']

summary_stats = df_final[numeric_cols].describe()

iqr = df_final[numeric_cols].quantile(0.75) - df_final[numeric_cols].quantile(0.25)
summary_stats.loc['IQR'] = iqr

summary_stats.loc['median'] = df_final[numeric_cols].median()

summary_stats = summary_stats.loc[['count', 'mean', 'median', 'std', 'min', '25%', '50%', '75%', 'max', 'IQR']]

summary_stats = summary_stats.round(2)

print(summary_stats)
print()
print("Interpretation:")
print("- Percentage: % of establishments that are small businesses in each county")
print("- % Fair or Poor Health: % of adults reporting fair or poor health")
print("- Physically Unhealthy Days: Average number of physically unhealthy days per month")
print("- Mentally Unhealthy Days: Average number of mentally unhealthy days per month")

        Percentage  % Fair or Poor Health  \
count        24.00                  24.00   
mean         79.08                  16.42   
median       79.00                  16.00   
std           3.99                   3.27   
min          71.00                  11.00   
25%          77.00                  14.00   
50%          79.00                  16.00   
75%          82.25                  18.25   
max          86.00                  25.00   
IQR           5.25                   4.25   

        Average Number of Physically Unhealthy Days  \
count                                         24.00   
mean                                           3.63   
median                                         3.50   
std                                            0.51   
min                                            2.70   
25%                                            3.28   
50%                                            3.50   
75%                                            4.03   
max                                            4.80   
IQR                                            0.75   

        Average Number of Mentally Unhealthy Days  
count                                       24.00  
mean                                         4.48  
median                                       4.35  
std                                          0.44  
min                                          3.50  
25%                                          4.30  
50%                                          4.35  
75%                                          4.82  
max                                          5.20  
IQR                                          0.53  

Interpretation:
- Percentage: % of establishments that are small businesses in each county
- % Fair or Poor Health: % of adults reporting fair or poor health
- Physically Unhealthy Days: Average number of physically unhealthy days per month
- Mentally Unhealthy Days: Average number of mentally unhealthy days per month

grouped_stats = df_final.groupby('Small_Business_Presence')[numeric_cols].agg([
    'count', 'mean', 'median', 'std', 'min', 'max',
    lambda x: x.quantile(0.25),
    lambda x: x.quantile(0.75),
    lambda x: x.quantile(0.75) - x.quantile(0.25)
])

grouped_stats = grouped_stats.rename(columns={
    '<lambda_0>': 'Q1',
    '<lambda_1>': 'Q3',
    '<lambda_2>': 'IQR'
})

grouped_stats = grouped_stats.round(2)

print(grouped_stats)
print()

print("\nNumber of counties in each category:")
print(df_final['Small_Business_Presence'].value_counts().sort_index())

                        Percentage                                        \
                             count   mean median   std   min   max    Q1   
Small_Business_Presence                                                    
High                            11  82.55   83.0  1.75  80.0  86.0  81.0   
Low                             13  76.15   77.0  2.76  71.0  79.0  74.0   

                                   % Fair or Poor Health  ...  \
                           Q3  IQR                 count  ...   
Small_Business_Presence                                   ...   
High                     83.5  2.5                    11  ...   
Low                      78.0  4.0                    13  ...   

                        Average Number of Physically Unhealthy Days  \
                                                                IQR   
Small_Business_Presence                                               
High                                                           0.75   
Low                                                            0.80   

                        Average Number of Mentally Unhealthy Days        \
                                                            count  mean   
Small_Business_Presence                                                   
High                                                           11  4.67   
Low                                                            13  4.32   

                                                                 
                        median   std  min  max   Q1    Q3   IQR  
Small_Business_Presence                                          
High                       4.6  0.35  4.3  5.2  4.3  4.95  0.65  
Low                        4.3  0.47  3.5  5.0  4.1  4.80  0.70  

[2 rows x 36 columns]


Number of counties in each category:
Small_Business_Presence
High    11
Low     13
Name: count, dtype: int64

# detailed comparison of high vs low business presence
comparison = pd.DataFrame()

for col in numeric_cols:
    low_data = df_final[df_final['Small_Business_Presence'] == 'Low'][col]
    high_data = df_final[df_final['Small_Business_Presence'] == 'High'][col]

    comparison[col] = [
        f"Low: {low_data.mean():.2f}, High: {high_data.mean():.2f}",
        f"Low: {low_data.median():.2f}, High: {high_data.median():.2f}",
        f"Low: {low_data.std():.2f}, High: {high_data.std():.2f}",
        f"Low: {low_data.min():.2f}, High: {high_data.min():.2f}",
        f"Low: {low_data.max():.2f}, High: {high_data.max():.2f}",
        f"Low: {(low_data.quantile(0.75) - low_data.quantile(0.25)):.2f}, High: {(high_data.quantile(0.75) - high_data.quantile(0.25)):.2f}"
    ]

comparison.index = ['Mean', 'Median', 'Std Dev', 'Min', 'Max', 'IQR']

print(comparison.to_string())

                      Percentage    % Fair or Poor Health Average Number of Physically Unhealthy Days Average Number of Mentally Unhealthy Days
Mean     Low: 76.15, High: 82.55  Low: 16.15, High: 16.73                       Low: 3.51, High: 3.77                     Low: 4.32, High: 4.67
Median   Low: 77.00, High: 83.00  Low: 16.00, High: 16.00                       Low: 3.40, High: 3.70                     Low: 4.30, High: 4.60
Std Dev    Low: 2.76, High: 1.75    Low: 3.08, High: 3.61                       Low: 0.49, High: 0.51                     Low: 0.47, High: 0.35
Min      Low: 71.00, High: 80.00  Low: 11.00, High: 13.00                       Low: 2.70, High: 3.20                     Low: 3.50, High: 4.30
Max      Low: 79.00, High: 86.00  Low: 22.00, High: 25.00                       Low: 4.20, High: 4.80                     Low: 5.00, High: 5.20
IQR        Low: 4.00, High: 2.50    Low: 4.00, High: 4.50                       Low: 0.80, High: 0.75                     Low: 0.70, High: 0.65

#visual representation of summary statistics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Distribution of Variables by Small Business Presence', fontsize=16, fontweight='bold')

variables = [
    ('Percentage', 'Small Business Percentage'),
    ('% Fair or Poor Health', '% Fair or Poor Health'),
    ('Average Number of Physically Unhealthy Days', 'Avg Physically Unhealthy Days'),
    ('Average Number of Mentally Unhealthy Days', 'Avg Mentally Unhealthy Days')
]

for idx, (col, title) in enumerate(variables):
    ax = axes[idx // 2, idx % 2]

    data_low = df_final[df_final['Small_Business_Presence'] == 'Low'][col]
    data_high = df_final[df_final['Small_Business_Presence'] == 'High'][col]

    bp = ax.boxplot([data_low, data_high],
                     labels=['Low', 'High'],
                     patch_artist=True,
                     showmeans=True,
                     meanprops=dict(marker='D', markerfacecolor='red', markersize=8))

    # Color the boxes
    bp['boxes'][0].set_facecolor('lightblue')
    bp['boxes'][1].set_facecolor('lightgreen')

    ax.set_xlabel('Small Business Presence', fontweight='bold')
    ax.set_ylabel(title, fontweight='bold')
    ax.set_title(title, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

/tmp/ipython-input-825447840.py:18: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  bp = ax.boxplot([data_low, data_high],
/tmp/ipython-input-825447840.py:18: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  bp = ax.boxplot([data_low, data_high],
/tmp/ipython-input-825447840.py:18: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  bp = ax.boxplot([data_low, data_high],
/tmp/ipython-input-825447840.py:18: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  bp = ax.boxplot([data_low, data_high],

# key insights from summary statistics
low_group = df_final[df_final['Small_Business_Presence'] == 'Low']
high_group = df_final[df_final['Small_Business_Presence'] == 'High']

print(f"1. SMALL BUSINESS DENSITY:")
print(f"   - Low group: Mean = {low_group['Percentage'].mean():.2f}%, Median = {low_group['Percentage'].median():.2f}%")
print(f"   - High group: Mean = {high_group['Percentage'].mean():.2f}%, Median = {high_group['Percentage'].median():.2f}%")
print(f"   - Difference in means: {high_group['Percentage'].mean() - low_group['Percentage'].mean():.2f} percentage points")
print()

print(f"2. FAIR OR POOR HEALTH:")
print(f"   - Low group: Mean = {low_group['% Fair or Poor Health'].mean():.2f}%, Median = {low_group['% Fair or Poor Health'].median():.2f}%")
print(f"   - High group: Mean = {high_group['% Fair or Poor Health'].mean():.2f}%, Median = {high_group['% Fair or Poor Health'].median():.2f}%")
print(f"   - Difference in means: {high_group['% Fair or Poor Health'].mean() - low_group['% Fair or Poor Health'].mean():.2f} percentage points")
print()

print(f"3. PHYSICALLY UNHEALTHY DAYS:")
print(f"   - Low group: Mean = {low_group['Average Number of Physically Unhealthy Days'].mean():.2f} days")
print(f"   - High group: Mean = {high_group['Average Number of Physically Unhealthy Days'].mean():.2f} days")
print(f"   - Difference in means: {high_group['Average Number of Physically Unhealthy Days'].mean() - low_group['Average Number of Physically Unhealthy Days'].mean():.2f} days")
print()

print(f"4. MENTALLY UNHEALTHY DAYS:")
print(f"   - Low group: Mean = {low_group['Average Number of Mentally Unhealthy Days'].mean():.2f} days")
print(f"   - High group: Mean = {high_group['Average Number of Mentally Unhealthy Days'].mean():.2f} days")
print(f"   - Difference in means: {high_group['Average Number of Mentally Unhealthy Days'].mean() - low_group['Average Number of Mentally Unhealthy Days'].mean():.2f} days")
print()

print("5. VARIABILITY:")
print(f"   - The IQR for small business percentage is {df_final['Percentage'].quantile(0.75) - df_final['Percentage'].quantile(0.25):.2f}")
print(f"   - The IQR for % Fair/Poor Health is {df_final['% Fair or Poor Health'].quantile(0.75) - df_final['% Fair or Poor Health'].quantile(0.25):.2f}")
print(f"   - Counties show {'high' if df_final['Percentage'].std() > 10 else 'moderate'} variability in small business density (SD = {df_final['Percentage'].std():.2f})")

1. SMALL BUSINESS DENSITY:
   - Low group: Mean = 76.15%, Median = 77.00%
   - High group: Mean = 82.55%, Median = 83.00%
   - Difference in means: 6.39 percentage points

2. FAIR OR POOR HEALTH:
   - Low group: Mean = 16.15%, Median = 16.00%
   - High group: Mean = 16.73%, Median = 16.00%
   - Difference in means: 0.57 percentage points

3. PHYSICALLY UNHEALTHY DAYS:
   - Low group: Mean = 3.51 days
   - High group: Mean = 3.77 days
   - Difference in means: 0.27 days

4. MENTALLY UNHEALTHY DAYS:
   - Low group: Mean = 4.32 days
   - High group: Mean = 4.67 days
   - Difference in means: 0.35 days

5. VARIABILITY:
   - The IQR for small business percentage is 5.25
   - The IQR for % Fair/Poor Health is 4.25
   - Counties show moderate variability in small business density (SD = 3.99)

#create a table with only the relevant data (county name, density of small businesses, and percent of poor/fair health)
#include a new column for percentage of good health (100 - percentage of fair/poor health)
df_density_v_health = df_final[['County Name', '% Fair or Poor Health','Percentage']]
df_density_v_health['% Good Health'] = 100 - df_density_v_health['% Fair or Poor Health']
df_density_v_health

/tmp/ipython-input-1633745827.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_density_v_health['% Good Health'] = 100 - df_density_v_health['% Fair or Poor Health']

#run and print out results of Pearson's R
p = pearsonr(df_density_v_health['% Good Health'], df_density_v_health['Percentage'])
print("correlation coefficient:", p.statistic)
print("p value:", p.pvalue)

correlation coefficient: 0.2361976963530344
p value: 0.26649506614445556

#create and show a scatterplot
plt.scatter(df_density_v_health['Percentage'], df_density_v_health['% Good Health'])
plt.ylabel('% Good Health')
plt.xlabel('Density of Small Businesses')
plt.title('Correlation between Density of Small Businesses and Good Health')
m, b = np.polyfit(df_density_v_health['Percentage'], df_density_v_health['% Good Health'], 1)
plt.plot(df_density_v_health['Percentage'], m*df_density_v_health['Percentage'] + b, color='purple')
plt.show()

# create a clean dataset with only the relevant columns
df_density_v_mental_health = df_final[['County Name', 'Percentage', 'Small_Business_Presence', 'Average Number of Mentally Unhealthy Days']]
df_density_v_mental_health

low = df_density_v_mental_health[df_density_v_mental_health['Small_Business_Presence'] == 'Low']
high = df_density_v_mental_health[df_density_v_mental_health['Small_Business_Presence'] == 'High']
statistic, p_value = f_oneway(low['Average Number of Mentally Unhealthy Days'], high['Average Number of Mentally Unhealthy Days'])
print(f"P-value = {p_value}")

P-value = 0.05222867166235262

plt.figure(figsize=(8, 6))
plt.boxplot([low['Average Number of Mentally Unhealthy Days'],
             high['Average Number of Mentally Unhealthy Days']],
            tick_labels=['Low', 'High'])
plt.xlabel('Small Business Presence')
plt.ylabel('Average Number of Mentally Unhealthy Days')
plt.title('Comparison of Low vs High Small Business Presence \nwith Average Mentally Unhealthy Days')
plt.show()

# Pre-processing
# Made copy of data
df_model = df_final.copy()

# Encoding the categorical feature since sci kit learn cannot handle categorical strings
l = LabelEncoder()
df_model['Small_Biz_Encoded'] = l.fit_transform(df_model['Small_Business_Presence'])


# define features
X = df_model[['Small_Biz_Encoded']]

# define target
y = df_model['% Fair or Poor Health']

# Modeling

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Using the Decision Tree Regression version since output is a percent
dtree = DecisionTreeRegressor(max_depth=1, random_state=42)
dtree.fit(X_train, y_train)

# Predict
y_pred = dtree.predict(X_test)

# Evaluation

# For regression, we look at how far off the predictions are on average
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

# model's prediction is off by x percent
print(f"Mean Absolute Error: {mae:.2f}")
# explains variance
print(f"R-Squared Score: {r2:.2f}")

Mean Absolute Error: 3.47
R-Squared Score: -0.38

# 2 FEATURES INSTEAD OF ONE, MAX DEPTH IS 2 HERE
# Pre-processing
# Made copy of data
df_model = df_final.copy()

# Encoding the categorical feature since sci kit learn cannot handle categorical strings
l = LabelEncoder()
df_model['Small_Biz_Encoded'] = l.fit_transform(df_model['Small_Business_Presence'])


# define features
X = df_model[['Small_Biz_Encoded', 'Percentage']]

# define target
y = df_model['% Fair or Poor Health']

# Modeling

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Using the Decision Tree Regression version since output is a percent
dtree = DecisionTreeRegressor(max_depth=1, random_state=42)
dtree.fit(X_train, y_train)

# Predict
y_pred = dtree.predict(X_test)

# Evaluation

# For regression, we look at how far off the predictions are on average
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

# model's prediction is off by x percent
print(f"Mean Absolute Error: {mae:.2f}")
# explains variance
print(f"R-Squared Score: {r2:.2f}")

Mean Absolute Error: 3.71
R-Squared Score: -0.48

#load the svi dataset
df_svi = pd.read_csv("sample_data/Maryland_county.csv")
print(df_svi.shape)
df_svi.head(10)

(24, 158)

#filter the dataset and remove columns that aren't needed. for features, keep columns that start with EP_ as they are percentage values

cols = ["COUNTY"] + [col for col in df_svi.columns if col.startswith("EP_")]

df_svi_filtered = df_svi[cols]
#check for null values, overall structure of the new dataset
print(df_svi_filtered.isna().sum())
print(df_svi_filtered.shape)

COUNTY          0
EP_POV150       0
EP_UNEMP        0
EP_HBURD        0
EP_NOHSDP       0
EP_UNINSUR      0
EP_AGE65        0
EP_AGE17        0
EP_DISABL       0
EP_SNGPNT       0
EP_LIMENG       0
EP_MINRTY       0
EP_MUNIT        0
EP_MOBILE       0
EP_CROWD        0
EP_NOVEH        0
EP_GROUPQ       0
EP_NOINT        0
EP_AFAM         0
EP_HISP         0
EP_ASIAN        0
EP_AIAN         0
EP_NHPI         0
EP_TWOMORE      0
EP_OTHERRACE    0
dtype: int64
(24, 25)

#create a NEW merged final dataset
df_svi_filtered.rename(columns={'COUNTY': 'County Name'}, inplace=True)
df_svi_filtered['County Name'] = df_svi_filtered['County Name'].str.replace("county", '', case=False).str.strip()

df_new_final = pd.merge(df_filtered_health, df_svi_filtered, on='County Name', how='left')

#focusing on '% Fair or Poor Health' as our main target variable
df_new_final = df_new_final.drop(columns=['Average Number of Physically Unhealthy Days', 'Average Number of Mentally Unhealthy Days'])
df_new_final

/tmp/ipython-input-2375991886.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_svi_filtered.rename(columns={'COUNTY': 'County Name'}, inplace=True)
/tmp/ipython-input-2375991886.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_svi_filtered['County Name'] = df_svi_filtered['County Name'].str.replace("county", '', case=False).str.strip()

#calculate the correlation coefficients with the target variable
corr = df_new_final.corr(numeric_only=True)['% Fair or Poor Health'].sort_values(ascending=False)
print(corr)
#keep only the most important features (correlation of 0.6+)
selected_features = corr[corr > 0.6].index.tolist()
print(selected_features)

#filter the final dataset to only include the selected features
df_selected = df_new_final[selected_features]

% Fair or Poor Health    1.000000
EP_POV150                0.903903
EP_NOHSDP                0.843847
EP_UNEMP                 0.757730
EP_HBURD                 0.742666
EP_NOINT                 0.739843
EP_SNGPNT                0.695800
EP_GROUPQ                0.667893
EP_DISABL                0.639614
EP_MOBILE                0.588332
EP_NOVEH                 0.566284
EP_AFAM                  0.462352
EP_AIAN                  0.326299
EP_UNINSUR               0.239222
EP_CROWD                 0.209673
EP_MINRTY                0.208215
EP_AGE65                 0.000193
EP_TWOMORE              -0.096045
EP_MUNIT                -0.100919
EP_LIMENG               -0.131355
EP_NHPI                 -0.163022
EP_HISP                 -0.227812
EP_OTHERRACE            -0.352418
EP_AGE17                -0.394753
EP_ASIAN                -0.489223
Name: % Fair or Poor Health, dtype: float64
['% Fair or Poor Health', 'EP_POV150', 'EP_NOHSDP', 'EP_UNEMP', 'EP_HBURD', 'EP_NOINT', 'EP_SNGPNT', 'EP_GROUPQ', 'EP_DISABL']

#correlation heatmap to figure out collinearity
plt.figure(figsize=(8, 6))
sns.heatmap(df_selected.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Selected Features")
plt.show()

#remove the features that have high collinearity and less correlation with the target variable
#threshold is above 0.8
df_selected_final = df_selected.drop(["EP_NOHSDP", "EP_DISABL"], axis=1)
corr_matrix = df_selected_final.corr(numeric_only=True)

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Selected Features")
plt.show()

print(df_selected_final.describe())

       % Fair or Poor Health  EP_POV150   EP_UNEMP   EP_HBURD   EP_NOINT  \
count              24.000000  24.000000  24.000000  24.000000  24.000000   
mean               16.416667  16.695833   4.825000  24.020833  11.608333   
std                 3.269313   6.939144   1.345282   5.484760   4.553443   
min                11.000000   7.800000   3.100000  15.900000   4.100000   
25%                14.000000  10.925000   3.700000  19.525000   8.075000   
50%                16.000000  15.550000   4.700000  23.900000  10.850000   
75%                18.250000  20.825000   5.750000  28.150000  14.100000   
max                25.000000  29.500000   7.000000  36.300000  20.100000   

       EP_SNGPNT  EP_GROUPQ  
count  24.000000  24.000000  
mean    6.483333   3.229167  
std     1.742479   4.283180  
min     3.900000   0.600000  
25%     5.375000   1.125000  
50%     5.850000   1.750000  
75%     7.925000   2.650000  
max     9.800000  19.800000

#create a box plot visualizing the data
plt.figure(figsize=(14, 8))

#melt the dataframe to put all columns on the X-axis
cols_to_plot = [col for col in df_selected_final.columns if col != 'County Name']
df_melted = df_selected_final[cols_to_plot].melt(var_name='Variable', value_name='Percentage')

sns.boxplot(
    x='Variable',
    y='Percentage',
    data=df_melted,
    hue='Variable',
    legend=False,
    palette="plasma"
)

plt.title('Distribution of Health & Socioeconomic Factors', fontsize=16)
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45)
plt.grid(True, linestyle='--')

plt.show()

#create a table with only the relevant data (county name, density of small businesses, and percent of poor/fair health)
#include a new column for percentage of good health (100 - percentage of fair/poor health)
df_groupq_v_health = df_new_final[['County Name', '% Fair or Poor Health','EP_GROUPQ']]
df_groupq_v_health['% Good Health'] = 100 - df_groupq_v_health['% Fair or Poor Health']
df_groupq_v_health

/tmp/ipython-input-2626279647.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_groupq_v_health['% Good Health'] = 100 - df_groupq_v_health['% Fair or Poor Health']

#run and print out results of Pearson's R
p = pearsonr(df_groupq_v_health['% Good Health'], df_groupq_v_health['EP_GROUPQ'])
print("correlation coefficient:", p.statistic)
print("p value:", p.pvalue)

correlation coefficient: -0.667892503939244
p value: 0.00036201534815581406

#create and show a scatterplot
plt.scatter(df_groupq_v_health['EP_GROUPQ'], df_density_v_health['% Good Health'])
plt.ylabel('% Good Health')
plt.xlabel('Percentage of Persons Living in Group Quarters')
plt.title('Correlation between Persons in Group Quarters and Good Health')
m, b = np.polyfit(df_groupq_v_health['EP_GROUPQ'], df_groupq_v_health['% Good Health'], 1)
plt.plot(df_groupq_v_health['EP_GROUPQ'], m*df_groupq_v_health['EP_GROUPQ'] + b, color='purple')
plt.show()

# declaring the Target and Feature variables
# We drop '% Fair or Poor Health' from X because it is the target
X = df_selected_final.drop(['% Fair or Poor Health'], axis=1)
y = df_selected_final['% Fair or Poor Health']

# 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training shape: {X_train.shape}")
print(f"Testing shape: {X_test.shape}")

Training shape: (19, 6)
Testing shape: (5, 6)

# declaring the Features and Target variables
features = ['EP_POV150', 'EP_UNEMP', 'EP_HBURD', 'EP_NOINT', 'EP_SNGPNT', 'EP_GROUPQ']
target = '% Fair or Poor Health'

# setting up the smaller subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

# creating the Regplot for each feature
for i, feature in enumerate(features):
    sns.regplot(
        x=feature,
        y=target,
        data=df_selected_final,
        ax=axes[i],
        color='purple',
        line_kws={"color": "blue"}
    )
    axes[i].set_title(f'{feature} vs. Health')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('% Fair or Poor Health')

plt.tight_layout()
plt.suptitle('Linear Relationships: Individual SVI Factors vs. Poor Health', y=1.02)
plt.show()

# training model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# predict
lr_preds = lr_model.predict(X_test)

print("Linear Regression Performance: ")
print(f"MAE: {mean_absolute_error(y_test, lr_preds):.2f}")
print(f"MSE: {mean_squared_error(y_test, lr_preds):.2f}")
print(f"R2 Score: {r2_score(y_test, lr_preds):.2f}")

Linear Regression Performance: 
MAE: 1.89
MSE: 7.34
R2 Score: 0.60

# split the data
X_all = df_selected_final.drop(['% Fair or Poor Health'], axis=1)
X_train_all, X_test_all, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=42)

# Training and creating 100 decision trees
rf_model_all_features = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_all_features.fit(X_train_all, y_train)

# Predict
rf_preds_all_features = rf_model_all_features.predict(X_test_all)

print("Random Forest Performance (All Features): ")
print(f"MAE: {mean_absolute_error(y_test, rf_preds_all_features):.2f}")
print(f"MSE: {mean_squared_error(y_test, rf_preds_all_features):.2f}")
print(f"R2 Score: {r2_score(y_test, rf_preds_all_features):.2f}")

Random Forest Performance (All Features): 
MAE: 1.55
MSE: 5.43
R2 Score: 0.70

# dataframe for feature importances from the model trained with all features
feature_importance_all = pd.DataFrame({
    'Feature': X_all.columns,
    'Importance': rf_model_all_features.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_all, palette='plasma')
plt.title('Feature Importance: Which SVI Factors Predict Poor Health?')
plt.xlabel('Importance Score')
plt.ylabel('SVI Feature')
plt.show()

print("Top Predictor:", feature_importance_all.iloc[0]['Feature'])

/tmp/ipython-input-2010053042.py:13: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_all, palette='plasma')

Top Predictor: EP_POV150

# Features for clustering (excluding the target variable and County Name)
X_cluster = df_selected_final.drop(columns=['% Fair or Poor Health'])

# Standardize the features to ensure all variables contribute equally
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# Determine the optimal number of clusters using the elbow method or a domain knowledge. For this example, let's use 3 clusters
k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')

# Fit the model and add cluster labels to the DataFrame
df_selected_final['Cluster'] = kmeans.fit_predict(X_scaled)

df_selected_final.head()

# Group by Cluster and calculate the average for all columns
cluster_analysis = df_selected_final.groupby('Cluster').mean().sort_values(by='% Fair or Poor Health', ascending=True)

# Show the table
print("Average Health & SVI Metrics by Cluster: ")
print(cluster_analysis)

# bar graph comparison
plt.figure(figsize=(8, 5))
sns.barplot(x=cluster_analysis.index, y=cluster_analysis['% Fair or Poor Health'], palette='plasma')
plt.title('Average Poor Health by Cluster')
plt.xlabel('Cluster ID')
plt.ylabel('Avg % Fair or Poor Health')
plt.show()

Average Health & SVI Metrics by Cluster: 
         % Fair or Poor Health  EP_POV150  EP_UNEMP   EP_HBURD   EP_NOINT  \
Cluster                                                                     
1                    13.818182  10.800000  3.809091  19.700000   8.290909   
2                    16.714286  17.342857  5.171429  25.571429  12.957143   
0                    20.833333  26.750000  6.283333  30.133333  16.116667   

         EP_SNGPNT  EP_GROUPQ  
Cluster                        
1         5.554545   1.218182  
2         6.271429   3.257143  
0         8.433333   6.883333

/tmp/ipython-input-2959979881.py:13: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=cluster_analysis.index, y=cluster_analysis['% Fair or Poor Health'], palette='plasma')

# dictionary of counties for each cluster
df_selected_final_with_county = df_selected_final.copy()
df_selected_final_with_county['County Name'] = df_new_final['County Name']
cluster_groups = df_selected_final_with_county.groupby('Cluster')['County Name'].apply(list)

for cluster_id, counties in cluster_groups.items():
    print(f"\nCluster {cluster_id}:")
    print(", ".join(counties))

Cluster 0:
Allegany, Caroline, Dorchester, Somerset, Wicomico, Baltimore City

Cluster 1:
Anne Arundel, Calvert, Carroll, Charles, Frederick, Harford, Howard, Montgomery, Queen Anne's, St. Mary's, Talbot

Cluster 2:
Baltimore, Cecil, Garrett, Kent, Prince George's, Washington, Worcester

# dictionary of counties for each cluster
df_selected_final_with_county = df_selected_final.copy()
df_selected_final_with_county['County Name'] = df_new_final['County Name']
cluster_groups = df_selected_final_with_county.groupby('Cluster')['County Name'].apply(list)

for cluster_id, counties in cluster_groups.items():
    print(f"\nCluster {cluster_id}:")
    print(", ".join(counties))

Cluster 0:
Allegany, Caroline, Dorchester, Somerset, Wicomico, Baltimore City

Cluster 1:
Anne Arundel, Calvert, Carroll, Charles, Frederick, Harford, Howard, Montgomery, Queen Anne's, St. Mary's, Talbot

Cluster 2:
Baltimore, Cecil, Garrett, Kent, Prince George's, Washington, Worcester

median_poverty = df_selected_final['EP_POV150'].median()
high_poverty = df_selected_final[df_selected_final['EP_POV150'] > median_poverty]['% Fair or Poor Health']
low_poverty = df_selected_final[df_selected_final['EP_POV150'] <= median_poverty]['% Fair or Poor Health']
t_stat, p_val = stats.ttest_ind(high_poverty, low_poverty)

print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_val:.4f}")

T-statistic: 4.3143
P-value: 0.0003

# Instantiate KNeighborsRegressor model
knn_model = KNeighborsRegressor(n_neighbors=5, weights='distance')

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on X_test
knn_preds = knn_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, knn_preds)
mse = mean_squared_error(y_test, knn_preds)
r2 = r2_score(y_test, knn_preds)

print("KNeighbors Regressor Performance:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R2 Score: {r2:.2f}")

KNeighbors Regressor Performance:
MAE: 1.82
MSE: 7.40
R2 Score: 0.60

plt.figure(figsize=(8, 8))
plt.scatter(y_test, knn_preds, alpha=0.6)
plt.title('K-Nearest Neighbors: Actual vs. Predicted % Fair or Poor Health')
plt.xlabel('Actual % Fair or Poor Health (y_test)')
plt.ylabel('Predicted % Fair or Poor Health (knn_preds)')

min_val = min(y_test.min(), knn_preds.min())
max_val = max(y_test.max(), knn_preds.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Prediction')

plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Perform hierarchical clustering
linked = linkage(X_scaled, method='ward')

# Plot the dendrogram
plt.figure(figsize=(15, 8))
dendrogram(
    linked,
    orientation='top',
    labels=df_selected_final_with_county['County Name'].tolist(),
    distance_sort='descending',
    show_leaf_counts=True
)
plt.title('Hierarchical Clustering Dendrogram of Maryland Counties (SVI Factors)')
plt.xlabel('County')
plt.ylabel('Euclidean distance (Ward)')
plt.tight_layout()
plt.show()

k = 3
df_selected_final['Hierarchical_Cluster'] = fcluster(linked, k, criterion='maxclust')

print("Cluster assignments added to df_selected_final:")
display(df_selected_final.head())

Cluster assignments added to df_selected_final:

cluster_analysis = df_selected_final.groupby('Hierarchical_Cluster').mean().sort_values(by='% Fair or Poor Health', ascending=True)

print("Average Health & SVI Metrics by Hierarchical Cluster: ")
print(cluster_analysis)

Average Health & SVI Metrics by Hierarchical Cluster: 
                      % Fair or Poor Health  EP_POV150  EP_UNEMP   EP_HBURD  \
Hierarchical_Cluster                                                          
2                                 13.888889  10.133333      3.90  18.877778   
3                                 16.400000  17.200000      4.99  25.470000   
1                                 21.000000  27.500000      6.16  30.380000   

                       EP_NOINT  EP_SNGPNT  EP_GROUPQ  Cluster  
Hierarchical_Cluster                                            
2                      7.333333   5.766667   1.255556      1.0  
3                     12.930000   6.110000   2.870000      1.6  
1                     16.660000   8.520000   7.500000      0.0

plt.figure(figsize=(8, 5))
sns.barplot(x=cluster_analysis.index, y=cluster_analysis['% Fair or Poor Health'], hue=cluster_analysis.index, palette='plasma', legend=False)
plt.title('Average % Fair or Poor Health by Hierarchical Cluster')
plt.xlabel('Hierarchical Cluster ID')
plt.ylabel('Avg % Fair or Poor Health')
plt.show()

df_selected_final_with_county['Hierarchical_Cluster'] = df_selected_final['Hierarchical_Cluster']
cluster_groups_hierarchical = df_selected_final_with_county.groupby('Hierarchical_Cluster')['County Name'].apply(list)

print("Counties in each Hierarchical Cluster:")
for cluster_id, counties in cluster_groups_hierarchical.items():
    print(f"\nCluster {cluster_id}:")
    print(", ".join(counties))

Counties in each Hierarchical Cluster:

Cluster 1:
Allegany, Caroline, Dorchester, Somerset, Baltimore City

Cluster 2:
Anne Arundel, Calvert, Carroll, Charles, Frederick, Harford, Howard, Montgomery, St. Mary's

Cluster 3:
Baltimore, Cecil, Garrett, Kent, Prince George's, Queen Anne's, Talbot, Washington, Wicomico, Worcester

	FIPS	State	County	Unreliable	Deaths	Years of Potential Life Lost Rate	95% CI - Low	95% CI - High	Z-Score	YPLL Rate (AIAN)	...	% Drive Alone (Hispanic) 95% CI - Low	% Drive Alone (Hispanic) 95% CI - High	% Drive Alone (white)	% Drive Alone (white) 95% CI - Low	% Drive Alone (white) 95% CI - High	# Workers who Drive Alone	% Long Commute - Drives Alone	95% CI - Low.19	95% CI - High.19	Z-Score.34
0	24000	Maryland	NaN	NaN	75564	7547	7467	7627	NaN	4720.0	...	64.0	67.0	78.0	78.0	79.0	3047112	50	50	51	NaN
1	24001	Maryland	Allegany	NaN	1199	9538	8662	10414	0.74	NaN	...	NaN	NaN	86.0	84.0	87.0	26999	22	19	24	-1.90
2	24003	Maryland	Anne Arundel	NaN	6579	6937	6693	7180	-0.45	NaN	...	68.0	77.0	80.0	80.0	81.0	302608	48	46	49	0.24
3	24005	Maryland	Baltimore	NaN	11566	8371	8141	8601	0.20	NaN	...	64.0	73.0	82.0	81.0	82.0	412698	47	46	48	0.16
4	24009	Maryland	Calvert	NaN	1038	6531	5911	7151	-0.64	NaN	...	59.0	77.0	77.0	75.0	79.0	47266	62	59	66	1.45
5	24011	Maryland	Caroline	NaN	519	9003	7820	10187	0.49	NaN	...	NaN	NaN	82.0	78.0	86.0	15783	52	47	56	0.55
6	24013	Maryland	Carroll	NaN	2047	6604	6148	7060	-0.61	NaN	...	53.0	70.0	78.0	76.0	80.0	87217	58	56	60	1.08
7	24015	Maryland	Cecil	NaN	1772	10405	9673	11138	1.14	NaN	...	46.0	88.0	80.0	77.0	82.0	49939	49	45	53	0.35
8	24017	Maryland	Charles	NaN	1965	7786	7284	8288	-0.06	NaN	...	66.0	82.0	82.0	80.0	84.0	82067	68	65	71	1.88
9	24019	Maryland	Dorchester	NaN	575	10398	8973	11822	1.13	NaN	...	32.0	62.0	81.0	78.0	85.0	14310	42	37	47	-0.22
10	24021	Maryland	Frederick	NaN	2568	5885	5543	6227	-0.94	NaN	...	63.0	71.0	78.0	76.0	80.0	134016	50	48	51	0.40
11	24023	Maryland	Garrett	NaN	446	7521	6282	8760	-0.19	NaN	...	NaN	NaN	NaN	NaN	NaN	13668	31	26	35	-1.16
12	24025	Maryland	Harford	NaN	3103	7005	6619	7391	-0.42	NaN	...	62.0	75.0	80.0	79.0	82.0	130855	53	51	55	0.66
13	24027	Maryland	Howard	NaN	2256	4342	4065	4618	-1.64	NaN	...	66.0	76.0	81.0	80.0	83.0	169456	48	46	50	0.26
14	24029	Maryland	Kent	NaN	308	7810	6273	9347	-0.05	NaN	...	27.0	76.0	78.0	73.0	83.0	9258	38	32	44	-0.54
15	24031	Maryland	Montgomery	NaN	7369	4304	4155	4453	-1.66	NaN	...	61.0	65.0	69.0	68.0	70.0	549986	54	52	55	0.72
16	24033	Maryland	Prince George's	NaN	11369	7484	7285	7684	-0.20	NaN	...	62.0	67.0	80.0	79.0	81.0	475260	61	59	62	1.31
17	24035	Maryland	Queen Anne's	NaN	583	6384	5509	7260	-0.71	NaN	...	33.0	72.0	74.0	71.0	76.0	25283	54	51	58	0.79
18	24037	Maryland	St. Mary's	NaN	1348	6954	6417	7491	-0.45	NaN	...	61.0	86.0	82.0	80.0	84.0	56674	40	37	43	-0.36
19	24039	Maryland	Somerset	NaN	444	8834	7547	10121	0.42	NaN	...	NaN	NaN	85.0	79.0	90.0	9200	34	28	41	-0.86
20	24041	Maryland	Talbot	NaN	504	8256	6975	9537	0.15	NaN	...	54.0	88.0	79.0	76.0	82.0	17089	30	26	34	-1.21
21	24043	Maryland	Washington	NaN	2384	9490	8919	10061	0.72	NaN	...	64.0	89.0	83.0	82.0	84.0	67878	39	36	41	-0.51
22	24045	Maryland	Wicomico	NaN	1518	8924	8250	9597	0.46	NaN	...	58.0	81.0	84.0	82.0	87.0	48898	25	23	28	-1.60
23	24047	Maryland	Worcester	NaN	782	6632	5796	7469	-0.59	NaN	...	61.0	71.0	82.0	79.0	85.0	23732	29	26	32	-1.30
24	24510	Maryland	Baltimore City	NaN	13322	14845	14497	15193	3.17	NaN	...	55.0	63.0	76.0	75.0	77.0	276972	43	41	44	-0.18

	State	State Name	County	County Name	NAICS	NAICS Description	Enterprise Size	Firms	Establishments	Employment	Employment Noise Flag	Annual Payroll\n($1,000)	Annual Payroll Noise Flag	Receipts\n($1,000)	Receipts\nNoise Flag
0	1	Alabama	1	Autauga	--	Total	1: Total	874	948	12,409	G	496,158	G	3,422,297	G
1	1	Alabama	1	Autauga	--	Total	2: <20 employees	578	579	2,591	G	90,641	G	562,413	G
2	1	Alabama	1	Autauga	--	Total	3: 20-99 employees	91	101	2,517	G	98,457	G	530,120	G
3	1	Alabama	1	Autauga	--	Total	4: 100-499 employees	38	51	1,406	H	51,404	H	205,532	H
4	1	Alabama	1	Autauga	--	Total	5: 500+ employees	167	217	5,895	G	255,656	G	2,124,232	G

	County Name	NAICS Description	Enterprise Size	Establishments
188082	Allegany	Total	2: <20 employees	931
188083	Allegany	Total	3: 20-99 employees	137
188084	Allegany	Total	4: 100-499 employees	106
188085	Allegany	Total	5: 500+ employees	294
188296	Anne Arundel	Total	2: <20 employees	9,698
...	...	...	...	...
193519	Worcester	Total	5: 500+ employees	243
193725	Baltimore city	Total	2: <20 employees	8,190
193726	Baltimore city	Total	3: 20-99 employees	1,260
193727	Baltimore city	Total	4: 100-499 employees	658
193728	Baltimore city	Total	5: 500+ employees	2,257

	County Name	NAICS Description	Enterprise Size	Establishments
188081	Allegany	Total	1: Total	1,468
188082	Allegany	Total	2: <20 employees	931
188083	Allegany	Total	3: 20-99 employees	137
188084	Allegany	Total	4: 100-499 employees	106
188085	Allegany	Total	5: 500+ employees	294
...	...	...	...	...
194096	Statewide Maryland	Total	1: Total	855
194097	Statewide Maryland	Total	2: <20 employees	6
194098	Statewide Maryland	Total	3: 20-99 employees	10
194099	Statewide Maryland	Total	4: 100-499 employees	64
194100	Statewide Maryland	Total	5: 500+ employees	775

	County Name	Business Size	Establishments
0	Allegany	Large	400
1	Allegany	Small	1068
2	Anne Arundel	Large	3401
3	Anne Arundel	Small	11244
4	Baltimore	Large	4554
5	Baltimore	Small	15812
6	Baltimore city	Large	2915
7	Baltimore city	Small	9450
8	Calvert	Large	291
9	Calvert	Small	1442
10	Caroline	Large	107
11	Caroline	Small	510
12	Carroll	Large	662
13	Carroll	Small	3602
14	Cecil	Large	345
15	Cecil	Small	1405
16	Charles	Large	621
17	Charles	Small	2135
18	Dorchester	Large	127
19	Dorchester	Small	539
20	Frederick	Large	1371
21	Frederick	Small	5097
22	Garrett	Large	163
23	Garrett	Small	779
24	Harford	Large	1198
25	Harford	Small	4416
26	Howard	Large	2154
27	Howard	Small	7712
28	Kent	Large	106
29	Kent	Small	487
30	Montgomery	Large	5868
31	Montgomery	Small	21899
32	Prince George's	Large	3747
33	Prince George's	Small	12497
34	Queen Anne's	Large	206
35	Queen Anne's	Small	1231
36	Somerset	Large	67
37	Somerset	Small	288
38	St. Mary's	Large	538
39	St. Mary's	Small	1541
40	Talbot	Large	277
41	Talbot	Small	1201
42	Washington	Large	993
43	Washington	Small	2439
44	Wicomico	Large	701
45	Wicomico	Small	1786
46	Worcester	Large	354
47	Worcester	Small	1880

	County Name	Business Size	Establishments	Total per County	Percentage
0	Allegany	Large	400	1468	27.0
1	Allegany	Small	1068	1468	73.0
2	Anne Arundel	Large	3401	14645	23.0
3	Anne Arundel	Small	11244	14645	77.0
4	Baltimore	Large	4554	20366	22.0
5	Baltimore	Small	15812	20366	78.0
6	Baltimore city	Large	2915	12365	24.0
7	Baltimore city	Small	9450	12365	76.0
8	Calvert	Large	291	1733	17.0
9	Calvert	Small	1442	1733	83.0
10	Caroline	Large	107	617	17.0
11	Caroline	Small	510	617	83.0
12	Carroll	Large	662	4264	16.0
13	Carroll	Small	3602	4264	84.0
14	Cecil	Large	345	1750	20.0
15	Cecil	Small	1405	1750	80.0
16	Charles	Large	621	2756	23.0
17	Charles	Small	2135	2756	77.0
18	Dorchester	Large	127	666	19.0
19	Dorchester	Small	539	666	81.0
20	Frederick	Large	1371	6468	21.0
21	Frederick	Small	5097	6468	79.0
22	Garrett	Large	163	942	17.0
23	Garrett	Small	779	942	83.0
24	Harford	Large	1198	5614	21.0
25	Harford	Small	4416	5614	79.0
26	Howard	Large	2154	9866	22.0
27	Howard	Small	7712	9866	78.0
28	Kent	Large	106	593	18.0
29	Kent	Small	487	593	82.0
30	Montgomery	Large	5868	27767	21.0
31	Montgomery	Small	21899	27767	79.0
32	Prince George's	Large	3747	16244	23.0
33	Prince George's	Small	12497	16244	77.0
34	Queen Anne's	Large	206	1437	14.0
35	Queen Anne's	Small	1231	1437	86.0
36	Somerset	Large	67	355	19.0
37	Somerset	Small	288	355	81.0
38	St. Mary's	Large	538	2079	26.0
39	St. Mary's	Small	1541	2079	74.0
40	Talbot	Large	277	1478	19.0
41	Talbot	Small	1201	1478	81.0
42	Washington	Large	993	3432	29.0
43	Washington	Small	2439	3432	71.0
44	Wicomico	Large	701	2487	28.0
45	Wicomico	Small	1786	2487	72.0
46	Worcester	Large	354	2234	16.0
47	Worcester	Small	1880	2234	84.0

	ST	STATE	ST_ABBR	STCNTY	COUNTY	FIPS	LOCATION	AREA_SQMI	E_TOTPOP	...	EP_ASIAN	MP_ASIAN	EP_AIAN	MP_AIAN	EP_NHPI	MP_NHPI	EP_TWOMORE	MP_TWOMORE	EP_OTHERRACE	MP_OTHERRACE
0	24	Maryland	MD	24001	Allegany County	24001	Allegany County, Maryland	422.198632	68161	...	0.9	0.1	0.1	0.1	0.0	0.1	2.9	0.3	0.2	0.1
1	24	Maryland	MD	24003	Anne Arundel County	24003	Anne Arundel County, Maryland	414.806257	588109	...	4.0	0.1	0.1	0.1	0.0	0.1	4.9	0.4	0.5	0.1
2	24	Maryland	MD	24005	Baltimore County	24005	Baltimore County, Maryland	598.357965	850737	...	6.1	0.1	0.1	0.1	0.0	0.1	3.6	0.2	0.4	0.1
3	24	Maryland	MD	24009	Calvert County	24009	Calvert County, Maryland	213.189517	93244	...	2.0	0.4	0.1	0.1	0.1	0.1	5.2	0.8	0.4	0.2
4	24	Maryland	MD	24011	Caroline County	24011	Caroline County, Maryland	319.441587	33320	...	0.5	0.2	0.1	0.1	0.0	0.1	4.1	0.7	0.8	0.5
5	24	Maryland	MD	24013	Carroll County	24013	Carroll County, Maryland	447.629615	173225	...	2.2	0.1	0.1	0.1	0.0	0.1	2.6	0.4	0.5	0.2
6	24	Maryland	MD	24015	Cecil County	24015	Cecil County, Maryland	346.299723	103876	...	1.3	0.3	0.1	0.1	0.0	0.1	4.0	0.7	0.5	0.3
7	24	Maryland	MD	24017	Charles County	24017	Charles County, Maryland	457.825693	167035	...	3.2	0.2	0.3	0.1	0.0	0.1	5.7	0.7	0.5	0.3
8	24	Maryland	MD	24019	Dorchester County	24019	Dorchester County, Maryland	540.764325	32557	...	1.3	0.1	0.0	0.1	0.0	0.1	5.1	1.2	0.5	0.3
9	24	Maryland	MD	24021	Frederick County	24021	Frederick County, Maryland	660.605502	273829	...	4.9	0.2	0.1	0.1	0.0	0.1	4.4	0.4	0.3	0.1

	County Name	% Fair or Poor Health	EP_POV150	EP_UNEMP	EP_HBURD	EP_NOHSDP	EP_UNINSUR	EP_AGE65	EP_AGE17	EP_DISABL	...	EP_NOVEH	EP_GROUPQ	EP_NOINT	EP_AFAM	EP_HISP	EP_ASIAN	EP_AIAN	EP_NHPI	EP_TWOMORE	EP_OTHERRACE
0	Allegany	19	26.4	6.8	23.3	9.4	4.0	20.8	17.6	18.9	...	9.6	10.6	17.5	7.2	2.0	0.9	0.1	0.0	2.9	0.2
1	Anne Arundel	14	9.7	4.2	19.3	6.5	4.5	15.4	22.4	10.7	...	3.8	1.8	5.6	17.2	8.7	4.0	0.1	0.0	4.9	0.5
2	Baltimore	16	16.1	5.2	26.6	8.3	5.3	17.6	21.6	11.9	...	7.8	2.3	10.0	29.6	6.1	6.1	0.1	0.0	3.6	0.4
3	Calvert	14	7.8	3.2	15.9	5.5	3.2	15.5	23.1	9.6	...	3.1	0.6	8.1	12.1	4.6	2.0	0.1	0.1	5.2	0.4
4	Caroline	20	26.6	4.6	29.1	15.6	6.8	16.9	23.5	14.6	...	6.1	1.7	14.1	13.2	8.2	0.5	0.1	0.0	4.1	0.8
5	Carroll	13	8.9	3.1	17.1	5.8	2.9	17.4	21.7	12.5	...	4.0	2.1	9.5	3.7	4.1	2.2	0.1	0.0	2.6	0.5
6	Cecil	16	17.6	4.9	24.5	8.6	3.9	16.4	22.3	13.0	...	5.2	1.5	10.3	6.6	4.9	1.3	0.1	0.0	4.0	0.5
7	Charles	17	10.1	4.9	20.0	5.6	4.0	12.9	23.8	10.4	...	3.3	0.8	8.6	48.3	6.7	3.2	0.3	0.0	5.7	0.5
8	Dorchester	19	25.6	5.5	31.5	12.2	5.2	21.9	21.0	17.0	...	10.0	1.7	14.1	25.6	6.1	1.3	0.0	0.0	5.1	0.5
9	Frederick	14	9.6	3.7	19.0	6.7	4.5	14.9	23.1	9.9	...	3.9	1.6	8.0	9.8	11.0	4.9	0.1	0.0	4.4	0.3
10	Garrett	18	20.1	4.6	17.9	9.5	5.8	23.0	18.0	18.2	...	7.4	2.3	16.9	1.3	1.3	0.4	0.0	0.1	1.0	0.4
11	Harford	14	11.9	3.7	19.6	5.8	3.5	16.7	22.2	11.0	...	4.0	0.6	7.2	14.0	5.0	2.9	0.0	0.0	3.9	0.5
12	Howard	11	8.7	3.8	17.1	4.8	3.9	14.5	24.0	8.4	...	4.0	0.8	4.1	19.7	7.5	18.8	0.1	0.1	4.9	0.6
13	Kent	16	17.6	3.3	29.9	9.5	4.3	26.8	15.7	15.0	...	6.5	8.4	20.1	14.1	4.7	1.0	0.0	0.0	2.8	0.6
14	Montgomery	13	12.0	4.8	22.1	8.7	6.7	16.2	22.9	8.8	...	7.9	0.8	5.2	18.2	20.0	15.2	0.1	0.0	4.2	0.8
15	Prince George's	17	14.9	6.7	27.9	12.9	10.5	14.1	22.1	9.9	...	9.3	1.9	7.9	60.4	20.0	4.0	0.2	0.0	3.2	0.6
16	Queen Anne's	13	11.2	3.6	22.2	6.7	5.0	19.7	21.2	10.1	...	3.1	0.9	13.4	5.8	4.6	1.1	0.0	0.0	3.8	0.6
17	St. Mary's	15	12.5	3.7	19.8	8.2	4.0	13.3	24.0	11.3	...	4.5	2.2	9.7	14.3	5.7	2.5	0.1	0.0	5.4	0.4
18	Somerset	25	29.5	7.0	31.7	15.3	3.9	17.2	16.7	16.1	...	7.5	19.8	18.8	38.9	4.0	0.8	0.2	0.0	4.1	0.2
19	Talbot	14	16.4	3.2	24.6	7.8	4.4	29.7	18.2	16.7	...	7.2	1.2	11.8	12.1	7.3	1.3	0.1	0.0	2.5	0.3
20	Washington	18	20.1	5.0	24.8	11.6	5.5	17.7	21.7	15.3	...	7.9	5.0	14.1	11.1	6.2	1.7	0.1	0.1	4.3	0.2
21	Wicomico	20	23.0	6.9	28.9	11.5	6.5	16.2	22.1	12.2	...	7.4	3.8	13.4	25.8	5.7	2.8	0.1	0.0	4.6	0.2
22	Worcester	16	15.0	6.5	27.4	6.5	5.8	28.0	17.1	15.3	...	5.6	1.4	11.4	12.2	3.8	1.5	0.0	0.0	2.9	0.5
23	Baltimore City	22	29.4	6.9	36.3	12.9	5.5	14.8	20.4	16.4	...	26.5	3.7	18.8	60.7	5.9	2.5	0.2	0.0	3.2	0.5

Utilizing Social Vulnerability Factors to Predict Health Outcomes¶

Contributions¶

Introduction¶

Imports¶

Data Curation¶

Exploratory Data Analysis¶

Summary Statistics¶

For All Counties¶

Grouped by Small Business Presence¶

Statistical Method #1¶

Statistical Method #2¶

Modeling¶

Evaluation and Pivot¶

Data Curation¶

Exploratory Data Analysis¶

Statistical Method #1¶

Collinearity Elimination¶

KEY FOR FINAL FEATURES¶

Summary Statistics¶

Statistical Method #2¶

Modeling¶

Linear Regression (The Baseline Model):¶

Random Forest Regressor:¶

K-Means Clustering:¶

K-Nearest Neighbours¶

Hierarchical Clustering¶

Why Hierarchical Clustering for this analysis?¶

Results and Interpretation¶

Interpretation:¶

Implications:¶

Interpreting Hierarchical Clustering Analysis Results¶

Comparison with K-Means Clustering and Health Outcome Patterns¶

Summary of Insights from All Models¶

1. Initial Statistical Analysis (Pearson's R and ANOVA):¶

2. Feature Selection and Collinearity:¶

3. Predictive Modeling:¶

4. Clustering Analysis:¶

Overall Conclusion:¶

Summary:¶

Q&A¶

Data Analysis Key Findings¶

Insights or Next Steps¶