# required imports
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

df = pd.read_csv("/content/filtered_data.csv")
df.head()

#renamed stratification columns and dropped the redundant columns
df.rename(columns={'Stratification1': 'age_group','Stratification2': 'Race','Stratification3': 'Sex' }, inplace=True)
df.drop(columns=['StratificationCategory1','StratificationCategory2','StratificationCategory3'], inplace=True)
df.head()

print(df.nunique())

Year                             21
LocationAbbr                      1
LocationDesc                     23
GeographicLevel                   1
DataSource                        1
Class                             1
Topic                             5
Data_Value                    11234
Data_Value_Unit                   1
Data_Value_Type                   1
Data_Value_Footnote_Symbol        1
Data_Value_Footnote               1
Confidence_limit_Low          10629
Confidence_limit_High         11982
age_group                         2
Race                              6
Sex                               3
LocationID                       24
dtype: int64

#Dropping columns which only have one value
#LocationAbbr is md
#GeographicLevel is county since we're at the md level
#DataSource is NVSS
#Class is cardiovascular disease
#Data_Value_Unit is per 100,000
#Data_Value_Type is Age-Standardized, Spatiotemporally Smoothed Rate
df = df.drop(["LocationAbbr","GeographicLevel", "DataSource", "Class", "Data_Value_Unit", "Data_Value_Type"], axis=1)

#Data_Value_Footnote_Symbol is either ~ or NaN
#Data_Value_Footnote is NaN for symbol == NaN and "Value surpressed" for ~
df = df.drop(["Data_Value_Footnote_Symbol","Data_Value_Footnote"], axis=1)

#df after cleaning
df.head()

physical_df = pd.read_csv("/content/SHIP_Increase_Physical_Activity_2011-2019.csv")

physical_df = physical_df.drop("Measure", axis=1)
physical_df = physical_df[physical_df['Jurisdiction'] != 'State']

print(physical_df.nunique(), '\n')
races = physical_df['Race/ ethnicity'].unique()
print('Races in dataset:', physical_df['Race/ ethnicity'].unique(), '\n')

for race in races:
  print('Missing values for', race, physical_df[physical_df['Race/ ethnicity'] == race]['Value'].isna().sum())

Jurisdiction        24
Value              182
Race/ ethnicity     10
Year                 6
dtype: int64 

Races in dataset: ['All races/ ethnicities (aggregated)'
 'American Indian/Alaska Native Non-Hispanic/Latino'
 'Asian/Pacific Islander Non-Hispanic/Latino'
 'Black or African American Non-Hispanic/Latino' 'Hispanic'
 'Native Hawaiian/Other Pacific Islander Hispanic/Latino'
 'White Non-Hispanic/Latino' 'Black Non-Hispanic' 'White Non-Hispanic'
 'Asian Non-Hispanic'] 

Missing values for All races/ ethnicities (aggregated) 0
Missing values for American Indian/Alaska Native Non-Hispanic/Latino 24
Missing values for Asian/Pacific Islander Non-Hispanic/Latino 23
Missing values for Black or African American Non-Hispanic/Latino 13
Missing values for Hispanic 134
Missing values for Native Hawaiian/Other Pacific Islander Hispanic/Latino 24
Missing values for White Non-Hispanic/Latino 0
Missing values for Black Non-Hispanic 78
Missing values for White Non-Hispanic 0
Missing values for Asian Non-Hispanic 115

physical_df_filtered = physical_df[
    (physical_df['Year'].isin([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])) &
    (physical_df['Race/ ethnicity'] == 'All races/ ethnicities (aggregated)')
]
physical_df_filtered.reset_index().drop("index", axis = 1)

financial_df = pd.read_csv("/content/Maryland_Total_Personal_Income__Thousands_of_Current_Dollars___2010-2022.csv")
financial_df.head(10)

financial_df = financial_df.drop(columns=['MARYLAND', 'Date created'])
financial_df = financial_df.iloc[:10]
financial_df

financial_df = pd.melt(financial_df, id_vars=["Year"],
                         var_name="County",
                         value_name="Income")


financial_df['County'] = financial_df['County'].str.replace(' County', '', regex=False)
financial_df

print(financial_df.nunique(), '\n')
counties = financial_df['County'].unique()
print('Counties in dataset:', counties, '\n')

for county in counties:
  print('Missing values for', county, financial_df[financial_df['County'] == county]['Income'].isna().sum())

Year       10
County     24
Income    239
dtype: int64 

Counties in dataset: ['Allegany' 'Anne Arundel' 'Baltimore City' 'Baltimore' 'Calvert'
 'Caroline' 'Carroll' 'Cecil' 'Charles' 'Dorchester' 'Frederick' 'Garrett'
 'Harford' 'Howard' 'Kent' 'Montgomery' "Prince George's" "Queen Anne's"
 'Somerset' "St. Mary's" 'Talbot' 'Washington' 'Wicomico' 'Worcester'] 

Missing values for Allegany 0
Missing values for Anne Arundel 0
Missing values for Baltimore City 0
Missing values for Baltimore 0
Missing values for Calvert 0
Missing values for Caroline 0
Missing values for Carroll 0
Missing values for Cecil 0
Missing values for Charles 0
Missing values for Dorchester 0
Missing values for Frederick 0
Missing values for Garrett 0
Missing values for Harford 0
Missing values for Howard 0
Missing values for Kent 0
Missing values for Montgomery 0
Missing values for Prince George's 0
Missing values for Queen Anne's 0
Missing values for Somerset 0
Missing values for St. Mary's 0
Missing values for Talbot 0
Missing values for Washington 0
Missing values for Wicomico 0
Missing values for Worcester 0

print(df.shape, physical_df_filtered.shape, financial_df.shape)

(40320, 10) (144, 4) (240, 3)

print(df.columns, '\n',  physical_df_filtered.columns, '\n',  financial_df.columns)

Index(['Year', 'LocationDesc', 'Topic', 'Data_Value', 'Confidence_limit_Low',
       'Confidence_limit_High', 'age_group', 'Race', 'Sex', 'LocationID'],
      dtype='object') 
 Index(['Jurisdiction', 'Value', 'Race/ ethnicity', 'Year'], dtype='object') 
 Index(['Year', 'County', 'Income'], dtype='object')

print(df.count(), '\n')

Year                     40320
LocationDesc             40320
Topic                    40320
Data_Value               31080
Confidence_limit_Low     31080
Confidence_limit_High    31080
age_group                40320
Race                     40320
Sex                      40320
LocationID               40320
dtype: int64

print(df.dtypes, '\n')
print(physical_df_filtered.dtypes, '\n')
print(financial_df.dtypes, '\n')

Year                       int64
LocationDesc              object
Topic                     object
Data_Value               float64
Confidence_limit_Low     float64
Confidence_limit_High    float64
age_group                 object
Race                      object
Sex                       object
LocationID               float64
dtype: object 

Jurisdiction        object
Value              float64
Race/ ethnicity     object
Year                 int64
dtype: object 

Year       int64
County    object
Income     int64
dtype: object

df[df['Data_Value'].isna()]['Race'].unique()

array(['American Indian/Alaska Native', 'Asian/Pacific Islander',
       'Hispanic', 'Black (Non-Hispanic)'], dtype=object)

race_counts = df[df['Race'] !=  'Overall']['Race'].value_counts()
race_counts

plt.pie(race_counts, labels=race_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Race')
plt.axis('equal')
plt.show()

county_counts = df['LocationDesc'].value_counts()

plt.pie(county_counts, labels=county_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('County')
plt.axis('equal')
plt.show()

df.nunique()

#there are 23 counties in md and 23 locationDesc but there are 24 LocationID unique values
#printing what each LocationID maps to
for val in df["LocationID"].unique():
  filtered_rows = df[df['LocationID'] == val]

  if filtered_rows['LocationDesc'].nunique() == 1:
    unique_value = filtered_rows['LocationDesc'].iloc[0]
    print(val,"->", unique_value)

24001.0 -> Allegany
24009.0 -> Calvert
24003.0 -> Anne Arundel
24005.0 -> Baltimore
24011.0 -> Caroline
24013.0 -> Carroll
24015.0 -> Cecil
24017.0 -> Charles
24023.0 -> Garrett
24019.0 -> Dorchester
24021.0 -> Frederick
24025.0 -> Harford
24027.0 -> Howard
24029.0 -> Kent
24039.0 -> Somerset
24031.0 -> Montgomery
24033.0 -> Prince George's
24041.0 -> Talbot
24035.0 -> Queen Anne's
24037.0 -> St. Mary's
24043.0 -> Washington
24045.0 -> Wicomico
24047.0 -> Worcester
24510.0 -> Baltimore

#the data is sectioned by year, county, and race
df_race = df[df['Sex'] == 'Overall'].reset_index()
df_race = df_race.drop(columns=['Sex'])
df_race.isna().sum()
df_race

#the data is sectioned by year, county, and sex
df_sex = df[df['Race'] == 'Overall'].reset_index()
df_sex = df_sex.drop(columns=['Race'])
df_sex.isna().sum()
df_sex

#the data is sectioned by year, and county
df_overall = df[(df['Sex'] == 'Overall') & (df['Race'] == 'Overall')].reset_index()
df_overall = df_overall.drop(["Race", "Sex"], axis=1)
df_overall

df_overall.isna().sum()

df_sex.isna().sum()

df_race.isna().sum()

#printing out the number of missing values for each race
for race in df_race["Race"].unique():
    nan_count = df_race[df_race  ['Race'] == race]['Data_Value'].isna().sum()
    print(race, nan_count)

American Indian/Alaska Native 4305
Overall 0
Asian/Pacific Islander 2520
Black (Non-Hispanic) 315
Hispanic 2100
White 0

#printing out the percentage of missing values for each race
for race in df_race["Race"].unique():
    nan_count = df_race[df_race['Race'] == race]['Data_Value'].isna().sum()
    total_count = df_race[df_race['Race'] == race]['Data_Value'].count()

    if total_count > 0:
        percentage_nan = (nan_count / (nan_count + total_count)) * 100
    else:
        percentage_nan = 0

    print(race, "%: ", percentage_nan)

American Indian/Alaska Native %:  85.41666666666666
Overall %:  0.0
Asian/Pacific Islander %:  50.0
Black (Non-Hispanic) %:  6.25
Hispanic %:  41.66666666666667
White %:  0.0

from scipy.stats import chi2_contingency

variables = ['Topic', 'LocationDesc', 'age_group', 'Race', 'Sex']
for i in range(len(variables)):
    for j in range(i + 1, len(variables)):
        var1, var2 = variables[i], variables[j]
        contingency_table = pd.crosstab(df[var1], df[var2])
        chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
        print(f"Chi-Square Test between {var1} and {var2}:")
        print(f"P-Value: {p_value}")

Chi-Square Test between Topic and LocationDesc:
P-Value: 1.0
Chi-Square Test between Topic and age_group:
P-Value: 1.0
Chi-Square Test between Topic and Race:
P-Value: 1.0
Chi-Square Test between Topic and Sex:
P-Value: 1.0
Chi-Square Test between LocationDesc and age_group:
P-Value: 1.0
Chi-Square Test between LocationDesc and Race:
P-Value: 1.0
Chi-Square Test between LocationDesc and Sex:
P-Value: 1.0
Chi-Square Test between age_group and Race:
P-Value: 1.0
Chi-Square Test between age_group and Sex:
P-Value: 1.0
Chi-Square Test between Race and Sex:
P-Value: 0.0

df_race_sex_no_dv = pd.get_dummies(df, columns=['Race', 'Sex'])
df_race_sex_no_dv = df_race_sex_no_dv.drop(columns=['Year','Data_Value', 'LocationID', 'Confidence_limit_Low', 'Confidence_limit_High', 'Topic', 'LocationDesc', 'age_group'])
sns.heatmap(df_race_sex_no_dv.corr(), fmt='.2f', annot = True, cmap='coolwarm')

<Axes: >

df_race_sex_no_dv_overall = pd.get_dummies(df, columns=['Race', 'Sex'])
df_race_sex_no_dv_overall = df_race_sex_no_dv_overall.drop(columns=['Year','Data_Value', 'LocationID', 'Confidence_limit_Low', 'Confidence_limit_High', 'Topic', 'LocationDesc', 'age_group', 'Race_Overall', 'Sex_Overall'])
sns.heatmap(df_race_sex_no_dv_overall.corr(), fmt='.2f', annot = True, cmap='coolwarm')

<Axes: >

df_with_dummies = pd.get_dummies(df, columns=['Topic', 'LocationDesc', 'age_group', 'Race', 'Sex'])
df_with_dummies = df_with_dummies.drop(columns=['LocationID', 'Confidence_limit_Low', 'Confidence_limit_High', 'Topic_All heart disease', 'Sex_Overall', 'Race_Overall', ])
plt.figure(figsize=(10, 8))

sns.heatmap(df_with_dummies.corr(), cmap='coolwarm')

<Axes: >

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_value_scaled = scaler.fit_transform(df_with_dummies[['Data_Value']])
df_with_dummies['data_value_scaled'] = data_value_scaled

# df_with_dummies_no_loc = pd.get_dummies(df, columns=['Topic', 'LocationDesc', 'age_group', 'Race', 'Sex'])
df_with_dummies = df_with_dummies.drop(columns=['Data_Value'])
plt.figure(figsize=(10, 8))
sns.heatmap(df_with_dummies.corr(), cmap='coolwarm')

<Axes: >

# Boxplot
df_sex_mf = df_sex[df_sex['Sex'].isin(['Men', 'Women'])]
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_sex_mf, x="Sex", y="Data_Value")
plt.title("Effect of Sex on Mortality Rate")
plt.xlabel("Sex")
plt.ylabel("Mortality Rate (deaths per 100,000)")
plt.show()

df_race_filtered = df_race[df_race['Race'].isin(['American Indian/Alaska Native',
       'Asian/Pacific Islander', 'Black (Non-Hispanic)', 'Hispanic',
       'White'])]
plt.figure(figsize=(12, 10))
sns.boxplot(data=df_race_filtered, x="Race", y="Data_Value")
plt.title("Effect of Race on Mortality Rate")
plt.xlabel("Race")
plt.ylabel("Mortality Rate (deaths per 100,000)")
plt.show()

print("Types of Heart Diseases (Topics):")
print(df_sex['Topic'].unique())

#Splitting the dataset using "Topic"
df_overall_all_stroke =df_overall[df_overall['Topic'] == 'All stroke']
df_overall_all_heart_disease = df_overall[df_overall['Topic'] == 'All heart disease']
df_overall_CHD = df_overall[df_overall['Topic'] == 'Coronary heart disease (CHD)']
df_overall_CVD = df_overall[df_overall['Topic'] == 'Cardiovascular disease (CVD)' ]
df_overall_heart_failure = df_overall[df_overall['Topic'] == 'Heart failure']

# Performing the ANOVA test
f_statistic, p_value = stats.f_oneway(df_overall_all_stroke['Data_Value'], df_overall_all_heart_disease['Data_Value'],df_overall_CHD['Data_Value'],
                                      df_overall_CVD['Data_Value'],df_overall_heart_failure['Data_Value'])

print(f"F-statistic: {f_statistic}, p-value: {p_value}")


# Create a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Topic', y='Data_Value', data=df_sex)
plt.title('Boxplot of Data Values by Topic')
plt.xlabel('Type of Heart Disease')
plt.ylabel('Data Value')
plt.xticks(rotation=45)
plt.show()

Types of Heart Diseases (Topics):
['All stroke' 'All heart disease' 'Coronary heart disease (CHD)'
 'Cardiovascular disease (CVD)' 'Heart failure']
F-statistic: 394.7155311884273, p-value: 3.6936075074683683e-296

physical_df_filtered_grouped = physical_df_filtered.groupby('Jurisdiction')
plt.figure(figsize=(8, 6))
for county, group in physical_df_filtered_grouped:
  plt.plot(group['Year'], group['Value'], label=county, marker='o')

plt.title("Time-Series of Fitness Levels by County", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Physical Activity (# Individuals)", fontsize=12)
plt.legend(title="County", loc='upper left', bbox_to_anchor=(1, 1))
plt.grid(True)
plt.show()

physical_df = pd.read_csv("/content/SHIP_Increase_Physical_Activity_2011-2019.csv")

physical_df = physical_df.drop("Measure", axis=1)

physical_df_filtered = physical_df[
    (physical_df['Year'].isin([2011, 2019])) &
    (physical_df['Race/ ethnicity'] == 'All races/ ethnicities (aggregated)')
]
physical_df_filtered.reset_index().drop("index", axis = 1)

pivot_df = physical_df_filtered.pivot_table(
    index='Jurisdiction',
    columns='Year',
    values='Value'
).reset_index()

pivot_df.columns = ['Jurisdiction', 'Value_2011', 'Value_2019']

pivot_df['Change'] = pivot_df['Value_2019'] - pivot_df['Value_2011']
plt.bar(pivot_df['Jurisdiction'], pivot_df['Change'], color='skyblue')
plt.axhline(y=5, color='blue', linestyle='--', linewidth=1.5)
plt.xticks(rotation=90)
plt.xlabel('Jurisdiction')
plt.ylabel('Change in Physical Activity (# of Individuals)')
plt.title('Change in Physical Activity from 2011 to 2019 by Jurisdiction')
plt.show()

#list of counties with 5+ change from 2011 to 2019
sig_physical_change = pivot_df[pivot_df['Change'] >= 5].reset_index().drop("index", axis=1)
sig_physical_change

#Filtering the dataset to years 2011 & 2019
df_overall_11 = df_overall[(df_overall['Year'] == 2011) & (df_overall['Topic'] == 'All heart disease')].sort_values(by=['Year', 'LocationDesc'])
df_overall_19 = df_overall[(df_overall['Year'] == 2019) & (df_overall['Topic'] == 'All heart disease')].sort_values(by=['Year', 'LocationDesc'])

df_overall_11_mean = df_overall_11.groupby('LocationDesc')['Data_Value'].mean().reset_index()
df_overall_19_mean = df_overall_19.groupby('LocationDesc')['Data_Value'].mean().reset_index()
merged_means = pd.merge(df_overall_11_mean, df_overall_19_mean, on='LocationDesc', suffixes=('_2011', '_2019'))

# Assuming sig_physical_change is your list of locations
merged_means = merged_means[merged_means['LocationDesc'].isin(sig_physical_change["Jurisdiction"])]
merged_means

# Perform the paired t-test
t_statistic, p_value = stats.ttest_rel(merged_means['Data_Value_2011'], merged_means['Data_Value_2019'])

# Output the results
print(f"T-statistic: {t_statistic}, P-value: {p_value}")

T-statistic: 4.341413390383205, P-value: 0.0018735931821840352

plt.plot(merged_means['LocationDesc'], merged_means['Data_Value_2011'], marker='o', label='2011', color='blue')
plt.plot(merged_means['LocationDesc'], merged_means['Data_Value_2019'], marker='o', label='2019', color='orange')
plt.xlabel('Counties')
plt.ylabel('Mortality Rate')
plt.title('Comparison of Mortality Rates Before and After Significant Increase in Physical Activity')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.show()

df_overall_financial = df_overall[(df_overall['Year'] >= 2010) & (df_overall['Year'] <= 2020)]
df_merged = pd.merge(df_overall_financial , financial_df,
                     left_on=['Year', 'LocationDesc'],
                     right_on=['Year', 'County'],
                     how='inner')

df_merged = df_merged.drop(columns=['Confidence_limit_Low', 'LocationID', 'County', 'Confidence_limit_High'])
df_merged = df_merged[~df_merged['Data_Value'].isna()]
df_merged = df_merged [df_merged ['Topic'].isin(['All heart disease',])]

plt.figure(figsize=(8, 6))
plt.scatter(df_merged['Income'], df_merged['Data_Value'], label='All heart diseases', marker='o')

plt.title("Income vs. Mortality", fontsize=14)
plt.xlabel("Income (thousands of $)", fontsize=12)
plt.ylabel("Mortality by Heart Disease (per 100,000)", fontsize=12)
plt.legend()
plt.grid(True)
plt.show()

#3rd Hypothesis

financial_df = pd.read_csv("/content/Maryland_Total_Personal_Income__Thousands_of_Current_Dollars___2010-2022.csv")
financial_df = financial_df.drop(columns=['MARYLAND', 'Date created'])
financial_df = financial_df.iloc[:10]
financial_df = pd.melt(financial_df, id_vars=["Year"],
                         var_name="County",
                         value_name="Income")

financial_df['County'] = financial_df['County'].str.replace(' County', '', regex=False)

df_overall_financial = df_overall[(df_overall['Year'] >= 2010) & (df_overall['Year'] <= 2020)]
df_merged = pd.merge(df_overall_financial , financial_df,
                     left_on=['Year', 'LocationDesc'],
                     right_on=['Year', 'County'],
                     how='inner')

df_merged = df_merged.drop(columns=['Confidence_limit_Low', 'LocationID', 'County', 'Confidence_limit_High' ])
df_merged = df_merged [df_merged ['Topic'] == 'All stroke']
df_merged

df_merged_35 = df_merged [df_merged ['age_group'] == 'Ages 35-64 years']
df_merged_35

df_merged_65 = df_merged [df_merged ['age_group'] == 'Ages 65 years and older']
df_merged_65

from statsmodels.stats.weightstats import ztest
median_mortality_35_64 = df_merged_35['Data_Value'].median()
median_mortality_65_plus = df_merged_65['Data_Value'].median()


high_mortality_35_64 = df_merged_35[df_merged_35['Data_Value'] > median_mortality_35_64]
low_mortality_35_64 = df_merged_35[df_merged_35['Data_Value'] <= median_mortality_35_64]


high_mortality_65_plus = df_merged_65[df_merged_65['Data_Value'] > median_mortality_65_plus]
low_mortality_65_plus = df_merged_65[df_merged_65['Data_Value'] <= median_mortality_65_plus]


z_stat_35_64, p_val_35_64 = ztest(high_mortality_35_64['Income'], low_mortality_35_64['Income'])

# For 65+ age group
z_stat_65_plus, p_val_65_plus = ztest(high_mortality_65_plus['Income'], low_mortality_65_plus['Income'])

z_stat_35_64, p_val_35_64, z_stat_65_plus, p_val_65_plus

(-8.780587700873609,
 1.6262265163207292e-18,
 -3.47678832965042,
 0.0005074582711123621)

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm


def plot_z_test(z_stat, p_value, title):
    z_range = np.linspace(-4, 4, 500)

    normal_curve = norm.pdf(z_range)

    plt.plot(z_range, normal_curve, label='Standard Normal Distribution', color='blue')
    plt.axvline(x=z_stat, color='red', linestyle='--', label=f'Z = {z_stat:.2f}, p = {p_value:.3e}')
    plt.fill_between(z_range, 0, normal_curve, where=(z_range <= -1.96) | (z_range >= 1.96), color='red', alpha=0.3)

    plt.title(title)
    plt.xlabel('Z Value')
    plt.ylabel('Probability Density')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_z_test(-8.78, 1.63e-18, 'Z-Test for Age Group 35-64: Income vs Mortality')
plot_z_test(-3.48, 0.0005, 'Z-Test for Age Group 65+: Income vs Mortality')

master_df = df
master_df = master_df.drop(columns =['Confidence_limit_Low', 'Confidence_limit_High', 'LocationID'])

financial_df = pd.read_csv("/content/Maryland_Total_Personal_Income__Thousands_of_Current_Dollars___2010-2022.csv")
financial_df = financial_df.drop(columns=['MARYLAND', 'Date created'])
financial_df = financial_df.iloc[:10]
financial_df = pd.melt(financial_df, id_vars=["Year"],
                         var_name="County",
                         value_name="Income")

financial_df['County'] = financial_df['County'].str.replace(' County', '', regex=False)

master_df = master_df[(master_df['Year'] >= 2010) & (master_df['Year'] <= 2020)]
master_df = pd.merge(master_df , financial_df,
                     left_on=['Year', 'LocationDesc'],
                     right_on=['Year', 'County'],
                     how='inner')
master_df = master_df.drop(columns = ['County'])

physical_df_filtered = physical_df[
    (physical_df['Year'].isin([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])) &
    (physical_df['Race/ ethnicity'] == 'All races/ ethnicities (aggregated)')
]
physical_df_filtered.reset_index().drop("index", axis = 1)


master_df = pd.merge(master_df , physical_df_filtered[['Year', 'Jurisdiction', 'Value']],
                     left_on=['Year', 'LocationDesc'],
                     right_on=['Year', 'Jurisdiction'],
                     how='inner')

master_df = master_df.drop(columns = ['Jurisdiction'])
master_df.rename(columns={'Value': 'Physical Activity', 'Data_Value': 'Mortality', 'age_group':'Age_Group', 'LocationDesc':'County', 'Topic':'Disease Type'}, inplace=True)
master_df

master_df.size

90720

master_df.isna().sum()

master_df.groupby('County')['Mortality'].describe()

mean_data_values = master_df.groupby(['Year', 'County'])['Mortality'].mean()
years = [2011, 2012, 2013, 2015, 2017, 2019]
counties = ['Allegany', 'Calvert', 'Anne Arundel', 'Caroline', 'Carroll', 'Cecil',
 'Charles', 'Garrett', 'Dorchester', 'Frederick', 'Harford', 'Howard', 'Kent',
 'Montgomery', "Prince George's", 'Somerset', 'Talbot', "Queen Anne's",
 'Washington', 'Wicomico', 'Worcester']
for year in years:
  for county in counties:
    master_df[(master_df['Year'] == year) & (master_df['County'] == county)] = master_df[(master_df['Year'] == year) & (master_df['County'] == county)].fillna(mean_data_values[year][county])

print(master_df.isna().sum())
master_df

Year                 0
County               0
Disease Type         0
Mortality            0
Age_Group            0
Race                 0
Sex                  0
Income               0
Physical Activity    0
dtype: int64

plt.hist(master_df['Mortality'])
plt.title('Mortality Distribution')
plt.show()

plt.hist(master_df['Income'])
plt.title('Income Distribution')
plt.show()

plt.hist(master_df['Physical Activity'])
plt.title('Physical Activity Distribution')
plt.show()

import numpy as np

master_df['Mortality'] = np.log(master_df['Mortality'])
plt.hist(master_df['Mortality'])
plt.title('Mortality Distribution')
plt.show()

master_df = master_df[master_df['Race'] != 'American Indian/Alaska Native']
master_df

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
random_state= 42

y = master_df['Mortality']
X = master_df.drop(['Mortality'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

X_train

encoder = OneHotEncoder(categories='auto', sparse_output=False)
scaler = StandardScaler()

encoded = encoder.fit(X_train[['County', 'Disease Type', 'Sex', 'Age_Group', 'Race']])
encoded = encoder.transform(X_train[['County', 'Disease Type', 'Sex', 'Age_Group', 'Race']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['County', 'Disease Type', 'Sex', 'Age_Group', 'Race']))

scaled = scaler.fit(X_train[['Income', 'Physical Activity']])
scaled = scaler.transform(X_train[['Income', 'Physical Activity']])
scaled_df = pd.DataFrame(scaled, columns=['Income_scaled', 'Physical Activity_scaled'])

X_train = pd.concat([encoded_df, scaled_df], axis=1)


encoded = encoder.transform(X_test[['County', 'Disease Type', 'Sex', 'Age_Group', 'Race']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['County', 'Disease Type', 'Sex', 'Age_Group', 'Race']))

scaled = scaler.transform(X_test[['Income', 'Physical Activity']])
scaled_df = pd.DataFrame(scaled, columns=['Income_scaled', 'Physical Activity_scaled'])

X_test = pd.concat([encoded_df, scaled_df], axis=1)

X_train, X_test

(      County_Allegany  County_Anne Arundel  County_Calvert  County_Caroline  \
 0                 0.0                  0.0             0.0              0.0   
 1                 0.0                  1.0             0.0              0.0   
 2                 0.0                  0.0             0.0              0.0   
 3                 0.0                  0.0             0.0              0.0   
 4                 0.0                  0.0             0.0              0.0   
 ...               ...                  ...             ...              ...   
 7051              0.0                  0.0             0.0              0.0   
 7052              0.0                  0.0             0.0              0.0   
 7053              0.0                  0.0             0.0              0.0   
 7054              0.0                  0.0             1.0              0.0   
 7055              0.0                  0.0             0.0              0.0   
 
       County_Carroll  County_Cecil  County_Charles  County_Dorchester  \
 0                0.0           0.0             0.0                1.0   
 1                0.0           0.0             0.0                0.0   
 2                0.0           0.0             0.0                0.0   
 3                0.0           0.0             0.0                0.0   
 4                0.0           0.0             0.0                0.0   
 ...              ...           ...             ...                ...   
 7051             0.0           0.0             0.0                0.0   
 7052             0.0           0.0             0.0                0.0   
 7053             0.0           0.0             0.0                0.0   
 7054             0.0           0.0             0.0                0.0   
 7055             0.0           0.0             0.0                0.0   
 
       County_Frederick  County_Garrett  ...  Sex_Women  \
 0                  0.0             0.0  ...        0.0   
 1                  0.0             0.0  ...        0.0   
 2                  0.0             0.0  ...        0.0   
 3                  0.0             0.0  ...        0.0   
 4                  0.0             0.0  ...        0.0   
 ...                ...             ...  ...        ...   
 7051               0.0             0.0  ...        0.0   
 7052               0.0             0.0  ...        0.0   
 7053               0.0             0.0  ...        0.0   
 7054               0.0             0.0  ...        0.0   
 7055               0.0             0.0  ...        0.0   
 
       Age_Group_Ages 35-64 years  Age_Group_Ages 65 years and older  \
 0                            1.0                                0.0   
 1                            1.0                                0.0   
 2                            1.0                                0.0   
 3                            0.0                                1.0   
 4                            0.0                                1.0   
 ...                          ...                                ...   
 7051                         0.0                                1.0   
 7052                         1.0                                0.0   
 7053                         0.0                                1.0   
 7054                         1.0                                0.0   
 7055                         1.0                                0.0   
 
       Race_Asian/Pacific Islander  Race_Black (Non-Hispanic)  Race_Hispanic  \
 0                             0.0                        1.0            0.0   
 1                             0.0                        0.0            0.0   
 2                             0.0                        1.0            0.0   
 3                             0.0                        0.0            0.0   
 4                             0.0                        0.0            0.0   
 ...                           ...                        ...            ...   
 7051                          0.0                        0.0            0.0   
 7052                          0.0                        0.0            1.0   
 7053                          0.0                        1.0            0.0   
 7054                          0.0                        0.0            0.0   
 7055                          0.0                        1.0            0.0   
 
       Race_Overall  Race_White  Income_scaled  Physical Activity_scaled  
 0              0.0         0.0      -1.018148                 -0.754556  
 1              1.0         0.0       0.848683                  0.952862  
 2              0.0         0.0      -0.813061                 -0.022805  
 3              0.0         1.0      -1.236477                  0.151421  
 4              1.0         0.0      -0.496602                  0.046885  
 ...            ...         ...            ...                       ...  
 7051           1.0         0.0       2.275546                  1.283892  
 7052           0.0         0.0       0.781844                  0.499873  
 7053           0.0         0.0      -0.295140                  0.395338  
 7054           1.0         0.0       0.217184                  0.273379  
 7055           0.0         0.0       0.702552                  1.022552  
 
 [7056 rows x 38 columns],
       County_Allegany  County_Anne Arundel  County_Calvert  County_Caroline  \
 0                 0.0                  0.0             0.0              0.0   
 1                 0.0                  0.0             0.0              0.0   
 2                 0.0                  0.0             1.0              0.0   
 3                 0.0                  0.0             0.0              0.0   
 4                 0.0                  0.0             0.0              0.0   
 ...               ...                  ...             ...              ...   
 1759              0.0                  0.0             0.0              0.0   
 1760              1.0                  0.0             0.0              0.0   
 1761              0.0                  0.0             0.0              0.0   
 1762              0.0                  0.0             0.0              0.0   
 1763              0.0                  1.0             0.0              0.0   
 
       County_Carroll  County_Cecil  County_Charles  County_Dorchester  \
 0                0.0           0.0             0.0                1.0   
 1                0.0           0.0             0.0                0.0   
 2                0.0           0.0             0.0                0.0   
 3                0.0           0.0             0.0                0.0   
 4                0.0           0.0             1.0                0.0   
 ...              ...           ...             ...                ...   
 1759             0.0           0.0             0.0                0.0   
 1760             0.0           0.0             0.0                0.0   
 1761             0.0           0.0             0.0                0.0   
 1762             0.0           0.0             1.0                0.0   
 1763             0.0           0.0             0.0                0.0   
 
       County_Frederick  County_Garrett  ...  Sex_Women  \
 0                  0.0             0.0  ...        0.0   
 1                  1.0             0.0  ...        0.0   
 2                  0.0             0.0  ...        0.0   
 3                  0.0             0.0  ...        0.0   
 4                  0.0             0.0  ...        0.0   
 ...                ...             ...  ...        ...   
 1759               0.0             0.0  ...        0.0   
 1760               0.0             0.0  ...        0.0   
 1761               0.0             0.0  ...        0.0   
 1762               0.0             0.0  ...        0.0   
 1763               0.0             0.0  ...        0.0   
 
       Age_Group_Ages 35-64 years  Age_Group_Ages 65 years and older  \
 0                            1.0                                0.0   
 1                            1.0                                0.0   
 2                            0.0                                1.0   
 3                            1.0                                0.0   
 4                            0.0                                1.0   
 ...                          ...                                ...   
 1759                         1.0                                0.0   
 1760                         1.0                                0.0   
 1761                         1.0                                0.0   
 1762                         0.0                                1.0   
 1763                         0.0                                1.0   
 
       Race_Asian/Pacific Islander  Race_Black (Non-Hispanic)  Race_Hispanic  \
 0                             0.0                        0.0            0.0   
 1                             0.0                        0.0            0.0   
 2                             0.0                        0.0            0.0   
 3                             0.0                        0.0            0.0   
 4                             0.0                        0.0            1.0   
 ...                           ...                        ...            ...   
 1759                          0.0                        0.0            0.0   
 1760                          0.0                        0.0            1.0   
 1761                          0.0                        0.0            0.0   
 1762                          0.0                        1.0            0.0   
 1763                          0.0                        0.0            1.0   
 
       Race_Overall  Race_White  Income_scaled  Physical Activity_scaled  
 0              0.0         1.0      -0.715561                 -1.869604  
 1              1.0         0.0       0.708700                  1.231624  
 2              0.0         1.0       0.140887                 -0.406103  
 3              1.0         0.0      -1.185008                 -0.667443  
 4              0.0         0.0      -0.053244                 -0.353835  
 ...            ...         ...            ...                       ...  
 1759           0.0         1.0      -0.269287                  0.499873  
 1760           0.0         0.0      -1.180515                 -0.266722  
 1761           1.0         0.0       1.635929                  2.433785  
 1762           0.0         0.0       0.355906                 -0.510639  
 1763           0.0         0.0       0.848683                  0.952862  
 
 [1764 rows x 38 columns])

# imports
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.model_selection import cross_val_score

# Init model + train
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

LinearRegression()

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R^2: {r2}")


scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print("Cross Validation Mean R²:", scores.mean())

MAE: 0.484624365460802
MSE: 0.5066540990984603
R^2: 0.8271614608769846
Cross Validation Mean R²: 0.8347852599774859

import matplotlib.pyplot as plt
import statsmodels.api as sm

residuals = y_test - y_pred
plt.scatter(y_test, residuals, alpha=0.5)
# plotting y = 0
plt.axhline(y=0, color='r',)
plt.xlabel('Observed Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

plt.hist(y_train - model.predict(X_train))
plt.xlabel('Residuals')
plt.title('Histogram of Residuals')
plt.show()

plt.scatter(y_test, y_pred, alpha=0.5)
# prints the line y = x
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.show()

# imports
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Init model + train
model = DecisionTreeRegressor(random_state=random_state, max_depth=10)
model.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=10, random_state=42)

DecisionTreeRegressor(max_depth=10, random_state=42)

y_pred = model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")

MAE: 0.20990845554995466
MSE: 0.16233456523910997
R²: 0.9446216478756939

residuals = y_test - y_pred
plt.scatter(y_test, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Observed Values')
plt.ylabel('Residuals')
plt.title('Residual Analysis')
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.show()

# Get contribution of each feature
feature_importances = model.feature_importances_

plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances, y=X_train.columns)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mse:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R^2: {r2:.2f}")

scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='r2')
print("Cross Validation Mean R^2:", scores.mean())

MAE: 0.14
MSE: 0.14
R^2: 0.95
Cross Validation Mean R^2: 0.9488368648350983

residuals = y_test - y_pred
plt.scatter(y_test, residuals, alpha=0.5)
plt.axhline(y=0, color='r')
plt.xlabel('Observed Values')
plt.ylabel('Residuals')
plt.title('Residual Analysis')
plt.show()

plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.show()

feature_importances = rf_model.feature_importances_

# Create a bar plot of feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances, y=X_train.columns)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

from sklearn.model_selection import GridSearchCV

param_grid_rf = {'max_depth': [5, 10, 15, 20, 25, 30, 40, 50]}
rf = RandomForestRegressor(n_estimators=100, random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, scoring='neg_mean_squared_error', cv=5, n_jobs = -1)
grid_search_rf.fit(X_train, y_train)
print("Best max_depth for Random Forest:", grid_search_rf.best_params_)

Best max_depth for Random Forest: {'max_depth': 30}

rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=grid_search_rf.best_params_['max_depth'],
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

feature_importances = rf.feature_importances_
feature_importance_df_heat = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances, y=X_train.columns)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

residuals = y_test - y_pred
plt.scatter(y_test, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Observed Values')
plt.ylabel('Residuals')
plt.title('Residual Analysis')
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.show()

Mean Squared Error (MSE): 0.06
Root Mean Squared Error (RMSE): 0.25
R² Score: 0.98

import shap
import pandas as pd

feature_importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({ 'Feature': X_train.columns, 'Importance': feature_importances }).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance_df)

Feature Importances:
                                      Feature  Importance
29                 Age_Group_Ages 35-64 years    0.352041
30          Age_Group_Ages 65 years and older    0.244830
25                 Disease Type_Heart failure    0.077210
22                    Disease Type_All stroke    0.058420
36                              Income_scaled    0.039931
31                Race_Asian/Pacific Islander    0.031622
33                              Race_Hispanic    0.028190
24  Disease Type_Coronary heart disease (CHD)    0.022373
23  Disease Type_Cardiovascular disease (CVD)    0.021986
21             Disease Type_All heart disease    0.019752
9                              County_Garrett    0.011778
12                                County_Kent    0.010332
32                  Race_Black (Non-Hispanic)    0.010093
17                              County_Talbot    0.007079
20                           County_Worcester    0.006817
37                   Physical Activity_scaled    0.006670
15                        County_Queen Anne's    0.006414
19                            County_Wicomico    0.004864
28                                  Sex_Women    0.004829
34                               Race_Overall    0.004661
0                             County_Allegany    0.003931
26                                    Sex_Men    0.003490
18                          County_Washington    0.003228
35                                 Race_White    0.002980
3                             County_Caroline    0.002714
14                     County_Prince George's    0.002371
5                                County_Cecil    0.002099
7                           County_Dorchester    0.001947
27                                Sex_Overall    0.001022
13                          County_Montgomery    0.000971
10                             County_Harford    0.000958
11                              County_Howard    0.000867
1                         County_Anne Arundel    0.000705
8                            County_Frederick    0.000648
16                            County_Somerset    0.000634
2                              County_Calvert    0.000573
6                              County_Charles    0.000524
4                              County_Carroll    0.000444

feat_importances = pd.Series(rf.feature_importances_, index = X_train.columns)
feat_importances.nlargest(12).plot(kind = 'barh')

<Axes: >

ex = shap.TreeExplainer(rf)
shap_values = ex.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

#positive correlations are blue to red left to right
#negative correlations are red to blue
#the range shows the amount of impact

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

batch_size = 32

train_features = torch.tensor(X_train.values, dtype=torch.float32)
train_targets = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
test_features = torch.tensor(X_test.values, dtype=torch.float32)
test_targets = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(train_features, train_targets)
test_dataset = TensorDataset(test_features, test_targets)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

feat_dim = X_train.shape[1]

class MLPModel(nn.Module):
    def __init__(self, feat_dim):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(feat_dim, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = MLPModel(feat_dim)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

epochs = 100
losses = []
for epoch in range(epochs):
    running_loss = 0.0
    for batch_idx, (data, targets) in enumerate(train_loader):
        pred = model(data)
        loss = criterion(pred, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    losses.append(loss.item())
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

print('Finished Training')

Epoch 1/100, Loss: 0.6180262565612793
Epoch 2/100, Loss: 0.3505920171737671
Epoch 3/100, Loss: 0.7918599843978882
Epoch 4/100, Loss: 0.1303536742925644
Epoch 5/100, Loss: 0.05712771415710449
Epoch 6/100, Loss: 0.03171912580728531
Epoch 7/100, Loss: 0.12510550022125244
Epoch 8/100, Loss: 0.08974096924066544
Epoch 9/100, Loss: 0.0300369244068861
Epoch 10/100, Loss: 0.04509372636675835
Epoch 11/100, Loss: 0.0794365257024765
Epoch 12/100, Loss: 0.029062602669000626
Epoch 13/100, Loss: 0.03448960930109024
Epoch 14/100, Loss: 0.013601629994809628
Epoch 15/100, Loss: 0.025212010368704796
Epoch 16/100, Loss: 0.007741420064121485
Epoch 17/100, Loss: 0.01651059091091156
Epoch 18/100, Loss: 0.010084506124258041
Epoch 19/100, Loss: 0.010977290570735931
Epoch 20/100, Loss: 0.010945213958621025
Epoch 21/100, Loss: 0.007066465448588133
Epoch 22/100, Loss: 0.016459718346595764
Epoch 23/100, Loss: 0.0425255186855793
Epoch 24/100, Loss: 0.04082633927464485
Epoch 25/100, Loss: 0.014609578996896744
Epoch 26/100, Loss: 0.006224800366908312
Epoch 27/100, Loss: 0.007599109783768654
Epoch 28/100, Loss: 0.009256904944777489
Epoch 29/100, Loss: 0.012539909221231937
Epoch 30/100, Loss: 0.00674212072044611
Epoch 31/100, Loss: 0.009596269577741623
Epoch 32/100, Loss: 0.014535211957991123
Epoch 33/100, Loss: 0.006218832917511463
Epoch 34/100, Loss: 0.014108923263847828
Epoch 35/100, Loss: 0.010604849085211754
Epoch 36/100, Loss: 0.012138333171606064
Epoch 37/100, Loss: 0.004690488800406456
Epoch 38/100, Loss: 0.008386101573705673
Epoch 39/100, Loss: 0.005343279801309109
Epoch 40/100, Loss: 0.006853851955384016
Epoch 41/100, Loss: 0.02031724527478218
Epoch 42/100, Loss: 0.0057910531759262085
Epoch 43/100, Loss: 0.0018556591821834445
Epoch 44/100, Loss: 0.0031365351751446724
Epoch 45/100, Loss: 0.005620170850306749
Epoch 46/100, Loss: 0.006092675030231476
Epoch 47/100, Loss: 0.006983045022934675
Epoch 48/100, Loss: 0.013469272293150425
Epoch 49/100, Loss: 0.006912410259246826
Epoch 50/100, Loss: 0.011649376712739468
Epoch 51/100, Loss: 0.003585381433367729
Epoch 52/100, Loss: 0.012437469325959682
Epoch 53/100, Loss: 0.00227912119589746
Epoch 54/100, Loss: 0.004715257789939642
Epoch 55/100, Loss: 0.02325817383825779
Epoch 56/100, Loss: 0.002720362041145563
Epoch 57/100, Loss: 0.0037705234717577696
Epoch 58/100, Loss: 0.005242420360445976
Epoch 59/100, Loss: 0.010716262273490429
Epoch 60/100, Loss: 0.007987739518284798
Epoch 61/100, Loss: 0.002992952475324273
Epoch 62/100, Loss: 0.005613104905933142
Epoch 63/100, Loss: 0.00384236266836524
Epoch 64/100, Loss: 0.004477276466786861
Epoch 65/100, Loss: 0.0036773867905139923
Epoch 66/100, Loss: 0.003930995706468821
Epoch 67/100, Loss: 0.005508327390998602
Epoch 68/100, Loss: 0.008991721086204052
Epoch 69/100, Loss: 0.008103292435407639
Epoch 70/100, Loss: 0.0033096112310886383
Epoch 71/100, Loss: 0.006649049464613199
Epoch 72/100, Loss: 0.003951608203351498
Epoch 73/100, Loss: 0.004014777019619942
Epoch 74/100, Loss: 0.006757713854312897
Epoch 75/100, Loss: 0.00431868527084589
Epoch 76/100, Loss: 0.006432480178773403
Epoch 77/100, Loss: 0.009876019321382046
Epoch 78/100, Loss: 0.0029915091581642628
Epoch 79/100, Loss: 0.0035177061799913645
Epoch 80/100, Loss: 0.008354071527719498
Epoch 81/100, Loss: 0.002175804227590561
Epoch 82/100, Loss: 0.005512255243957043
Epoch 83/100, Loss: 0.005264884792268276
Epoch 84/100, Loss: 0.013264141976833344
Epoch 85/100, Loss: 0.00263501750305295
Epoch 86/100, Loss: 0.009585960768163204
Epoch 87/100, Loss: 0.0021744996774941683
Epoch 88/100, Loss: 0.004340585321187973
Epoch 89/100, Loss: 0.003258319105952978
Epoch 90/100, Loss: 0.004269901663064957
Epoch 91/100, Loss: 0.0020748793613165617
Epoch 92/100, Loss: 0.003359935013577342
Epoch 93/100, Loss: 0.0030603569466620684
Epoch 94/100, Loss: 0.00942076276987791
Epoch 95/100, Loss: 0.0069447108544409275
Epoch 96/100, Loss: 0.003604387864470482
Epoch 97/100, Loss: 0.0056814649142324924
Epoch 98/100, Loss: 0.005357019137591124
Epoch 99/100, Loss: 0.004147527739405632
Epoch 100/100, Loss: 0.004988198168575764
Finished Training

test_loss = 0.0
for batch_idx, (data, targets) in enumerate(test_loader):
    loss = criterion(model(data), targets)
    test_loss += loss.item()
test_loss = test_loss / len(test_loader)

train_loss = 0.0

for batch_idx, (data, targets) in enumerate(train_loader):
    loss = criterion(model(data), targets)
    train_loss += loss.item()
train_loss = train_loss / len(train_loader)

print("Train Loss:", train_loss)
print("Test Loss:", test_loss)

plt.plot(losses)
plt.title("Losses over Time")
plt.xlabel("Time")
plt.ylabel("Loss (MSE)")
plt.show()

Train Loss: 0.00469449505934753
Test Loss: 0.0072928791133953

	Date created	Year	MARYLAND	Allegany County	Anne Arundel County	Baltimore City	Baltimore County	Calvert County	Caroline County	Carroll County	...	Kent County	Montgomery County	Prince George's County	Queen Anne's County	Somerset County	St. Mary's County	Talbot County	Washington County	Wicomico County	Worcester County
0	2022-10-04T00:00:00.000	2010	49691	31232	53040	36857	48811	50095	34176	48075	...	45132	70723	40714	49171	25345	46901	56452	36571	34247	43403
1	2022-10-04T00:00:00.000	2011	51970	32213	55801	38775	49862	52275	36309	50198	...	46743	74818	42127	51387	26148	48958	58827	38048	35085	45459
2	2022-10-04T00:00:00.000	2012	53016	32945	57046	39187	51260	53243	38110	51653	...	47254	76602	42389	52815	26193	49315	59401	39095	34800	47380
3	2022-10-04T00:00:00.000	2013	52576	33017	57308	40123	51192	52850	39188	51938	...	48059	73353	42331	52969	27403	49234	60606	39236	35453	47851
4	2022-10-04T00:00:00.000	2014	54100	34505	59237	42332	52940	54324	40232	53976	...	50644	74559	43125	54744	28941	50512	63007	40761	37423	49525
5	2022-10-04T00:00:00.000	2015	56392	35510	61255	44260	54698	56738	41068	56343	...	52781	79358	44249	56475	29768	52333	65700	42128	38871	54118
6	2022-10-04T00:00:00.000	2016	58329	37320	62801	45567	56180	58114	41955	57958	...	54349	83540	45407	59158	29143	53465	67419	43296	39030	54048
7	2022-10-04T00:00:00.000	2017	60014	38180	64584	47076	57861	59692	44051	60081	...	57270	85351	46745	61742	30107	54383	68604	44187	40172	55768
8	2022-10-04T00:00:00.000	2018	61600	39391	66510	49219	59695	61389	44700	61328	...	58935	86316	48421	63043	29978	56372	70343	45700	40321	56079
9	2022-10-04T00:00:00.000	2019	62989	40687	68314	49964	62543	63247	45657	63409	...	60407	86221	49947	64996	30623	57891	72729	47071	41134	56536

	Year	Allegany County	Anne Arundel County	Baltimore City	Baltimore County	Calvert County	Caroline County	Carroll County	Cecil County	Charles County	...	Kent County	Montgomery County	Prince George's County	Queen Anne's County	Somerset County	St. Mary's County	Talbot County	Washington County	Wicomico County	Worcester County
0	2010	31232	53040	36857	48811	50095	34176	48075	37962	47949	...	45132	70723	40714	49171	25345	46901	56452	36571	34247	43403
1	2011	32213	55801	38775	49862	52275	36309	50198	39307	49812	...	46743	74818	42127	51387	26148	48958	58827	38048	35085	45459
2	2012	32945	57046	39187	51260	53243	38110	51653	39861	50056	...	47254	76602	42389	52815	26193	49315	59401	39095	34800	47380
3	2013	33017	57308	40123	51192	52850	39188	51938	40578	49832	...	48059	73353	42331	52969	27403	49234	60606	39236	35453	47851
4	2014	34505	59237	42332	52940	54324	40232	53976	41551	50729	...	50644	74559	43125	54744	28941	50512	63007	40761	37423	49525
5	2015	35510	61255	44260	54698	56738	41068	56343	42988	52425	...	52781	79358	44249	56475	29768	52333	65700	42128	38871	54118
6	2016	37320	62801	45567	56180	58114	41955	57958	44076	53721	...	54349	83540	45407	59158	29143	53465	67419	43296	39030	54048
7	2017	38180	64584	47076	57861	59692	44051	60081	45459	55003	...	57270	85351	46745	61742	30107	54383	68604	44187	40172	55768
8	2018	39391	66510	49219	59695	61389	44700	61328	47516	55839	...	58935	86316	48421	63043	29978	56372	70343	45700	40321	56079
9	2019	40687	68314	49964	62543	63247	45657	63409	48834	57196	...	60407	86221	49947	64996	30623	57891	72729	47071	41134	56536

	Jurisdiction	Value_2011	Value_2019	Change
0	Anne Arundel	47.6	54.0	6.4
1	Carroll	48.0	55.9	7.9
2	Charles	47.4	53.2	5.8
3	Frederick	50.5	56.2	5.7
4	Garrett	45.4	56.4	11.0
5	Harford	43.1	58.7	15.6
6	Howard	49.6	56.4	6.8
7	Queen Anne's	46.7	56.4	9.7
8	Somerset	33.1	41.2	8.1
9	Washington	41.0	52.3	11.3

	LocationDesc	Data_Value_2011	Data_Value_2019
1	Anne Arundel	610.05	570.20
5	Carroll	622.95	585.15
7	Charles	664.05	571.45
9	Frederick	604.75	542.65
10	Garrett	719.10	734.05
11	Harford	625.80	571.35
12	Howard	472.05	426.40
16	Queen Anne's	565.75	564.90
17	Somerset	987.20	907.45
20	Washington	678.75	631.25

	count	mean	std	min	25%	50%	75%	max
County
Allegany	270.0	523.700370	675.419350	6.9	71.400	177.85	907.225	2403.0
Anne Arundel	450.0	345.580444	456.122787	3.2	28.300	118.15	442.725	1822.5
Calvert	360.0	393.195278	547.557994	3.2	23.325	94.25	512.075	1957.5
Caroline	330.0	445.755758	557.633569	4.0	50.250	161.10	754.525	1955.9
Carroll	360.0	377.565556	519.605693	3.1	24.550	85.05	515.200	1972.0
Cecil	360.0	425.287500	578.617779	3.4	30.200	140.75	508.925	2120.6
Charles	420.0	342.601429	482.489074	2.8	26.900	104.75	405.850	1918.4
Dorchester	330.0	444.836061	574.452530	3.8	42.875	180.85	704.700	2056.7
Frederick	420.0	363.945714	468.649856	3.1	25.925	140.30	528.175	1794.3
Garrett	240.0	529.010417	617.119245	8.2	70.125	220.05	986.825	2105.7
Harford	420.0	367.349048	467.455969	2.9	31.200	142.15	558.375	1786.8
Howard	420.0	280.266429	358.773199	2.2	23.275	99.85	420.950	1371.8
Kent	300.0	441.400333	517.448622	3.9	52.775	224.65	715.075	1914.0
Montgomery	450.0	236.985556	315.619408	1.7	16.475	75.70	324.300	1243.1
Prince George's	480.0	355.559375	462.521191	3.1	40.150	140.45	519.175	1972.3
Queen Anne's	330.0	367.986364	476.432975	3.2	33.125	126.60	570.675	1799.9
Somerset	300.0	656.963333	831.346871	3.1	98.425	179.05	1590.575	2601.4
Talbot	330.0	355.602727	453.711526	4.4	39.400	134.80	516.050	1893.8
Washington	360.0	391.650833	534.458511	2.9	27.325	125.60	428.625	2018.3
Wicomico	360.0	477.405000	663.878788	2.5	34.375	148.80	596.250	2314.2
Worcester	330.0	415.139394	542.385620	3.2	39.425	167.60	720.350	2107.1

An analysis of Heart Disease and Stroke Mortality in Maryland¶

Introduction¶

Data Curation¶

Data Cleaning¶

Dataset 1: Rates and Trends in Heart Disease¶

Dataset 2: SHIP Increase in Physcial Activity¶

Dataset 3: Maryland Total Personal Income in Thousands of Dollars¶

Exploratory Data Analysis¶

Seperating Dataset¶

How the Data is to be Filtered¶

Checking for Missing Data¶

Exploring Race and Missingness¶

Examining Correlations in Dataset 1¶

Effect of Race and Sex on Mortality¶

First Test¶

Visualizing Dataset 2¶

Second Test¶

Visualizing Income and Mortality in Maryland¶

Third Test¶

Merging Datasets¶

Feature Engineering¶

Filtering out American Indian/Alaska Native¶

Primary Analysis¶

ML Model¶

ML Models: Linear Regression¶

ML Models: Decision Tree Regressor¶

ML Models: Random Forest Regressor¶

Improving Random Forest with Grid Search¶

Insights and Conclusions¶

BONUS ML Model: Neural Network¶

	Year	LocationDesc	Topic	Data_Value	Confidence_limit_Low	Confidence_limit_High	age_group	Race	Sex	LocationID
0	2010	Allegany	All stroke	18.2	15.1	22.9	Ages 35-64 years	Overall	Men	24001.0
1	2013	Allegany	All stroke	17.9	14.4	21.7	Ages 35-64 years	Overall	Men	24001.0
2	1999	Allegany	All heart disease	NaN	NaN	NaN	Ages 35-64 years	American Indian/Alaska Native	Overall	24001.0
3	2013	Allegany	All heart disease	NaN	NaN	NaN	Ages 35-64 years	American Indian/Alaska Native	Overall	24001.0
4	2014	Allegany	All heart disease	NaN	NaN	NaN	Ages 35-64 years	American Indian/Alaska Native	Overall	24001.0

	Jurisdiction	Value	Race/ ethnicity	Year
0	Allegany	45.0	All races/ ethnicities (aggregated)	2019
1	Anne Arundel	54.0	All races/ ethnicities (aggregated)	2019
2	Baltimore County	50.5	All races/ ethnicities (aggregated)	2019
3	Calvert	52.0	All races/ ethnicities (aggregated)	2019
4	Caroline	45.3	All races/ ethnicities (aggregated)	2019
...	...	...	...	...
139	Somerset	33.1	All races/ ethnicities (aggregated)	2011
140	Talbot	57.6	All races/ ethnicities (aggregated)	2011
141	Washington	41.0	All races/ ethnicities (aggregated)	2011
142	Wicomico	50.4	All races/ ethnicities (aggregated)	2011
143	Worcester	52.1	All races/ ethnicities (aggregated)	2011

	count
Race
American Indian/Alaska Native	5040
Asian/Pacific Islander	5040
Black (Non-Hispanic)	5040
Hispanic	5040
White	5040

	0
Year	21
LocationDesc	23
Topic	5
Data_Value	11234
Confidence_limit_Low	10629
Confidence_limit_High	11982
age_group	2
Race	6
Sex	3
LocationID	24

	0
index	0
Year	0
LocationDesc	0
Topic	0
Data_Value	0
Confidence_limit_Low	0
Confidence_limit_High	0
age_group	0
LocationID	0

	Jurisdiction	Value	Race/ ethnicity	Year
0	State	52.2	All races/ ethnicities (aggregated)	2019
1	Allegany	45.0	All races/ ethnicities (aggregated)	2019
2	Anne Arundel	54.0	All races/ ethnicities (aggregated)	2019
3	Baltimore County	50.5	All races/ ethnicities (aggregated)	2019
4	Calvert	52.0	All races/ ethnicities (aggregated)	2019
5	Caroline	45.3	All races/ ethnicities (aggregated)	2019
6	Carroll	55.9	All races/ ethnicities (aggregated)	2019
7	Cecil	50.7	All races/ ethnicities (aggregated)	2019
8	Charles	53.2	All races/ ethnicities (aggregated)	2019
9	Dorchester	45.2	All races/ ethnicities (aggregated)	2019
10	Frederick	56.2	All races/ ethnicities (aggregated)	2019
11	Garrett	56.4	All races/ ethnicities (aggregated)	2019
12	Harford	58.7	All races/ ethnicities (aggregated)	2019
13	Howard	56.4	All races/ ethnicities (aggregated)	2019
14	Kent	52.3	All races/ ethnicities (aggregated)	2019
15	Montgomery	52.8	All races/ ethnicities (aggregated)	2019
16	Prince George's	50.4	All races/ ethnicities (aggregated)	2019
17	Queen Anne's	56.4	All races/ ethnicities (aggregated)	2019
18	Saint Mary's	55.2	All races/ ethnicities (aggregated)	2019
19	Somerset	41.2	All races/ ethnicities (aggregated)	2019
20	Talbot	54.2	All races/ ethnicities (aggregated)	2019
21	Washington	52.3	All races/ ethnicities (aggregated)	2019
22	Wicomico	50.7	All races/ ethnicities (aggregated)	2019
23	Worcester	54.3	All races/ ethnicities (aggregated)	2019
24	Baltimore City	47.4	All races/ ethnicities (aggregated)	2019
25	State	48.7	All races/ ethnicities (aggregated)	2011
26	Allegany	52.2	All races/ ethnicities (aggregated)	2011
27	Anne Arundel	47.6	All races/ ethnicities (aggregated)	2011
28	Baltimore City	43.9	All races/ ethnicities (aggregated)	2011
29	Baltimore County	47.7	All races/ ethnicities (aggregated)	2011
30	Calvert	47.1	All races/ ethnicities (aggregated)	2011
31	Caroline	43.4	All races/ ethnicities (aggregated)	2011
32	Carroll	48.0	All races/ ethnicities (aggregated)	2011
33	Cecil	55.2	All races/ ethnicities (aggregated)	2011
34	Charles	47.4	All races/ ethnicities (aggregated)	2011
35	Dorchester	45.1	All races/ ethnicities (aggregated)	2011
36	Frederick	50.5	All races/ ethnicities (aggregated)	2011
37	Garrett	45.4	All races/ ethnicities (aggregated)	2011
38	Harford	43.1	All races/ ethnicities (aggregated)	2011
39	Howard	49.6	All races/ ethnicities (aggregated)	2011
40	Kent	51.7	All races/ ethnicities (aggregated)	2011
41	Montgomery	52.6	All races/ ethnicities (aggregated)	2011
42	Prince George's	50.2	All races/ ethnicities (aggregated)	2011
43	Queen Anne's	46.7	All races/ ethnicities (aggregated)	2011
44	Saint Mary's	55.7	All races/ ethnicities (aggregated)	2011
45	Somerset	33.1	All races/ ethnicities (aggregated)	2011
46	Talbot	57.6	All races/ ethnicities (aggregated)	2011
47	Washington	41.0	All races/ ethnicities (aggregated)	2011
48	Wicomico	50.4	All races/ ethnicities (aggregated)	2011
49	Worcester	52.1	All races/ ethnicities (aggregated)	2011

	0
Year	0
County	0
Disease Type	0
Mortality	2460
Age_Group	0
Race	0
Sex	0
Income	0
Physical Activity	0

	Year	County	Disease Type	Age_Group	Race	Sex	Income	Physical Activity
3532	2011	Dorchester	Cardiovascular disease (CVD)	Ages 35-64 years	Black (Non-Hispanic)	Overall	37570	45.1
539	2015	Anne Arundel	Heart failure	Ages 35-64 years	Overall	Overall	61255	54.9
9295	2017	Wicomico	Heart failure	Ages 35-64 years	Black (Non-Hispanic)	Overall	40172	49.3
9595	2012	Wicomico	Heart failure	Ages 65 years and older	White	Overall	34800	50.3
8912	2017	Washington	All heart disease	Ages 65 years and older	Overall	Overall	44187	49.7
...	...	...	...	...	...	...	...	...
6574	2015	Montgomery	All heart disease	Ages 65 years and older	Overall	Men	79358	56.8
5941	2019	Kent	All heart disease	Ages 35-64 years	Hispanic	Overall	60407	52.3
6170	2011	Kent	Cardiovascular disease (CVD)	Ages 65 years and older	Black (Non-Hispanic)	Overall	46743	51.7
1010	2012	Calvert	Coronary heart disease (CHD)	Ages 35-64 years	Overall	Overall	53243	51.0
8320	2012	Talbot	All stroke	Ages 35-64 years	Black (Non-Hispanic)	Overall	59401	55.3