# Data Storytelling

 - Explore the data below (ie features, stats, and visualizations).
 - Develop 2-3 questions/theses that might be addressed by these data.
 - Clean and combine data sources.
 - Choose a model. What aspects of the model will help support your thesis or answer your question?
 - Assess and interpret your model.



In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

[data repository](https://github.com/GettysburgDataScience/datasets/tree/main/us_political)

In [47]:
elec_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/us_political/countypres_2000-2024.csv')
health_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/us_political/countyHealth_2025.csv', skiprows = [1])
agesex_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/us_political/stats_america/Population-by-Age-and-Sex/Population%20by%20Age%20and%20Sex%20-%20US%2C%20States%2C%20Counties.csv')
race_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/us_political/stats_america/Population-by-Race/Population%20by%20Race%20-%20US%2C%20States%2C%20Counties.csv')
social_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/us_political/stats_america/Social-Context/Social%20Context.csv')
dev_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/us_political/stats_america/Metrics-For-Development/Metrics%20For%20Development.csv')


## Election Data

In [48]:
elec_df.head()

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode
0,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,AL GORE,DEMOCRAT,4942,17208,20250821,TOTAL
1,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,11993,17208,20250821,TOTAL
2,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,OTHER,OTHER,113,17208,20250821,TOTAL
3,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,RALPH NADER,GREEN,160,17208,20250821,TOTAL
4,2000,ALABAMA,AL,BALDWIN,1003.0,US PRESIDENT,AL GORE,DEMOCRAT,13997,56480,20250821,TOTAL


In [49]:
columns_to_keep = ['year', 'county_fips', 'candidate', 'candidatevotes', 'totalvotes']
elec_R_df = elec_df.query('party == "REPUBLICAN" and mode.str.startswith("TOTAL")')[columns_to_keep]
elec_D_df = elec_df.query('party == "DEMOCRAT" and mode.str.startswith("TOTAL")')[columns_to_keep]

In [50]:
elections_df = pd.merge(left = elec_D_df, right = elec_R_df,
         how = 'inner', on = ['year', 'county_fips'], suffixes = ['_D', '_R'])

elections_df.drop(columns = 'totalvotes_D', inplace = True)
elections_df.rename(columns = {'totalvotes_R':'totalvotes'})

elections_df.head()

Unnamed: 0,year,county_fips,candidate_D,candidatevotes_D,candidate_R,candidatevotes_R,totalvotes_R
0,2000,1001.0,AL GORE,4942,GEORGE W. BUSH,11993,17208
1,2000,1003.0,AL GORE,13997,GEORGE W. BUSH,40872,56480
2,2000,1005.0,AL GORE,5188,GEORGE W. BUSH,5096,10395
3,2000,1007.0,AL GORE,2710,GEORGE W. BUSH,4273,7101
4,2000,1009.0,AL GORE,4977,GEORGE W. BUSH,12667,17973


In [51]:
fips_df = elec_df[['state_po', 'county_name', 'county_fips']].drop_duplicates().reset_index(drop = True)
fips_df.head()

Unnamed: 0,state_po,county_name,county_fips
0,AL,AUTAUGA,1001.0
1,AL,BALDWIN,1003.0
2,AL,BARBOUR,1005.0
3,AL,BIBB,1007.0
4,AL,BLOUNT,1009.0


In [52]:
## County Health Data

In [53]:
health_df.head(3)

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Premature Death raw value,Premature Death numerator,Premature Death denominator,...,% Rural raw value,% Rural numerator,% Rural denominator,% Rural CI low,% Rural CI high,Population raw value,Population numerator,Population denominator,Population CI low,Population CI high
0,0,0,0,US,United States,2025,,8351.736549,4763989.0,925367214.0,...,0.200031,66300254.0,331449281.0,,,334914895.0,,,,
1,1,0,1000,AL,Alabama,2025,,11853.247248,102760.0,13958454.0,...,0.422628,2123399.0,5024279.0,,,5108468.0,,,,
2,1,1,1001,AL,Autauga County,2025,1.0,9938.263382,1008.0,163064.0,...,0.406768,23920.0,58805.0,,,60342.0,,,,


In [54]:
def print_cols(df, num_cols = 2):
    col_str = ''
    for k, col in enumerate(df.columns):
        col_str += f'{k:>3}.  {col:<40}'
        if k % num_cols == 0:
            col_str += '\n'
    print(col_str)
    
print_cols(health_df, num_cols = 4)

  0.  State FIPS Code                         
  1.  County FIPS Code                          2.  5-digit FIPS Code                         3.  State Abbreviation                        4.  Name                                    
  5.  Release Year                              6.  County Clustered (Yes=1/No=0)             7.  Premature Death raw value                 8.  Premature Death numerator               
  9.  Premature Death denominator              10.  Premature Death CI low                   11.  Premature Death CI high                  12.  Premature Death flag (0 = No Flag/1=Unreliable/2=Suppressed)
 13.  Premature Death (AIAN)                   14.  Premature Death CI low (AIAN)            15.  Premature Death CI high (AIAN)           16.  Premature Death flag (AIAN) (. = No Flag/1=Unreliable/2=Suppressed)
 17.  Premature Death (Asian)                  18.  Premature Death CI low (Asian)           19.  Premature Death CI high (Asian)          20.  Premature Death flag (

## County Age-Sex Data

## County Age-Sex Data

In [55]:
agesex_df.iloc[40:60]

Unnamed: 0,IBRC_Geo_ID,Statefips,Countyfips,Description,Year,Total Population,Population 0-4,Population 5-17,Population 18-24,Population 25-44,Population 45-64,Population 65+,Population Under 18,Population 18-54,Population 55+,Male Population,Female Population
40,1000,1,0,Alabama,2015,4854803,294097,809062,471910,1230689,1286744,762301,1103159,2351777,1399867.0,2352806,2501997
41,1000,1,0,Alabama,2016,4866824,294616,805845,462098,1231213,1289047,784005,1100461,2337301,1429062.0,2357211,2509613
42,1000,1,0,Alabama,2017,4877989,294572,802005,455547,1233167,1287432,805266,1096577,2325223,1456189.0,2360503,2517486
43,1000,1,0,Alabama,2018,4891628,295520,797079,453230,1236516,1282303,826980,1092599,2316986,1482043.0,2365445,2526183
44,1000,1,0,Alabama,2019,4907965,294274,794453,450764,1242294,1274660,851520,1088727,2309910,1509328.0,2371832,2536133
45,1000,1,0,Alabama,2020,5033094,297347,832329,467246,1270670,1299444,866058,1129676,2364124,1539193.0,2446816,2586278
46,1000,1,0,Alabama,2021,5049196,294227,833920,475424,1277137,1287898,880590,1128147,2372657,1550979.0,2453002,2596194
47,1000,1,0,Alabama,2022,5076181,293480,836216,478391,1285214,1279704,903176,1129696,2382840,1566349.0,2465853,2610328
48,1000,1,0,Alabama,2023,5117673,294447,839471,482743,1296732,1274413,929867,1133918,2399292,1586378.0,2484555,2633118
49,1000,1,0,Alabama,2024,5157699,295446,839411,488175,1309411,1269355,955901,1134857,2417153,1605689.0,2503170,2654529


## County Race Data

In [56]:
race_df.head()

Unnamed: 0,IBRC_Geo_ID,Statefips,Countyfips,Description,Year,Total Population,White Alone,Black Alone,American Indian or Alaskan Native,Asian Alone,Hawaiian or Pacific Islander Alone,Two or More Races,Not Hispanic,Hispanic
0,0,0,0,U.S.,1990,249622814,209366661.0,30648345.0,2058726.0,7549082.0,,,227049976.0,22572838.0
1,0,0,0,U.S.,1991,252980941,211606011.0,31290743.0,2126968.0,7957219.0,,,229425951.0,23554990.0
2,0,0,0,U.S.,1992,256514224,213945622.0,31979982.0,2202176.0,8386444.0,,,231894879.0,24619345.0
3,0,0,0,U.S.,1993,259918588,216187073.0,32634735.0,2282052.0,8814728.0,,,234141927.0,25776661.0
4,0,0,0,U.S.,1994,263125821,218304774.0,33258981.0,2361078.0,9200988.0,,,236179888.0,26945933.0


## County Social Leanings Data

In [57]:
social_df.head()

Unnamed: 0,IBRC_Geo_ID,Statefips,Countyfips,Description,Year,Social_Context_Code,Social_Context_Code_Description,Social_Context_Domain_Data,Time_Period
0,10001,10,1,"Kent County, DE",2019,211,Hopefulness,87.332678,1972-2017
1,10001,10,1,"Kent County, DE",2019,104,Employment Rate,95.3,2017
2,10001,10,1,"Kent County, DE",2019,103,Income Per Capita,26118.0,2017
3,10001,10,1,"Kent County, DE",2019,102,Income Mobility,-0.441463,2017
4,10001,10,1,"Kent County, DE",2019,101,Entrepreneurship,10.732984,2013


In [58]:
social_df.pivot(index = ['IBRC_Geo_ID', 'Time_Period'], columns = 'Social_Context_Code_Description', values = 'Social_Context_Domain_Data')

Unnamed: 0_level_0,Social_Context_Code_Description,Agreeableness,Belief In Science,Collectivism,Conflict Awareness,Conscientiousness,Empathy,Employment Rate,Entrepreneurship,Extraversion,Gender Equality,Hopefulness,Income Mobility,Income Per Capita,Neuroticism,Openness,Religiosity,Risk Taking,Selflessness,Tolerance,Work Ethic
IBRC_Geo_ID,Time_Period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,1972-2017,,70.833333,57.142857,63.444323,,83.308726,,,,77.063492,91.163142,,,,,91.106719,53.333333,82.142857,70.000000,60.380952
1001,1999-2017,86.279655,,,,84.113820,,,,86.112526,,,,,77.925476,78.222354,,,,,
1001,2013,,,,,,,,12.968300,,,,,,,,,,,,
1001,2017,,,,,,,94.7,,,,,-0.209446,26168.0,,,,,,,
1003,1972-2017,,63.268161,67.948815,63.751017,,78.187835,,,,69.016011,82.484017,,,,,71.771566,67.272980,75.586018,66.983549,70.972246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56043,2017,,,,,,,95.3,,,,,0.527082,26325.0,,,,,,,
56045,1972-2017,,67.710969,69.084172,56.654563,,81.012828,,,,69.933697,80.975991,,,,,71.535519,75.406846,76.730879,68.422269,69.413532
56045,1999-2017,88.251855,,,,84.263823,,,,87.006078,,,,,79.961075,76.977099,,,,,
56045,2013,,,,,,,,7.446809,,,,,,,,,,,,


## County Development Metrics Data

In [59]:
dev_df.head()

Unnamed: 0,IBRC_Geo_ID,Statefips,Countyfips,Description,Year,M4D_Code,Code Description,M4D_Data
0,1001,1,1,"Autauga County, AL",2019,100,Headline M4D Index,0.666112
1,1001,1,1,"Autauga County, AL",2019,1000,Full-Time Work,0.661573
2,1001,1,1,"Autauga County, AL",2019,10000,Grocery stores per capita,0.072548
3,1001,1,1,"Autauga County, AL",2019,10100,Farmers' markets per capita,0.018166
4,1001,1,1,"Autauga County, AL",2019,10200,SNAP benefits per capita,17.905272


In [60]:
# dev_df.drop_duplicates(inplace=True)
# dev_df.pivot(index = ['IBRC_Geo_ID', 'Year'], columns = 'Code Description', values = 'M4D_Data')

## County Age-Sex Data