4.4. Example: West Claude University#

Below is a fictional survey of fictional students from the fictional West Claude University. To reiterate, this is synthetic data produced using Claude.

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
wcu_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/westclaudeuniversity_survey.csv')
wcu_df.head()
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 wcu_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/westclaudeuniversity_survey.csv')
      2 wcu_df.head()

File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/common.py:728, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    725     codecs.lookup_error(errors)
    727 # open URLs
--> 728 ioargs = _get_filepath_or_buffer(
    729     path_or_buf,
    730     encoding=encoding,
    731     compression=compression,
    732     mode=mode,
    733     storage_options=storage_options,
    734 )
    736 handle = ioargs.filepath_or_buffer
    737 handles: list[BaseBuffer]

File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/common.py:384, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    382 # assuming storage_options is to be interpreted as headers
    383 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 384 with urlopen(req_info) as req:
    385     content_encoding = req.headers.get("Content-Encoding", None)
    386     if content_encoding == "gzip":
    387         # Override compression based on Content-Encoding header

File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/common.py:289, in urlopen(*args, **kwargs)
    283 """
    284 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
    285 the stdlib.
    286 """
    287 import urllib.request
--> 289 return urllib.request.urlopen(*args, **kwargs)

File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:189, in urlopen(url, data, timeout, context)
    187 else:
    188     opener = _opener
--> 189 return opener.open(url, data, timeout)

File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:495, in OpenerDirector.open(self, fullurl, data, timeout)
    493 for processor in self.process_response.get(protocol, []):
    494     meth = getattr(processor, meth_name)
--> 495     response = meth(req, response)
    497 return response

File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:604, in HTTPErrorProcessor.http_response(self, request, response)
    601 # According to RFC 2616, "2xx" code indicates that the client's
    602 # request was successfully received, understood, and accepted.
    603 if not (200 <= code < 300):
--> 604     response = self.parent.error(
    605         'http', request, response, code, msg, hdrs)
    607 return response

File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:533, in OpenerDirector.error(self, proto, *args)
    531 if http_err:
    532     args = (dict, 'default', 'http_error_default') + orig_args
--> 533     return self._call_chain(*args)

File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:466, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    464 for handler in handlers:
    465     func = getattr(handler, meth_name)
--> 466     result = func(*args)
    467     if result is not None:
    468         return result

File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:613, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
    612 def http_error_default(self, req, fp, code, msg, hdrs):
--> 613     raise HTTPError(req.full_url, code, msg, hdrs, fp)

HTTPError: HTTP Error 404: Not Found
  • Exploratory Data Analysis

  • Data Cleaning

  • Modeling

    • Feature Engineering/Feature Selection

    • train test split

    • feature scaling

    • model fit and selection

    • model assessment

  • Intepretation

## EDA

wcu_df.info() # data types and missing data
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 3
      1 ## EDA
----> 3 wcu_df.info() # data types and missing data

NameError: name 'wcu_df' is not defined
wcu_df.describe() # descriptive stats
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 wcu_df.describe() # descriptive stats

NameError: name 'wcu_df' is not defined
sns.pairplot(wcu_df)
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 sns.pairplot(wcu_df)
      2 plt.show()

NameError: name 'wcu_df' is not defined
wcu_df.drop(columns = 'student_id', inplace = True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 wcu_df.drop(columns = 'student_id', inplace = True)

NameError: name 'wcu_df' is not defined
wcu_df['major'].unique()
majors_df = pd.get_dummies(wcu_df['major'], dtype = int)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 wcu_df['major'].unique()
      2 majors_df = pd.get_dummies(wcu_df['major'], dtype = int)

NameError: name 'wcu_df' is not defined
wcu_df = wcu_df.drop(columns = 'major').join(majors_df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 wcu_df = wcu_df.drop(columns = 'major').join(majors_df)

NameError: name 'wcu_df' is not defined
wcu_df.replace({'Freshman':1,
                'Sophomore': 2,
                'Junior': 3,
                'Senior':4},
               inplace = True)

wcu_df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 wcu_df.replace({'Freshman':1,
      2                 'Sophomore': 2,
      3                 'Junior': 3,
      4                 'Senior':4},
      5                inplace = True)
      7 wcu_df

NameError: name 'wcu_df' is not defined
wcu_df.dropna(inplace=True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 wcu_df.dropna(inplace=True)

NameError: name 'wcu_df' is not defined

4.4.1. Modeling#

4.4.1.1. Finish Feature Engineering#

  • Make Polynomials

4.4.1.2. Model#

  • Compare LinearRegression, Lasso, Ridge, ElasticNet

target = 'gpa'

y = wcu_df[target]
X = wcu_df.drop(columns = target)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 3
      1 target = 'gpa'
----> 3 y = wcu_df[target]
      4 X = wcu_df.drop(columns = target)

NameError: name 'wcu_df' is not defined
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures(degree = 2, include_bias = False)
Xpoly = pf.fit_transform(X)

feature_names = pf.get_feature_names_out()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 4
      1 from sklearn.preprocessing import PolynomialFeatures
      3 pf = PolynomialFeatures(degree = 2, include_bias = False)
----> 4 Xpoly = pf.fit_transform(X)
      6 feature_names = pf.get_feature_names_out()

NameError: name 'X' is not defined

4.4.2. Modeling for Real#

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
X_train, X_test, y_train, y_test = train_test_split(
    Xpoly, y, test_size = 0.2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 X_train, X_test, y_train, y_test = train_test_split(
----> 2     Xpoly, y, test_size = 0.2)

NameError: name 'Xpoly' is not defined
scaler = StandardScaler()

Z_train = scaler.fit_transform(X_train)
Z_test = scaler.transform(X_test)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 3
      1 scaler = StandardScaler()
----> 3 Z_train = scaler.fit_transform(X_train)
      4 Z_test = scaler.transform(X_test)

NameError: name 'X_train' is not defined
linreg = LinearRegression()
linreg.fit(Z_train, y_train)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 2
      1 linreg = LinearRegression()
----> 2 linreg.fit(Z_train, y_train)

NameError: name 'Z_train' is not defined
def display_coef(model, features):
    idx = np.argsort(np.abs(model.coef_))[::-1]

    for k, (f, c) in enumerate(zip(features[idx], model.coef_[idx]), start = 1):
        print(f'{k:>3}.{f:<40}{c:+5.4f}')
#display_coef(linreg, feature_names)

4.4.2.1. Making Predictions and Assessing the Linear Regression Model#

from sklearn.metrics import r2_score
y_linreg_train = linreg.predict(Z_train)
y_linreg_test = linreg.predict(Z_test)

r2_linreg_train = r2_score(y_train, y_linreg_train)
r2_linreg_test = r2_score(y_test, y_linreg_test)

print(f'{r2_linreg_train=:0.2f}\n{r2_linreg_test=:0.2f}')
# OVER FITTING - great on train, much worse on test
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 1
----> 1 y_linreg_train = linreg.predict(Z_train)
      2 y_linreg_test = linreg.predict(Z_test)
      4 r2_linreg_train = r2_score(y_train, y_linreg_train)

NameError: name 'Z_train' is not defined

4.4.3. Let’s improve our model with Regularization#

alpha_test = np.logspace(-3, 3, 25)

ridge = RidgeCV(alphas = alpha_test)
ridge.fit(Z_train, y_train)

ridge.alpha_
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 4
      1 alpha_test = np.logspace(-3, 3, 25)
      3 ridge = RidgeCV(alphas = alpha_test)
----> 4 ridge.fit(Z_train, y_train)
      6 ridge.alpha_

NameError: name 'Z_train' is not defined
lasso = LassoCV(alphas = alpha_test, max_iter = 15000)
lasso.fit(Z_train, y_train)

lasso.alpha_
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 2
      1 lasso = LassoCV(alphas = alpha_test, max_iter = 15000)
----> 2 lasso.fit(Z_train, y_train)
      4 lasso.alpha_

NameError: name 'Z_train' is not defined
elastic = ElasticNetCV(l1_ratio = 0.5,
                       alphas = alpha_test,
                       max_iter = 15000)

elastic.fit(Z_train, y_train)

elastic.alpha_
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[23], line 5
      1 elastic = ElasticNetCV(l1_ratio = 0.5,
      2                        alphas = alpha_test,
      3                        max_iter = 15000)
----> 5 elastic.fit(Z_train, y_train)
      7 elastic.alpha_

NameError: name 'Z_train' is not defined

| model | Non-Zero | Above 0.1 |


| LinearRegression | 209 | 197 | | Ridge | 208 | 94 | | Lasso | 20 | 13 | | Elastic | 24 | 20 |

# display_coef(elastic, feature_names)
y_ridge_train = ridge.predict(Z_train)
y_ridge_test = ridge.predict(Z_test)

r2_ridge_train = r2_score(y_train, y_ridge_train)
r2_ridge_test = r2_score(y_test, y_ridge_test)

print(f'{r2_ridge_train=:0.2f}\n{r2_ridge_test=:0.2f}')

y_lasso_train = lasso.predict(Z_train)
y_lasso_test = lasso.predict(Z_test)

r2_lasso_train = r2_score(y_train, y_lasso_train)
r2_lasso_test = r2_score(y_test, y_lasso_test)

print(f'\n{r2_lasso_train=:0.2f}\n{r2_lasso_test=:0.2f}')

y_elastic_train = elastic.predict(Z_train)
y_elastic_test = elastic.predict(Z_test)

r2_elastic_train = r2_score(y_train, y_elastic_train)
r2_elastic_test = r2_score(y_test, y_elastic_test)

print(f'\n{r2_elastic_train=:0.2f}\n{r2_elastic_test=:0.2f}')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 y_ridge_train = ridge.predict(Z_train)
      2 y_ridge_test = ridge.predict(Z_test)
      4 r2_ridge_train = r2_score(y_train, y_ridge_train)

NameError: name 'Z_train' is not defined
# display_coef(lasso, feature_names)
import matplotlib.pyplot as plt

plt.plot(y_train, y_lasso_train, 'r.')
plt.plot(y_test, y_lasso_test, 'b.')
plt.plot([1, 4], [1, 4], 'k--')
plt.xlabel('Actual GPA')
plt.ylabel('Predicted GPA')

plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 3
      1 import matplotlib.pyplot as plt
----> 3 plt.plot(y_train, y_lasso_train, 'r.')
      4 plt.plot(y_test, y_lasso_test, 'b.')
      5 plt.plot([1, 4], [1, 4], 'k--')

NameError: name 'y_train' is not defined