4.4. Example: West Claude University#
Below is a fictional survey of fictional students from the fictional West Claude University. To reiterate, this is synthetic data produced using Claude.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
wcu_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/westclaudeuniversity_survey.csv')
wcu_df.head()
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
Cell In[2], line 1
----> 1 wcu_df = pd.read_csv('https://raw.githubusercontent.com/GettysburgDataScience/datasets/refs/heads/main/westclaudeuniversity_survey.csv')
2 wcu_df.head()
File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...) 1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
622 if chunksize or iterator:
623 return parser
File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/common.py:728, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
725 codecs.lookup_error(errors)
727 # open URLs
--> 728 ioargs = _get_filepath_or_buffer(
729 path_or_buf,
730 encoding=encoding,
731 compression=compression,
732 mode=mode,
733 storage_options=storage_options,
734 )
736 handle = ioargs.filepath_or_buffer
737 handles: list[BaseBuffer]
File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/common.py:384, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
382 # assuming storage_options is to be interpreted as headers
383 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 384 with urlopen(req_info) as req:
385 content_encoding = req.headers.get("Content-Encoding", None)
386 if content_encoding == "gzip":
387 # Override compression based on Content-Encoding header
File ~/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/pandas/io/common.py:289, in urlopen(*args, **kwargs)
283 """
284 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
285 the stdlib.
286 """
287 import urllib.request
--> 289 return urllib.request.urlopen(*args, **kwargs)
File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:189, in urlopen(url, data, timeout, context)
187 else:
188 opener = _opener
--> 189 return opener.open(url, data, timeout)
File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:495, in OpenerDirector.open(self, fullurl, data, timeout)
493 for processor in self.process_response.get(protocol, []):
494 meth = getattr(processor, meth_name)
--> 495 response = meth(req, response)
497 return response
File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:604, in HTTPErrorProcessor.http_response(self, request, response)
601 # According to RFC 2616, "2xx" code indicates that the client's
602 # request was successfully received, understood, and accepted.
603 if not (200 <= code < 300):
--> 604 response = self.parent.error(
605 'http', request, response, code, msg, hdrs)
607 return response
File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:533, in OpenerDirector.error(self, proto, *args)
531 if http_err:
532 args = (dict, 'default', 'http_error_default') + orig_args
--> 533 return self._call_chain(*args)
File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:466, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
464 for handler in handlers:
465 func = getattr(handler, meth_name)
--> 466 result = func(*args)
467 if result is not None:
468 return result
File ~/.pyenv/versions/3.13.1/lib/python3.13/urllib/request.py:613, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
612 def http_error_default(self, req, fp, code, msg, hdrs):
--> 613 raise HTTPError(req.full_url, code, msg, hdrs, fp)
HTTPError: HTTP Error 404: Not Found
Exploratory Data Analysis
Data Cleaning
Modeling
Feature Engineering/Feature Selection
train test split
feature scaling
model fit and selection
model assessment
Intepretation
## EDA
wcu_df.info() # data types and missing data
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 3
1 ## EDA
----> 3 wcu_df.info() # data types and missing data
NameError: name 'wcu_df' is not defined
wcu_df.describe() # descriptive stats
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 1
----> 1 wcu_df.describe() # descriptive stats
NameError: name 'wcu_df' is not defined
sns.pairplot(wcu_df)
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 1
----> 1 sns.pairplot(wcu_df)
2 plt.show()
NameError: name 'wcu_df' is not defined
wcu_df.drop(columns = 'student_id', inplace = True)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 1
----> 1 wcu_df.drop(columns = 'student_id', inplace = True)
NameError: name 'wcu_df' is not defined
wcu_df['major'].unique()
majors_df = pd.get_dummies(wcu_df['major'], dtype = int)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 1
----> 1 wcu_df['major'].unique()
2 majors_df = pd.get_dummies(wcu_df['major'], dtype = int)
NameError: name 'wcu_df' is not defined
wcu_df = wcu_df.drop(columns = 'major').join(majors_df)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 1
----> 1 wcu_df = wcu_df.drop(columns = 'major').join(majors_df)
NameError: name 'wcu_df' is not defined
wcu_df.replace({'Freshman':1,
'Sophomore': 2,
'Junior': 3,
'Senior':4},
inplace = True)
wcu_df
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 1
----> 1 wcu_df.replace({'Freshman':1,
2 'Sophomore': 2,
3 'Junior': 3,
4 'Senior':4},
5 inplace = True)
7 wcu_df
NameError: name 'wcu_df' is not defined
wcu_df.dropna(inplace=True)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 1
----> 1 wcu_df.dropna(inplace=True)
NameError: name 'wcu_df' is not defined
4.4.1. Modeling#
4.4.1.1. Finish Feature Engineering#
Make Polynomials
4.4.1.2. Model#
Compare LinearRegression, Lasso, Ridge, ElasticNet
target = 'gpa'
y = wcu_df[target]
X = wcu_df.drop(columns = target)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[11], line 3
1 target = 'gpa'
----> 3 y = wcu_df[target]
4 X = wcu_df.drop(columns = target)
NameError: name 'wcu_df' is not defined
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree = 2, include_bias = False)
Xpoly = pf.fit_transform(X)
feature_names = pf.get_feature_names_out()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[12], line 4
1 from sklearn.preprocessing import PolynomialFeatures
3 pf = PolynomialFeatures(degree = 2, include_bias = False)
----> 4 Xpoly = pf.fit_transform(X)
6 feature_names = pf.get_feature_names_out()
NameError: name 'X' is not defined
4.4.2. Modeling for Real#
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
X_train, X_test, y_train, y_test = train_test_split(
Xpoly, y, test_size = 0.2)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[14], line 2
1 X_train, X_test, y_train, y_test = train_test_split(
----> 2 Xpoly, y, test_size = 0.2)
NameError: name 'Xpoly' is not defined
scaler = StandardScaler()
Z_train = scaler.fit_transform(X_train)
Z_test = scaler.transform(X_test)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 3
1 scaler = StandardScaler()
----> 3 Z_train = scaler.fit_transform(X_train)
4 Z_test = scaler.transform(X_test)
NameError: name 'X_train' is not defined
linreg = LinearRegression()
linreg.fit(Z_train, y_train)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[16], line 2
1 linreg = LinearRegression()
----> 2 linreg.fit(Z_train, y_train)
NameError: name 'Z_train' is not defined
def display_coef(model, features):
idx = np.argsort(np.abs(model.coef_))[::-1]
for k, (f, c) in enumerate(zip(features[idx], model.coef_[idx]), start = 1):
print(f'{k:>3}.{f:<40}{c:+5.4f}')
#display_coef(linreg, feature_names)
4.4.2.1. Making Predictions and Assessing the Linear Regression Model#
from sklearn.metrics import r2_score
y_linreg_train = linreg.predict(Z_train)
y_linreg_test = linreg.predict(Z_test)
r2_linreg_train = r2_score(y_train, y_linreg_train)
r2_linreg_test = r2_score(y_test, y_linreg_test)
print(f'{r2_linreg_train=:0.2f}\n{r2_linreg_test=:0.2f}')
# OVER FITTING - great on train, much worse on test
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[20], line 1
----> 1 y_linreg_train = linreg.predict(Z_train)
2 y_linreg_test = linreg.predict(Z_test)
4 r2_linreg_train = r2_score(y_train, y_linreg_train)
NameError: name 'Z_train' is not defined
4.4.3. Let’s improve our model with Regularization#
alpha_test = np.logspace(-3, 3, 25)
ridge = RidgeCV(alphas = alpha_test)
ridge.fit(Z_train, y_train)
ridge.alpha_
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[21], line 4
1 alpha_test = np.logspace(-3, 3, 25)
3 ridge = RidgeCV(alphas = alpha_test)
----> 4 ridge.fit(Z_train, y_train)
6 ridge.alpha_
NameError: name 'Z_train' is not defined
lasso = LassoCV(alphas = alpha_test, max_iter = 15000)
lasso.fit(Z_train, y_train)
lasso.alpha_
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[22], line 2
1 lasso = LassoCV(alphas = alpha_test, max_iter = 15000)
----> 2 lasso.fit(Z_train, y_train)
4 lasso.alpha_
NameError: name 'Z_train' is not defined
elastic = ElasticNetCV(l1_ratio = 0.5,
alphas = alpha_test,
max_iter = 15000)
elastic.fit(Z_train, y_train)
elastic.alpha_
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[23], line 5
1 elastic = ElasticNetCV(l1_ratio = 0.5,
2 alphas = alpha_test,
3 max_iter = 15000)
----> 5 elastic.fit(Z_train, y_train)
7 elastic.alpha_
NameError: name 'Z_train' is not defined
| model | Non-Zero | Above 0.1 |
| LinearRegression | 209 | 197 | | Ridge | 208 | 94 | | Lasso | 20 | 13 | | Elastic | 24 | 20 |
# display_coef(elastic, feature_names)
y_ridge_train = ridge.predict(Z_train)
y_ridge_test = ridge.predict(Z_test)
r2_ridge_train = r2_score(y_train, y_ridge_train)
r2_ridge_test = r2_score(y_test, y_ridge_test)
print(f'{r2_ridge_train=:0.2f}\n{r2_ridge_test=:0.2f}')
y_lasso_train = lasso.predict(Z_train)
y_lasso_test = lasso.predict(Z_test)
r2_lasso_train = r2_score(y_train, y_lasso_train)
r2_lasso_test = r2_score(y_test, y_lasso_test)
print(f'\n{r2_lasso_train=:0.2f}\n{r2_lasso_test=:0.2f}')
y_elastic_train = elastic.predict(Z_train)
y_elastic_test = elastic.predict(Z_test)
r2_elastic_train = r2_score(y_train, y_elastic_train)
r2_elastic_test = r2_score(y_test, y_elastic_test)
print(f'\n{r2_elastic_train=:0.2f}\n{r2_elastic_test=:0.2f}')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[25], line 1
----> 1 y_ridge_train = ridge.predict(Z_train)
2 y_ridge_test = ridge.predict(Z_test)
4 r2_ridge_train = r2_score(y_train, y_ridge_train)
NameError: name 'Z_train' is not defined
# display_coef(lasso, feature_names)
import matplotlib.pyplot as plt
plt.plot(y_train, y_lasso_train, 'r.')
plt.plot(y_test, y_lasso_test, 'b.')
plt.plot([1, 4], [1, 4], 'k--')
plt.xlabel('Actual GPA')
plt.ylabel('Predicted GPA')
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[27], line 3
1 import matplotlib.pyplot as plt
----> 3 plt.plot(y_train, y_lasso_train, 'r.')
4 plt.plot(y_test, y_lasso_test, 'b.')
5 plt.plot([1, 4], [1, 4], 'k--')
NameError: name 'y_train' is not defined