PCA: Examples and Observations

8. PCA: Examples and Observations#

Principle Component Analysis (PCA) is one of the more complex concepts in data science. In this notebook, we look at some examples and make some general observations as to the (beneficial) effects and uses of PCA.

Working through the examples, observe:

Features that vary together appear in the same Principle Components. As a result, PCA solves the co-linearity problem.
You can approximately reconstruct the original feature values with fewer than all the PCs.

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

Show code cell source Hide code cell source

def plotPCA(data, pca=None, num_components = None, idx = 0, feature_names = None):

# plot PCA components and a reconstruction of one data instance

    if pca is None:
        if num_components is None:
            pca = PCA()
        else:
            pca = PCA(n_components = num_components)
    
    X = pca.fit_transform(data)

    if num_components is None:
        num_components = pca.n_components_
    
    
    if isinstance(data, pd.DataFrame):
        x0 = data.iloc[idx,:]
    else:
        x0 = data[idx,:]
        
    x_pca = np.zeros_like(x0)
    
    coeffs = X[idx,:]
    
    if feature_names is None:
        try:
            feature_names = pca.feature_names_in_
        except:
            feature_names = [str(i) for i in range(data.shape[1])]

    C0 = 'goldenrod'
    C1 = 'dodgerblue'
    alpha = 0.6
    

    offset = 2
    num_cols = num_components + offset
    width_ratios = [1, 0.2] + [1]*num_components
        
    fig, ax = plt.subplots(2, num_cols, figsize = ((num_cols)*2, 8), 
                        sharex = True, sharey = False,
                        gridspec_kw = dict(width_ratios=width_ratios,
                                            hspace = 0.1, wspace = 0.1)                       
                        )
    
    if not np.all(np.isclose(pca.mean_, 0)):
        ax[0, 0].bar(feature_names, pca.mean_, color = C1, alpha = alpha)
        ax[0, 0].bar(feature_names, x0, color = C0, alpha = alpha)
        
        ax[0, 0].sharey(ax[1,offset])
    
    ax[0,0].set_title('Mean')

    ax[1, 0].bar(feature_names, x0-pca.mean_, color = C0, alpha = alpha)
    ax[1, 0].bar(feature_names, x_pca, color = C1, alpha = alpha)
    ax[1, 0].set_xlabel('Features - Mean = ', fontsize = 16)
    ax[1, 0].tick_params(axis='x', labelrotation = 90, )
    ax[1, 0].sharey(ax[1,offset])


    # ax[0, 2].set_title(f'PC1')   

    ax[0,1].set_visible(False)
    ax[1,1].set_visible(False)

    for r_idx, (r, coef) in enumerate(zip(pca.components_, coeffs)):
        x_pca += coef * r
        ax[0, r_idx+offset].bar(feature_names, r, color = C1, alpha = alpha)
        ax[0, r_idx+offset].text(0.5, 0.05, f'coef = {coef:0.2f}', ha = 'center', transform=ax[0, r_idx+offset].transAxes)
        
        ax[1, r_idx+offset].bar(feature_names, x0-pca.mean_, color = C0, alpha = alpha)
        ax[1, r_idx+offset].bar(feature_names, x_pca, alpha = alpha, color = C1)
        
        if r_idx == 0:
            ax_str = f'{coef:0.2f} * PC1'
        
        if r_idx>0:
            ax[0, r_idx+offset].sharey(ax[0,offset])
            ax[1, r_idx+offset].sharey(ax[1,offset])
            plt.setp(ax[0,r_idx+offset].get_yticklabels(), visible=False)
            plt.setp(ax[1,r_idx+offset].get_yticklabels(), visible=False)
            
            if coef > 0:
                ax_str = f' + {coef:4.2} * PC{r_idx+1}'
            elif coef < 0:
                ax_str = f' - {-1*coef:4.2} * PC{r_idx+1}'
            else:
                ax_str = f' + 0'
                        
        ax[0, r_idx+offset].set_title(f'PC{r_idx+1}')   
        ax[1, r_idx+offset].set_xlabel(ax_str, fontsize = 16)
        ax[1, r_idx+offset].tick_params(axis='x', labelrotation = 90, )


    ax[0,2].set_ylim([-1, 1])
    fig.suptitle('Principle components (top row) and a sample reconstruction (bottom row)')

    return pca, fig, ax

8.1. Example 0: Synthetic data (boxes)#

num_data = 100

boxes_dict = {'L_ft': 5*np.random.randn(num_data),
              'W_ft': 10*np.random.randn(num_data)
              }

boxes_df = pd.DataFrame(boxes_dict)
boxes_df

	L_ft	W_ft
0	-3.799786	1.491786
1	6.050738	-4.238306
2	-11.037197	6.851790
3	6.792352	-4.045675
4	-6.838963	-5.197603
...	...	...
95	2.344248	-11.745888
96	-3.561784	-0.456870
97	-2.896416	8.947900
98	-1.264895	18.349202
99	2.268445	1.091549

100 rows × 2 columns

boxes_df[['L_in', 'W_in']] = boxes_df[['L_ft','W_ft']]*12
boxes_df['P_ft'] = 2*(boxes_df['L_ft']+boxes_df['W_ft'])
boxes_df['P_in'] = 2*(boxes_df['L_in']+boxes_df['W_in'])

boxes_df

	L_ft	W_ft	L_in	W_in	P_ft	P_in
0	-3.799786	1.491786	-45.597430	17.901432	-4.616000	-55.391995
1	6.050738	-4.238306	72.608856	-50.859672	3.624864	43.498368
2	-11.037197	6.851790	-132.446362	82.221484	-8.370813	-100.449757
3	6.792352	-4.045675	81.508220	-48.548094	5.493354	65.920251
4	-6.838963	-5.197603	-82.067555	-62.371241	-24.073133	-288.877592
...	...	...	...	...	...	...
95	2.344248	-11.745888	28.130974	-140.950657	-18.803280	-225.639366
96	-3.561784	-0.456870	-42.741409	-5.482442	-8.037308	-96.447701
97	-2.896416	8.947900	-34.756995	107.374806	12.102968	145.235621
98	-1.264895	18.349202	-15.178746	220.190428	34.168614	410.023365
99	2.268445	1.091549	27.221338	13.098589	6.719988	80.639853

100 rows × 6 columns

pca, fig, ax = plotPCA(boxes_df)
plt.show()

../_images/3e2e611e7eb38c5711b78a240fc5c0f3dcd122098f283fd1e787902548e37b80.png

ss = StandardScaler()
boxes_scaled = ss.fit_transform(boxes_df)
boxes_scaled_df = pd.DataFrame(boxes_scaled, columns = boxes_df.columns)

pca_scaled, fig, ax = plotPCA(boxes_scaled_df)
plt.show()

../_images/536e775ac1679d46f1be0ed7eb2f7a093f57bc8420fd474972ec22265ea165b6.png

plt.plot(pca_scaled.explained_variance_, 'b.')
plt.show()

../_images/61cc133dce2f60f8dd4c2ae7df0c02c512b54eee59d561dfe89b109eafc22bba.png

8.2. Example 1: Macro-nutrients#

macros_df = pd.read_csv('https://raw.githubusercontent.com/f-imp/Principal-Component-Analysis-PCA-over-3-datasets/refs/heads/master/datasets/Pizza.csv')
macros_df.head()

	brand	id	mois	prot	fat	ash	sodium	carb	cal
0	A	14069	27.82	21.43	44.87	5.11	1.77	0.77	4.93
1	A	14053	28.49	21.26	43.89	5.34	1.79	1.02	4.84
2	A	14025	28.35	19.99	45.78	5.08	1.63	0.80	4.95
3	A	14016	30.55	20.15	43.13	4.79	1.61	1.38	4.74
4	A	14005	30.49	21.28	41.65	4.82	1.64	1.76	4.67

features_to_keep = ['mois', 'prot', 'fat', 'ash', 'sodium', 'carb', 'cal']

macros_df = macros_df[features_to_keep]
macros_df

	mois	prot	fat	ash	sodium	carb	cal
0	27.82	21.43	44.87	5.11	1.77	0.77	4.93
1	28.49	21.26	43.89	5.34	1.79	1.02	4.84
2	28.35	19.99	45.78	5.08	1.63	0.80	4.95
3	30.55	20.15	43.13	4.79	1.61	1.38	4.74
4	30.49	21.28	41.65	4.82	1.64	1.76	4.67
...	...	...	...	...	...	...	...
295	44.91	11.07	17.00	2.49	0.66	25.36	2.91
296	43.15	11.79	18.46	2.43	0.67	24.17	3.10
297	44.55	11.01	16.03	2.43	0.64	25.98	2.92
298	47.60	10.43	15.18	2.32	0.56	24.47	2.76
299	46.84	9.91	15.50	2.27	0.57	25.48	2.81

300 rows × 7 columns

pca = PCA(n_components = 6)
macros_pca = pca.fit_transform(macros_df)

pca_df = pd.DataFrame(data = pca.components_, columns = macros_df.columns)
pca_df

/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: divide by zero encountered in matmul
  X_transformed = X @ self.components_.T
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: overflow encountered in matmul
  X_transformed = X @ self.components_.T
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: invalid value encountered in matmul
  X_transformed = X @ self.components_.T

	mois	prot	fat	ash	sodium	carb	cal
0	-0.276963	-0.266941	-0.278934	-0.055434	-0.011142	0.878084	-0.000603
1	0.747074	-0.055733	-0.657845	-0.040604	-0.023814	0.006818	-0.061254
2	-0.352016	0.809718	-0.467976	0.022225	-0.026245	-0.012469	-0.010062
3	-0.195900	-0.255747	-0.259802	0.871443	0.201453	-0.164525	-0.040678
4	0.059475	0.083719	0.035776	-0.166634	0.978316	0.057470	0.001497
5	0.440974	0.443490	0.448624	0.450220	-0.030463	0.444405	-0.080452

feature_names = list(macros_df.columns)
ss = StandardScaler()

X = ss.fit_transform(macros_df)
fig, ax = plotPCA(X, feature_names = feature_names)

/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: divide by zero encountered in matmul
  X_transformed = X @ self.components_.T
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: overflow encountered in matmul
  X_transformed = X @ self.components_.T
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: invalid value encountered in matmul
  X_transformed = X @ self.components_.T

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 5
      2 ss = StandardScaler()
      4 X = ss.fit_transform(macros_df)
----> 5 fig, ax = plotPCA(X, feature_names = feature_names)

ValueError: too many values to unpack (expected 2)

../_images/0a18d345fafc4b79a9e622da62fad38d851711c40306078597b86f6d78ea278c.png

8.3. Example 2: Automobile specs#

cars_df = pd.read_csv('https://raw.githubusercontent.com/shreyamdg/automobile-data-set-analysis/refs/heads/master/cars.csv')
cars_df.head()

	Unnamed: 0	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	...	engine-size	fuel-system	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	0	3	?	alfa-romero	gas	std	two	convertible	rwd	front	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	13495
1	1	3	?	alfa-romero	gas	std	two	convertible	rwd	front	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	16500
2	2	1	?	alfa-romero	gas	std	two	hatchback	rwd	front	...	152	mpfi	2.68	3.47	9.0	154	5000	19	26	16500
3	3	2	164	audi	gas	std	four	sedan	fwd	front	...	109	mpfi	3.19	3.40	10.0	102	5500	24	30	13950
4	4	2	164	audi	gas	std	four	sedan	4wd	front	...	136	mpfi	3.19	3.40	8.0	115	5500	18	22	17450

5 rows × 27 columns

specs_df = cars_df.select_dtypes(include = 'number').drop(columns = ['Unnamed: 0', 'symboling'])
names_df = cars_df[['make',  'num-of-doors', 'body-style', 'fuel-type']]

idx = 7

ss = StandardScaler()
X = ss.fit_transform(specs_df)
feature_names = list(specs_df.columns)

display(names_df.iloc[[7]])

plotPCA(X, num_components = 5, feature_names = feature_names, idx = idx)
plt.show()

	make	num-of-doors	body-style	fuel-type
7	audi	four	wagon	gas

../_images/2cb6253dd18c08a8a8cad377647ec6df2308e10fa854d967b2b68114c9ec7247.png

PCA: Examples and Observations

Contents

8. PCA: Examples and Observations#

8.1. Example 0: Synthetic data (boxes)#

8.2. Example 1: Macro-nutrients#

8.3. Example 2: Automobile specs#