PCA: Examples and Observations#

Principle Component Analysis (PCA) is one of the more complex concepts in data science. In this notebook, we look at some examples and make some general observations as to the (beneficial) effects and uses of PCA.

Working through the examples, observe:

  • Features that vary together appear in the same Principle Components. As a result, PCA solves the co-linearity problem.

  • You can approximately reconstruct the original feature values with fewer than all the PCs.

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
Hide code cell content
def plotPCA(data, pca=None, num_components = None, idx = 0, feature_names = None):

    if pca is None:
        if num_components is None:
            pca = PCA()
        else:
            pca = PCA(n_components = num_components)
    
    X = pca.fit_transform(data)

    if num_components is None:
        num_components = pca.n_components_
    
    
    if isinstance(data, pd.DataFrame):
        x0 = data.iloc[idx,:]
    else:
        x0 = data[idx,:]
        
    x_pca = np.zeros_like(x0)
    
    coeffs = X[idx,:]
    
    if feature_names is None:
        try:
            feature_names = pca.feature_names_in_
        except:
            feature_names = [str(i) for i in range(data.shape[1])]

    C0 = 'goldenrod'
    C1 = 'dodgerblue'
    alpha = 0.6
    

    offset = 2
    num_cols = num_components + offset
    width_ratios = [1, 0.2] + [1]*num_components
        
    fig, ax = plt.subplots(2, num_cols, figsize = ((num_cols)*2, 8), 
                        sharex = True, sharey = False,
                        gridspec_kw = dict(width_ratios=width_ratios,
                                            hspace = 0.1, wspace = 0.1)                       
                        )
    
    if not np.all(np.isclose(pca.mean_, 0)):
        ax[0, 0].bar(feature_names, pca.mean_, color = C1, alpha = alpha)
        ax[0, 0].bar(feature_names, x0, color = C0, alpha = alpha)
        
        ax[0, 0].sharey(ax[1,offset])
    
    ax[0,0].set_title('Mean')

    ax[1, 0].bar(feature_names, x0-pca.mean_, color = C0, alpha = alpha)
    ax[1, 0].bar(feature_names, x_pca, color = C1, alpha = alpha)
    ax[1, 0].set_xlabel('Features - Mean = ', fontsize = 16)
    ax[1, 0].tick_params(axis='x', labelrotation = 90, )
    ax[1, 0].sharey(ax[1,offset])


    # ax[0, 2].set_title(f'PC1')   

    ax[0,1].set_visible(False)
    ax[1,1].set_visible(False)

    for r_idx, (r, coef) in enumerate(zip(pca.components_, coeffs)):
        x_pca += coef * r
        ax[0, r_idx+offset].bar(feature_names, r, color = C1, alpha = alpha)
        ax[0, r_idx+offset].text(0.5, 0.05, f'coef = {coef:0.2f}', ha = 'center', transform=ax[0, r_idx+offset].transAxes)
        
        ax[1, r_idx+offset].bar(feature_names, x0-pca.mean_, color = C0, alpha = alpha)
        ax[1, r_idx+offset].bar(feature_names, x_pca, alpha = alpha, color = C1)
        
        if r_idx == 0:
            ax_str = f'{coef:0.2f} * PC1'
        
        if r_idx>0:
            ax[0, r_idx+offset].sharey(ax[0,offset])
            ax[1, r_idx+offset].sharey(ax[1,offset])
            plt.setp(ax[0,r_idx+offset].get_yticklabels(), visible=False)
            plt.setp(ax[1,r_idx+offset].get_yticklabels(), visible=False)
            
            if coef > 0:
                ax_str = f' + {coef:4.2} * PC{r_idx+1}'
            elif coef < 0:
                ax_str = f' - {-1*coef:4.2} * PC{r_idx+1}'
            else:
                ax_str = f' + 0'
                        
        ax[0, r_idx+offset].set_title(f'PC{r_idx+1}')   
        ax[1, r_idx+offset].set_xlabel(ax_str, fontsize = 16)
        ax[1, r_idx+offset].tick_params(axis='x', labelrotation = 90, )


    ax[0,2].set_ylim([-1, 1])
    
    return pca, fig, ax
    
plt.show()

Example 0: Synthetic data (boxes)#

num_data = 100

boxes_dict = {'L_ft': 5*np.random.rand(num_data),
              'W_ft': 10*np.random.rand(num_data)
              }

boxes_df = pd.DataFrame(boxes_dict)
boxes_df
L_ft W_ft
0 3.356280 8.981393
1 3.524766 3.182874
2 3.135138 0.376884
3 3.293515 1.891925
4 4.125670 6.172301
... ... ...
95 0.732521 8.485788
96 1.144732 1.500132
97 4.091529 5.685457
98 3.962749 8.728399
99 1.952905 7.703050

100 rows × 2 columns

boxes_df[['L_in', 'W_in']] = boxes_df[['L_ft','W_ft']]*12
boxes_df['P_ft'] = 2*(boxes_df['L_ft']+boxes_df['W_ft'])
boxes_df['P_in'] = 2*(boxes_df['L_in']+boxes_df['W_in'])

boxes_df
L_ft W_ft L_in W_in P_ft P_in
0 3.356280 8.981393 40.275358 107.776720 24.675346 296.104156
1 3.524766 3.182874 42.297188 38.194490 13.415280 160.983355
2 3.135138 0.376884 37.621659 4.522603 7.024044 84.288525
3 3.293515 1.891925 39.522179 22.703100 10.370880 124.450559
4 4.125670 6.172301 49.508035 74.067607 20.595940 247.151285
... ... ... ... ... ... ...
95 0.732521 8.485788 8.790257 101.829451 18.436618 221.239418
96 1.144732 1.500132 13.736778 18.001583 5.289727 63.476723
97 4.091529 5.685457 49.098351 68.225486 19.553973 234.647674
98 3.962749 8.728399 47.552993 104.740784 25.382296 304.587553
99 1.952905 7.703050 23.434861 92.436598 19.311910 231.742917

100 rows × 6 columns

pca, fig, ax = plotPCA(boxes_df)
plt.show()
../_images/535c76c2622f6427b540afa8f93074d1ff9225a97104915696dacd43bef81f94.png
ss = StandardScaler()
boxes_scaled = ss.fit_transform(boxes_df)
boxes_scaled_df = pd.DataFrame(boxes_scaled, columns = boxes_df.columns)

pca_scaled, fig, ax = plotPCA(boxes_scaled_df)
plt.show()
../_images/972b12d9c8361c1ca007ef7445d7d4fcc37e4f2f36aa6ad15f6c425ad9177b92.png
plt.plot(pca_scaled.explained_variance_, 'b.')
plt.show()
../_images/aef5e2a748dd0c9fe3fb6e76058c353c7149d960d778dbe89cec50afcfad5bf9.png

Example 1: Macro-nutrients#

macros_df = pd.read_csv('https://raw.githubusercontent.com/f-imp/Principal-Component-Analysis-PCA-over-3-datasets/refs/heads/master/datasets/Pizza.csv')
macros_df.head()
brand id mois prot fat ash sodium carb cal
0 A 14069 27.82 21.43 44.87 5.11 1.77 0.77 4.93
1 A 14053 28.49 21.26 43.89 5.34 1.79 1.02 4.84
2 A 14025 28.35 19.99 45.78 5.08 1.63 0.80 4.95
3 A 14016 30.55 20.15 43.13 4.79 1.61 1.38 4.74
4 A 14005 30.49 21.28 41.65 4.82 1.64 1.76 4.67
features_to_keep = ['mois', 'prot', 'fat', 'ash', 'sodium', 'carb', 'cal']

macros_df = macros_df[features_to_keep]
macros_df
mois prot fat ash sodium carb cal
0 27.82 21.43 44.87 5.11 1.77 0.77 4.93
1 28.49 21.26 43.89 5.34 1.79 1.02 4.84
2 28.35 19.99 45.78 5.08 1.63 0.80 4.95
3 30.55 20.15 43.13 4.79 1.61 1.38 4.74
4 30.49 21.28 41.65 4.82 1.64 1.76 4.67
... ... ... ... ... ... ... ...
295 44.91 11.07 17.00 2.49 0.66 25.36 2.91
296 43.15 11.79 18.46 2.43 0.67 24.17 3.10
297 44.55 11.01 16.03 2.43 0.64 25.98 2.92
298 47.60 10.43 15.18 2.32 0.56 24.47 2.76
299 46.84 9.91 15.50 2.27 0.57 25.48 2.81

300 rows × 7 columns

pca = PCA(n_components = 6)
macros_pca = pca.fit_transform(macros_df)

pca_df = pd.DataFrame(data = pca.components_, columns = macros_df.columns)
pca_df
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: divide by zero encountered in matmul
  X_transformed = X @ self.components_.T
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: overflow encountered in matmul
  X_transformed = X @ self.components_.T
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: invalid value encountered in matmul
  X_transformed = X @ self.components_.T
mois prot fat ash sodium carb cal
0 -0.276963 -0.266941 -0.278934 -0.055434 -0.011142 0.878084 -0.000603
1 0.747074 -0.055733 -0.657845 -0.040604 -0.023814 0.006818 -0.061254
2 -0.352016 0.809718 -0.467976 0.022225 -0.026245 -0.012469 -0.010062
3 -0.195900 -0.255747 -0.259802 0.871443 0.201453 -0.164525 -0.040678
4 0.059475 0.083719 0.035776 -0.166634 0.978316 0.057470 0.001497
5 0.440974 0.443490 0.448624 0.450220 -0.030463 0.444405 -0.080452
feature_names = list(macros_df.columns)
ss = StandardScaler()

X = ss.fit_transform(macros_df)
fig, ax = plotPCA(X, feature_names = feature_names)
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: divide by zero encountered in matmul
  X_transformed = X @ self.components_.T
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: overflow encountered in matmul
  X_transformed = X @ self.components_.T
/Users/eatai/.pyenv/versions/3.13.1/envs/datascience/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: invalid value encountered in matmul
  X_transformed = X @ self.components_.T
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 5
      2 ss = StandardScaler()
      4 X = ss.fit_transform(macros_df)
----> 5 fig, ax = plotPCA(X, feature_names = feature_names)

ValueError: too many values to unpack (expected 2)
../_images/28e2d78781f8d81821676529151ae968a5ee9deac89d1c20f73ef7b4d8468681.png

Example 2: Automobile specs#

cars_df = pd.read_csv('https://raw.githubusercontent.com/shreyamdg/automobile-data-set-analysis/refs/heads/master/cars.csv')
cars_df.head()
Unnamed: 0 symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location ... engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price
0 0 3 ? alfa-romero gas std two convertible rwd front ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 1 3 ? alfa-romero gas std two convertible rwd front ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 2 1 ? alfa-romero gas std two hatchback rwd front ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
3 3 2 164 audi gas std four sedan fwd front ... 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
4 4 2 164 audi gas std four sedan 4wd front ... 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450

5 rows × 27 columns

specs_df = cars_df.select_dtypes(include = 'number').drop(columns = ['Unnamed: 0', 'symboling'])
names_df = cars_df[['make',  'num-of-doors', 'body-style', 'fuel-type']]
idx = 7

ss = StandardScaler()
X = ss.fit_transform(specs_df)
feature_names = list(specs_df.columns)

display(names_df.iloc[[7]])

plotPCA(X, num_components = 5, feature_names = feature_names, idx = idx)
plt.show()
make num-of-doors body-style fuel-type
7 audi four wagon gas
../_images/2cb6253dd18c08a8a8cad377647ec6df2308e10fa854d967b2b68114c9ec7247.png