Deovam Singh - Fab Futures - Data Science
Home About

My understanding of PCA

  • My dataset of daily returns of NIFTY 50 for last 18 years has 16 parameters like close, high, low, ema12, ema26, MACD, RSI, etc.
  • Now PCA will tell me which of these parameters affects my model the most and which parameters are not needed that much for the model.
  • Principal Components can have multiple parameters also. (Prof Niel)
In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Load the data
df = pd.read_csv("datasets/NIFTY_50.csv")

# --- 1. Data Selection and Cleaning ---

# Select all relevant numerical columns
feature_cols = [
    'Adj Close', 'High', 'Low', 'Open', 'Volume', 
    'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'MACD', 
    'Signal_Line', 'RSI_14', 'BB_Mid', 'BB_Upper', 
    'BB_Lower', 'Daily_Return_%' # Total 16 features
]

data_pca = df[feature_cols].copy()

# Drop rows where any technical indicator value is missing (NaN)
data_pca.dropna(inplace=True)
X = data_pca.values

# --- 2. Standardization (Crucial Step for PCA) ---
# Standardize the features so they all contribute equally to variance calculation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 3. Perform PCA ---
# Initialize PCA to find all possible components (up to 16)
pca = PCA()
pca.fit(X_scaled)

# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# --- 4. Generate Scree Plot ---
plt.figure(figsize=(10, 6))

# Plot the cumulative explained variance
plt.plot(range(1, len(explained_variance_ratio) + 1), 
         np.cumsum(explained_variance_ratio), 
         marker='o', linestyle='--', color='b')

# Add a reference line for 90% cumulative variance
plt.axhline(y=0.90, color='r', linestyle='-', label='90% Variance Explained')

plt.title('Scree Plot: Explained Variance by Principal Component')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.xticks(range(1, len(explained_variance_ratio) + 1))
plt.grid(True, alpha=0.5)
plt.legend()
plt.show()
No description has been provided for this image
In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

# Load the data and select features
df = pd.read_csv("datasets/NIFTY_50.csv")
feature_cols = [
    'Adj Close', 'High', 'Low', 'Open', 'Volume', 
    'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'MACD', 
    'Signal_Line', 'RSI_14', 'BB_Mid', 'BB_Upper', 
    'BB_Lower', 'Daily_Return_%'
]
data_pca = df[feature_cols].copy().dropna()
X = data_pca.values

# Standardize the Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA for 3 components (92% variance explained)
pca = PCA(n_components=3)
pca.fit(X_scaled)

# Get the Loadings (Weightage)
loadings = pca.components_.T

# Create a DataFrame for clear presentation
loadings_df = pd.DataFrame(
    loadings,
    columns=['PC 1', 'PC 2', 'PC 3'],
    index=feature_cols
)

# Print the sorted table (sorted by PC 1 influence)
print(loadings_df.reindex(loadings_df['PC 1'].abs().sort_values(ascending=False).index).to_string())
                    PC 1      PC 2      PC 3
High            0.296044 -0.018261  0.008133
Open            0.296035 -0.017554  0.002917
Adj Close       0.296028 -0.016145  0.013952
Low             0.296013 -0.015497  0.009018
EMA_12          0.295897 -0.029155  0.002328
SMA_20          0.295619 -0.040232  0.002391
BB_Mid          0.295619 -0.040232  0.002391
EMA_26          0.295545 -0.042810  0.005492
BB_Lower        0.295461 -0.035592 -0.000814
BB_Upper        0.295419 -0.044631  0.005459
SMA_50          0.294672 -0.063848  0.013763
Volume          0.156968 -0.021204  0.010504
Signal_Line     0.082216  0.571160 -0.221360
MACD            0.077616  0.614379 -0.143270
RSI_14          0.030878  0.517781  0.208537
Daily_Return_%  0.002132  0.115629  0.941413
In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

# --- 1. Data Setup ---
# Load the data and define the features used for PCA
df = pd.read_csv("datasets/NIFTY_50.csv")
feature_cols = [
    'Adj Close', 'High', 'Low', 'Open', 'Volume', 
    'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'MACD', 
    'Signal_Line', 'RSI_14', 'BB_Mid', 'BB_Upper', 
    'BB_Lower', 'Daily_Return_%'
]
data_pca = df[feature_cols].copy().dropna()
X = data_pca.values

# Scale the data (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 2. Perform PCA ---
# Set n_components=3 to focus on the most important components
N_COMPONENTS = 3 
pca = PCA(n_components=N_COMPONENTS)
pca.fit(X_scaled)

# --- 3. Extract and Print Component Composition (Loadings) ---

# Get the Loadings: pca.components_ is the transposed eigenvectors
loadings = pca.components_.T 

# Create the column names for the output table
component_names = [f'PC {i+1}' for i in range(N_COMPONENTS)]

# Create a DataFrame to map the loadings to the original feature names
loadings_df = pd.DataFrame(
    loadings,
    columns=component_names,
    index=feature_cols # The original feature names
)

# Sort the DataFrame by the absolute value of PC 1 loading for better readability
loadings_df_sorted = loadings_df.reindex(
    loadings_df['PC 1'].abs().sort_values(ascending=False).index
)

# Print the final result
print("\n--- Component Composition (Loadings / Weightage) ---\n")
print(loadings_df_sorted.to_string())
--- Component Composition (Loadings / Weightage) ---

                    PC 1      PC 2      PC 3
High            0.296044 -0.018261  0.008133
Open            0.296035 -0.017554  0.002917
Adj Close       0.296028 -0.016145  0.013952
Low             0.296013 -0.015497  0.009018
EMA_12          0.295897 -0.029155  0.002328
SMA_20          0.295619 -0.040232  0.002391
BB_Mid          0.295619 -0.040232  0.002391
EMA_26          0.295545 -0.042810  0.005492
BB_Lower        0.295461 -0.035592 -0.000814
BB_Upper        0.295419 -0.044631  0.005459
SMA_50          0.294672 -0.063848  0.013763
Volume          0.156968 -0.021204  0.010504
Signal_Line     0.082216  0.571160 -0.221360
MACD            0.077616  0.614379 -0.143270
RSI_14          0.030878  0.517781  0.208537
Daily_Return_%  0.002132  0.115629  0.941413
In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# --- Data Preparation ---
df = pd.read_csv("datasets/NIFTY_50.csv")
feature_cols = [
    'Adj Close', 'High', 'Low', 'Open', 'Volume', 
    'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'MACD', 
    'Signal_Line', 'RSI_14', 'BB_Mid', 'BB_Upper', 
    'BB_Lower', 'Daily_Return_%'
]

df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
data_pca = df[feature_cols].copy().dropna()
X = data_pca.values

adj_close_aligned = data_pca['Adj Close']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- PCA and Transformation ---
pca_1 = PCA(n_components=1)
X_pca_1 = pca_1.fit_transform(X_scaled)
pc1_scores = pd.Series(X_pca_1.flatten(), index=data_pca.index)

# --- Plotting PC 1 Scores ---
plt.figure(figsize=(12, 6))
plt.plot(pc1_scores.index, pc1_scores.values, label='PC 1 Scores', color='navy')
plt.title('Time Series Analysis of Principal Component 1 (PC 1 Scores)')
plt.xlabel('Date')
plt.ylabel('PC 1 Score (Standardized Units)')
plt.grid(True, alpha=0.4)
plt.legend()
plt.show()

# --- Correlation Calculation ---
correlation = pc1_scores.corr(adj_close_aligned)
print(f"Correlation between PC 1 Scores and Adjusted Close Price: {correlation:.4f}")
No description has been provided for this image
Correlation between PC 1 Scores and Adjusted Close Price: 0.9986
In [ ]: