My understanding of PCA
- My dataset of daily returns of NIFTY 50 for last 18 years has 16 parameters like close, high, low, ema12, ema26, MACD, RSI, etc.
- Now PCA will tell me which of these parameters affects my model the most and which parameters are not needed that much for the model.
- Principal Components can have multiple parameters also. (Prof Niel)
In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
# Load the data
df = pd.read_csv("datasets/NIFTY_50.csv")
# --- 1. Data Selection and Cleaning ---
# Select all relevant numerical columns
feature_cols = [
'Adj Close', 'High', 'Low', 'Open', 'Volume',
'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'MACD',
'Signal_Line', 'RSI_14', 'BB_Mid', 'BB_Upper',
'BB_Lower', 'Daily_Return_%' # Total 16 features
]
data_pca = df[feature_cols].copy()
# Drop rows where any technical indicator value is missing (NaN)
data_pca.dropna(inplace=True)
X = data_pca.values
# --- 2. Standardization (Crucial Step for PCA) ---
# Standardize the features so they all contribute equally to variance calculation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# --- 3. Perform PCA ---
# Initialize PCA to find all possible components (up to 16)
pca = PCA()
pca.fit(X_scaled)
# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
# --- 4. Generate Scree Plot ---
plt.figure(figsize=(10, 6))
# Plot the cumulative explained variance
plt.plot(range(1, len(explained_variance_ratio) + 1),
np.cumsum(explained_variance_ratio),
marker='o', linestyle='--', color='b')
# Add a reference line for 90% cumulative variance
plt.axhline(y=0.90, color='r', linestyle='-', label='90% Variance Explained')
plt.title('Scree Plot: Explained Variance by Principal Component')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.xticks(range(1, len(explained_variance_ratio) + 1))
plt.grid(True, alpha=0.5)
plt.legend()
plt.show()
In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
# Load the data and select features
df = pd.read_csv("datasets/NIFTY_50.csv")
feature_cols = [
'Adj Close', 'High', 'Low', 'Open', 'Volume',
'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'MACD',
'Signal_Line', 'RSI_14', 'BB_Mid', 'BB_Upper',
'BB_Lower', 'Daily_Return_%'
]
data_pca = df[feature_cols].copy().dropna()
X = data_pca.values
# Standardize the Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform PCA for 3 components (92% variance explained)
pca = PCA(n_components=3)
pca.fit(X_scaled)
# Get the Loadings (Weightage)
loadings = pca.components_.T
# Create a DataFrame for clear presentation
loadings_df = pd.DataFrame(
loadings,
columns=['PC 1', 'PC 2', 'PC 3'],
index=feature_cols
)
# Print the sorted table (sorted by PC 1 influence)
print(loadings_df.reindex(loadings_df['PC 1'].abs().sort_values(ascending=False).index).to_string())
PC 1 PC 2 PC 3 High 0.296044 -0.018261 0.008133 Open 0.296035 -0.017554 0.002917 Adj Close 0.296028 -0.016145 0.013952 Low 0.296013 -0.015497 0.009018 EMA_12 0.295897 -0.029155 0.002328 SMA_20 0.295619 -0.040232 0.002391 BB_Mid 0.295619 -0.040232 0.002391 EMA_26 0.295545 -0.042810 0.005492 BB_Lower 0.295461 -0.035592 -0.000814 BB_Upper 0.295419 -0.044631 0.005459 SMA_50 0.294672 -0.063848 0.013763 Volume 0.156968 -0.021204 0.010504 Signal_Line 0.082216 0.571160 -0.221360 MACD 0.077616 0.614379 -0.143270 RSI_14 0.030878 0.517781 0.208537 Daily_Return_% 0.002132 0.115629 0.941413
In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
# --- 1. Data Setup ---
# Load the data and define the features used for PCA
df = pd.read_csv("datasets/NIFTY_50.csv")
feature_cols = [
'Adj Close', 'High', 'Low', 'Open', 'Volume',
'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'MACD',
'Signal_Line', 'RSI_14', 'BB_Mid', 'BB_Upper',
'BB_Lower', 'Daily_Return_%'
]
data_pca = df[feature_cols].copy().dropna()
X = data_pca.values
# Scale the data (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# --- 2. Perform PCA ---
# Set n_components=3 to focus on the most important components
N_COMPONENTS = 3
pca = PCA(n_components=N_COMPONENTS)
pca.fit(X_scaled)
# --- 3. Extract and Print Component Composition (Loadings) ---
# Get the Loadings: pca.components_ is the transposed eigenvectors
loadings = pca.components_.T
# Create the column names for the output table
component_names = [f'PC {i+1}' for i in range(N_COMPONENTS)]
# Create a DataFrame to map the loadings to the original feature names
loadings_df = pd.DataFrame(
loadings,
columns=component_names,
index=feature_cols # The original feature names
)
# Sort the DataFrame by the absolute value of PC 1 loading for better readability
loadings_df_sorted = loadings_df.reindex(
loadings_df['PC 1'].abs().sort_values(ascending=False).index
)
# Print the final result
print("\n--- Component Composition (Loadings / Weightage) ---\n")
print(loadings_df_sorted.to_string())
--- Component Composition (Loadings / Weightage) ---
PC 1 PC 2 PC 3
High 0.296044 -0.018261 0.008133
Open 0.296035 -0.017554 0.002917
Adj Close 0.296028 -0.016145 0.013952
Low 0.296013 -0.015497 0.009018
EMA_12 0.295897 -0.029155 0.002328
SMA_20 0.295619 -0.040232 0.002391
BB_Mid 0.295619 -0.040232 0.002391
EMA_26 0.295545 -0.042810 0.005492
BB_Lower 0.295461 -0.035592 -0.000814
BB_Upper 0.295419 -0.044631 0.005459
SMA_50 0.294672 -0.063848 0.013763
Volume 0.156968 -0.021204 0.010504
Signal_Line 0.082216 0.571160 -0.221360
MACD 0.077616 0.614379 -0.143270
RSI_14 0.030878 0.517781 0.208537
Daily_Return_% 0.002132 0.115629 0.941413
In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
# --- Data Preparation ---
df = pd.read_csv("datasets/NIFTY_50.csv")
feature_cols = [
'Adj Close', 'High', 'Low', 'Open', 'Volume',
'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'MACD',
'Signal_Line', 'RSI_14', 'BB_Mid', 'BB_Upper',
'BB_Lower', 'Daily_Return_%'
]
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
data_pca = df[feature_cols].copy().dropna()
X = data_pca.values
adj_close_aligned = data_pca['Adj Close']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# --- PCA and Transformation ---
pca_1 = PCA(n_components=1)
X_pca_1 = pca_1.fit_transform(X_scaled)
pc1_scores = pd.Series(X_pca_1.flatten(), index=data_pca.index)
# --- Plotting PC 1 Scores ---
plt.figure(figsize=(12, 6))
plt.plot(pc1_scores.index, pc1_scores.values, label='PC 1 Scores', color='navy')
plt.title('Time Series Analysis of Principal Component 1 (PC 1 Scores)')
plt.xlabel('Date')
plt.ylabel('PC 1 Score (Standardized Units)')
plt.grid(True, alpha=0.4)
plt.legend()
plt.show()
# --- Correlation Calculation ---
correlation = pc1_scores.corr(adj_close_aligned)
print(f"Correlation between PC 1 Scores and Adjusted Close Price: {correlation:.4f}")
Correlation between PC 1 Scores and Adjusted Close Price: 0.9986
In [ ]: