Chandra B. Pradhan - Fab Futures - Data Science
Home About

Week4: Machine Learning¶

Scikit¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# -----------------------------
# 1. Create Sample Data (Simulated Data)
# -----------------------------
# Replace this section with your file loading (e.g., df = pd.read_csv("your_data.csv"))
np.random.seed(42)
X = np.linspace(-3, 3, 50)
true_Y = 0.5 * X**3 - 2 * X**2 + 5 * X + 10
Y = true_Y + np.random.normal(0, 5, 50)
df = pd.DataFrame({'Input_X': X, 'Output_Y': Y})

# -----------------------------
# 2. Define Parameters
# -----------------------------
X_COLUMN = 'Input_X'
Y_COLUMN = 'Output_Y'
POLYNOMIAL_DEGREE = 3 # Set the degree of the polynomial

# -----------------------------
# 3. Prepare Data for Scikit-learn
# -----------------------------
# Scikit-learn requires the input X to be a 2D array: (n_samples, n_features)
X_data = df[X_COLUMN].values.reshape(-1, 1) 
Y_data = df[Y_COLUMN].values

# -----------------------------
# 4. Create and Fit the Scikit-learn Model
# -----------------------------

# Create a pipeline: 
# 1. PolynomialFeatures expands X into [X, X^2, X^3, ...]
# 2. LinearRegression fits a linear model to these expanded features.
model = make_pipeline(PolynomialFeatures(degree=POLYNOMIAL_DEGREE, include_bias=False),
                      LinearRegression())

# Fit the model to the data
model.fit(X_data, Y_data)

# -----------------------------
# 5. Generate Predictions for Plotting
# -----------------------------
# Create 100 evenly spaced points for a smooth fitted curve
X_fit = np.linspace(X_data.min(), X_data.max(), 100).reshape(-1, 1) 
Y_pred = model.predict(X_fit)

# -----------------------------
# 6. Visualization (Plotting)
# -----------------------------
plt.figure(figsize=(10, 6))

# Plot the original data points
plt.scatter(X_data, Y_data, label='Original Data', color='darkblue', alpha=0.6)

# Plot the fitted polynomial curve
plt.plot(X_fit, Y_pred, label=f'Scikit-learn Polynomial Fit (Degree {POLYNOMIAL_DEGREE})', color='red', linewidth=2)

plt.title(f'Polynomial Regression using scikit-learn (Degree {POLYNOMIAL_DEGREE})')
plt.xlabel(X_COLUMN)
plt.ylabel(Y_COLUMN)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Print coefficients
linear_model = model.named_steps['linearregression']
# Coefficients are returned in order of increasing power: x^1, x^2, x^3, ...
print(f"Scikit-learn Coefficients (for x^1, x^2, x^3...): {linear_model.coef_}")
print(f"Scikit-learn Intercept (Constant Term): {linear_model.intercept_}")
No description has been provided for this image
Scikit-learn Coefficients (for x^1, x^2, x^3...): [ 5.68361597 -1.70256003  0.29228664]
Scikit-learn Intercept (Constant Term): 7.943889342887482
In [ ]: