Machine Learning¶
✅ What is Machine Learning?
Machine learning (ML) is a field of artificial intelligence where computers learn patterns from data and use those patterns to make decisions or predictions without being explicitly programmed. Work easily with ML libraries like:
- scikit-learn
- TensorFlow
- PyTorch
- NumPy / Pandas
- Matplotlib / Seaborn
Simple ML example in Jupyter¶
In [3]:
import sklearn.linear_model import LinearRegression
import numpy as np
Cell In[3], line 1 import sklearn.linear_model import LinearRegression ^ SyntaxError: invalid syntax
In [2]:
from sklearn.linear_model import LinearRegression
import numpy as np
# Training data
x = np.array([[1], [2], [3], [4]])
y = np.array([2, 4, 6, 8])
# Create model
model = LinearRegression()
model.fit(x, y)
# Predict
print(model.predict([[5]])) # Output: 10
[10.]
In [1]:
import jax
import jax.numpy as jnp
from jax import random
import pandas as pd
import altair as alt
In [ ]:
Imported dataset¶
In [2]:
# Data
data = {
"Section": ["A", "B", "C"],
"Total Students": [28, 28, 28],
"Total Stds Passed": [22, 24, 13],
"Total Stds Fail": [6, 5, 15],
"Total Pass %": [78.57, 85.71, 46.43],
"Total Fail %": [21.43, 17.86, 53.57],
"Mean Mark": [75.7, 71.7, 57.9],
"National Mean Mark": [67.09, 75, 67.09]
}
df = pd.DataFrame(data)
df
Out[2]:
| Section | Total Students | Total Stds Passed | Total Stds Fail | Total Pass % | Total Fail % | Mean Mark | National Mean Mark | |
|---|---|---|---|---|---|---|---|---|
| 0 | A | 28 | 22 | 6 | 78.57 | 21.43 | 75.7 | 67.09 |
| 1 | B | 28 | 24 | 5 | 85.71 | 17.86 | 71.7 | 75.00 |
| 2 | C | 28 | 13 | 15 | 46.43 | 53.57 | 57.9 | 67.09 |
In [3]:
key = random.PRNGKey(0)
# Function to simulate student pass/fail
def simulate_pass_fail(key, total_students, total_pass):
# 1 = Pass, 0 = Fail
outcomes = jnp.array([1]*total_pass + [0]*(total_students - total_pass))
shuffled = random.permutation(key, outcomes)
return shuffled
# Compute pass probability for each section
pass_prob = []
for i, row in df.iterrows():
key, subkey = random.split(key)
sim = simulate_pass_fail(subkey, row["Total Students"], row["Total Stds Passed"])
prob = jnp.mean(sim) # average = probability of pass
pass_prob.append(prob)
df["Simulated Pass Prob"] = jnp.array(pass_prob)
df
Out[3]:
| Section | Total Students | Total Stds Passed | Total Stds Fail | Total Pass % | Total Fail % | Mean Mark | National Mean Mark | Simulated Pass Prob | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | A | 28 | 22 | 6 | 78.57 | 21.43 | 75.7 | 67.09 | 0.785714 |
| 1 | B | 28 | 24 | 5 | 85.71 | 17.86 | 71.7 | 75.00 | 0.857143 |
| 2 | C | 28 | 13 | 15 | 46.43 | 53.57 | 57.9 | 67.09 | 0.464286 |
In [4]:
# Prepare DataFrame for Altair
df_viz = df.melt(
id_vars=["Section"],
value_vars=["Total Pass %", "Total Fail %", "Simulated Pass Prob"],
var_name="Category",
value_name="Percentage"
)
# Convert simulated pass probability to % for visualization
df_viz["Percentage"] = df_viz.apply(
lambda row: row["Percentage"]*100 if row["Category"]=="Simulated Pass Prob" else row["Percentage"], axis=1
)
# Plot
chart = alt.Chart(df_viz).mark_bar().encode(
x=alt.X('Section:N', title='Section'),
y=alt.Y('Percentage:Q', title='Percentage'),
color='Category:N',
tooltip=['Section', 'Category', 'Percentage']
).properties(
title="Pass % and Fail % by Section (with JAX Simulation)",
width=500,
height=300
)
chart
Out[4]:
Regression¶
In [5]:
import pandas as pd
data = {
"Section": ["A", "B", "C"],
"Total Students": [28, 28, 28],
"Total Stds Passed": [22, 24, 13],
"Total Stds Fail": [6, 5, 15],
"Total Pass %": [78.57, 85.71, 46.43],
"Total Fail %": [21.43, 17.86, 53.57],
"Mean Mark": [75.7, 71.7, 57.9],
"National Mean Mark": [67.09, 75, 67.09]
}
df = pd.DataFrame(data)
df
Out[5]:
| Section | Total Students | Total Stds Passed | Total Stds Fail | Total Pass % | Total Fail % | Mean Mark | National Mean Mark | |
|---|---|---|---|---|---|---|---|---|
| 0 | A | 28 | 22 | 6 | 78.57 | 21.43 | 75.7 | 67.09 |
| 1 | B | 28 | 24 | 5 | 85.71 | 17.86 | 71.7 | 75.00 |
| 2 | C | 28 | 13 | 15 | 46.43 | 53.57 | 57.9 | 67.09 |
JAX¶
In [6]:
import jax.numpy as jnp
from jax import grad, jit
# Convert data to JAX arrays
x = jnp.array(df["Total Pass %"])
y = jnp.array(df["Mean Mark"])
# Initialize parameters
m = jnp.array(0.0)
c = jnp.array(0.0)
learning_rate = 0.001
epochs = 1000
# Define loss function (Mean Squared Error)
def loss(params, x, y):
m, c = params
y_pred = m * x + c
return jnp.mean((y - y_pred)**2)
# Gradient descent
params = jnp.array([m, c])
grad_loss = grad(loss)
for _ in range(epochs):
grads = grad_loss(params, x, y)
params = params - learning_rate * grads
m, c = params
print(f"Fitted line: y = {m:.2f}x + {c:.2f}")
Fitted line: y = nanx + nan
In [7]:
import matplotlib.pyplot as plt
y_pred = m * x + c
plt.figure(figsize=(8,5))
plt.scatter(x, y, color='blue', label='Actual Mean Marks')
plt.plot(x, y_pred, color='red', linestyle='--', label='Regression Line')
plt.xlabel('Total Pass %')
plt.ylabel('Mean Mark')
plt.title('Regression: Mean Mark vs Total Pass %')
plt.legend()
plt.show()
In [8]:
import altair as alt
df_viz = df.copy()
df_viz["Predicted Mean Mark"] = m * df_viz["Total Pass %"] + c
chart = alt.Chart(df_viz).mark_circle(size=100).encode(
x='Total Pass %',
y='Mean Mark',
tooltip=['Section', 'Mean Mark', 'Total Pass %']
)
line = alt.Chart(df_viz).mark_line(color='red').encode(
x='Total Pass %',
y='Predicted Mean Mark'
)
chart + line
Out[8]:
Multiple Linear Regression using JAX¶
In [11]:
import pandas as pd
import jax.numpy as jnp
from jax import grad
# Dataset
data = {
"Section": ["A", "B", "C"],
"Total Students": [28, 28, 28],
"Total Stds Passed": [22, 24, 13],
"Total Stds Fail": [6, 5, 15],
"Total Pass %": [78.57, 85.71, 46.43],
"Total Fail %": [21.43, 17.86, 53.57],
"Mean Mark": [75.7, 71.7, 57.9],
"National Mean Mark": [67.09, 75, 67.09]
}
df = pd.DataFrame(data)
df
Out[11]:
| Section | Total Students | Total Stds Passed | Total Stds Fail | Total Pass % | Total Fail % | Mean Mark | National Mean Mark | |
|---|---|---|---|---|---|---|---|---|
| 0 | A | 28 | 22 | 6 | 78.57 | 21.43 | 75.7 | 67.09 |
| 1 | B | 28 | 24 | 5 | 85.71 | 17.86 | 71.7 | 75.00 |
| 2 | C | 28 | 13 | 15 | 46.43 | 53.57 | 57.9 | 67.09 |
y=w1​x1​+w2​x2​+w3​x3​+b¶
In [13]:
# Features and target
X = jnp.array(df[["Total Pass %", "Total Fail %", "National Mean Mark"]])
y = jnp.array(df["Mean Mark"])
# Initialize weights and bias
w = jnp.zeros(3)
b = 0.0
learning_rate = 0.0001
epochs = 10000
# Define Mean Squared Error loss
def loss(params, X, y):
w, b = params[:3], params[3]
y_pred = jnp.dot(X, w) + b
return jnp.mean((y - y_pred)**2)
# Gradient function
grad_loss = grad(loss)
# Combine weights and bias for optimization
params = jnp.append(w, b)
# Gradient Descent
for _ in range(epochs):
grads = grad_loss(params, X, y)
params = params - learning_rate * grads
w_opt = params[:3]
b_opt = params[3]
print(f"Optimized weights: {w_opt}")
print(f"Optimized bias: {b_opt}")
Optimized weights: [nan nan nan] Optimized bias: nan
In [14]:
y_pred = jnp.dot(X, w_opt) + b_opt
df["Predicted Mean Mark"] = y_pred
df
Out[14]:
| Section | Total Students | Total Stds Passed | Total Stds Fail | Total Pass % | Total Fail % | Mean Mark | National Mean Mark | Predicted Mean Mark | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | A | 28 | 22 | 6 | 78.57 | 21.43 | 75.7 | 67.09 | NaN |
| 1 | B | 28 | 24 | 5 | 85.71 | 17.86 | 71.7 | 75.00 | NaN |
| 2 | C | 28 | 13 | 15 | 46.43 | 53.57 | 57.9 | 67.09 | NaN |
In [16]:
import altair as alt
chart = alt.Chart(df).mark_circle(size=100).encode(
x='Section',
y='Mean Mark',
color=alt.value('blue'),
tooltip=['Section', 'Mean Mark', 'Predicted Mean Mark']
)
predicted_line = alt.Chart(df).mark_line(color='red').encode(
x='Section',
y='Predicted Mean Mark'
)
chart + predicted_line
Out[16]:
Activities¶
In [1]:
# ===============================
# Beginner-Friendly ML in Jupyter
# ===============================
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
# -------------------------------
# 2. Load Dataset
# -------------------------------
# Example dataset for regression: House Prices
# You can replace this with your own CSV file
data = pd.DataFrame({
'size': [750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200],
'bedrooms': [2, 2, 3, 3, 3, 3, 4, 4, 4, 4],
'price': [150000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000],
'sold': [1, 1, 0, 0, 1, 1, 0, 0, 1, 1] # Example for classification
})
# Preview data
print("Dataset:")
display(data.head())
# -------------------------------
# 3. Data Visualization
# -------------------------------
sns.pairplot(data, x_vars=['size', 'bedrooms'], y_vars='price', height=4, kind='scatter')
plt.show()
sns.countplot(x='sold', data=data)
plt.title("Sold Status Count")
plt.show()
# -------------------------------
# 4. Regression: Predict House Price
# -------------------------------
X = data[['size', 'bedrooms']] # Features
y = data['price'] # Target
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)
# Predict
y_pred = reg_model.predict(X_test)
# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f"Regression Mean Squared Error: {mse}")
# Plot
plt.scatter(X_test['size'], y_test, color='blue', label='Actual')
plt.scatter(X_test['size'], y_pred, color='red', label='Predicted')
plt.xlabel('Size')
plt.ylabel('Price')
plt.legend()
plt.show()
# -------------------------------
# 5. Classification: Predict Sold Status
# -------------------------------
X_class = data[['size', 'bedrooms']]
y_class = data['sold']
# Split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size=0.2, random_state=42)
# Train
clf_model = LogisticRegression()
clf_model.fit(X_train_c, y_train_c)
# Predict
y_pred_c = clf_model.predict(X_test_c)
# Evaluate
acc = accuracy_score(y_test_c, y_pred_c)
cm = confusion_matrix(y_test_c, y_pred_c)
print(f"Classification Accuracy: {acc}")
print("Confusion Matrix:")
print(cm)
Dataset:
| size | bedrooms | price | sold | |
|---|---|---|---|---|
| 0 | 750 | 2 | 150000 | 1 |
| 1 | 800 | 2 | 160000 | 1 |
| 2 | 850 | 3 | 170000 | 0 |
| 3 | 900 | 3 | 180000 | 0 |
| 4 | 950 | 3 | 190000 | 1 |
Regression Mean Squared Error: 8.470329472543003e-22
Classification Accuracy: 1.0 Confusion Matrix: [[2]]
/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:534: UserWarning: A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels. warnings.warn(
In [ ]:
XOR¶
The XOR (Exclusive OR) function is a logical operation used in mathematics, computer science, and programming. It compares two inputs and returns true only when the inputs are different.
| Input A | Input B | A XOR B |
|---|---|---|
| 0 | 0 | 0 |
| 0 | 1 | 1 |
| 1 | 0 | 1 |
| 1 | 1 | 0 |
Simple Explanation If both inputs are the same → Output is 0 If the inputs are different → Output is 1
In [ ]:
a = 1
b = 0
result = a ^ b # XOR operator
print(result) # Output: 1