[Wangd Lhamo] - Fab Futures - Data Science
Home About

Machine Learning¶

✅ What is Machine Learning?

Machine learning (ML) is a field of artificial intelligence where computers learn patterns from data and use those patterns to make decisions or predictions without being explicitly programmed. Work easily with ML libraries like:

  • scikit-learn
  • TensorFlow
  • PyTorch
  • NumPy / Pandas
  • Matplotlib / Seaborn

Simple ML example in Jupyter¶

In [3]:
import sklearn.linear_model import LinearRegression
import numpy as np
  Cell In[3], line 1
    import sklearn.linear_model import LinearRegression
                                ^
SyntaxError: invalid syntax
In [2]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Training data
x = np.array([[1], [2], [3], [4]])
y = np.array([2, 4, 6, 8])

# Create model
model = LinearRegression()
model.fit(x, y)

# Predict
print(model.predict([[5]]))  # Output: 10
[10.]
In [1]:
import jax
import jax.numpy as jnp
from jax import random
import pandas as pd
import altair as alt
In [ ]:
 

Imported dataset¶

In [2]:
# Data
data = {
    "Section": ["A", "B", "C"],
    "Total Students": [28, 28, 28],
    "Total Stds Passed": [22, 24, 13],
    "Total Stds Fail": [6, 5, 15],
    "Total Pass %": [78.57, 85.71, 46.43],
    "Total Fail %": [21.43, 17.86, 53.57],
    "Mean Mark": [75.7, 71.7, 57.9],
    "National Mean Mark": [67.09, 75, 67.09]
}

df = pd.DataFrame(data)
df
Out[2]:
Section Total Students Total Stds Passed Total Stds Fail Total Pass % Total Fail % Mean Mark National Mean Mark
0 A 28 22 6 78.57 21.43 75.7 67.09
1 B 28 24 5 85.71 17.86 71.7 75.00
2 C 28 13 15 46.43 53.57 57.9 67.09
In [3]:
key = random.PRNGKey(0)

# Function to simulate student pass/fail
def simulate_pass_fail(key, total_students, total_pass):
    # 1 = Pass, 0 = Fail
    outcomes = jnp.array([1]*total_pass + [0]*(total_students - total_pass))
    shuffled = random.permutation(key, outcomes)
    return shuffled

# Compute pass probability for each section
pass_prob = []
for i, row in df.iterrows():
    key, subkey = random.split(key)
    sim = simulate_pass_fail(subkey, row["Total Students"], row["Total Stds Passed"])
    prob = jnp.mean(sim)  # average = probability of pass
    pass_prob.append(prob)

df["Simulated Pass Prob"] = jnp.array(pass_prob)
df
Out[3]:
Section Total Students Total Stds Passed Total Stds Fail Total Pass % Total Fail % Mean Mark National Mean Mark Simulated Pass Prob
0 A 28 22 6 78.57 21.43 75.7 67.09 0.785714
1 B 28 24 5 85.71 17.86 71.7 75.00 0.857143
2 C 28 13 15 46.43 53.57 57.9 67.09 0.464286
In [4]:
# Prepare DataFrame for Altair
df_viz = df.melt(
    id_vars=["Section"],
    value_vars=["Total Pass %", "Total Fail %", "Simulated Pass Prob"],
    var_name="Category",
    value_name="Percentage"
)

# Convert simulated pass probability to % for visualization
df_viz["Percentage"] = df_viz.apply(
    lambda row: row["Percentage"]*100 if row["Category"]=="Simulated Pass Prob" else row["Percentage"], axis=1
)

# Plot
chart = alt.Chart(df_viz).mark_bar().encode(
    x=alt.X('Section:N', title='Section'),
    y=alt.Y('Percentage:Q', title='Percentage'),
    color='Category:N',
    tooltip=['Section', 'Category', 'Percentage']
).properties(
    title="Pass % and Fail % by Section (with JAX Simulation)",
    width=500,
    height=300
)

chart
Out[4]:

Regression¶

In [5]:
import pandas as pd

data = {
    "Section": ["A", "B", "C"],
    "Total Students": [28, 28, 28],
    "Total Stds Passed": [22, 24, 13],
    "Total Stds Fail": [6, 5, 15],
    "Total Pass %": [78.57, 85.71, 46.43],
    "Total Fail %": [21.43, 17.86, 53.57],
    "Mean Mark": [75.7, 71.7, 57.9],
    "National Mean Mark": [67.09, 75, 67.09]
}

df = pd.DataFrame(data)
df
Out[5]:
Section Total Students Total Stds Passed Total Stds Fail Total Pass % Total Fail % Mean Mark National Mean Mark
0 A 28 22 6 78.57 21.43 75.7 67.09
1 B 28 24 5 85.71 17.86 71.7 75.00
2 C 28 13 15 46.43 53.57 57.9 67.09

JAX¶

In [6]:
import jax.numpy as jnp
from jax import grad, jit

# Convert data to JAX arrays
x = jnp.array(df["Total Pass %"])
y = jnp.array(df["Mean Mark"])

# Initialize parameters
m = jnp.array(0.0)
c = jnp.array(0.0)
learning_rate = 0.001
epochs = 1000

# Define loss function (Mean Squared Error)
def loss(params, x, y):
    m, c = params
    y_pred = m * x + c
    return jnp.mean((y - y_pred)**2)

# Gradient descent
params = jnp.array([m, c])
grad_loss = grad(loss)

for _ in range(epochs):
    grads = grad_loss(params, x, y)
    params = params - learning_rate * grads

m, c = params
print(f"Fitted line: y = {m:.2f}x + {c:.2f}")
Fitted line: y = nanx + nan
In [7]:
import matplotlib.pyplot as plt

y_pred = m * x + c

plt.figure(figsize=(8,5))
plt.scatter(x, y, color='blue', label='Actual Mean Marks')
plt.plot(x, y_pred, color='red', linestyle='--', label='Regression Line')
plt.xlabel('Total Pass %')
plt.ylabel('Mean Mark')
plt.title('Regression: Mean Mark vs Total Pass %')
plt.legend()
plt.show()
No description has been provided for this image
In [8]:
import altair as alt

df_viz = df.copy()
df_viz["Predicted Mean Mark"] = m * df_viz["Total Pass %"] + c

chart = alt.Chart(df_viz).mark_circle(size=100).encode(
    x='Total Pass %',
    y='Mean Mark',
    tooltip=['Section', 'Mean Mark', 'Total Pass %']
)

line = alt.Chart(df_viz).mark_line(color='red').encode(
    x='Total Pass %',
    y='Predicted Mean Mark'
)

chart + line
Out[8]:

Multiple Linear Regression using JAX¶

In [11]:
import pandas as pd
import jax.numpy as jnp
from jax import grad

# Dataset
data = {
    "Section": ["A", "B", "C"],
    "Total Students": [28, 28, 28],
    "Total Stds Passed": [22, 24, 13],
    "Total Stds Fail": [6, 5, 15],
    "Total Pass %": [78.57, 85.71, 46.43],
    "Total Fail %": [21.43, 17.86, 53.57],
    "Mean Mark": [75.7, 71.7, 57.9],
    "National Mean Mark": [67.09, 75, 67.09]
}

df = pd.DataFrame(data)
df
Out[11]:
Section Total Students Total Stds Passed Total Stds Fail Total Pass % Total Fail % Mean Mark National Mean Mark
0 A 28 22 6 78.57 21.43 75.7 67.09
1 B 28 24 5 85.71 17.86 71.7 75.00
2 C 28 13 15 46.43 53.57 57.9 67.09

y=w1​x1​+w2​x2​+w3​x3​+b¶

In [13]:
# Features and target
X = jnp.array(df[["Total Pass %", "Total Fail %", "National Mean Mark"]])
y = jnp.array(df["Mean Mark"])

# Initialize weights and bias
w = jnp.zeros(3)
b = 0.0
learning_rate = 0.0001
epochs = 10000

# Define Mean Squared Error loss
def loss(params, X, y):
    w, b = params[:3], params[3]
    y_pred = jnp.dot(X, w) + b
    return jnp.mean((y - y_pred)**2)

# Gradient function
grad_loss = grad(loss)

# Combine weights and bias for optimization
params = jnp.append(w, b)

# Gradient Descent
for _ in range(epochs):
    grads = grad_loss(params, X, y)
    params = params - learning_rate * grads

w_opt = params[:3]
b_opt = params[3]
print(f"Optimized weights: {w_opt}")
print(f"Optimized bias: {b_opt}")
Optimized weights: [nan nan nan]
Optimized bias: nan
In [14]:
y_pred = jnp.dot(X, w_opt) + b_opt
df["Predicted Mean Mark"] = y_pred
df
Out[14]:
Section Total Students Total Stds Passed Total Stds Fail Total Pass % Total Fail % Mean Mark National Mean Mark Predicted Mean Mark
0 A 28 22 6 78.57 21.43 75.7 67.09 NaN
1 B 28 24 5 85.71 17.86 71.7 75.00 NaN
2 C 28 13 15 46.43 53.57 57.9 67.09 NaN
In [16]:
import altair as alt

chart = alt.Chart(df).mark_circle(size=100).encode(
    x='Section',
    y='Mean Mark',
    color=alt.value('blue'),
    tooltip=['Section', 'Mean Mark', 'Predicted Mean Mark']
)

predicted_line = alt.Chart(df).mark_line(color='red').encode(
    x='Section',
    y='Predicted Mean Mark'
)

chart + predicted_line
Out[16]:

Activities¶

In [1]:
# ===============================
# Beginner-Friendly ML in Jupyter
# ===============================

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

# -------------------------------
# 2. Load Dataset
# -------------------------------
# Example dataset for regression: House Prices
# You can replace this with your own CSV file
data = pd.DataFrame({
    'size': [750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200],
    'bedrooms': [2, 2, 3, 3, 3, 3, 4, 4, 4, 4],
    'price': [150000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000],
    'sold': [1, 1, 0, 0, 1, 1, 0, 0, 1, 1]  # Example for classification
})

# Preview data
print("Dataset:")
display(data.head())

# -------------------------------
# 3. Data Visualization
# -------------------------------
sns.pairplot(data, x_vars=['size', 'bedrooms'], y_vars='price', height=4, kind='scatter')
plt.show()

sns.countplot(x='sold', data=data)
plt.title("Sold Status Count")
plt.show()

# -------------------------------
# 4. Regression: Predict House Price
# -------------------------------
X = data[['size', 'bedrooms']]  # Features
y = data['price']               # Target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

# Predict
y_pred = reg_model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f"Regression Mean Squared Error: {mse}")

# Plot
plt.scatter(X_test['size'], y_test, color='blue', label='Actual')
plt.scatter(X_test['size'], y_pred, color='red', label='Predicted')
plt.xlabel('Size')
plt.ylabel('Price')
plt.legend()
plt.show()

# -------------------------------
# 5. Classification: Predict Sold Status
# -------------------------------
X_class = data[['size', 'bedrooms']]
y_class = data['sold']

# Split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

# Train
clf_model = LogisticRegression()
clf_model.fit(X_train_c, y_train_c)

# Predict
y_pred_c = clf_model.predict(X_test_c)

# Evaluate
acc = accuracy_score(y_test_c, y_pred_c)
cm = confusion_matrix(y_test_c, y_pred_c)

print(f"Classification Accuracy: {acc}")
print("Confusion Matrix:")
print(cm)
Dataset:
size bedrooms price sold
0 750 2 150000 1
1 800 2 160000 1
2 850 3 170000 0
3 900 3 180000 0
4 950 3 190000 1
No description has been provided for this image
No description has been provided for this image
Regression Mean Squared Error: 8.470329472543003e-22
No description has been provided for this image
Classification Accuracy: 1.0
Confusion Matrix:
[[2]]
/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:534: UserWarning: A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.
  warnings.warn(
In [ ]:
 

XOR¶

The XOR (Exclusive OR) function is a logical operation used in mathematics, computer science, and programming. It compares two inputs and returns true only when the inputs are different.

Input A Input B A XOR B
0 0 0
0 1 1
1 0 1
1 1 0

Simple Explanation If both inputs are the same → Output is 0 If the inputs are different → Output is 1

In [ ]:
a = 1
b = 0
result = a ^ b   # XOR operator
print(result)  # Output: 1