Machine Learning Model¶
whats Machine learning model?¶
A machine learning model is basically a mathematical or computational representation of a pattern in data. It “learns” from examples and can then make predictions or decisions on new, unseen data.
Think of it like this:
You teach it → Feed it examples of input → Output pairs
It learns → Finds patterns and relationships in the data
It predicts → Given new inputs, it predicts the output
Example in Real Life¶
Suppose you want to predict a student’s exam score based on how many hours they studied:
Hours Studied Score 1 50 2 55 3 65 4 70
A machine learning model will look at these examples, figure out the pattern (more hours → higher score), and then can predict the score if the student studies 5 hours.
Key Components
Data (Input): Features or independent variables (e.g., hours studied, rainfall, temperature)
Output (Target): What you want to predict (e.g., exam score, crop yield, disease presence)
Learning Algorithm: Method/model that finds patterns in the data (e.g., Linear Regression, Decision Tree, Neural Networks)
Predictions: The model’s output on new/unseen data
For beginners starting ML on tabular data, the most important libraries are:¶
NumPy, Pandas, Matplotlib/Seaborn, Scikit-learn, Jax
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(-3,3,100)
plt.plot(x,1/(1+np.exp(-x)),label='sigmoid')
plt.plot(x,np.tanh(x),label='tanh')
plt.plot(x,np.where(x < 0,0,x),label='ReLU')
plt.plot(x,np.where(x < 0,0.1*x,x),'--',label='leaky ReLU')
plt.legend()
plt.show()
Rainfall_mm,Temperature_C,Soil_pH,Fertilizer_kg,Pesticide_kg,Irrigation,Yield_ton_ha
820,22.5,6.4,120,3.2,1,4.5
650,24.1,5.8,90,2.1,0,3.1
900,21.3,6.8,140,3.8,1,5.2
720,23.0,6.1,100,2.5,1,4.0
580,25.4,5.6,80,1.9,0,2.8
870,22.1,6.7,130,3.5,1,5.0
760,23.7,6.2,110,2.9,1,4.2
610,24.9,5.7,85,2.0,0,3.0
940,20.8,7.0,150,4.1,1,5.6
700,23.5,6.0,105,2.6,1,4.1
680,24.0,5.9,95,2.3,0,3.3
910,21.5,6.9,145,3.9,1,5.4
790,22.8,6.3,115,3.0,1,4.4
560,26.1,5.5,75,1.8,0,2.6
880,21.9,6.6,135,3.6,1,5.1
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
data = {
'Name': ['Dorji', 'Tashi', 'Pema', 'Dawa', 'Nima', 'Karma', 'Dema', 'Dechen', 'Kelzang', 'Zam'],
'Math': [20, 35, 70, 40, 50, 67, 88, 46, 67, 46],
'Sci': [54, 60, 54, 34, 36, 67, 89, 90, 57, 67],
'Eng': [67, 76, 55, 45, 34, 25, 78, 47, 67, 76],
'Dzo': [93, 59, 76, 77, 59, 47, 29, 39, 71, 62],
'Total': [234, 230, 255, 196, 179, 206, 284, 222, 262, 251]
}
df = pd.DataFrame(data)
X = df[['Math', 'Sci', 'Eng', 'Dzo']] # Features
y = df['Total'] # Target variable
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
data = {
'Name': ['Dorji', 'Tashi', 'Pema', 'Dawa', 'Nima', 'Karma', 'Dema', 'Dechen', 'Kelzang', 'Zam'],
'Math': [20, 35, 70, 40, 50, 67, 88, 46, 67, 46],
'Sci': [54, 60, 54, 34, 36, 67, 89, 90, 57, 67],
'Eng': [67, 76, 55, 45, 34, 25, 78, 47, 67, 76],
'Dzo': [93, 59, 76, 77, 59, 47, 29, 39, 71, 62],
'Total': [234, 230, 255, 196, 179, 206, 284, 222, 262, 251]
}
df = pd.DataFrame(data)
X = df[['Math', 'Sci', 'Eng', 'Dzo']]
y = df['Total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
coefficients = pd.DataFrame({
'Feature': X.columns,
'Coefficient': model.coef_
})
print(coefficients)
Mean Squared Error: 1.6155871338926322e-27 R-squared: 1.0 Feature Coefficient 0 Math 1.0 1 Sci 1.0 2 Eng 1.0 3 Dzo 1.0
import jax.numpy as jnp
from jax import grad, jit
import numpy as np
import pandas as pd
# Your dataset
data = {
'Math': [20, 35, 70, 40, 50, 67, 88, 46, 67, 46],
'Sci': [54, 60, 54, 34, 36, 67, 89, 90, 57, 67],
'Eng': [67, 76, 55, 45, 34, 25, 78, 47, 67, 76],
'Dzo': [93, 59, 76, 77, 59, 47, 29, 39, 71, 62],
'Total': [234, 230, 255, 196, 179, 206, 284, 222, 262, 251]
}
df = pd.DataFrame(data)
# Convert to numpy arrays and add bias term (intercept)
X = df[['Math', 'Sci', 'Eng', 'Dzo']].values
y = df['Total'].values
# Add column of ones for bias (intercept)
X = np.hstack([np.ones((X.shape[0], 1)), X]) # Shape (10,5)
# Convert numpy arrays to jax arrays
X = jnp.array(X)
y = jnp.array(y)
# Initialize weights (parameters) randomly
key = jnp.array([0]) # Dummy key for reproducibility; can use jax.random for real randomness
weights = jnp.zeros(X.shape[1]) # 5 weights including bias
# Define prediction function
def predict(params, inputs):
return jnp.dot(inputs, params)
# Define loss function (Mean Squared Error)
def loss(params, inputs, targets):
preds = predict(params, inputs)
return jnp.mean((preds - targets) ** 2)
# Gradient of loss function w.r.t params
grad_loss = jit(grad(loss))
# Training loop parameters
learning_rate = 0.0001
epochs = 5000
params = weights
for epoch in range(epochs):
grads = grad_loss(params, X, y)
params = params - learning_rate * grads
if epoch % 500 == 0:
current_loss = loss(params, X, y)
print(f"Epoch {epoch}, Loss: {current_loss:.4f}")
print("Final parameters:", params)
# Make predictions
predictions = predict(params, X)
print("Predictions:", predictions)
print("Actual:", y)
Epoch 0, Loss: 166311.7656 Epoch 500, Loss: nan Epoch 1000, Loss: nan Epoch 1500, Loss: nan Epoch 2000, Loss: nan Epoch 2500, Loss: nan Epoch 3000, Loss: nan Epoch 3500, Loss: nan Epoch 4000, Loss: nan Epoch 4500, Loss: nan Final parameters: [nan nan nan nan nan] Predictions: [nan nan nan nan nan nan nan nan nan nan] Actual: [234 230 255 196 179 206 284 222 262 251]
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Dataset
data = {
'Math': [20, 35, 70, 40, 50, 67, 88, 46, 67, 46],
'Sci': [54, 60, 54, 34, 36, 67, 89, 90, 57, 67],
'Eng': [67, 76, 55, 45, 34, 25, 78, 47, 67, 76],
'Dzo': [93, 59, 76, 77, 59, 47, 29, 39, 71, 62],
'Total': [234, 230, 255, 196, 179, 206, 284, 222, 262, 251]
}
# Convert to DataFrame
df = pd.DataFrame(data)
# Features and target
X = df[['Math', 'Sci', 'Eng', 'Dzo']]
y = df['Total']
# Initialize and fit the model
model = LinearRegression()
model.fit(X, y)
# Predictions
y_pred = model.predict(X)
# Print coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
# Evaluate model
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")
# Show actual vs predicted
results = pd.DataFrame({'Actual': y, 'Predicted': y_pred})
print(results)
Coefficients: [1. 1. 1. 1.] Intercept: -1.4210854715202004e-13 Mean Squared Error: 0.00 R^2 Score: 1.00 Actual Predicted 0 234 234.0 1 230 230.0 2 255 255.0 3 196 196.0 4 179 179.0 5 206 206.0 6 284 284.0 7 222 222.0 8 262 262.0 9 251 251.0
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# -------------------------------
# 1️⃣ Load data
data = {
'Name': ['Dorji','Tashi','Pema','Dawa','Nima','Karma','Dema','Dechen','Kelzang','Zam'],
'Math': [20, 35, 70, 40, 50, 67, 88, 46, 67, 46],
'Sci': [54, 60, 54, 34, 36, 67, 89, 90, 57, 67],
'Eng': [67, 76, 55, 45, 34, 25, 78, 47, 67, 76],
'Dzo': [93, 59, 76, 77, 59, 47, 29, 39, 71, 62],
'Total': [234, 230, 255, 196, 179, 206, 284, 222, 262, 251]
}
df = pd.DataFrame(data)
# -------------------------------
# 2️⃣ Categorize Total scores
def categorize(total):
if total < 210:
return 'Low'
elif total < 250:
return 'Medium'
else:
return 'High'
df['Category'] = df['Total'].apply(categorize)
# -------------------------------
# 3️⃣ Features and target
X = df[['Math','Sci','Eng','Dzo']]
y = df['Category']
# -------------------------------
# 4️⃣ Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# -------------------------------
# 5️⃣ Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
# -------------------------------
# 6️⃣ Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
# -------------------------------
# 7️⃣ Predict and evaluate
y_pred = clf.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# -------------------------------
# 8️⃣ Optional: Feature Importance
import matplotlib.pyplot as plt
feature_importance = clf.feature_importances_
features = X.columns
plt.figure(figsize=(8,5))
plt.bar(features, feature_importance, color='skyblue')
plt.title("Feature Importance in Random Forest")
plt.ylabel("Importance")
plt.show()
Confusion Matrix:
[[1 0 0]
[1 0 0]
[1 0 0]]
Classification Report:
precision recall f1-score support
High 0.33 1.00 0.50 1
Low 0.00 0.00 0.00 1
Medium 0.00 0.00 0.00 1
accuracy 0.33 3
macro avg 0.11 0.33 0.17 3
weighted avg 0.11 0.33 0.17 3
/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
What is Probability in Data Science?¶
Probability in data science is a way to measure uncertainty and quantify how likely an event is to happen using data and mathematics.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Data
total_marks = [234, 230, 255, 196, 179, 206, 284, 222, 262, 251]
# Convert to DataFrame
df = pd.DataFrame({"Total": total_marks})
# Sort total marks
sorted_total = np.sort(df["Total"])
# Calculate cumulative probability
cum_prob = np.arange(1, len(sorted_total) + 1) / len(sorted_total)
# Plot CDF
plt.figure(figsize=(7,5))
plt.plot(sorted_total, cum_prob, marker='o', linestyle='-')
plt.xlabel("Total Marks")
plt.ylabel("Cumulative Probability")
plt.title("Cumulative Probability Distribution (CDF) of Total Marks")
plt.grid(True)
plt.show()
Probability Tools¶
1️⃣ Probability Mass Function (PMF)
Used for discrete data
Gives probability of exact values
𝑃 ( 𝑋¶
𝑥 ) P(X=x)
2️⃣ Probability Density Function (PDF)
Used for continuous data
Area under the curve = probability
Example: Distribution of total marks across students.
3️⃣ Histogram
Graphical approximation of probability distribution
Shows frequency of values
Used before PDF & CDF
4️⃣ Joint Probability
Probability of two events together
𝑃 ( 𝐴 ∩ 𝐵 ) P(A∩B)
Example: Probability of scoring high in Math AND Science
5️⃣ Conditional Probability
Probability of one event given another
𝑃 ( 𝐴 ∣ 𝐵 ) P(A∣B)
Example: Probability of high total marks given high Math marks
6️⃣ Bayes’ Theorem
Updates probability with new evidence
𝑃 ( 𝐴 ∣ 𝐵 )¶
𝑃 ( 𝐵 ∣ 𝐴 ) 𝑃 ( 𝐴 ) 𝑃 ( 𝐵 ) P(A∣B)= P(B) P(B∣A)P(A)
Used in: Naive Bayes, medical diagnosis, spam detection
7️⃣ Expectation (Expected Value)
Average outcome over many trials
𝐸 ( 𝑋 )¶
∑ 𝑥 𝑃 ( 𝑥 ) E(X)=∑xP(x)
import pandas as pd
# -----------------------
# 1. Dataset
# -----------------------
data = {
"Name": ["Dorji","Tashi","Pema","Dawa","Nima","Karma","Dema","Dechen","Kelzang","Zam"],
"Math": [20,35,70,40,50,67,88,46,67,46],
"Sci": [54,60,54,34,36,67,89,90,57,67],
"Eng": [67,76,55,45,34,25,78,47,67,76],
"Dzo": [93,59,76,77,59,47,29,39,71,62],
"Total": [234,230,255,196,179,206,284,222,262,251]
}
df = pd.DataFrame(data)
# -----------------------
# 2. Define Events
# -----------------------
# Event A: Total >= 250
df['A'] = df['Total'] >= 250
# Event B: Math >= 60
df['B'] = df['Math'] >= 60
# -----------------------
# 3. Compute Probabilities
# -----------------------
P_A = df['A'].mean() # P(A)
P_B = df['B'].mean() # P(B)
P_B_given_A = df[df['A']]['B'].mean() # P(B|A)
# Bayes theorem
P_A_given_B = (P_B_given_A * P_A) / P_B
print(f"P(A) = {P_A:.2f}")
print(f"P(B) = {P_B:.2f}")
print(f"P(B|A) = {P_B_given_A:.2f}")
print(f"P(A|B) = {P_A_given_B:.2f}")
P(A) = 0.40 P(B) = 0.40 P(B|A) = 0.75 P(A|B) = 0.75
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
# -----------------------
# 1. Dataset
# -----------------------
data = {
"Math": [20,35,70,40,50,67,88,46,67,46],
"Sci": [54,60,54,34,36,67,89,90,57,67],
"Eng": [67,76,55,45,34,25,78,47,67,76],
"Dzo": [93,59,76,77,59,47,29,39,71,62],
"Total": [234,230,255,196,179,206,284,222,262,251]
}
df = pd.DataFrame(data)
# -----------------------
# 2. Histogram for Total Marks
# -----------------------
plt.figure(figsize=(7,5))
plt.hist(df['Total'], bins=5, color='skyblue', edgecolor='black', density=True)
plt.title("Histogram of Total Marks (Probability Density)")
plt.xlabel("Total Marks")
plt.ylabel("Probability Density")
plt.grid(axis='y')
plt.show()
# -----------------------
# 3. PMF for Total Marks
# -----------------------
counts = Counter(df['Total'])
total_students = len(df)
pmf = {k: v/total_students for k,v in counts.items()}
# Plot PMF
plt.figure(figsize=(7,5))
plt.bar(pmf.keys(), pmf.values(), color='lightgreen', edgecolor='black')
plt.title("PMF of Total Marks")
plt.xlabel("Total Marks")
plt.ylabel("Probability")
plt.grid(axis='y')
plt.show()
# Optional: print PMF values
print("PMF values for Total Marks:")
for mark, prob in pmf.items():
print(f"{mark}: {prob:.2f}")
PMF values for Total Marks: 234: 0.10 230: 0.10 255: 0.10 196: 0.10 179: 0.10 206: 0.10 284: 0.10 222: 0.10 262: 0.10 251: 0.10