from IPython.display import HTML
HTML('<img src="images/presentation.png" width="900">')

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")
data

data.head()

data.tail()

data.describe()

data.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB

data.isnull()

import matplotlib.pyplot as plt

genders = data['gender'].unique() 
for g in genders:
    subset = data[data['gender'] == g]  
    plt.scatter(subset['reading score'], subset['writing score'], label=g)

plt.xlabel('Reading Score')
plt.ylabel('Writing Score')
plt.title('Male and Female (Writing Score Vs Reading Score')
plt.legend()
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")  
x_column = 'writing score'   
y_column = 'math score'      

x = df[x_column].dropna().values
y = df[y_column].dropna().values

np.set_printoptions(precision=3)

coeff1 = np.polyfit(x, y, 1)
pfit1 = np.poly1d(coeff1)

coeff2 = np.polyfit(x, y, 2)
pfit2 = np.poly1d(coeff2)

xfit = np.linspace(np.min(x), np.max(x), 100)
yfit1 = pfit1(xfit)
yfit2 = pfit2(xfit)

print(f"first-order fit coefficients: {coeff1}")
print(f"second-order fit coefficients: {coeff2}")

plt.figure(figsize=(8,6))
plt.plot(x, y, 'o', alpha=0.6, label='data')
plt.plot(xfit, yfit1, 'g-', label='linear fit')
plt.plot(xfit, yfit2, 'r-', label='quadratic fit')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f"Polynomial Fit: {y_column} vs {x_column}")
plt.legend()
plt.show()

first-order fit coefficients: [ 0.801 11.583]
second-order fit coefficients: [-1.238e-03  9.640e-01  6.503e+00]

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

df = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")  
feature_cols = ['math score', 'reading score']  
target_col = 'writing score'                    

X = df[feature_cols].values
y = df[target_col].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = MLPRegressor(hidden_layer_sizes=(50,50),
                     activation='tanh',
                     solver='adam',
                     max_iter=1000,
                     random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.3f}")
print(f"R2 Score: {r2:.3f}")

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel("True Writing Score")
plt.ylabel("Predicted Writing Score")
plt.title("MLP Regressor Predictions with Tanh Activation")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  
plt.show()

Mean Squared Error: 26.222
R2 Score: 0.891

/opt/conda/lib/python3.13/site-packages/sklearn/neural_network/_multilayer_perceptron.py:781: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.
  warnings.warn(

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")

x = data['writing score'].values

x = np.sort(x)

plt.plot(x, 1/(1+np.exp(-x)), label='Sigmoid')
plt.plot(x, np.tanh(x), label='Tanh')
plt.plot(x, np.where(x < 0, 0, x), label='ReLU')
plt.plot(x, np.where(x < 0, 0.1*x, x), '--', label='Leaky ReLU')

plt.legend()
plt.xlabel("Input values (from your data)")
plt.ylabel("Activation output")
plt.title("Activation Functions using Uploaded Data")
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")

scores = data['math score']

# Plot histogram (normalized)
plt.hist(scores, bins=20, density=True, alpha=0.6)

mean = scores.mean()
std = scores.std()

x = np.linspace(scores.min(), scores.max(), 200)
gaussian = (1 / (std * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((x - mean) / std) ** 2)

plt.plot(x, gaussian)

plt.xlabel("Math Score")
plt.ylabel("Density")
plt.title("Histogram of Math Scores with Gaussian Curve")

plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")  
column = "math score"             
x = data[column].dropna().values

true_mean = np.mean(x)
true_std = np.std(x)

trials = 200
points = np.arange(10, 500, 25)
means = np.zeros((trials, len(points)))

for p in range(len(points)):
    N = points[p]
    for t in range(trials):
        sample = np.random.choice(x, size=N, replace=True)
        means[t, p] = np.mean(sample)

plt.plot(points, true_mean + true_std / np.sqrt(points), 'r', label='calculated')
plt.plot(points, true_mean - true_std / np.sqrt(points), 'r')

estimated_mean = np.mean(means, axis=0)
estimated_std = np.std(means, axis=0)

plt.errorbar(points, estimated_mean, yerr=estimated_std,
             fmt='k-o', capsize=7, label='estimated')

for p in range(len(points)):
    plt.plot(np.full(trials, points[p]), means[:, p], 'o', markersize=2)

plt.xlabel('number of samples averaged')
plt.ylabel('mean estimates')
plt.legend()
plt.show()

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d

data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv") 
xcol = "reading score"  
ycol = "math score"  

x = data[xcol].dropna().values
y = data[ycol].dropna().values

N = min(len(x), len(y))
x = x[:N]
y = y[:N]

nsteps = 1000
momentum = 0.  

def kmeans(x, y, momentum, nclusters):
    indices = np.random.uniform(low=0, high=len(x), size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]

    for _ in range(nsteps):
        X = np.outer(x, np.ones(len(mux)))
        Y = np.outer(y, np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)), mux)
        Muy = np.outer(np.ones(len(x)), muy)

        distances = np.sqrt((X - Mux)**2 + (Y - Muy)**2)
        mins = np.argmin(distances, axis=1)

        for i in range(len(mux)):
            index = np.where(mins == i)
            if len(index[0]) > 0:
                mux[i] = np.mean(x[index])
                muy[i] = np.mean(y[index])

    distances = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        distances += np.sum(np.sqrt((x[index] - mux[i])**2 + (y[index] - muy[i])**2))

    return mux, muy, distances

def plot_kmeans(x, y, mux, muy):
    fig, ax = plt.subplots()
    ax.plot(x, y, ".", alpha=0.5)
    ax.plot(mux, muy, "r.", markersize=20)
    ax.set_xlabel(xcol)
    ax.set_ylabel(ycol)
    ax.set_title(f"{len(mux)} clusters (centroids)")
    plt.show()

def plot_Voronoi(x, y, mux, muy):
    fig, ax = plt.subplots()
    ax.plot(x, y, ".", alpha=0.5)
    vor = Voronoi(np.stack((mux, muy), axis=1))
    voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
    ax.set_xlabel(xcol)
    ax.set_ylabel(ycol)
    ax.set_title(f"{len(mux)} clusters (Voronoi)")
    plt.show()

distances = []

for k in range(1, 6):
    mux, muy, d = kmeans(x, y, momentum, k)
    distances.append(d)
    
    if k <= 2:
        plot_kmeans(x, y, mux, muy)
    else:
        plot_Voronoi(x, y, mux, muy)
plt.figure()
plt.plot(range(1, 6), distances, "o-")
plt.xlabel("Number of clusters")
plt.ylabel("Total distance to clusters")
plt.title("Elbow Plot")
plt.xticks(range(1, 6))
plt.show()

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd

data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")

print("Columns available in your dataset:\n", data.columns)

y = data["parental level of education"].astype('category').cat.codes  
X = data[["math score", "reading score", "writing score"]].values

print(f"Your data shape (records, features): {X.shape}")

plt.scatter(X[:,0], X[:,1], c=y)
plt.xlabel("math score")
plt.ylabel("reading score")
plt.title("Students Performance: Two Features")
plt.colorbar(label="parental level of education")
plt.show()

X = X - np.mean(X, axis=0)
std = np.std(X, axis=0)
Xscale = X / np.where(std > 0, std, 1)

pca = sklearn.decomposition.PCA(n_components=3)
Xpca = pca.fit_transform(Xscale)

plt.plot(pca.explained_variance_, 'o')
plt.xlabel("component")
plt.ylabel("explained variance")
plt.show()

plt.scatter(Xpca[:,0], Xpca[:,1], c=y)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of Student Performance")
plt.colorbar(label="parental level of education")
plt.show()

Columns available in your dataset:
 Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')
Your data shape (records, features): (1000, 3)

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75
...	...	...	...	...	...	...	...	...
995	female	group E	master's degree	standard	completed	88	99	95
996	male	group C	high school	free/reduced	none	62	55	55
997	female	group C	high school	free/reduced	completed	59	71	65
998	female	group D	some college	standard	completed	68	78	77
999	female	group D	some college	free/reduced	none	77	86	86

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
995	female	group E	master's degree	standard	completed	88	99	95
996	male	group C	high school	free/reduced	none	62	55	55
997	female	group C	high school	free/reduced	completed	59	71	65
998	female	group D	some college	standard	completed	68	78	77
999	female	group D	some college	free/reduced	none	77	86	86

	math score	reading score	writing score
count	1000.00000	1000.000000	1000.000000
mean	66.08900	69.169000	68.054000
std	15.16308	14.600192	15.195657
min	0.00000	17.000000	10.000000
25%	57.00000	59.000000	57.750000
50%	66.00000	70.000000	69.000000
75%	77.00000	79.000000	79.000000
max	100.00000	100.000000	100.000000

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	False	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...
995	False	False	False	False	False	False	False	False
996	False	False	False	False	False	False	False	False
997	False	False	False	False	False	False	False	False
998	False	False	False	False	False	False	False	False
999	False	False	False	False	False	False	False	False

Presentation¶

Presentation Slide¶

Week 01¶

Introduction to Data Science¶

Fab Labs¶

21st-Century Vocational Skills¶

Selecting Datasets from Open Sources¶

Introduction to the JupyterLab Interface¶

Dataset for this course¶

Tools¶

NumPy¶

Pandas¶

Matplotlib¶

Assignment (Data Visualization using tools)¶

Example¶

Show first and last five Data¶

Descriptive statistics¶

To show column names¶

Data Information¶

Find Missing Values¶

Data Visualization¶

Week 02¶

Function Fitting¶

polyfit routine¶

Singular Value Decomposition (SVD),¶

lstsq routine¶

Assignment¶

Machine Learning¶

Different Functions¶

Scikit Learn Library for Machin Learning¶

Functions¶

Probabilty¶

Definition¶

Probability in Data Science¶

Histogram¶

Averaging¶

Explanation¶

Density Estimation¶

Voronoi Cluster, K-Means, Elbow Plot¶

Explanation¶

Transform¶

Principal Components Analysis (PCA)¶

References¶

Thank You¶