import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv ('~/work/sonam-dendup/datasets/StudentsPerformance.csv')
df.head(2)

df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

df.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

df.shape

(1000, 8)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB

df.describe()

df['gender'].nunique()

2

df['gender'].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

df['parental level of education'].nunique() # check the unique data

6

df['parental level of education'].value_counts()

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

# Plot 01=> Pie Chart
count = df['parental level of education'].value_counts()
count
plt.pie(count, labels = count.index, autopct = "%1.0f", explode =[0,0.05,0.05,0,0.05,0])
plt.axis('equal')
plt.title("Parental level of education")
#plt.savefig("fig2.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

new_df = df['math score'].iloc[0:50]
# Line Plot 
plt.plot(new_df, color="black", marker='o', linestyle="-", linewidth="1.5")
plt.xlabel("Student Index (First 50)")
plt.ylabel("Math score")
plt.title("Student Performance Math (First 50 Students)")
#plt.savefig("fig3.png", dpi=300, bbox_inches='tight', transparent=True)

Text(0.5, 1.0, 'Student Performance Math (First 50 Students)')

# Plot 03
color_map = {"male": "skyblue", "female": "gray"}

for gender, color in color_map.items():
    df_gender = df[df["gender"] == gender]
    
    # Plot only the filtered data (df_gender) using the corresponding color and label
    plt.scatter(df_gender["reading score"], df_gender["writing score"], c=color, label=gender) 
    
plt.legend()
plt.xlabel("Reading score")
plt.ylabel("Writing score")
plt.title("Scatter Plot ")
#plt.savefig("fig2.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

#Plot 04
mal_mat =df[df["gender"] == "male"]["math score"]
fem_mat =df[df["gender"] == "female"]["math score"]
#print(mal_mat, fem_mat)
plt.boxplot([mal_mat, fem_mat], labels = ["Male" , "Female"])
plt.title("BOX PLOT")
plt.show()

/tmp/ipykernel_789/1072431338.py:5: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot([mal_mat, fem_mat], labels = ["Male" , "Female"])

# Plot 05
color_map = {"male": "skyblue", "female": "black"}

for gender, color in color_map.items():
    df_gender = df[df["gender"] == gender]
    
    # Plot only the filtered data (df_gender) using the corresponding color and label
    plt.scatter(df_gender["reading score"], df_gender["math score"], c=color, label=gender) 
    
plt.legend()
plt.xlabel("Reading score")
plt.ylabel("math")
plt.title("Scatter Plot ")
#plt.savefig("fig2.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

df_new = df.sort_values(by='reading score', ascending=True)

npts = 200 
c = [0.01, 0.01, 20] 
noise = 10 
min_x = min(df_new['reading score'])
max_x = max(df_new['reading score'])

x = min_x + (max_x - min_x) * np.random.rand(npts)
y = c[2] + c[1] * x + c[0] * x * x + np.random.normal(0, noise, npts)

# Polynomial Fit (polyfit)
c1 = np.polyfit(x, y, 1) # fit first-order polynomial
c2 = np.polyfit(x, y, 2) # fit second-order polynomial

xfit = np.linspace(min(x), max(x), 100)
p1 = np.poly1d(c1)
y1 = p1(xfit)
p2 = np.poly1d(c2)
y2 = p2(xfit) 

plt.figure()
plt.plot(x, y, 'o')
plt.plot(xfit, y1, 'g-', label='linear')
plt.plot(xfit, y2, 'r-', label='quadratic')
plt.xlabel(" X Values (based on Reading Score range)") 
plt.ylabel("Y Values (Quadratic Model)")  
plt.legend()
plt.title("Polynomial Fit")
plt.savefig("fig1.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

def show_distribution(mat_score):
    import seaborn as sns
    min_val = mat_score.min()
    max_val = mat_score.max()
    mean_val = mat_score.mean()
    med_val = mat_score.median()
    mode_val = mat_score.mode()[0]
    print(f'Minimum:{min_val}\n Maximun:{max_val}\nMean:{mean_val}\nMedian:{med_val}\n Mode:{mode_val}')
    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(10, 6))

    #add line for mean, mode and median
    ax[0].hist(mat_score, edgecolor ='black')
    ax[0].set_ylabel('Frequency')
    ax[0].axvline(x =min_val, color ='red',linestyle='dashed', linewidth=2)
    ax[0].axvline(x =max_val, color ='red',linestyle='dashed', linewidth=2)
    ax[0].axvline(x =mean_val, color ='cyan',linestyle='dashed',linewidth=2)
    ax[0].axvline(x =med_val, color ='gray',linestyle='dashed', linewidth=2)
    ax[0].axvline(x =mode_val, color ='orange',linestyle='dashed', linewidth=2) 
    
    # plot the boxplot
    ax[1].boxplot(mat_score, vert = False)
    ax[1].set_xlabel('Score')
    # plot the Density plot
    sns.kdeplot(mat_score, ax=ax[2]) 
    ax[2].set_xlabel('Score')
    fig.suptitle('Data Distrubution')
    fig.show()
    
col = df['math score']

# call the function
show_distribution(col)

Minimum:0
 Maximun:100
Mean:66.089
Median:66.0
 Mode:65

# Removed Outlier
col = df[df['math score'] > 29]['math score']
#fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(10, 6))
show_distribution(col)

Minimum:30
 Maximun:100
Mean:66.71399594320486
Median:66.5
 Mode:65

df.boxplot(column = 'math score' , by = 'parental level of education', figsize=(10,5))
plt.grid(False)
plt.show()

df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

df['parental level of education'].value_counts()

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

df_map = df['parental level of education'].map(
    {
    'some college': 1 , 
    "associate's degree": 2,
    'high school': 3 , 
    'some high school': 4,
    "bachelor's degree": 5,
    "master's degree" : 6
    }
                                              )

df["math score"].map(df_map)
math_score = df ['math score'].iloc[0:50]

trials = 100
points = np.arange(1, len(math_score)+1)  # sample sizes from 1 to number of students
means = np.zeros((trials, len(points)))

for i, n in enumerate(points):
    for t in range(trials):
        means[t, i] = np.mean(np.random.choice(math_score, size=n, replace=True))  # sampling with replacement

mean_estimates = np.mean(means, axis=0)
std_estimates = np.std(means, axis=0)

plt.errorbar(points, mean_estimates, yerr=std_estimates, fmt='k-o', capsize=5, label='estimated')

mean_score = np.mean(math_score)
std_score = np.std(math_score)
plt.plot(points, mean_score + std_score/np.sqrt(points), 'r', label='calculated')
plt.plot(points, mean_score - std_score/np.sqrt(points), 'r')

for i, n in enumerate(points):
    plt.plot(np.full(trials, n), means[:, i], 'o', markersize=3, alpha=0.5)

plt.xlabel("number of samples averaged")
plt.ylabel("mean estimates of Math score")
plt.title('Averaging')
plt.legend()
plt.show()

df["math score"].map(df_map)
math_score = df ['math score']
npts = len(math_score)
mean = np.mean(math_score)
stddev = np.std(math_score)

#Plot
plt.hist(math_score, bins=npts//50, density=True, alpha=0.6, color='gray', edgecolor='black')

plt.plot(math_score, 0*math_score, '|', ms=20, color='black')

xi = np.linspace(mean-3*stddev, mean+3*stddev, 100)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
plt.plot(xi, yi, 'b', lw=2, label='Gaussian Fit')

plt.xlabel("Math Score")
plt.ylabel("Density")
plt.title("Density Estimation: Gaussian Curve")
plt.legend()
plt.savefig("fig2.1.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

from sklearn.cluster import KMeans
from scipy.spatial import Voronoi, voronoi_plot_2d
X = df[['math score', 'reading score']].values

#  Run K-Means
K = 3
kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_

#Create Voronoi Diagram
vor = Voronoi(centers)

# Plotting
fig, ax = plt.subplots(figsize=(10, 8))

# Define colors for the regions to match the 'viridis' scatter plot
cmap = plt.cm.viridis
colors = cmap(np.linspace(0, 1, K))

for i, region_idx in enumerate(vor.point_region):
    region = vor.regions[region_idx]
    if not -1 in region: 
        polygon = [vor.vertices[i] for i in region]
        ax.fill(*zip(*polygon), color=colors[i], alpha=0.2)
        
# Plot Voronoi boundaries 
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=True, 
                line_colors='black', line_width=2, line_alpha=0.6)

# Scatter plot of student scores
ax.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, edgecolors='k', alpha=0.8, label='Students')

# Plot the centroids
ax.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, edgecolors='black', label='Centroids')

# Formatting
ax.set_title(f'Student Score Clusters (K={K}) with Voronoi Regions')
ax.set_xlabel('Math Score')
ax.set_ylabel('Reading Score')
ax.set_xlim(X[:, 0].min() - 5, X[:, 0].max() + 5)
ax.set_ylim(X[:, 1].min() - 5, X[:, 1].max() + 5)
ax.legend()
plt.show()

from sklearn.cluster import KMeans

def plot_kmeans_elbow(data, K_max):
    means = []
    inertias = []
    
    for k in range(1, K_max + 1):
        kmeans = KMeans(n_clusters=k, n_init="auto", random_state=42)
        kmeans.fit(data)

        means.append(k)
        inertias.append(kmeans.inertia_)

    # Generate the Elbow Plot
    plt.figure(figsize=(8, 6))
    plt.plot(means, inertias, 'o')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia')
    plt.title('Elbow Method For Optimal k')
    plt.grid(True)
    plt.show()

plot_kmeans_elbow(df[['reading score', 'writing score']], 10)

df2 = df[['reading score', 'math score']].copy()

df2

from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

X = df2.drop('math score', axis=1) # Independent features
y = df2['math score']            # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build & Train Model
model = MLPRegressor(hidden_layer_sizes=(64, 64), activation='relu', max_iter=2000, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate Performance
predictions = model.predict(X_test_scaled)
print(f"R-squared Score: {r2_score(y_test, predictions)}")

R-squared Score: 0.6753082526328942

plt.figure(figsize=(8, 6))
plt.plot(model.loss_curve_)
plt.title('Neural Network Training Loss Curve')
plt.xlabel('Iterations')
plt.ylabel('Loss (Mean Squared Error)')
plt.grid(False)
plt.show()

# Entropy calculation
import numpy as np
import pandas as pd
from scipy.stats import entropy

# 1. Get probability distribution (binning is recommended for continuous scores)
# Using value_counts(normalize=True) creates the p(x) distribution
prob_dist = df2['math score'].value_counts(normalize=True)

# 2. Calculate Shannon Entropy (base 2 for bits)
math_entropy = entropy(prob_dist, base=2)
print(f"Shannon Entropy: {math_entropy:.4f} bits")

Shannon Entropy: 5.8767 bits

# Data visualization 
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))

# Plot the distribution of scores
sns.histplot(df2['math score'], kde=True, color='black', stat="probability")

# Overlay entropy value as text
plt.text(df2['math score'].min(), 0.05, f'Calculated Entropy: {math_entropy:.2f} bits', 
         fontsize=12, bbox=dict(facecolor='white', alpha=0.5))

plt.title('Math Score Distribution and Entropy Analysis') 
plt.xlabel('Math Score')
plt.ylabel('Probability')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Select numeric features (excluding target 'mathscore' if predicting)
features = df2.select_dtypes(include=[np.number]).columns
x = df2[features].values

# Standardize (Crucial for PCA)
x_scaled = StandardScaler().fit_transform(x)

# Apply PCA (e.g., reduce to 2 components for visualization)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x_scaled)

# Results
print(f"Explained Variance Ratio: {pca.explained_variance_ratio_}")

Explained Variance Ratio: [0.90878983 0.09121017]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 1. Select numeric features related to math performance
# Ex: ['math_test', 'homework_score', 'attendance', 'study_hours']
features = df2.select_dtypes(include=[np.number]).columns
x = df2[features].values

# 2. Standardization (Crucial: PCA is sensitive to scale)
# Standardizes to mean=0 and variance=1
x_scaled = StandardScaler().fit_transform(x)

# 3. Apply PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x_scaled)

# 4. Create a DataFrame for results
pca_df = pd.DataFrame(data=principalComponents, 
                      columns=['PC1', 'PC2'])

# Plotting the 2D Projection
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', data=pca_df, alpha=0.7)
plt.axhline(0, color='gray', linestyle='--')
plt.axvline(0, color='gray', linestyle='--')
plt.title('PCA of Math Score Data')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from scipy.fft import fft, fftfreq

# 1. Prepare the signal
# Remove mean to focus on variations (DC offset removal)
scores = df2['math score'].values
scores_detrended = scores - np.mean(scores)
N = len(scores)

# 2. Compute FFT
yf = fft(scores_detrended)
xf = fftfreq(N, 1) # Assumes 1 sample per unit of time (e.g., 1 test per month)

# 3. Plot Magnitude Spectrum
plt.figure(figsize=(8, 5))
plt.plot(xf[:N//2], 2.0/N * np.abs(yf[:N//2])) # Plot positive frequencies
plt.title('Frequency Domain: Math Score Periodicities')
plt.xlabel('Frequency (Cycles per unit time)')
plt.ylabel('Magnitude (Strength of Cycle)')
plt.grid(False)
plt.show()

	math score	reading score	writing score
count	1000.00000	1000.000000	1000.000000
mean	66.08900	69.169000	68.054000
std	15.16308	14.600192	15.195657
min	0.00000	17.000000	10.000000
25%	57.00000	59.000000	57.750000
50%	66.00000	70.000000	69.000000
75%	77.00000	79.000000	79.000000
max	100.00000	100.000000	100.000000

Fab Future DataScience Class 2025¶

My Data Analysis¶

Import essential python libraries¶

Week01: Tools and Data visualization¶

Data Filtering¶

Data Visualization¶

Week02-Fitting and Machine Learning¶

Polynomial Fit (polyfit)¶

Week 03: Probability and Density Estimation¶

Probability¶

Density Estimation¶

k-means Clustering¶

Data Science: Machine Learning¶

Neural Networks¶

Data Science: Transforms¶

Fast Fourier Transform (FFT)¶

Final Conclusion¶

Thank You Datascience Team¶

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88

	reading score	math score
0	72	72
1	90	69
2	95	90
3	57	47
4	78	76
...	...	...
995	99	88
996	55	62
997	71	59
998	78	68
999	86	77

	reading score	math score
0	72	72
1	90	69
2	95	90
3	57	47
4	78	76
...	...	...
995	99	88
996	55	62
997	71	59
998	78	68
999	86	77

	reading score	math score
0	72	72
1	90	69
2	95	90
3	57	47
4	78	76
...	...	...
995	99	88
996	55	62
997	71	59
998	78	68
999	86	77