print("Hello world All greeting from Bhutan")

Hello world All greeting from Bhutan

# Import relevant libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

 data = pd.read_csv ('~/work/sonam-dendup/datasets/StudentsPerformance.csv') # Dataset import from Kaggel Database

data.head(3)

data.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB

data.describe()

data['gender'].nunique()

2

data['parental level of education'].nunique() # check the unique data

6

data['parental level of education'].value_counts()

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

data['lunch'].nunique()

2

data['gender'].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

data.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

x_first_50= data['math score'].iloc[0:50]

# Line Plot using the sliced data
plt.plot(x_first_50, color="black", marker='o', linestyle="-", linewidth="1.5")
plt.xlabel("Student Index (First 50)")
plt.ylabel("Math score")
plt.title("Student Performance Math (First 50 Students)")
#plt.savefig("fig3.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

# BOX Plot
plt.boxplot(data['reading score'])
plt.show()

data['reading score'].shape

(1000,)

# Univariate =>catogerical data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB

# Pie Chart
count = data['parental level of education'].value_counts()
count

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

plt.pie(count, labels = count.index, autopct = "%1.0f", explode =[0,0.05,0.05,0,0.05,0])
plt.axis('equal')
plt.title("Parental level of education")
#plt.savefig("fig2.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

# Count Plot

gen = data['gender'].value_counts()       
gen

gender
female    518
male      482
Name: count, dtype: int64

#Barplot
plt.bar(gen.index, gen, color = ['gray', 'black'], )
plt.title("Gender  count")
plt.xlabel(" Gender")
plt.ylabel("Count")
plt.show()

sort_read = data.sort_values("reading score")

data.head(2)

# Bivariate => numerical + categorical
mal_mat =data[data["gender"] == "male"]["math score"]
mal_mat

3      47
4      76
7      40
8      64
10     58
       ..
985    57
987    81
990    86
994    63
996    62
Name: math score, Length: 482, dtype: int64

fem_mat =data[data["gender"] == "female"]["math score"]
fem_mat

0      72
1      69
2      90
5      71
6      88
       ..
993    62
995    88
997    59
998    68
999    77
Name: math score, Length: 518, dtype: int64

# Box Plot
plt.boxplot([mal_mat, fem_mat], labels = ["Male" , "Female"])
plt.title("BOX PLOT")
plt.show()

/tmp/ipykernel_8900/1166900140.py:2: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot([mal_mat, fem_mat], labels = ["Male" , "Female"])

color_map = {"male": "skyblue", "female": "gray"}

for gender, color in color_map.items():
    df_gender = data[data["gender"] == gender]
    
    # Plot only the filtered data (df_gender) using the corresponding color and label
    plt.scatter(df_gender["reading score"], df_gender["writing score"], c=color, label=gender) 
    
plt.legend()
plt.xlabel("Reading score")
plt.ylabel("Writing score")
plt.title("Scatter Plot ")
#plt.savefig("fig2.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

fig, axs = plt.subplots(1,3, figsize = (15,5))

axs[0].plot(data["reading score"].iloc[0:50], color = 'black', marker = 'o',linestyle = '-',markersize = 2, linewidth = '2')
axs[0].grid()
axs[0].set_title('LINE PLOT')
axs[0].set_xlabel('Reading Score')
axs[0].set_xlabel('Index')

axs[1].hist(data["math score"], bins = 10, color = 'skyblue' ,edgecolor = 'black', linewidth = '2') 
axs[1].set_title('HISTROGRAM')
axs[1].set_xlabel('Math Score')
axs[1].set_ylabel('Frequency') 
                 
axs[2].boxplot(data["math score"])
axs[2].set_title('BOX PLOT')
axs[2].set_xlabel('Math Score')
# plt.savefig("fig.png") 
plt.show()

import kagglehub
from kagglehub import KaggleDatasetAdapter
file_path = "Exam_Score_Prediction.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "kundanbedmutha/exam-score-prediction-dataset",
  file_path,
)

/tmp/ipykernel_8900/1666180015.py:6: DeprecationWarning: Use dataset_load() instead of load_dataset(). load_dataset() will be removed in a future version.
  df = kagglehub.load_dataset(

Downloading from https://www.kaggle.com/api/v1/datasets/download/kundanbedmutha/exam-score-prediction-dataset?dataset_version_number=2&file_name=Exam_Score_Prediction.csv...

100%|██████████| 1.37M/1.37M [00:00<00:00, 3.07MB/s]

import seaborn as sns  
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6)) 
sns.scatterplot(data=df.loc[:100], x='study_hours', y='exam_score', hue='gender')
plt.title('Scatter Plot of X vs Y by Gender')
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt


plt.figure(figsize=(7, 5))
sns.regplot(x="study_hours", y="exam_score", data=df.loc[:100])

plt.title("Linear Best Fit Line (Default)")
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(7, 5))
sns.regplot(
    x="study_hours",
    y="exam_score",
    data=df.loc[:100],
    order=2,     
)
plt.title("Non-Linear Best Fit Line (Polynomial Order 2)")
plt.show()

	math score	reading score	writing score
count	1000.00000	1000.000000	1000.000000
mean	66.08900	69.169000	68.054000
std	15.16308	14.600192	15.195657
min	0.00000	17.000000	10.000000
25%	57.00000	59.000000	57.750000
50%	66.00000	70.000000	69.000000
75%	77.00000	79.000000	79.000000
max	100.00000	100.000000	100.000000

Week 1: Playground¶

Python code¶

Week 01 ( Introduction and Tools)¶

Import dataset 01: Kaggel¶

Data Visualization with Matplotlib # =>pyplot API¶

Object Oriented API ( Application Programming Interface)¶

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93