import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")
data

data.head()

data.tail()

data.describe()

data.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB

data.gender

0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object

data['gender'].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

data.isnull()

data.plot(kind = 'line', title = 'Students Performance', xlabel = 'No. of Students', ylabel = 'Scores')

<Axes: title={'center': 'Students Performance'}, xlabel='No. of Students', ylabel='Scores'>

data.plot(kind = 'bar', title = 'Students Performance', xlabel = 'Reading/writing/Math', ylabel = 'Scores')

<Axes: title={'center': 'Students Performance'}, xlabel='Reading/writing/Math', ylabel='Scores'>

data.plot(kind='scatter', x='reading score', y='writing score')
# OR used this code plt.scatter(data['reading score'], data['writing score'])

<Axes: xlabel='reading score', ylabel='writing score'>

import matplotlib.pyplot as plt

genders = data['gender'].unique()  # This finds all different gender labels in your dataset (e.g., "male" and "female").
for g in genders:
    subset = data[data['gender'] == g]  #For each gender (e.g., male, female), Select only the rows for that gender using:
    plt.scatter(subset['reading score'], subset['writing score'], label=g)

plt.xlabel('Reading Score')
plt.ylabel('Writing Score')
plt.legend()
plt.show()

import matplotlib.pyplot as plt

male_data = data[data['gender'] == 'male']

plt.scatter(male_data['reading score'], male_data['writing score'])
plt.xlabel('Reading Score')
plt.ylabel('Writing Score')
plt.title('Reading vs Writing Scores (Male Students)')
plt.show()

import matplotlib.pyplot as plt

# Calculate average math score for each parental education level
grouped = data.groupby('parental level of education')['math score'].mean()

# Plot bar graph
grouped.plot(kind='bar')

plt.xlabel('Parental Level of Education')
plt.ylabel('Average Math Score')
plt.title('Math Score by Parental Level of Education')
plt.show()

import matplotlib.pyplot as plt

# Calculate average scores
scores = [
    data['math score'].mean(),
    data['reading score'].mean(),
    data['writing score'].mean()
]

labels = ['Math Score', 'Reading Score', 'Writing Score']

# Create pie chart
plt.pie(scores, labels=labels, autopct='%1.1f%%')
plt.title('Proportion of Average Scores')
plt.show()

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75
...	...	...	...	...	...	...	...	...
995	female	group E	master's degree	standard	completed	88	99	95
996	male	group C	high school	free/reduced	none	62	55	55
997	female	group C	high school	free/reduced	completed	59	71	65
998	female	group D	some college	standard	completed	68	78	77
999	female	group D	some college	free/reduced	none	77	86	86

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
995	female	group E	master's degree	standard	completed	88	99	95
996	male	group C	high school	free/reduced	none	62	55	55
997	female	group C	high school	free/reduced	completed	59	71	65
998	female	group D	some college	standard	completed	68	78	77
999	female	group D	some college	free/reduced	none	77	86	86

	math score	reading score	writing score
count	1000.00000	1000.000000	1000.000000
mean	66.08900	69.169000	68.054000
std	15.16308	14.600192	15.195657
min	0.00000	17.000000	10.000000
25%	57.00000	59.000000	57.750000
50%	66.00000	70.000000	69.000000
75%	77.00000	79.000000	79.000000
max	100.00000	100.000000	100.000000

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	False	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...
995	False	False	False	False	False	False	False	False
996	False	False	False	False	False	False	False	False
997	False	False	False	False	False	False	False	False
998	False	False	False	False	False	False	False	False
999	False	False	False	False	False	False	False	False

Tools¶

Data Visualization¶

Line graph¶

Students Performance¶