[Kelzang Wangdi] - Fab Futures - Data Science
Home About

< Home

Tools¶

Data Visualization¶

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
In [3]:
data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")
data
Out[3]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
... ... ... ... ... ... ... ... ...
995 female group E master's degree standard completed 88 99 95
996 male group C high school free/reduced none 62 55 55
997 female group C high school free/reduced completed 59 71 65
998 female group D some college standard completed 68 78 77
999 female group D some college free/reduced none 77 86 86

1000 rows × 8 columns

In [4]:
data.head()
Out[4]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
In [5]:
data.tail()
Out[5]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
995 female group E master's degree standard completed 88 99 95
996 male group C high school free/reduced none 62 55 55
997 female group C high school free/reduced completed 59 71 65
998 female group D some college standard completed 68 78 77
999 female group D some college free/reduced none 77 86 86
In [6]:
data.describe()
Out[6]:
math score reading score writing score
count 1000.00000 1000.000000 1000.000000
mean 66.08900 69.169000 68.054000
std 15.16308 14.600192 15.195657
min 0.00000 17.000000 10.000000
25% 57.00000 59.000000 57.750000
50% 66.00000 70.000000 69.000000
75% 77.00000 79.000000 79.000000
max 100.00000 100.000000 100.000000
In [7]:
data.columns
Out[7]:
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')
In [8]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
In [10]:
data.gender
Out[10]:
0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object
In [11]:
data['gender'].value_counts()
Out[11]:
gender
female    518
male      482
Name: count, dtype: int64
In [12]:
data.isnull()
Out[12]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
0 False False False False False False False False
1 False False False False False False False False
2 False False False False False False False False
3 False False False False False False False False
4 False False False False False False False False
... ... ... ... ... ... ... ... ...
995 False False False False False False False False
996 False False False False False False False False
997 False False False False False False False False
998 False False False False False False False False
999 False False False False False False False False

1000 rows × 8 columns

Line graph¶

Students Performance¶

In [21]:
data.plot(kind = 'line', title = 'Students Performance', xlabel = 'No. of Students', ylabel = 'Scores')
Out[21]:
<Axes: title={'center': 'Students Performance'}, xlabel='No. of Students', ylabel='Scores'>
No description has been provided for this image
In [23]:
data.plot(kind = 'bar', title = 'Students Performance', xlabel = 'Reading/writing/Math', ylabel = 'Scores')
Out[23]:
<Axes: title={'center': 'Students Performance'}, xlabel='Reading/writing/Math', ylabel='Scores'>
No description has been provided for this image
In [25]:
data.plot(kind='scatter', x='reading score', y='writing score')
# OR used this code plt.scatter(data['reading score'], data['writing score'])
Out[25]:
<Axes: xlabel='reading score', ylabel='writing score'>
No description has been provided for this image
In [26]:
import matplotlib.pyplot as plt

genders = data['gender'].unique()  # This finds all different gender labels in your dataset (e.g., "male" and "female").
for g in genders:
    subset = data[data['gender'] == g]  #For each gender (e.g., male, female), Select only the rows for that gender using:
    plt.scatter(subset['reading score'], subset['writing score'], label=g)

plt.xlabel('Reading Score')
plt.ylabel('Writing Score')
plt.legend()
plt.show()
No description has been provided for this image
In [29]:
import matplotlib.pyplot as plt

male_data = data[data['gender'] == 'male']

plt.scatter(male_data['reading score'], male_data['writing score'])
plt.xlabel('Reading Score')
plt.ylabel('Writing Score')
plt.title('Reading vs Writing Scores (Male Students)')
plt.show()
No description has been provided for this image
In [30]:
import matplotlib.pyplot as plt

# Calculate average math score for each parental education level
grouped = data.groupby('parental level of education')['math score'].mean()

# Plot bar graph
grouped.plot(kind='bar')

plt.xlabel('Parental Level of Education')
plt.ylabel('Average Math Score')
plt.title('Math Score by Parental Level of Education')
plt.show()
No description has been provided for this image
In [32]:
import matplotlib.pyplot as plt

# Calculate average scores
scores = [
    data['math score'].mean(),
    data['reading score'].mean(),
    data['writing score'].mean()
]

labels = ['Math Score', 'Reading Score', 'Writing Score']

# Create pie chart
plt.pie(scores, labels=labels, autopct='%1.1f%%')
plt.title('Proportion of Average Scores')
plt.show()
No description has been provided for this image