< Home
Tools¶
Data Visualization¶
In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
In [3]:
data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")
data
Out[3]:
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | female | group E | master's degree | standard | completed | 88 | 99 | 95 |
| 996 | male | group C | high school | free/reduced | none | 62 | 55 | 55 |
| 997 | female | group C | high school | free/reduced | completed | 59 | 71 | 65 |
| 998 | female | group D | some college | standard | completed | 68 | 78 | 77 |
| 999 | female | group D | some college | free/reduced | none | 77 | 86 | 86 |
1000 rows × 8 columns
In [4]:
data.head()
Out[4]:
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
In [5]:
data.tail()
Out[5]:
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
|---|---|---|---|---|---|---|---|---|
| 995 | female | group E | master's degree | standard | completed | 88 | 99 | 95 |
| 996 | male | group C | high school | free/reduced | none | 62 | 55 | 55 |
| 997 | female | group C | high school | free/reduced | completed | 59 | 71 | 65 |
| 998 | female | group D | some college | standard | completed | 68 | 78 | 77 |
| 999 | female | group D | some college | free/reduced | none | 77 | 86 | 86 |
In [6]:
data.describe()
Out[6]:
| math score | reading score | writing score | |
|---|---|---|---|
| count | 1000.00000 | 1000.000000 | 1000.000000 |
| mean | 66.08900 | 69.169000 | 68.054000 |
| std | 15.16308 | 14.600192 | 15.195657 |
| min | 0.00000 | 17.000000 | 10.000000 |
| 25% | 57.00000 | 59.000000 | 57.750000 |
| 50% | 66.00000 | 70.000000 | 69.000000 |
| 75% | 77.00000 | 79.000000 | 79.000000 |
| max | 100.00000 | 100.000000 | 100.000000 |
In [7]:
data.columns
Out[7]:
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
'test preparation course', 'math score', 'reading score',
'writing score'],
dtype='object')
In [8]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 1000 non-null object 1 race/ethnicity 1000 non-null object 2 parental level of education 1000 non-null object 3 lunch 1000 non-null object 4 test preparation course 1000 non-null object 5 math score 1000 non-null int64 6 reading score 1000 non-null int64 7 writing score 1000 non-null int64 dtypes: int64(3), object(5) memory usage: 62.6+ KB
In [10]:
data.gender
Out[10]:
0 female
1 female
2 female
3 male
4 male
...
995 female
996 male
997 female
998 female
999 female
Name: gender, Length: 1000, dtype: object
In [11]:
data['gender'].value_counts()
Out[11]:
gender female 518 male 482 Name: count, dtype: int64
In [12]:
data.isnull()
Out[12]:
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | False | False | False | False | False | False | False | False |
| 996 | False | False | False | False | False | False | False | False |
| 997 | False | False | False | False | False | False | False | False |
| 998 | False | False | False | False | False | False | False | False |
| 999 | False | False | False | False | False | False | False | False |
1000 rows × 8 columns
In [21]:
data.plot(kind = 'line', title = 'Students Performance', xlabel = 'No. of Students', ylabel = 'Scores')
Out[21]:
<Axes: title={'center': 'Students Performance'}, xlabel='No. of Students', ylabel='Scores'>
In [23]:
data.plot(kind = 'bar', title = 'Students Performance', xlabel = 'Reading/writing/Math', ylabel = 'Scores')
Out[23]:
<Axes: title={'center': 'Students Performance'}, xlabel='Reading/writing/Math', ylabel='Scores'>
In [25]:
data.plot(kind='scatter', x='reading score', y='writing score')
# OR used this code plt.scatter(data['reading score'], data['writing score'])
Out[25]:
<Axes: xlabel='reading score', ylabel='writing score'>
In [26]:
import matplotlib.pyplot as plt
genders = data['gender'].unique() # This finds all different gender labels in your dataset (e.g., "male" and "female").
for g in genders:
subset = data[data['gender'] == g] #For each gender (e.g., male, female), Select only the rows for that gender using:
plt.scatter(subset['reading score'], subset['writing score'], label=g)
plt.xlabel('Reading Score')
plt.ylabel('Writing Score')
plt.legend()
plt.show()
In [29]:
import matplotlib.pyplot as plt
male_data = data[data['gender'] == 'male']
plt.scatter(male_data['reading score'], male_data['writing score'])
plt.xlabel('Reading Score')
plt.ylabel('Writing Score')
plt.title('Reading vs Writing Scores (Male Students)')
plt.show()
In [30]:
import matplotlib.pyplot as plt
# Calculate average math score for each parental education level
grouped = data.groupby('parental level of education')['math score'].mean()
# Plot bar graph
grouped.plot(kind='bar')
plt.xlabel('Parental Level of Education')
plt.ylabel('Average Math Score')
plt.title('Math Score by Parental Level of Education')
plt.show()
In [32]:
import matplotlib.pyplot as plt
# Calculate average scores
scores = [
data['math score'].mean(),
data['reading score'].mean(),
data['writing score'].mean()
]
labels = ['Math Score', 'Reading Score', 'Writing Score']
# Create pie chart
plt.pie(scores, labels=labels, autopct='%1.1f%%')
plt.title('Proportion of Average Scores')
plt.show()