import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('datasets/enhanced_student_habits_performance_dataset.csv')

# Plot a scatter graph between study hours and exam scores
plt.figure(figsize=(8,5)) 		# Set the plot size
plt.scatter(df['study_hours_per_day'], df['exam_score']) 		# Create scatter plot
plt.xlabel('Study Hours Per Day') 		# Set x-axis label
plt.ylabel('Exam Score') 		# Set y-axis label
plt.title('Study Hours vs Exam Score',fontsize=13) 		# Set the title of the plot
plt.show() 			# Display the plot

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
df = pd.read_csv('datasets/enhanced_student_habits_performance_dataset.csv')

# Select numeric columns only (remove student_id if present)
numeric_df = df.select_dtypes(include=np.number)
if 'student_id' in numeric_df.columns:
    numeric_df = numeric_df.drop(columns=['student_id'])

# Calculate the correlation matrix
corr = numeric_df.corr()

# Set the plot size
plt.figure(figsize=(16, 12))
plt.imshow(corr, cmap='coolwarm')  # Display the correlation matrix as an image

# Add color bar to indicate the scale
plt.colorbar()

# Add labels to the x and y axes
labels = corr.columns
plt.xticks(np.arange(len(labels)), labels, rotation=90)  # Set column names on x-axis and also rotating them vertically to avoid overlap 
plt.yticks(np.arange(len(labels)), labels)               # Set column names on y-axis

# Show the correlation value in each cell
for i in range(len(labels)):
	for j in range(len(labels)):
		plt.text(j, i, f"{corr.iloc[i, j]:.2f}", ha="center", va="center")

# Title and layout
plt.title("Correlation Heatmap of Student Performance Variables")
plt.tight_layout()

# Save the image
#plt.savefig("correlation_heatmap.png")

# Show the plot
plt.show()

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('datasets/enhanced_student_habits_performance_dataset.csv')

# Plot a scatter graph between study hours and exam scores
plt.figure(figsize=(8,5)) 		# Set the plot size
plt.scatter(df['social_media_hours'], df['exam_score']) 		# Create scatter plot
plt.xlabel('Social Media Hours') 		# Set x-axis label
plt.ylabel('Exam Score') 		# Set y-axis label
plt.title('Social Media Hours vs Exam Score',fontsize=13) 		# Set the title of the plot
plt.show() 			# Display the plot

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('datasets/enhanced_student_habits_performance_dataset.csv')

# Select only 50 random data points
sample_df = df.sample(n=50, random_state=42)   # random_state makes the sample reproducible

# Plot a scatter graph between study hours and exam scores for the sample
plt.figure(figsize=(8,5))                   # Set the plot size
plt.scatter(sample_df['previous_gpa'], sample_df['exam_score'])  # Scatter plot
plt.xlabel('Previous GPA')            # x-axis label
plt.ylabel('Exam Score')                    # y-axis label
plt.title('Sampled (50) Previous GPA vs Exam Score', fontsize=13)
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load your dataset
df = pd.read_csv('datasets/enhanced_student_habits_performance_dataset.csv')

# Select only 100 random data points
df_sample = df.sample(n=100, random_state=42)

# Variables to compare with exam_score
variables = [
    'study_hours_per_day',
    'social_media_hours',
    'netflix_hours',
    'previous_gpa',
    'stress_level',
    'motivation_level',
    'exam_anxiety_score',
    'social_activity'
]

# Create a grid of subplots (2 rows × 4 columns)
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i, var in enumerate(variables):

    # Convert to numeric to avoid errors
    x = pd.to_numeric(df_sample[var], errors='coerce')
    y = pd.to_numeric(df_sample['exam_score'], errors='coerce')

    # Drop NaN values
    mask = (~x.isna()) & (~y.isna())
    x = x[mask]
    y = y[mask]

    # Scatter plot
    axes[i].scatter(x, y, alpha=0.6)

    # Trend line (only if we have enough points)
    if len(x) > 1:
        m, b = np.polyfit(x, y, 1)
        axes[i].plot(x, m*x + b)

    # Labels & title
    axes[i].set_title(f"{var} vs exam_score", fontsize=11)
    axes[i].set_xlabel(var)
    axes[i].set_ylabel("exam_score")

plt.tight_layout()
plt.show()

Parameter	Description
student_id	Unique student identifier
age	Age of the student (16 to 28)
gender	Male, Female, or Other
major	Field of study (e.g., Computer Science, Engineering, Arts)
study_hours_per_day	Average hours studied daily
social_media_hours	Daily hours spent on social media
netflix_hours	Daily hours spent watching Netflix/streaming
screen_time	Total daily screen time across devices
part_time_job	Whether the student has a job (Yes/No)
attendance_percentage	Academic attendance in percentage
sleep_hours	Average hours of sleep per night
exercise_frequency	How often the student exercises
diet_quality	Perceived quality of the students diet
mental_health_rating	Mental health score (1 to 10)
stress_level	Stress rating (1 to 10)
exam_anxiety_score	Exam anxiety level (1 to 10)
extracurricular_participation	Participation in extracurricular activities
access_to_tutoring	Whether the student has access to tutoring
family_income_range	Students family income range
parental_support_level	Degree of support from parents
parental_education_level	Highest education level of parents
motivation_level	Motivation rating (1 to 10)
time_management_score	Time management ability (1 to 10)
learning_style	Preferred learning method
study_environment	Common location where the student studies
dropout_risk	Yes/No : derived from stress and motivation levels
previous_gpa	Students previous GPA
exam_score	Target or actual exam score

Week 1: Introductory Session¶

Assignment for Week 1¶

Identifying the data set¶

Parameter Names¶

Representation of Data¶