import pandas as pd

#import datasets
df = pd.read_csv('datasets/factory_sensor_simulator_2040.csv')
print(df)

       Machine_ID        Machine_Type  Installation_Year  Operational_Hours  \
0       MC_000000               Mixer               2027              81769   
1       MC_000001  Industrial_Chiller               2032              74966   
2       MC_000002      Pick_and_Place               2003              94006   
3       MC_000003       Vision_System               2007              76637   
4       MC_000004      Shuttle_System               2016              20870   
...           ...                 ...                ...                ...   
499995  MC_499995       Vacuum_Packer               2011              14425   
499996  MC_499996       Conveyor_Belt               2003              75501   
499997  MC_499997                 CMM               2039              19855   
499998  MC_499998               Dryer               2035              86823   
499999  MC_499999  Industrial_Chiller               2021              52505   

        Temperature_C  Vibration_mms  Sound_dB  Oil_Level_pct  \
0               73.43          12.78     83.72          36.76   
1               58.32          14.99     77.04         100.00   
2               49.63          23.78     69.08          42.96   
3               63.73          12.38     85.58          94.90   
4               42.77           4.42     96.72          47.56   
...               ...            ...       ...            ...   
499995          65.42          16.50     81.95          59.21   
499996          44.83          12.88     64.94          73.69   
499997          37.26          11.46     70.70          70.70   
499998          67.72          16.76     77.45          97.00   
499999          46.01           8.94     67.88          51.11   

        Coolant_Level_pct  Power_Consumption_kW  ...  Failure_History_Count  \
0                   68.74                 84.95  ...                      5   
1                   62.13                154.61  ...                      2   
2                   35.96                 51.90  ...                      1   
3                   48.94                 75.61  ...                      1   
4                   53.78                224.93  ...                      2   
...                   ...                   ...  ...                    ...   
499995              73.67                255.87  ...                      3   
499996              29.25                198.37  ...                      1   
499997              49.04                156.59  ...                      2   
499998              15.40                132.33  ...                      2   
499999              31.90                138.69  ...                      1   

        AI_Supervision  Error_Codes_Last_30_Days  Remaining_Useful_Life_days  \
0                 True                         3                       162.0   
1                 True                         4                       147.0   
2                 True                         6                         0.0   
3                False                         4                       161.0   
4                False                         1                       765.0   
...                ...                       ...                         ...   
499995           False                         0                       820.0   
499996           False                         4                        34.0   
499997           False                         4                       815.0   
499998            True                         0                        99.0   
499999           False                         4                       489.0   

        Failure_Within_7_Days  Laser_Intensity  Hydraulic_Pressure_bar  \
0                       False              NaN                     NaN   
1                       False              NaN                     NaN   
2                        True              NaN                     NaN   
3                       False              NaN                     NaN   
4                       False              NaN                     NaN   
...                       ...              ...                     ...   
499995                  False              NaN                     NaN   
499996                  False              NaN                     NaN   
499997                  False              NaN                     NaN   
499998                  False              NaN                     NaN   
499999                  False              NaN                     NaN   

        Coolant_Flow_L_min  Heat_Index  AI_Override_Events  
0                      NaN         NaN                   2  
1                    40.92         NaN                   2  
2                      NaN         NaN                   2  
3                      NaN         NaN                   0  
4                      NaN         NaN                   0  
...                    ...         ...                 ...  
499995                 NaN         NaN                   0  
499996                 NaN         NaN                   0  
499997                 NaN         NaN                   0  
499998                 NaN         NaN                   2  
499999               41.09         NaN                   0  

[500000 rows x 22 columns]

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
#from sklearn.impute import SimpleImputer
#from sklearn.decomposition import PCA
#from sklearn.cluster import KMeans
#import shap
#import lightgbm as lgb
#import xgboost as xgb
#import warnings
#warnings.filterwarnings("ignore")

import os
print(os.listdir("datasets"))

['olympic_athlete_events.csv', 'factory_sensor_simulator_2040.csv', 'olympic_noc_regions.csv', '.gitignore']

# Load data 
df = pd.read_csv("datasets/factory_sensor_simulator_2040.csv")

# Basic overview
print("\n--- Dataset Overview ---")
print(df.shape)
print(df.dtypes)
print(df.head())

--- Dataset Overview ---
(500000, 22)
Machine_ID                     object
Machine_Type                   object
Installation_Year               int64
Operational_Hours               int64
Temperature_C                 float64
Vibration_mms                 float64
Sound_dB                      float64
Oil_Level_pct                 float64
Coolant_Level_pct             float64
Power_Consumption_kW          float64
Last_Maintenance_Days_Ago       int64
Maintenance_History_Count       int64
Failure_History_Count           int64
AI_Supervision                   bool
Error_Codes_Last_30_Days        int64
Remaining_Useful_Life_days    float64
Failure_Within_7_Days            bool
Laser_Intensity               float64
Hydraulic_Pressure_bar        float64
Coolant_Flow_L_min            float64
Heat_Index                    float64
AI_Override_Events              int64
dtype: object
  Machine_ID        Machine_Type  Installation_Year  Operational_Hours  \
0  MC_000000               Mixer               2027              81769   
1  MC_000001  Industrial_Chiller               2032              74966   
2  MC_000002      Pick_and_Place               2003              94006   
3  MC_000003       Vision_System               2007              76637   
4  MC_000004      Shuttle_System               2016              20870   

   Temperature_C  Vibration_mms  Sound_dB  Oil_Level_pct  Coolant_Level_pct  \
0          73.43          12.78     83.72          36.76              68.74   
1          58.32          14.99     77.04         100.00              62.13   
2          49.63          23.78     69.08          42.96              35.96   
3          63.73          12.38     85.58          94.90              48.94   
4          42.77           4.42     96.72          47.56              53.78   

   Power_Consumption_kW  ...  Failure_History_Count  AI_Supervision  \
0                 84.95  ...                      5            True   
1                154.61  ...                      2            True   
2                 51.90  ...                      1            True   
3                 75.61  ...                      1           False   
4                224.93  ...                      2           False   

   Error_Codes_Last_30_Days  Remaining_Useful_Life_days  \
0                         3                       162.0   
1                         4                       147.0   
2                         6                         0.0   
3                         4                       161.0   
4                         1                       765.0   

   Failure_Within_7_Days  Laser_Intensity  Hydraulic_Pressure_bar  \
0                  False              NaN                     NaN   
1                  False              NaN                     NaN   
2                   True              NaN                     NaN   
3                  False              NaN                     NaN   
4                  False              NaN                     NaN   

   Coolant_Flow_L_min  Heat_Index  AI_Override_Events  
0                 NaN         NaN                   2  
1               40.92         NaN                   2  
2                 NaN         NaN                   2  
3                 NaN         NaN                   0  
4                 NaN         NaN                   0  

[5 rows x 22 columns]

# Null values
print("\n--- Null Value Count ---")
print(df.isnull().sum().sort_values(ascending=False))

--- Null Value Count ---
Laser_Intensity               484844
Hydraulic_Pressure_bar        469660
Heat_Index                    454786
Coolant_Flow_L_min            454376
Machine_ID                         0
Machine_Type                       0
Failure_Within_7_Days              0
Remaining_Useful_Life_days         0
Error_Codes_Last_30_Days           0
AI_Supervision                     0
Failure_History_Count              0
Maintenance_History_Count          0
Last_Maintenance_Days_Ago          0
Power_Consumption_kW               0
Coolant_Level_pct                  0
Oil_Level_pct                      0
Sound_dB                           0
Vibration_mms                      0
Temperature_C                      0
Operational_Hours                  0
Installation_Year                  0
AI_Override_Events                 0
dtype: int64

# Target distribution
plt.figure(figsize=(8, 5))
sns.histplot(df["Remaining_Useful_Life_days"], bins=50, kde=True)
plt.title("Remaining Useful Life (RUL) Distribution")
plt.xlabel("Days")
plt.ylabel("Machine Count")
plt.grid(True)
plt.tight_layout()
plt.show()

Week 2: Tools¶

Class memo¶

Assignment: "Visualize your data set(s)"¶