i just started learning about H20 AutoMl, so i have this project i'm working on google colab, i'm trying to write a code for Machine Failure Prediction using a NASA Turbofan Jet Engine Data Set from , but when i run the AutoMl RMSE is not right, it ether return 0, close to zero 0.06, or values like this 5.72724e-05, i tried a lot of things but nothing worked, as i mentioned before i'm still learning, can someone check my code and explain to me what i should do? or just fix my code but add comments please because i want to understand my mistake, thanks.
Note: a friend sent the code to a person who claim to have a PHD, and that person sent back a screenshot after an hour showing 18 on RMSE, but when my friend asked for the code, the person requested a 2000$ for the code which i don't understand why? why so much? maybe he thought i need it for a master or phd thesis or something.
My code:
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')
# Install necessary packages
!pip install h2o pandas numpy scikit-learn matplotlib seaborn
# Import required libraries
import h2o
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from h2o.automl import H2OAutoML
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Initialize H2O
h2o.init()
# Define base path for dataset folder
dataset_path = "/content/drive/MyDrive/CMaps/"
# Function to load and preprocess the dataset
def load_dataset(file_path, rul_file=None, is_train=True):
"""Loads and preprocesses the dataset.
Args:
file_path (str): Path to the dataset file.
rul_file (str, optional): Path to the RUL file (for test data). Defaults to None.
is_train (bool, optional): Whether it's training data. Defaults to True.
Returns:
pandas.DataFrame: The loaded and preprocessed dataframe.
"""
# Define column names
columns = ["unit_number", "time_in_cycles", "operational_setting_1", "operational_setting_2", "operational_setting_3"] + \
[f"sensor_{i}" for i in range(1, 22)] # 21 sensors
# Load data into Pandas DataFrame
df = pd.read_csv(file_path, sep=" ", header=None, names=columns, engine="python")
# Replace missing values (NaN) with 0
df = df.fillna(0) # Replace NaN with 0
# Calculate Remaining Useful Life (RUL) for each engine
max_cycles = df.groupby("unit_number")["time_in_cycles"].max()
df["RUL"] = df.apply(lambda row: max_cycles[row["unit_number"]] - row["time_in_cycles"], axis=1)
return df
# Load training and test data
train_file = dataset_path + "train_FD001.txt"
test_file = dataset_path + "test_FD001.txt"
rul_file = dataset_path + "RUL_FD001.txt" # Corrected to match the actual filename
train_df = load_dataset(train_file, is_train=True)
test_df = load_dataset(test_file, rul_file, is_train=False)
# Check if data is loaded correctly
print(train_df.head())
print(test_df.head())
# Define path to the training dataset file (update to the correct path)
file_path = "/content/drive/MyDrive/CMaps/train_FD001.txt" # Update to the correct path
# Define column names for the dataset
columns = ["unit_number", "time_in_cycles", "operational_setting_1", "operational_setting_2", "operational_setting_3"] + \
[f"sensor_{i}" for i in range(1, 22)] # 21 sensors
# Load data into Pandas DataFrame
df = pd.read_csv(file_path, sep=" ", header=None, names=columns, engine="python")
# Remove empty columns (if any) due to formatting issues
df = df.dropna(axis=1, how="all")
# Calculate Remaining Useful Life (RUL) for each engine
max_cycles = df.groupby("unit_number")["time_in_cycles"].max()
df["RUL"] = df.apply(lambda row: max_cycles[row["unit_number"]] - row["time_in_cycles"], axis=1)
# Loading Data
df = pd.read_csv(file_path, sep=" ", header=None, names=columns, engine="python")
# Replace missing values (NaN) with 0 instead of removing rows/columns
df = df.fillna(0) # Replace NaN with 0
# Calculate Remaining Useful Life (RUL) again for each engine after filling missing values
max_cycles = df.groupby("unit_number")["time_in_cycles"].max()
df["RUL"] = df.apply(lambda row: max_cycles[row["unit_number"]] - row["time_in_cycles"], axis=1)
# Select relevant sensors for the analysis
selected_sensors = [
"sensor_2", "sensor_3", "sensor_4", "sensor_7", "sensor_8",
"sensor_9", "sensor_11", "sensor_12", "sensor_13", "sensor_14",
"sensor_15", "sensor_17", "sensor_20", "sensor_21"
]
# Define features and target variable
features = ["time_in_cycles"] + selected_sensors
X = df[features]
y = df["RUL"]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Convert the data into H2OFrame format
train_h2o = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test_h2o = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))
# Define input columns and target column
target = "RUL"
features = X_train.columns.tolist()
# Initialize AutoML and train the model
aml = H2OAutoML(max_models=20, seed=42, max_runtime_secs=600) # You can adjust max_runtime_secs as per your preference
aml.train(x=features, y=target, training_frame=train_h2o, validation_frame=test_h2o)
# Check the leaderboard to view the models' performance
leaderboard = aml.leaderboard
print(leaderboard)