I have an environment with TensorFlow 2.17 and ROCm (AMD GPU). I am having an issue where the loss becomes NaN always after batch 120 and then resumes calculations in the next epoch up to batch 120. I have changed the seed multiple times to ensure it's not a data issue, and I have clipped the gradient values but with no results. I have tested my code on Google Colab and it works well, and I have re-installed the packages several times. Do you have any other ideas to check where the problem might be occurring?
import os
import tensorflow as tf
MI210_list = tf.config.list_physical_devices('GPU')
for device in MI210_list:
tf.config.experimental.set_memory_growth(device, True)
import numpy as np
import pandas as pd
import datetime as dt
from types import SimpleNamespace
tf.random.set_seed(42)
import pickle
np.random.seed(42)
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
import random
random.seed(9001)
import pickle
X_random = np.random.rand(2900, 12, 307)
y_random = np.random.rand(2900, 1, 290)
pickle_dir = 'pickles_files'
if not os.path.exists(pickle_dir):
os.makedirs(pickle_dir)
for i in range(200):
pickle_file = os.path.join(pickle_dir, f'batch_{i}.pickle')
with open(pickle_file, 'wb') as f:
pickle.dump((X_random, y_random), f)
def load_batch(pickle_file):
with open(pickle_file, 'rb') as f:
X_batch, y_batch = pickle.load(f)
return X_batch, y_batch
def dataset_generator():
pickle_dir = 'pickles_files'
for i in range(200):
pickle_file = os.path.join(pickle_dir, f'batch_{i}.pickle')
X_batch, y_batch = load_batch(pickle_file)
yield X_batch, y_batch
output_signature = (
tf.TensorSpec(shape=(None, 12, 307), dtype=tf.float32),
tf.TensorSpec(shape=(None, 1, 290), dtype=tf.float32)
)
dataset = tf.data.Dataset.from_generator(dataset_generator, output_signature=output_signature).apply(tf.data.experimental.assert_cardinality(200))
def create_simple_linear_model():
model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(12, 307), dtype=tf.float32),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(290, dtype=tf.float32),
tf.keras.layers.Reshape((1, 290), dtype=tf.float32)
])
return model
strategy = tf.distribute.MirroredStrategy()
# Open a strategy scope.
with strategy.scope():
itransformer = create_simple_linear_model()
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate= 0.0001,
decay_steps=2500,
decay_rate=0.98,
staircase=True)
itransformerpile(loss=tf.keras.losses.MeanSquaredError(),
optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule,amsgrad=True,clipvalue=1.0),
metrics=[tf.keras.metrics.MeanAbsoluteError()])
# Train the model using the dataset
itransformer.fit(dataset,batch_size=2900, epochs=10)