tensorflow - loss become nan in gpu

I have an environment with TensorFlow 2.17 and ROCm (AMD GPU). I am having an issue where the loss becomes NaN always after batch 120 and then resumes calculations in the next epoch up to batch 120. I have changed the seed multiple times to ensure it's not a data issue, and I have clipped the gradient values but with no results. I have tested my code on Google Colab and it works well, and I have re-installed the packages several times. Do you have any other ideas to check where the problem might be occurring?

import os
import tensorflow as tf

MI210_list = tf.config.list_physical_devices('GPU')
for device in MI210_list:
    tf.config.experimental.set_memory_growth(device, True)

import numpy as np
import pandas as pd
import datetime as dt
from types import SimpleNamespace
tf.random.set_seed(42)
import pickle
np.random.seed(42)
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
import random
random.seed(9001)

import pickle
X_random = np.random.rand(2900, 12, 307)
y_random = np.random.rand(2900, 1, 290)

pickle_dir = 'pickles_files'
if not os.path.exists(pickle_dir):
    os.makedirs(pickle_dir)


for i in range(200):
    pickle_file = os.path.join(pickle_dir, f'batch_{i}.pickle')
    with open(pickle_file, 'wb') as f:
        pickle.dump((X_random, y_random), f)





def load_batch(pickle_file):
    with open(pickle_file, 'rb') as f:
        X_batch, y_batch = pickle.load(f)
    return X_batch, y_batch

def dataset_generator():
    pickle_dir = 'pickles_files'
    for i in range(200):
        pickle_file = os.path.join(pickle_dir, f'batch_{i}.pickle')
        X_batch, y_batch = load_batch(pickle_file)
        yield X_batch, y_batch


output_signature = (
            tf.TensorSpec(shape=(None, 12, 307), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 1, 290), dtype=tf.float32)
)

dataset = tf.data.Dataset.from_generator(dataset_generator, output_signature=output_signature).apply(tf.data.experimental.assert_cardinality(200))




def create_simple_linear_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(12, 307), dtype=tf.float32),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(290, dtype=tf.float32),
        tf.keras.layers.Reshape((1, 290), dtype=tf.float32)
    ])
    return model



strategy = tf.distribute.MirroredStrategy()

# Open a strategy scope.
with strategy.scope():
    itransformer = create_simple_linear_model()

    lr_schedule  =  tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate= 0.0001,
        decay_steps=2500,
        decay_rate=0.98,
        staircase=True)

    itransformerpile(loss=tf.keras.losses.MeanSquaredError(),
         optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule,amsgrad=True,clipvalue=1.0),
         metrics=[tf.keras.metrics.MeanAbsoluteError()])

# Train the model using the dataset
itransformer.fit(dataset,batch_size=2900, epochs=10)

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

tensorflow - loss become nan in gpu - Stack Overflow

与本文相关的文章

评论列表(0)