The training job keeps running indefinitely and never terminates. Additionally, the SageMaker "Training jobs" section does not show any active or ongoing training. My IAM role has full access to both S3 and SageMaker, so there should not be a permissions issue. What could be the possible reasons for this problem?
# Set your SageMaker session
sagemaker_session = sagemaker.Session()
# Define the S3 bucket and paths
bucket = "xxxxxxxxxxxxxxxxxx"
prefix = "pytorch/mnist"
role = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# Define the training script
train_script = """
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import argparse
from sagemaker.pytorch import PyTorch
# Define model
class SimpleNN(nn.Module):
def __init__(self):
super(SimpleNN, self).__init__()
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(28 * 28, 128),
nn.ReLU(),
nn.Linear(128, 10)
)
def forward(self, x):
return self.fc(x)
def main(args):
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(root='data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
model = SimpleNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=args.lr)
for epoch in range(args.epochs):
for i, (inputs, labels) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
if i % 100 == 0:
print(f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--epochs', type=int, default=10)
parser.add_argument('--lr', type=float, default=0.001)
args = parser.parse_args()
main(args)
"""
# Save the training script
with open("train_script.py", "w") as f:
f.write(train_script)
# Set up the SageMaker estimator
estimator = PyTorch(
entry_point="train_script.py",
role=role,
framework_version="1.6.0",
py_version="py3",
instance_count=1,
instance_type="ml.p2.xlarge",
sagemaker_session=sagemaker_session,
hyperparameters={"batch-size": 64, "epochs": 10, "lr": 0.001},
)
# Train the model
estimator.fit({"training": train_input, "testing": test_input})