ubuntu - torch.OutOfMemoryError: CUDA out of memory

[1][1: Dataset shape] [2: Error]

I am working on a video anomaly detection model on Google Colab using the code from BN-WVAD. The authors reported high accuracy on two datasets (XD-Violence and UCF-Crime), but since the UCF-Crime-specific code wasn't released, I used code shared by others in the issue tab of the repository.

However, I keep encountering a CUDA Out of Memory (OOM) error during training, both on my local GPU and on Google Colab.

Shape of the 2 datasets, UCF-Crime is smaller, which is (88,1024) while XD-Violence is greater which is (360,1024), I had successfully trained for XD-violence, but it shows gpu memory not enough for UCF-Crime, does this make sense?

One of the main code is as below which is my main.py

import pdb
import numpy as np
import torch.utils.data as data
import utils
import time
import wandb
import torch

from options import *

from train import train
from losses import LossComputer
from test import test
from models import WSAD

from dataset_loader import *
from tqdm import tqdm
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch_xla
import torch_xla.core.xla_model as xm

localtime = time.localtime()
time_ymd = time.strftime("%Y-%m-%d", localtime)
time_hms = time.strftime("%H:%M:%S", localtime)

if __name__ == "__main__":
    """_summary_
        args로부터 필요한 파라미터들을 받아오기
    """
    args = parse_args()
    if args.debug:
        pdb.set_trace()

    args.log_path = os.path.join(args.log_path, time_ymd, 'ucf', args.version)
    args.model_path = os.path.join(args.model_path, time_ymd, 'ucf', args.version)
    if not os.path.exists(args.log_path):
        os.makedirs(args.log_path)
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    
    wandb.init(
        project="BN-WVAD",
        name=args.version,
        config={
            'optimization:lr': args.lr[0],
            'optimization:iters': args.num_iters,
            'dataset:dataset': 'ucf-crime',
            'model:kernel_sizes': args.kernel_sizes,
            'model:channel_ratios': args.ratios,
            'triplet_loss:abn_ratio_sample': args.ratio_sample,
            'triplet_loss:abn_ratio_batch': args.ratio_batch,
        },
        settings=wandb.Settings(code_dir=os.path.dirname(os.path.abspath(__file__))),
        save_code=True,
    )


    worker_init_fn = None

    if args.seed >= 0:
        utils.set_seed(args.seed)
        worker_init_fn = np.random.seed(args.seed)
    # plot_freq=5 seed가 다를 때의 실험을 위해 잠시 주석처리
    #device = xm.xla_device()
    #print(device)
    
    net = WSAD(args.len_feature,flag = "Train", args=args)
    net = net.cuda()
    #net = net.to(device)


    normal_train_loader = data.DataLoader(
        UCFVideo(root_dir = args.root_dir, mode = 'Train', num_segments = args.num_segments, len_feature = args.len_feature, is_normal = True),
            batch_size = args.batch_size,
            shuffle = True, num_workers = args.num_workers,
            worker_init_fn = worker_init_fn, drop_last = True)
    abnormal_train_loader = data.DataLoader(
        UCFVideo(root_dir = args.root_dir, mode='Train', num_segments = args.num_segments, len_feature = args.len_feature, is_normal = False),
            batch_size = args.batch_size,
            shuffle = True, num_workers = args.num_workers,
            worker_init_fn = worker_init_fn, drop_last = True)
    test_loader = data.DataLoader(
        UCFVideo(root_dir = args.root_dir, mode = 'Test', num_segments = args.num_segments, len_feature = args.len_feature),
            batch_size = 10,
            shuffle = False, num_workers = args.num_workers,
            worker_init_fn = worker_init_fn)


    test_info = {'step': [], 'AUC': [], 'AP': []}
    
    best_auc = 0

    criterion = LossComputer()
    
    optimizer = torch.optim.Adam(net.parameters(), lr = args.lr[0],
        betas = (0.9, 0.999), weight_decay = args.weight_decay)

    best_scores = {
        'best_AUC': -1,
        'best_AP': -1,
    }

    metric = test(net, test_loader, test_info, 0)

    for step in tqdm(
            range(1, args.num_iters + 1),
            total = args.num_iters,
            dynamic_ncols = True
        ):
        ## 각 step 별 learning rate 및 dataloader 설정
        if step > 1 and args.lr[step - 1] != args.lr[step - 2]:
            for param_group in optimizer.param_groups:
                param_group["lr"] = args.lr[step - 1]
        if (step - 1) % len(normal_train_loader) == 0:
            normal_loader_iter = iter(normal_train_loader)

        if (step - 1) % len(abnormal_train_loader) == 0:
            abnormal_loader_iter = iter(abnormal_train_loader)
            
        ## 학습 및 loss 반환
        losses = train(net, normal_loader_iter,abnormal_loader_iter, optimizer, criterion)
        wandb.log(losses, step=step)
        if step==0 or step == 300 or step == 700 or step==997:
            torch.save(net.state_dict(), os.path.join(args.model_path, f"wsad_epoch_{step}.pt"))
            print(f"Model saved at epoch {step}")
        ## 주기적으로 test를 통한 성능 확인
        if step % args.plot_freq == 0 and step > 0:
            metric = test(net, test_loader, test_info, step)
            print('AUC: ', test_info['AUC'][-1])
            if test_info["AUC"][-1] > best_scores['best_AUC']:
                utils.save_best_record(test_info, os.path.join(args.log_path, "ucf_best_record_{}.txt".format(args.seed)))

                torch.save(net.state_dict(), os.path.join(args.model_path, "ucf_best_{}.pkl".format(args.seed)))
            
            for n, v in metric.items():
                best_name = 'best_' + n
                best_scores[best_name] = v if v > best_scores[best_name] else best_scores[best_name]

        wandb.log(metric, step=step)
        wandb.log(best_scores, step=step)

Below is the error log:

WARNING:root:Found CUDA without GPU_NUM_DEVICES. Defaulting to PJRT_DEVICE=CUDA with GPU_NUM_DEVICES=1
wandb: Using wandb-core as the SDK backend.  Please refer to  for more information.
wandb: Currently logged in as: jiaqifoo (jiaqifoo-universiti-tunku-abdul-rahman). Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.19.1
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run train


Traceback (most recent call last):
  File "/content/drive/MyDrive/fyp/main.py", line 106, in <module>
    metric = test(net, test_loader, test_info, 0)
  File "/content/drive/MyDrive/fyp/test.py", line 61, in test
    frame_predicts = get_predicts(test_loader, net)
  File "/content/drive/MyDrive/fyp/test.py", line 22, in get_predicts
    res = net(_data)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/drive/MyDrive/fyp/models/model.py", line 122, in forward
    x = self.selfatt(x)             # feat enhfance(attn+ff)    (128, 200, 512)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/drive/MyDrive/fyp/models/translayer.py", line 84, in forward
    x = attn(x) + x             #                   (128, 200, 512)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/drive/MyDrive/fyp/models/translayer.py", line 20, in forward
    return self.fn(self.norm(x), **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/drive/MyDrive/fyp/models/translayer.py", line 57, in forward
    dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale  # root(d_k) (128, 4, 200, 200)
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.79 GiB. GPU 0 has a total capacity of 14.75 GiB of which 6.61 GiB is free. Process 28173 has 100.00 MiB memory in use. Process 272388 has 8.04 GiB memory in use. Of the allocated memory 7.85 GiB is allocated by PyTorch, and 70.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (.html#environment-variables)
wandb: Find logs at: wandb/run-20241228_094147-fx4f8y5f/logs

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

ubuntu - torch.OutOfMemoryError: CUDA out of memory - Stack Overflow

与本文相关的文章

评论列表(0)