I am trying to finetune Llama3.1 8B. I am using 4 A10G gpu with 24 GB each.
I have model weight present on the local file in base_model = '<model_weight_folder>'
from accelerate import PartialState
device_string = PartialState().process_index
torch_dtype = torch.float16
attn_implementation = "eager"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch_dtype,
bnb_4bit_use_double_quant=True
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
base_model,
quantization_config=bnb_config,
# device_map={'':torch.cuda.current_device()},
# device_map="auto",
device_map={'':device_string},
attn_implementation=attn_implementation
)
# LoRA config
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
output_dir=new_model,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=2,
# gradient_checkpointing_kwargs={'use_reentrant':False},
optim="paged_adamw_32bit",
num_train_epochs=5,
evaluation_strategy="steps",
eval_steps=0.2,
logging_steps=1,
warmup_steps=10,
logging_strategy="steps",
learning_rate=2e-4,
fp16=False,
bf16=False,
group_by_length=True,
report_to="tensorboard"
)
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
peft_config=peft_config,
# max_seq_length=512,
# dataset_text_field="text",
tokenizer=tokenizer,
args=training_arguments,
# packing= False,
)
trainer.train()
I am launching it using
python -m torch.distributed.launch trainer.py
However it is resulting into out of memory
issue while loading the base model.
Can anyone suggest if I can use multi gpu for finetuning?. If yes, what config change is needed.