I'm encountering an error during the supervised fine-tuning (SFT) of Qwen2.5-Coder-1.5B. The error, shown in the log below, seems to indicate that something is interrupting gradient computation during backpropagation, but I haven't been able to pinpoint the cause. Could someone help me understand what might be triggering this issue?
Below is a simplified version of my code and the corresponding log output:
my code:
# Download model
import os
import tokenize
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
model_id = "Qwen/Qwen2.5-Coder-1.5B"
save_dir = f"/root/autodl-tmp/NL2SQL/models/{model_id[5:]}/"
os.makedirs(save_dir, exist_ok=True)
# snapshot_download(repo_id=model_id, local_dir=save_dir)
# Load model
model = AutoModelForCausalLM.from_pretrained(save_dir, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(save_dir, device_map="cuda")
# Data processing
import pandas as pd
from datasets import Dataset
# Read CSV file and create Dataset
data_dir = '/root/autodl-tmp/NL2SQL/cot-qa.csv'
df = pd.read_csv(data_dir)
dataset = Dataset.from_pandas(df)
def combined_preprocess(batch):
texts = []
# Iterate over each sample to construct the complete prompt and completion text
for q, a, t in zip(batch["query"], batch["answer"], batch["thinking_process"]):
question = str(q)
answer = str(a)
thinking = str(t)
prompt = (
f"For the question: {question}.\n"
"Please think step by step, list your thinking process between <think> and </think> and then show the final SQL answer:"
)
completion = (
f"<think>{thinking}</think>\nMy final answer is: ```sql\n{answer}\n```"
)
texts.append(prompt + "\n" + completion)
# Do not perform padding or return torch.Tensor; return a list for the collator to pad later
tokenized = tokenizer(
texts,
truncation=True,
max_length=1024 * 2,
padding=False,
)
return tokenized
processed_dataset = dataset.map(combined_preprocess, batched=True, remove_columns=dataset.column_names)
# print(processed_dataset[0])
# LoRA configuration
from peft import LoraConfig, TaskType, get_peft_model
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
r=8,
lora_alpha=16, # 8*2
lora_dropout=0.05,
bias='none',
inference_mode=False
)
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())
model.config.use_cache = False
# Training configuration
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./output/sft/",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
logging_steps=10,
logging_first_step=5,
num_train_epochs=2,
save_steps=100,
learning_rate=1e-4,
save_on_each_node=True,
gradient_checkpointing=True,
report_to="none",
remove_unused_columns=False,
)
# Swanlab setup
import swanlab
from swanlab.integration.transformers import SwanLabCallback
swanlab_callback = SwanLabCallback(
...
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=processed_dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
callbacks=[swanlab_callback],
)
trainer.train()
log output:
root@autodl-container:~/autodl-tmp/NL2SQL# python sft.py
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 9399/9399 [00:03<00:00, 2396.28 examples/s]
trainable params: 9,232,384 || all params: 1,552,946,688 || trainable%: 0.5945
None
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
swanlab: Tracking run with swanlab version 0.4.11
swanlab: Run data will be saved locally in /root/autodl-tmp/NL2SQL/swanlog/run-
swanlab: