I have one set of weights, one tokenizer, the same prompt, and identical generation parameters. Yet somehow, when I load the model using AutoModelForCausalLM, I get one output, and when I construct it manually with LlamaForCausalLM plus the same config and state_dict, I get another output entirely.
This code can show the difference on both a6000 and a100.
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
LlamaForCausalLM,
LlamaConfig
)
# 1) Adjust these as needed
model_name = "meta-llama/Llama-3.1-8B"
prompt = "Hello from Llama 3.1! Tell me something interesting."
dtype = torch.float16 # or torch.float32 if needed
# 2) Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
# Prepare input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
############################################
# A) Load with AutoModelForCausalLM
############################################
print("=== Loading with AutoModelForCausalLM ===")
model_auto = AutoModelForCausalLM.from_pretrained(
model_name,
attn_implementation="eager", # matches your usage
torch_dtype=dtype
).cuda()
model_auto.eval() # turn off dropout
config = model_auto.config
with torch.no_grad():
out_auto = model_auto(**inputs)
logits_auto = out_auto.logits # shape: [batch_size, seq_len, vocab_size]
del model_auto
torch.cuda.empty_cache()
############################################
# B) Load with LlamaForCausalLM + config
############################################
print("=== Loading with LlamaForCausalLM + config ===")
# Get config from the same checkpoint
# Build Llama model directly
model_llama = LlamaForCausalLM(config).cuda()
model_llama.eval()
# Load the same weights that AutoModelForCausalLM used
model_auto_temp = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
model_llama.load_state_dict(model_auto_temp.state_dict())
del model_auto_temp
torch.cuda.empty_cache()
with torch.no_grad():
out_llama = model_llama(**inputs)
logits_llama = out_llama.logits
############################################
# C) Compare the Logits
############################################
# Compute maximum absolute difference
max_diff = (logits_auto - logits_llama).abs().max()
print(f"\nMax absolute difference between logits: {max_diff.item()}")
if max_diff < 1e-7:
print("→ The logits are effectively identical (within floating-point precision).")
else:
print("→ There is a non-trivial difference in logits!")
I have one set of weights, one tokenizer, the same prompt, and identical generation parameters. Yet somehow, when I load the model using AutoModelForCausalLM, I get one output, and when I construct it manually with LlamaForCausalLM plus the same config and state_dict, I get another output entirely.
This code can show the difference on both a6000 and a100.
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
LlamaForCausalLM,
LlamaConfig
)
# 1) Adjust these as needed
model_name = "meta-llama/Llama-3.1-8B"
prompt = "Hello from Llama 3.1! Tell me something interesting."
dtype = torch.float16 # or torch.float32 if needed
# 2) Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
# Prepare input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
############################################
# A) Load with AutoModelForCausalLM
############################################
print("=== Loading with AutoModelForCausalLM ===")
model_auto = AutoModelForCausalLM.from_pretrained(
model_name,
attn_implementation="eager", # matches your usage
torch_dtype=dtype
).cuda()
model_auto.eval() # turn off dropout
config = model_auto.config
with torch.no_grad():
out_auto = model_auto(**inputs)
logits_auto = out_auto.logits # shape: [batch_size, seq_len, vocab_size]
del model_auto
torch.cuda.empty_cache()
############################################
# B) Load with LlamaForCausalLM + config
############################################
print("=== Loading with LlamaForCausalLM + config ===")
# Get config from the same checkpoint
# Build Llama model directly
model_llama = LlamaForCausalLM(config).cuda()
model_llama.eval()
# Load the same weights that AutoModelForCausalLM used
model_auto_temp = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
model_llama.load_state_dict(model_auto_temp.state_dict())
del model_auto_temp
torch.cuda.empty_cache()
with torch.no_grad():
out_llama = model_llama(**inputs)
logits_llama = out_llama.logits
############################################
# C) Compare the Logits
############################################
# Compute maximum absolute difference
max_diff = (logits_auto - logits_llama).abs().max()
print(f"\nMax absolute difference between logits: {max_diff.item()}")
if max_diff < 1e-7:
print("→ The logits are effectively identical (within floating-point precision).")
else:
print("→ There is a non-trivial difference in logits!")
Share
Improve this question
edited Mar 20 at 13:43
Christoph Rackwitz
15.9k5 gold badges39 silver badges51 bronze badges
asked Mar 8 at 8:24
han mohan mo
234 bronze badges
1 Answer
Reset to default 0In your example specifically, you set attn_implementation="eager"
in 1st AutoModelForCausalLM (the config you save), but not to the 2nd AutoModelForCausalLM (from which you actually load the weights).
model_auto = AutoModelForCausalLM.from_pretrained(
model_name,
attn_implementation="eager", #<---- You set this here
torch_dtype=dtype
).cuda()
config = model_auto.config
# Later...
model_auto_temp = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=dtype
)
model_llama.load_state_dict(model_auto_temp.state_dict())
One of those “Auto” calls picks up a different default than the other. That can lead to differences in logits.
Pass identical arguments to every load
If you rely on AutoModelForCausalLM for everything:
model_auto_1 = AutoModelForCausalLM.from_pretrained(
model_name,
attn_implementation="eager",
torch_dtype=dtype
).cuda()
config = model_auto_1.config
# Ensure 2nd time also uses the same attn_implementation etc.
model_auto_2 = AutoModelForCausalLM.from_pretrained(
model_name,
attn_implementation="eager",
torch_dtype=dtype
)
model_llama = LlamaForCausalLM(config).cuda()
model_llama.eval()
model_llama.load_state_dict(model_auto_2.state_dict())
Now both auto calls have same arguments.
Skip the intermed. model
Let LlamaForCausalLM do the work, LlamaForCausalLM supports from_pretrained:
# A) "Auto" way
model_auto = AutoModelForCausalLM.from_pretrained(
model_name,
attn_implementation="eager",
torch_dtype=dtype
).cuda()
# B) Direct Llama way
model_llama = LlamaForCausalLM.from_pretrained(
model_name,
attn_implementation="eager",
torch_dtype=dtype
).cuda()
Now both read the same config plus the same checkpoint weights without you having to do any manual .state_dict() copy.
If you compare their outputs:
out_auto = model_auto(**inputs).logits
out_llama = model_llama(**inputs).logits
diff = (out_auto - out_llama).abs().max()
print(diff.item())
…you should see almost no differences