python - Why does my Llama 3.1 model act differently between AutoModelForCausalLM and LlamaForCausalLM?

I have one set of weights, one tokenizer, the same prompt, and identical generation parameters. Yet somehow, when I load the model using AutoModelForCausalLM, I get one output, and when I construct it manually with LlamaForCausalLM plus the same config and state_dict, I get another output entirely.

This code can show the difference on both a6000 and a100.

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    LlamaConfig
)

# 1) Adjust these as needed
model_name = "meta-llama/Llama-3.1-8B"
prompt = "Hello from Llama 3.1! Tell me something interesting."
dtype = torch.float16  # or torch.float32 if needed

# 2) Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Prepare input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

############################################
# A) Load with AutoModelForCausalLM
############################################

print("=== Loading with AutoModelForCausalLM ===")

model_auto = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager",  # matches your usage
    torch_dtype=dtype
).cuda()
model_auto.eval()  # turn off dropout
config = model_auto.config
with torch.no_grad():
    out_auto = model_auto(**inputs)
logits_auto = out_auto.logits  # shape: [batch_size, seq_len, vocab_size]

del model_auto
torch.cuda.empty_cache()

############################################
# B) Load with LlamaForCausalLM + config
############################################

print("=== Loading with LlamaForCausalLM + config ===")

# Get config from the same checkpoint
# Build Llama model directly
model_llama = LlamaForCausalLM(config).cuda()
model_llama.eval()

# Load the same weights that AutoModelForCausalLM used
model_auto_temp = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
model_llama.load_state_dict(model_auto_temp.state_dict())
del model_auto_temp
torch.cuda.empty_cache()

with torch.no_grad():
    out_llama = model_llama(**inputs)
logits_llama = out_llama.logits

############################################
# C) Compare the Logits
############################################

# Compute maximum absolute difference
max_diff = (logits_auto - logits_llama).abs().max()
print(f"\nMax absolute difference between logits: {max_diff.item()}")

if max_diff < 1e-7:
    print("→ The logits are effectively identical (within floating-point precision).")
else:
    print("→ There is a non-trivial difference in logits!")

This code can show the difference on both a6000 and a100.

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    LlamaConfig
)

# 1) Adjust these as needed
model_name = "meta-llama/Llama-3.1-8B"
prompt = "Hello from Llama 3.1! Tell me something interesting."
dtype = torch.float16  # or torch.float32 if needed

# 2) Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Prepare input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

############################################
# A) Load with AutoModelForCausalLM
############################################

print("=== Loading with AutoModelForCausalLM ===")

model_auto = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager",  # matches your usage
    torch_dtype=dtype
).cuda()
model_auto.eval()  # turn off dropout
config = model_auto.config
with torch.no_grad():
    out_auto = model_auto(**inputs)
logits_auto = out_auto.logits  # shape: [batch_size, seq_len, vocab_size]

del model_auto
torch.cuda.empty_cache()

############################################
# B) Load with LlamaForCausalLM + config
############################################

print("=== Loading with LlamaForCausalLM + config ===")

# Get config from the same checkpoint
# Build Llama model directly
model_llama = LlamaForCausalLM(config).cuda()
model_llama.eval()

# Load the same weights that AutoModelForCausalLM used
model_auto_temp = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
model_llama.load_state_dict(model_auto_temp.state_dict())
del model_auto_temp
torch.cuda.empty_cache()

with torch.no_grad():
    out_llama = model_llama(**inputs)
logits_llama = out_llama.logits

############################################
# C) Compare the Logits
############################################

# Compute maximum absolute difference
max_diff = (logits_auto - logits_llama).abs().max()
print(f"\nMax absolute difference between logits: {max_diff.item()}")

if max_diff < 1e-7:
    print("→ The logits are effectively identical (within floating-point precision).")
else:
    print("→ There is a non-trivial difference in logits!")

Share Improve this question edited Mar 20 at 13:43 Christoph Rackwitz 15.9k5 gold badges39 silver badges51 bronze badges asked Mar 8 at 8:24 han mo 234 bronze badges

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

In your example specifically, you set attn_implementation="eager" in 1st AutoModelForCausalLM (the config you save), but not to the 2nd AutoModelForCausalLM (from which you actually load the weights).

model_auto = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager", #<---- You set this here
    torch_dtype=dtype
).cuda()
config = model_auto.config

# Later...
model_auto_temp = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=dtype
)

model_llama.load_state_dict(model_auto_temp.state_dict())

One of those “Auto” calls picks up a different default than the other. That can lead to differences in logits.

Pass identical arguments to every load

If you rely on AutoModelForCausalLM for everything:

model_auto_1 = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager",
    torch_dtype=dtype
).cuda()

config = model_auto_1.config

# Ensure 2nd time also uses the same attn_implementation etc.
model_auto_2 = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager",
    torch_dtype=dtype
)

model_llama = LlamaForCausalLM(config).cuda()
model_llama.eval()

model_llama.load_state_dict(model_auto_2.state_dict())

Now both auto calls have same arguments.

Skip the intermed. model

Let LlamaForCausalLM do the work, LlamaForCausalLM supports from_pretrained:

#  A) "Auto" way
model_auto = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager",
    torch_dtype=dtype
).cuda()

#  B) Direct Llama way
model_llama = LlamaForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager",
    torch_dtype=dtype
).cuda()

Now both read the same config plus the same checkpoint weights without you having to do any manual .state_dict() copy.

If you compare their outputs:

out_auto = model_auto(**inputs).logits
out_llama = model_llama(**inputs).logits
diff = (out_auto - out_llama).abs().max()
print(diff.item())

…you should see almost no differences

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - Why does my Llama 3.1 model act differently between AutoModelForCausalLM and LlamaForCausalLM? - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)