im new in this kind of proyects and i want to try a tutorial for translation from HugginFace (Here is the link :text). I'm using my own custom dataset that contains two columns , one for spanish(español) and the other from Mapudungun. I think that i adjust the dataset for try this tutorial, but when i try to train the model this error appears:
> Epoch 1/3
62/62 [==============================] - ETA: 0s - loss: 4.2643
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-36-aa371efdfe52> in <cell line: 0>()
----> 1 model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
11 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/t5/modeling_tf_t5.py in else_body()
54 nonlocal input_shape
55 err_msg_prefix = ag__.if_exp(ag__.ld(self).is_decoder, lambda: 'decoder_', lambda: '', 'self.is_decoder')
---> 56 raise ag__.converted_call(ag__.ld(ValueError), (f'You have to specify either {ag__.ld(err_msg_prefix)}input_ids or {ag__.ld(err_msg_prefix)}inputs_embeds',), None, fscope)
57 err_msg_prefix = ag__.Undefined('err_msg_prefix')
58 input_shape = ag__.Undefined('input_shape')
ValueError: in user code:
File "/usr/local/lib/python3.11/dist-packages/tf_keras/src/engine/training.py", line 2436, in predict_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.11/dist-packages/tf_keras/src/engine/training.py", line 2421, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.11/dist-packages/tf_keras/src/engine/training.py", line 2409, in run_step **
outputs = model.predict_step(data)
File "/usr/local/lib/python3.11/dist-packages/tf_keras/src/engine/training.py", line 2377, in predict_step
return self(x, training=False)
File "/usr/local/lib/python3.11/dist-packages/tf_keras/src/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/tmp/__autograph_generated_file0_hgym1q.py", line 40, in tf__run_call_with_unpacked_inputs
raise
File "/tmp/__autograph_generated_filedk4yrcz4.py", line 91, in tf__call
decoder_outputs = ag__.converted_call(ag__.ld(self).decoder, (ag__.ld(decoder_input_ids),), dict(attention_mask=ag__.ld(decoder_attention_mask), encoder_hidden_states=ag__.ld(hidden_states), encoder_attention_mask=ag__.ld(attention_mask), inputs_embeds=ag__.ld(decoder_inputs_embeds), head_mask=ag__.ld(decoder_head_mask), past_key_values=ag__.ld(past_key_values), use_cache=ag__.ld(use_cache), output_attentions=ag__.ld(output_attentions), output_hidden_states=ag__.ld(output_hidden_states), return_dict=ag__.ld(return_dict), training=ag__.ld(training)), fscope)
File "/tmp/__autograph_generated_file0_hgym1q.py", line 40, in tf__run_call_with_unpacked_inputs
raise
File "/tmp/__autograph_generated_filezh8cqxq8.py", line 65, in tf__call
ag__.if_stmt(ag__.and_(lambda: ag__.ld(input_ids) is not None, lambda: ag__.ld(inputs_embeds) is not None), if_body_2, else_body_2, get_state_2, set_state_2, ('input_ids', 'input_shape'), 2)
File "/tmp/__autograph_generated_filezh8cqxq8.py", line 62, in else_body_2
ag__.if_stmt(ag__.ld(input_ids) is not None, if_body_1, else_body_1, get_state_1, set_state_1, ('input_ids', 'input_shape'), 2)
File "/tmp/__autograph_generated_filezh8cqxq8.py", line 59, in else_body_1
ag__.if_stmt(ag__.ld(inputs_embeds) is not None, if_body, else_body, get_state, set_state, ('input_shape',), 1)
File "/tmp/__autograph_generated_filezh8cqxq8.py", line 56, in else_body
raise ag__.converted_call(ag__.ld(ValueError), (f'You have to specify either {ag__.ld(err_msg_prefix)}input_ids or {ag__.ld(err_msg_prefix)}inputs_embeds',), None, fscope)
ValueError: Exception encountered when calling layer 'tft5_for_conditional_generation' (type TFT5ForConditionalGeneration).
in user code:
File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_tf_utils.py", line 1395, in run_call_with_unpacked_inputs *
return func(self, **unpacked_inputs)
File "/usr/local/lib/python3.11/dist-packages/transformers/models/t5/modeling_tf_t5.py", line 1455, in call *
decoder_outputs = self.decoder(
File "/usr/local/lib/python3.11/dist-packages/tf_keras/src/utils/traceback_utils.py", line 70, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/tmp/__autograph_generated_file0_hgym1q.py", line 40, in tf__run_call_with_unpacked_inputs
raise
File "/tmp/__autograph_generated_filezh8cqxq8.py", line 65, in tf__call
ag__.if_stmt(ag__.and_(lambda: ag__.ld(input_ids) is not None, lambda: ag__.ld(inputs_embeds) is not None), if_body_2, else_body_2, get_state_2, set_state_2, ('input_ids', 'input_shape'), 2)
File "/tmp/__autograph_generated_filezh8cqxq8.py", line 62, in else_body_2
ag__.if_stmt(ag__.ld(input_ids) is not None, if_body_1, else_body_1, get_state_1, set_state_1, ('input_ids', 'input_shape'), 2)
File "/tmp/__autograph_generated_filezh8cqxq8.py", line 59, in else_body_1
ag__.if_stmt(ag__.ld(inputs_embeds) is not None, if_body, else_body, get_state, set_state, ('input_shape',), 1)
File "/tmp/__autograph_generated_filezh8cqxq8.py", line 56, in else_body
raise ag__.converted_call(ag__.ld(ValueError), (f'You have to specify either {ag__.ld(err_msg_prefix)}input_ids or {ag__.ld(err_msg_prefix)}inputs_embeds',), None, fscope)
ValueError: Exception encountered when calling layer 'decoder' (type TFT5MainLayer).
in user code:
File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_tf_utils.py", line 1395, in run_call_with_unpacked_inputs *
return func(self, **unpacked_inputs)
File "/usr/local/lib/python3.11/dist-packages/transformers/models/t5/modeling_tf_t5.py", line 754, in call *
raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds
Call arguments received by layer 'decoder' (type TFT5MainLayer):
• input_ids=None
• attention_mask=None
• encoder_hidden_states=tf.Tensor(shape=(16, 36, 512), dtype=float32)
• encoder_attention_mask=tf.Tensor(shape=(16, 36), dtype=int32)
• inputs_embeds=None
• head_mask=None
• encoder_head_mask=None
• past_key_values=None
• use_cache=True
• output_attentions=False
• output_hidden_states=False
• return_dict=True
• training=False
Call arguments received by layer 'tft5_for_conditional_generation' (type TFT5ForConditionalGeneration):
• input_ids={'input_ids': 'tf.Tensor(shape=(16, 36), dtype=int64)', 'attention_mask': 'tf.Tensor(shape=(16, 36), dtype=int64)'}
• attention_mask=None
• decoder_input_ids=None
• decoder_attention_mask=None
• head_mask=None
• decoder_head_mask=None
• encoder_outputs=None
• past_key_values=None
• inputs_embeds=None
• decoder_inputs_embeds=None
• labels=None
• use_cache=None
• output_attentions=None
• output_hidden_states=None
• return_dict=None
• training=False
My full code is this
from huggingface_hub import notebook_login
notebook_login()
import pandas as pd
from datasets import Dataset
# Cargar el CSV
csv_path = "es_mapu_limpio.csv" # Ruta de tu archivo CSV
df = pd.read_csv(csv_path)
# Asegúrate de que las columnas se llaman "español" y "mapudungun"
# Si tienen otros nombres, ajusta aquí
df = df.rename(columns={"español": "es", "mapudungun": "map"})
# Transformar el DataFrame a la estructura deseada
structured_data = [
{
"id": str(index), # Generar un ID único para cada entrada
"translation": {
"es": row["es"], # Traducción en español
"map": row["map"], # Traducción en mapudungun
},
}
for index, row in df.iterrows()
]
# Crear el dataset de Hugging Face
books = Dataset.from_list(structured_data)
# Dividir en train y test
books = books.train_test_split(test_size=0.2)
# Resultado
print(books["train"][0])
from transformers import AutoTokenizer
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
source_lang = "es" # Spanish column name
target_lang = "map" # Mapudungun column name
prefix = "traducir español a mapudungun: "
def preprocess_function(examples):
inputs = [prefix + example[source_lang] for example in examples["translation"]]
targets = [example[target_lang] for example in examples["translation"]]
model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
return model_inputs
tokenized_books = books.map(preprocess_function, batched=True)
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
import evaluate
metric = evaluate.load("sacrebleu")
import numpy as np
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metricpute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
from transformers import AdamWeightDecay
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
from transformers import TFAutoModelForSeq2SeqLM
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tf_train_set = model.prepare_tf_dataset(
tokenized_books["train"],
shuffle=True,
batch_size=16,
collate_fn=data_collator,
)
tf_test_set = model.prepare_tf_dataset(
tokenized_books["test"],
shuffle=False,
batch_size=16,
collate_fn=data_collator,
import tensorflow as tf
modelpile(optimizer=optimizer) # No loss argument!
from transformers.keras_callbacks import KerasMetricCallback
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
from transformers.keras_callbacks import PushToHubCallback
push_to_hub_callback = PushToHubCallback(
output_dir="es_mapu_model",
tokenizer=tokenizer,
)
callbacks = [metric_callback, push_to_hub_callback]
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)