I couldn't figure out where and what the issue is. I have ensured that I'm only returning 2 values (caption and title), but I am unsure where the other 2 values are coming from.
here is my function for the image model:
def generate_caption_and_title(image_path, device):
# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Move the model to the selected device (CPU or GPU)
model.to(device)
try:
# Open the image
image = Image.open(image_path).convert("RGB")
# Generate the caption (Description)
description_prompt = "Describe this image in detail."
inputs_description = processor(image, text=description_prompt, return_tensors="pt").to(device)
description_ids = model.generate(
inputs_description['input_ids'],
max_length=50,
num_beams=5,
early_stopping=True
)
caption = processor.decode(description_ids[0], skip_special_tokens=True)
# Generate a title (can be the first few words or a summary of the caption)
title_prompt = "Provide a one-word title for this image."
inputs_title = processor(image, text=title_prompt, return_tensors="pt").to(device)
title_ids = model.generate(
inputs_title['input_ids'],
max_length=15,
num_beams=5,
early_stopping=True
)
title = processor.decode(title_ids[0], skip_special_tokens=True)
return caption, title # Ensure the order matches the unpacking
except Exception as e:
print(f"Error in generate_caption_and_title for {image_path}: {e}")
return "Error Title", "Error Description" # Default return for failed cases
here is where I'm calling out the statement for generate_caption_and_title:
def process_images(image_dir, output_csv, device):
for root, dirs, files in os.walk(image_dir):
for filename in files:
file_path = os.path.join(root, filename)
# Process only image files
if os.path.isfile(file_path) and filename.lower().endswith((".jpg", ".jpeg", ".png", ".tiff")):
image_files.append(file_path)
# Process only image files
for file_path in tqdm(image_files, desc="Processing Images", unit="image"):
try:
# Generate description and title using AI
title, caption = generate_caption_and_title(file_path, device)
# Open the image
with Image.open(file_path) as img:
# Extract EXIF data
exif_data = img._getexif()
metadata = {
"Filename": filename,
"File Path": file_path,
"Resolution": f"{img.width}x{img.height}",
"Description": caption,
"Title": title}
# Extract specific EXIF tags
if exif_data:
for tag_id, value in exif_data.items():
tag_name = TAGS.get(tag_id, tag_id)
metadata[tag_name] = value
# Append metadata to the list
metadata_list.append(metadata)
except Exception as e:
print(f"Error processing {filename}: {e}")
# Create a DataFrame and save it to a CSV file
df = pd.DataFrame(metadata_list)
df.to_csv(output_csv, index=False)
print(f"Metadata has been saved to {output_csv}")
Also here is the full error message:
Error in generate_caption_and_title for D:\Pictures\Jacob Brockwell Graphic Arts and Pictures\Photo Gallaries\Photo Gallary\JPEG Photos\RAW Exports\Other\Other #2\Food\DSC_0006.jpg: not enough values to unpack (expected 4, got 2)
Full Traceback:
Traceback (most recent call last):
File "d:\other-files\school\database_dev\Personal Projects\Image Database\imageScan.py", line 32, in generate_caption_and_title
description_ids = model.generate(
^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 1187, in generate
vision_outputs = self.vision_model(
^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 726, in forward
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 277, in forward
batch_size, _, height, width = pixel_values.shape
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: not enough values to unpack (expected 4, got 2)
I couldn't figure out where and what the issue is. I have ensured that I'm only returning 2 values (caption and title), but I am unsure where the other 2 values are coming from.
here is my function for the image model:
def generate_caption_and_title(image_path, device):
# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Move the model to the selected device (CPU or GPU)
model.to(device)
try:
# Open the image
image = Image.open(image_path).convert("RGB")
# Generate the caption (Description)
description_prompt = "Describe this image in detail."
inputs_description = processor(image, text=description_prompt, return_tensors="pt").to(device)
description_ids = model.generate(
inputs_description['input_ids'],
max_length=50,
num_beams=5,
early_stopping=True
)
caption = processor.decode(description_ids[0], skip_special_tokens=True)
# Generate a title (can be the first few words or a summary of the caption)
title_prompt = "Provide a one-word title for this image."
inputs_title = processor(image, text=title_prompt, return_tensors="pt").to(device)
title_ids = model.generate(
inputs_title['input_ids'],
max_length=15,
num_beams=5,
early_stopping=True
)
title = processor.decode(title_ids[0], skip_special_tokens=True)
return caption, title # Ensure the order matches the unpacking
except Exception as e:
print(f"Error in generate_caption_and_title for {image_path}: {e}")
return "Error Title", "Error Description" # Default return for failed cases
here is where I'm calling out the statement for generate_caption_and_title:
def process_images(image_dir, output_csv, device):
for root, dirs, files in os.walk(image_dir):
for filename in files:
file_path = os.path.join(root, filename)
# Process only image files
if os.path.isfile(file_path) and filename.lower().endswith((".jpg", ".jpeg", ".png", ".tiff")):
image_files.append(file_path)
# Process only image files
for file_path in tqdm(image_files, desc="Processing Images", unit="image"):
try:
# Generate description and title using AI
title, caption = generate_caption_and_title(file_path, device)
# Open the image
with Image.open(file_path) as img:
# Extract EXIF data
exif_data = img._getexif()
metadata = {
"Filename": filename,
"File Path": file_path,
"Resolution": f"{img.width}x{img.height}",
"Description": caption,
"Title": title}
# Extract specific EXIF tags
if exif_data:
for tag_id, value in exif_data.items():
tag_name = TAGS.get(tag_id, tag_id)
metadata[tag_name] = value
# Append metadata to the list
metadata_list.append(metadata)
except Exception as e:
print(f"Error processing {filename}: {e}")
# Create a DataFrame and save it to a CSV file
df = pd.DataFrame(metadata_list)
df.to_csv(output_csv, index=False)
print(f"Metadata has been saved to {output_csv}")
Also here is the full error message:
Error in generate_caption_and_title for D:\Pictures\Jacob Brockwell Graphic Arts and Pictures\Photo Gallaries\Photo Gallary\JPEG Photos\RAW Exports\Other\Other #2\Food\DSC_0006.jpg: not enough values to unpack (expected 4, got 2)
Full Traceback:
Traceback (most recent call last):
File "d:\other-files\school\database_dev\Personal Projects\Image Database\imageScan.py", line 32, in generate_caption_and_title
description_ids = model.generate(
^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 1187, in generate
vision_outputs = self.vision_model(
^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 726, in forward
hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 277, in forward
batch_size, _, height, width = pixel_values.shape
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: not enough values to unpack (expected 4, got 2)
Share
Improve this question
edited Jan 19 at 0:01
Jacob Brockwell
asked Jan 18 at 23:41
Jacob BrockwellJacob Brockwell
212 bronze badges
1
- Show the full traceback of the error as properly formatted text (formatted as code) in the question. – Michael Butscher Commented Jan 18 at 23:50
1 Answer
Reset to default 0If you look at the signature of the generate
function in the source code, you will find what arguments it expects:
def generate(
self,
pixel_values: torch.FloatTensor,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
interpolate_pos_encoding: bool = False,
**generate_kwargs,
) -> torch.LongTensor:
And if you print the dictionnary that returns the preprocess step (print(input_descriptions)
), you will find the first arguments expected by the generate function: ['pixel_values', 'input_ids', 'attention_mask']. In particular in your code the first argument is input_ids (dim. 2) whereas it should be pixel_values (dim. 4). The following call should work:
description_ids = model.generate(
**inputs_description,
max_length=15,
num_beams=5,
early_stopping=True
)
and similarly for the other call to generate().