python - Image scan not enough values to unpack (expected 4, got 2)

I couldn't figure out where and what the issue is. I have ensured that I'm only returning 2 values (caption and title), but I am unsure where the other 2 values are coming from.

here is my function for the image model:

def generate_caption_and_title(image_path, device): 
# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Move the model to the selected device (CPU or GPU)
model.to(device)
try:
    # Open the image
    image = Image.open(image_path).convert("RGB")

    # Generate the caption (Description)
    description_prompt = "Describe this image in detail."
    inputs_description = processor(image, text=description_prompt, return_tensors="pt").to(device)
    description_ids = model.generate(
        inputs_description['input_ids'],
        max_length=50,
        num_beams=5,
        early_stopping=True
    )
    caption = processor.decode(description_ids[0], skip_special_tokens=True)

    # Generate a title (can be the first few words or a summary of the caption)
    title_prompt = "Provide a one-word title for this image."
    inputs_title = processor(image, text=title_prompt, return_tensors="pt").to(device)
    title_ids = model.generate(
        inputs_title['input_ids'],
        max_length=15,
        num_beams=5,
        early_stopping=True
    )
    title = processor.decode(title_ids[0], skip_special_tokens=True)

    return caption, title  # Ensure the order matches the unpacking
except Exception as e:
    print(f"Error in generate_caption_and_title for {image_path}: {e}")
    return "Error Title", "Error Description"  # Default return for failed cases

here is where I'm calling out the statement for generate_caption_and_title:

def process_images(image_dir, output_csv, device):
for root, dirs, files in os.walk(image_dir):
    for filename in files:
        file_path = os.path.join(root, filename)

        # Process only image files
        if os.path.isfile(file_path) and filename.lower().endswith((".jpg", ".jpeg", ".png", ".tiff")):
            image_files.append(file_path)
# Process only image files
for file_path in tqdm(image_files, desc="Processing Images", unit="image"):
    try:
        # Generate description and title using AI
        title, caption = generate_caption_and_title(file_path, device)

        # Open the image
        with Image.open(file_path) as img:
            # Extract EXIF data
            exif_data = img._getexif()
            metadata = {
                "Filename": filename, 
                "File Path": file_path, 
                "Resolution": f"{img.width}x{img.height}", 
                "Description": caption, 
                "Title": title}

        # Extract specific EXIF tags
        if exif_data:
            for tag_id, value in exif_data.items():
                tag_name = TAGS.get(tag_id, tag_id)
                metadata[tag_name] = value

        # Append metadata to the list
        metadata_list.append(metadata)
    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Create a DataFrame and save it to a CSV file
df = pd.DataFrame(metadata_list)
df.to_csv(output_csv, index=False)

print(f"Metadata has been saved to {output_csv}")

Also here is the full error message:

Error in generate_caption_and_title for D:\Pictures\Jacob Brockwell Graphic Arts and Pictures\Photo Gallaries\Photo Gallary\JPEG Photos\RAW Exports\Other\Other #2\Food\DSC_0006.jpg: not enough values to unpack (expected 4, got 2)
Full Traceback:
Traceback (most recent call last):
  File "d:\other-files\school\database_dev\Personal Projects\Image Database\imageScan.py", line 32, in generate_caption_and_title
    description_ids = model.generate(
                      ^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 1187, in generate
    vision_outputs = self.vision_model(
                     ^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 726, in forward
    hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 277, in forward
    batch_size, _, height, width = pixel_values.shape
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: not enough values to unpack (expected 4, got 2)

I couldn't figure out where and what the issue is. I have ensured that I'm only returning 2 values (caption and title), but I am unsure where the other 2 values are coming from.

here is my function for the image model:

def generate_caption_and_title(image_path, device): 
# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Move the model to the selected device (CPU or GPU)
model.to(device)
try:
    # Open the image
    image = Image.open(image_path).convert("RGB")

    # Generate the caption (Description)
    description_prompt = "Describe this image in detail."
    inputs_description = processor(image, text=description_prompt, return_tensors="pt").to(device)
    description_ids = model.generate(
        inputs_description['input_ids'],
        max_length=50,
        num_beams=5,
        early_stopping=True
    )
    caption = processor.decode(description_ids[0], skip_special_tokens=True)

    # Generate a title (can be the first few words or a summary of the caption)
    title_prompt = "Provide a one-word title for this image."
    inputs_title = processor(image, text=title_prompt, return_tensors="pt").to(device)
    title_ids = model.generate(
        inputs_title['input_ids'],
        max_length=15,
        num_beams=5,
        early_stopping=True
    )
    title = processor.decode(title_ids[0], skip_special_tokens=True)

    return caption, title  # Ensure the order matches the unpacking
except Exception as e:
    print(f"Error in generate_caption_and_title for {image_path}: {e}")
    return "Error Title", "Error Description"  # Default return for failed cases

here is where I'm calling out the statement for generate_caption_and_title:

def process_images(image_dir, output_csv, device):
for root, dirs, files in os.walk(image_dir):
    for filename in files:
        file_path = os.path.join(root, filename)

        # Process only image files
        if os.path.isfile(file_path) and filename.lower().endswith((".jpg", ".jpeg", ".png", ".tiff")):
            image_files.append(file_path)
# Process only image files
for file_path in tqdm(image_files, desc="Processing Images", unit="image"):
    try:
        # Generate description and title using AI
        title, caption = generate_caption_and_title(file_path, device)

        # Open the image
        with Image.open(file_path) as img:
            # Extract EXIF data
            exif_data = img._getexif()
            metadata = {
                "Filename": filename, 
                "File Path": file_path, 
                "Resolution": f"{img.width}x{img.height}", 
                "Description": caption, 
                "Title": title}

        # Extract specific EXIF tags
        if exif_data:
            for tag_id, value in exif_data.items():
                tag_name = TAGS.get(tag_id, tag_id)
                metadata[tag_name] = value

        # Append metadata to the list
        metadata_list.append(metadata)
    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Create a DataFrame and save it to a CSV file
df = pd.DataFrame(metadata_list)
df.to_csv(output_csv, index=False)

print(f"Metadata has been saved to {output_csv}")

Also here is the full error message:

Error in generate_caption_and_title for D:\Pictures\Jacob Brockwell Graphic Arts and Pictures\Photo Gallaries\Photo Gallary\JPEG Photos\RAW Exports\Other\Other #2\Food\DSC_0006.jpg: not enough values to unpack (expected 4, got 2)
Full Traceback:
Traceback (most recent call last):
  File "d:\other-files\school\database_dev\Personal Projects\Image Database\imageScan.py", line 32, in generate_caption_and_title
    description_ids = model.generate(
                      ^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 1187, in generate
    vision_outputs = self.vision_model(
                     ^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 726, in forward
    hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jacob Brockwell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\transformers\models\blip\modeling_blip.py", line 277, in forward
    batch_size, _, height, width = pixel_values.shape
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: not enough values to unpack (expected 4, got 2)

Share Improve this question edited Jan 19 at 0:01 asked Jan 18 at 23:41 Jacob Brockwell 212 bronze badges

Show the full traceback of the error as properly formatted text (formatted as code) in the question. – Michael Butscher Commented Jan 18 at 23:50

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

If you look at the signature of the generate function in the source code, you will find what arguments it expects:

def generate(
    self,
    pixel_values: torch.FloatTensor,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.LongTensor] = None,
    interpolate_pos_encoding: bool = False,
    **generate_kwargs,
) -> torch.LongTensor:

And if you print the dictionnary that returns the preprocess step (print(input_descriptions)), you will find the first arguments expected by the generate function: ['pixel_values', 'input_ids', 'attention_mask']. In particular in your code the first argument is input_ids (dim. 2) whereas it should be pixel_values (dim. 4). The following call should work:

description_ids = model.generate(
        **inputs_description,
        max_length=15,
        num_beams=5,
        early_stopping=True
    )

and similarly for the other call to generate().

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - Image scan not enough values to unpack (expected 4, got 2) - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)