From d876ca230134a8df78c00ff2c1d4316a71bfabea Mon Sep 17 00:00:00 2001 From: "Christoph J. Scherr" Date: Thu, 27 Mar 2025 15:23:19 +0100 Subject: [PATCH] docs: add sphinx documentation for image_reco module Refs: OPS-92 OPS-93 --- senju/image_reco.py | 84 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 9 deletions(-) diff --git a/senju/image_reco.py b/senju/image_reco.py index 49d2db8..272e985 100644 --- a/senju/image_reco.py +++ b/senju/image_reco.py @@ -1,3 +1,40 @@ +""" +Senju Image Recognition Module +============================= + +A module providing image description generation capabilities for the Senju haiku application. + +This module leverages pre-trained vision-language models (specifically BLIP) to generate +textual descriptions of uploaded images. These descriptions can then be used as input +for the haiku generation process, enabling image-to-haiku functionality. + +Classes +------- +ImageDescriptionGenerator + The primary class responsible for loading the vision-language model + and generating descriptions from image data. + +Functions +--------- +gen_response + A helper function that wraps the description generation process + for API integration. + +Dependencies +------------ +* torch: Deep learning framework required for model operations +* PIL.Image: Image processing capabilities +* io: Utilities for working with binary data streams +* transformers: Hugging Face's library providing access to pre-trained models + +Implementation Details +--------------------- +The module initializes a BLIP model (Bootstrapped Language-Image Pre-training) +which can understand visual content and generate natural language descriptions. +The implementation handles image loading, preprocessing, model inference, +and post-processing to return structured description data. +""" + import torch from PIL import Image import io @@ -5,14 +42,28 @@ from transformers import BlipProcessor, BlipForConditionalGeneration class ImageDescriptionGenerator: + """ + A class for generating textual descriptions of images using a vision-language model. + + This class handles the loading of a pre-trained BLIP model, image preprocessing, + and caption generation. It provides an interface for converting raw image data + into natural language descriptions that can be used for haiku inspiration. + + :ivar processor: The BLIP processor for handling image inputs + :type processor: BlipProcessor + :ivar model: The BLIP model for conditional text generation + :type model: BlipForConditionalGeneration + :ivar device: The computation device (CUDA or CPU) + :type device: str + """ + def __init__(self, model_name="Salesforce/blip-image-captioning-base"): """ Initialize an image description generator using a vision-language model. - Args: - model_name: The name of the model to use - (default: BLIP captioning model) + :param model_name: The name of the pre-trained model to use + :type model_name: str """ self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {self.device}") @@ -24,13 +75,15 @@ class ImageDescriptionGenerator: """ Generate a descriptive caption for the given image. - Args: - image_data: Raw image data (bytes) - max_length: Maximum length of the generated caption + This method processes the raw image data, runs inference with the BLIP model, + and returns a structured response with the generated description. - Returns: - dict: A dictionary containing the generated description - and confidence score + :param image_data: Raw binary image data + :type image_data: bytes + :param max_length: Maximum token length for the generated caption + :type max_length: int + :return: Dictionary containing the generated description and confidence score + :rtype: dict """ # Convert uploaded bytes to image img = Image.open(io.BytesIO(image_data)).convert("RGB") @@ -59,8 +112,21 @@ class ImageDescriptionGenerator: } +# Global instance of the description generator g_descriptor: ImageDescriptionGenerator = ImageDescriptionGenerator() def gen_response(image_data) -> dict: + """ + Generate a description for an image using the global description generator. + + This function provides a simplified interface to the image description functionality + for use in API endpoints. + + :param image_data: Raw binary image data + :type image_data: bytes + :return: Dictionary containing the image description and confidence information + :rtype: dict + :raises Exception: If image processing or description generation fails + """ return g_descriptor.generate_description(image_data)