mirror of
https://github.com/senju1337/senju.git
synced 2025-12-23 23:39:27 +00:00
docs: add sphinx documentation for image_reco module
Refs: OPS-92 OPS-93
This commit is contained in:
parent
97b57916de
commit
d876ca2301
1 changed files with 75 additions and 9 deletions
|
|
@ -1,3 +1,40 @@
|
||||||
|
"""
|
||||||
|
Senju Image Recognition Module
|
||||||
|
=============================
|
||||||
|
|
||||||
|
A module providing image description generation capabilities for the Senju haiku application.
|
||||||
|
|
||||||
|
This module leverages pre-trained vision-language models (specifically BLIP) to generate
|
||||||
|
textual descriptions of uploaded images. These descriptions can then be used as input
|
||||||
|
for the haiku generation process, enabling image-to-haiku functionality.
|
||||||
|
|
||||||
|
Classes
|
||||||
|
-------
|
||||||
|
ImageDescriptionGenerator
|
||||||
|
The primary class responsible for loading the vision-language model
|
||||||
|
and generating descriptions from image data.
|
||||||
|
|
||||||
|
Functions
|
||||||
|
---------
|
||||||
|
gen_response
|
||||||
|
A helper function that wraps the description generation process
|
||||||
|
for API integration.
|
||||||
|
|
||||||
|
Dependencies
|
||||||
|
------------
|
||||||
|
* torch: Deep learning framework required for model operations
|
||||||
|
* PIL.Image: Image processing capabilities
|
||||||
|
* io: Utilities for working with binary data streams
|
||||||
|
* transformers: Hugging Face's library providing access to pre-trained models
|
||||||
|
|
||||||
|
Implementation Details
|
||||||
|
---------------------
|
||||||
|
The module initializes a BLIP model (Bootstrapped Language-Image Pre-training)
|
||||||
|
which can understand visual content and generate natural language descriptions.
|
||||||
|
The implementation handles image loading, preprocessing, model inference,
|
||||||
|
and post-processing to return structured description data.
|
||||||
|
"""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import io
|
import io
|
||||||
|
|
@ -5,14 +42,28 @@ from transformers import BlipProcessor, BlipForConditionalGeneration
|
||||||
|
|
||||||
|
|
||||||
class ImageDescriptionGenerator:
|
class ImageDescriptionGenerator:
|
||||||
|
"""
|
||||||
|
A class for generating textual descriptions of images using a vision-language model.
|
||||||
|
|
||||||
|
This class handles the loading of a pre-trained BLIP model, image preprocessing,
|
||||||
|
and caption generation. It provides an interface for converting raw image data
|
||||||
|
into natural language descriptions that can be used for haiku inspiration.
|
||||||
|
|
||||||
|
:ivar processor: The BLIP processor for handling image inputs
|
||||||
|
:type processor: BlipProcessor
|
||||||
|
:ivar model: The BLIP model for conditional text generation
|
||||||
|
:type model: BlipForConditionalGeneration
|
||||||
|
:ivar device: The computation device (CUDA or CPU)
|
||||||
|
:type device: str
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, model_name="Salesforce/blip-image-captioning-base"):
|
def __init__(self, model_name="Salesforce/blip-image-captioning-base"):
|
||||||
"""
|
"""
|
||||||
Initialize an image description generator using a vision-language
|
Initialize an image description generator using a vision-language
|
||||||
model.
|
model.
|
||||||
|
|
||||||
Args:
|
:param model_name: The name of the pre-trained model to use
|
||||||
model_name: The name of the model to use
|
:type model_name: str
|
||||||
(default: BLIP captioning model)
|
|
||||||
"""
|
"""
|
||||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
print(f"Using device: {self.device}")
|
print(f"Using device: {self.device}")
|
||||||
|
|
@ -24,13 +75,15 @@ class ImageDescriptionGenerator:
|
||||||
"""
|
"""
|
||||||
Generate a descriptive caption for the given image.
|
Generate a descriptive caption for the given image.
|
||||||
|
|
||||||
Args:
|
This method processes the raw image data, runs inference with the BLIP model,
|
||||||
image_data: Raw image data (bytes)
|
and returns a structured response with the generated description.
|
||||||
max_length: Maximum length of the generated caption
|
|
||||||
|
|
||||||
Returns:
|
:param image_data: Raw binary image data
|
||||||
dict: A dictionary containing the generated description
|
:type image_data: bytes
|
||||||
and confidence score
|
:param max_length: Maximum token length for the generated caption
|
||||||
|
:type max_length: int
|
||||||
|
:return: Dictionary containing the generated description and confidence score
|
||||||
|
:rtype: dict
|
||||||
"""
|
"""
|
||||||
# Convert uploaded bytes to image
|
# Convert uploaded bytes to image
|
||||||
img = Image.open(io.BytesIO(image_data)).convert("RGB")
|
img = Image.open(io.BytesIO(image_data)).convert("RGB")
|
||||||
|
|
@ -59,8 +112,21 @@ class ImageDescriptionGenerator:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global instance of the description generator
|
||||||
g_descriptor: ImageDescriptionGenerator = ImageDescriptionGenerator()
|
g_descriptor: ImageDescriptionGenerator = ImageDescriptionGenerator()
|
||||||
|
|
||||||
|
|
||||||
def gen_response(image_data) -> dict:
|
def gen_response(image_data) -> dict:
|
||||||
|
"""
|
||||||
|
Generate a description for an image using the global description generator.
|
||||||
|
|
||||||
|
This function provides a simplified interface to the image description functionality
|
||||||
|
for use in API endpoints.
|
||||||
|
|
||||||
|
:param image_data: Raw binary image data
|
||||||
|
:type image_data: bytes
|
||||||
|
:return: Dictionary containing the image description and confidence information
|
||||||
|
:rtype: dict
|
||||||
|
:raises Exception: If image processing or description generation fails
|
||||||
|
"""
|
||||||
return g_descriptor.generate_description(image_data)
|
return g_descriptor.generate_description(image_data)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue