Spaces:
Running
Running
from PIL import Image | |
#importing models | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
from langchain_google_genai import ChatGoogleGenerativeAI | |
from dotenv import load_dotenv | |
import os | |
from langchain.output_parsers import StructuredOutputParser, ResponseSchema | |
import gradio as gr | |
from diffusers import DiffusionPipeline,StableDiffusion3Pipeline | |
from huggingface_hub import login | |
load_dotenv() | |
Hugging_face_token=os.getenv('HFToken') | |
login(Hugging_face_token) | |
# loading image captionning model | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
# Set the model name for our LLMs. | |
GEMINI_MODEL = "gemini-2.0-flash" | |
# Store the API key in a variable. | |
GEMINI_API_KEY = os.getenv("google_api_key") | |
class stable_dif: | |
def __init__(self,sizes): | |
self.sizes=sizes | |
def model(self): | |
if self.sizes == 'medium': | |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium") | |
elif self.sizes == 'large': | |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3.5-large-turbo") | |
elif self.sizes == 'small': | |
pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") | |
return pipe | |
stable=stable_dif('small') | |
pipe=stable.model() | |
def image_story_generator(image,requirement,style): | |
raw_image = Image.open(image) | |
# get caption from image | |
inputs = processor(raw_image, return_tensors="pt") | |
out = model.generate(**inputs, min_length=20) | |
model_prompt=processor.decode(out[0], skip_special_tokens=True) | |
#load gemnini for creating story | |
llm = ChatGoogleGenerativeAI(google_api_key=GEMINI_API_KEY, model=GEMINI_MODEL, temperature=0.3) | |
query =f' Write a 4 chapters story based on {model_prompt} and\ | |
that fits the following requirements: {requirement}. Give a detailed\ | |
description of the charaters appearences.' | |
result = llm.invoke(query) | |
story= result.content.replace('\n',' ') | |
# create promts for image gen from story | |
image_prompt_llm=ChatGoogleGenerativeAI(google_api_key=GEMINI_API_KEY, model=GEMINI_MODEL, temperature=0.3) | |
# create shemas to format output | |
schemas=[ | |
ResponseSchema(name='prompt 1', description='the prompt'), | |
ResponseSchema(name='prompt 2', description='the prompt'), | |
ResponseSchema(name='prompt 3', description='the prompt'), | |
ResponseSchema(name='prompt 4', description='the prompt') | |
] | |
# initialize parser for output | |
parser=StructuredOutputParser.from_response_schemas(schemas) | |
instructions=parser.get_format_instructions() | |
query = f' Based on this story: {story}. Create 4 prompts for stable diffusion that tells of a maximum of 77 tokens\ | |
what happens in each chapters. Describe the characters everytime their name is mentioned. Each image should be created in the same exact style {style}.\ | |
'+ '\n\n'+instructions | |
result=image_prompt_llm.invoke(query) | |
image_prompts = parser.parse(result.content) | |
# iterate through the prompts and generate new images | |
images=[] | |
for i in image_prompts.keys(): | |
image = pipe(image_prompts[i]).images[0] | |
images.append(image) | |
return images, story | |
# gradio | |
interface = gr.Interface( | |
fn=image_story_generator, | |
inputs=[gr.Image(type='filepath'),gr.Textbox('enter story requirements'), gr.Textbox('pick a style for the images')], | |
outputs=[gr.Gallery(), | |
gr.Textbox('story') | |
], | |
description='Upload an image to start the story generation process.' | |
) | |
interface.launch() |