import os import time import gradio as gr import torch from PIL import Image from gtts import gTTS import numpy as np import cv2 from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM from huggingface_hub import login # meu tokennnupv2 hf_token = os.getenv("HUGGINGFACE_TOKEN2") if hf_token: login(token=hf_token) # YOLOv5 model = torch.hub.load('ultralytics/yolov5', 'yolov5s') # Calcula a GLCM e o contrastee def calculate_glcm_contrast(image): gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY) max_value = gray_image.max() + 1 glcm = np.zeros((max_value, max_value), dtype=np.float64) for i in range(gray_image.shape[0] - 1): for j in range(gray_image.shape[1] - 1): x = gray_image[i, j] y = gray_image[i + 1, j + 1] glcm[x, y] += 1 glcm = glcm / glcm.sum() contrast = 0.0 for i in range(max_value): for j in range(max_value): contrast += (i - j) ** 2 * glcm[i, j] return contrast # Analisar a textura e a temperatura de cor def analyze_image_properties(image): # Análise de cor (média RGB) image_rgb = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB) avg_color_per_row = np.average(image_rgb, axis=0) avg_color = np.average(avg_color_per_row, axis=0) # Determinar temperatura da cor if avg_color[0] > avg_color[2]: # Mais vermelho que azul temperature = 'quente' else: temperature = 'fria' # Análise de textura texture_contrast = calculate_glcm_contrast(image) texture = 'lisa' if texture_contrast < 100 else 'texturizada' return temperature, texture # Descrever imagem usando BLIP def describe_image(image): processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") inputs = processor(image, return_tensors="pt") out = model.generate(**inputs) description = processor.decode(out[0], skip_special_tokens=True) return description # Traduzir descrição para pt def translate_description(description): model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt' tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True)) translated_text = tokenizer.decode(translated[0], skip_special_tokens=True) return translated_text # Processar imagem e gerar saída de voz def process_image(image): # Detecção de objetos results = model(image) detected_image = results.render()[0] # Análise de textura e temperatura de cor temperature, texture = analyze_image_properties(image) # Descrição da imagem description = describe_image(image) translated_description = translate_description(description) # Construir a descrição final final_description = f"{translated_description}. A textura é {texture} e a temperatura de cor é {temperature}." # Texto para voz tts = gTTS(text=final_description, lang='pt') attempts = 0 while attempts < 5: try: tts.save("output.mp3") break except gTTS.tts.gTTSError as e: if e.r.status_code == 429: print("Too many requests. Waiting before retrying...") time.sleep(5) attempts += 1 else: raise e # Saída return Image.fromarray(detected_image), final_description, "output.mp3" # example_image_path = "example1.JPG" # Gradio iface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs=[gr.Image(type="pil"), gr.Textbox(), gr.Audio(type="filepath")], examples=[example_image_path] ) iface.launch()