import os
import time
import gradio as gr
import torch
from PIL import Image
from gtts import gTTS
import numpy as np
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import login

# meu tokennnupv2
hf_token = os.getenv("HUGGINGFACE_TOKEN2")

if hf_token:
    login(token=hf_token)

# YOLOv5
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Calcula a GLCM e o contrastee
def calculate_glcm_contrast(image):
    gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    max_value = gray_image.max() + 1
    glcm = np.zeros((max_value, max_value), dtype=np.float64)

    for i in range(gray_image.shape[0] - 1):
        for j in range(gray_image.shape[1] - 1):
            x = gray_image[i, j]
            y = gray_image[i + 1, j + 1]
            glcm[x, y] += 1

    glcm = glcm / glcm.sum()

    contrast = 0.0
    for i in range(max_value):
        for j in range(max_value):
            contrast += (i - j) ** 2 * glcm[i, j]
    
    return contrast

# Analisar a textura e a temperatura de cor
def analyze_image_properties(image):
    # Análise de cor (média RGB)
    image_rgb = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB)
    avg_color_per_row = np.average(image_rgb, axis=0)
    avg_color = np.average(avg_color_per_row, axis=0)
    
    # Determinar temperatura da cor
    if avg_color[0] > avg_color[2]:  # Mais vermelho que azul
        temperature = 'quente'
    else:
        temperature = 'fria'

    # Análise de textura
    texture_contrast = calculate_glcm_contrast(image)
    texture = 'lisa' if texture_contrast < 100 else 'texturizada'

    return temperature, texture

# Descrever imagem usando BLIP
def describe_image(image):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs)
    description = processor.decode(out[0], skip_special_tokens=True)
    return description

# Traduzir descrição para pt
def translate_description(description):
    model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True))
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

# Processar imagem e gerar saída de voz
def process_image(image):
    # Detecção de objetos
    results = model(image)
    detected_image = results.render()[0]

    # Análise de textura e temperatura de cor
    temperature, texture = analyze_image_properties(image)

    # Descrição da imagem
    description = describe_image(image)
    translated_description = translate_description(description)

    # Construir a descrição final
    final_description = f"{translated_description}. A textura é {texture} e a temperatura de cor é {temperature}."

    # Texto para voz
    tts = gTTS(text=final_description, lang='pt')
    attempts = 0
    while attempts < 5:
        try:
            tts.save("output.mp3")
            break
        except gTTS.tts.gTTSError as e:
            if e.r.status_code == 429:
                print("Too many requests. Waiting before retrying...")
                time.sleep(5)
                attempts += 1
            else:
                raise e

    # Saída
    return Image.fromarray(detected_image), final_description, "output.mp3"

# 
example_image_path = "example1.JPG"

# Gradio
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Image(type="pil"), gr.Textbox(), gr.Audio(type="filepath")],
    examples=[example_image_path]
)

iface.launch()