import gradio as gr
import PyPDF2
import io
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import uuid
import re

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Initialize Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
INDEX_NAME = "ghana"
EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_DIMENSION = 3072

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if the index exists
if INDEX_NAME not in pc.list_indexes().names():
    # Create the index with updated dimensions
    pc.create_index(
        name=INDEX_NAME,
        dimension=EMBEDDING_DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=PINECONE_ENVIRONMENT.split('-')[0],  # Assuming environment is in format 'gcp-starter'
            region=PINECONE_ENVIRONMENT.split('-')[1]
        )
    )
else:
    # Optionally, verify the existing index's dimension matches
    existing_index = pc.describe_index(INDEX_NAME)
    if existing_index.dimension != EMBEDDING_DIMENSION:
        raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")

# Connect to the Pinecone index
index = pc.Index(INDEX_NAME)

def transcribe_pdf(pdf_file):
    # Read PDF and extract text
    pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
    text = ""
    for page in pdf_reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    
    # Dynamic Chunking
    chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
    
    # Generate embeddings for each chunk
    embeddings = get_embeddings(chunks)
    
    # Prepare upsert data
    upsert_data = [
        (str(uuid.uuid4()), emb, {"text": chunk})
        for chunk, emb in zip(chunks, embeddings)
    ]
    
    # Upsert to Pinecone
    index.upsert(vectors=upsert_data)
    
    return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."

def dynamic_chunking(text, max_tokens=500, overlap=50):
    """
    Splits text into chunks with a maximum number of tokens and a specified overlap.
    """
    # Simple tokenization based on whitespace
    tokens = re.findall(r'\S+', text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk = ' '.join(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap
    return chunks

def get_embeddings(chunks):
    """
    Generates embeddings for each chunk using OpenAI's embedding API.
    """
    response = client.embeddings.create(
        input=chunks,
        model=EMBEDDING_MODEL
    )
    embeddings = [data.embedding for data in response.data]
    return embeddings

iface = gr.Interface(
    fn=transcribe_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.Textbox(label="Transcription"),
    title="PDF Transcription and Upsert to Pinecone",
    description="Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'."
)

if __name__ == "__main__":
    iface.launch()