Spaces:
Sleeping
Sleeping
File size: 3,392 Bytes
0d1233c 35b634b 03ccacf 81c2749 35b634b 81c2749 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 0d1233c 35b634b 0d1233c 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 35b634b 03ccacf 35b634b 81c2749 35b634b 03ccacf 35b634b 81c2749 35b634b 0d1233c 35b634b 0d1233c 03ccacf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
import PyPDF2
import io
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import uuid
import re
# Load environment variables from .env file
load_dotenv()
# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Initialize Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
INDEX_NAME = "ghana"
EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_DIMENSION = 3072
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
# Check if the index exists
if INDEX_NAME not in pc.list_indexes().names():
# Create the index with updated dimensions
pc.create_index(
name=INDEX_NAME,
dimension=EMBEDDING_DIMENSION,
metric="cosine",
spec=ServerlessSpec(
cloud=PINECONE_ENVIRONMENT.split('-')[0], # Assuming environment is in format 'gcp-starter'
region=PINECONE_ENVIRONMENT.split('-')[1]
)
)
else:
# Optionally, verify the existing index's dimension matches
existing_index = pc.describe_index(INDEX_NAME)
if existing_index.dimension != EMBEDDING_DIMENSION:
raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
# Connect to the Pinecone index
index = pc.Index(INDEX_NAME)
def transcribe_pdf(pdf_file):
# Read PDF and extract text
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
text = ""
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
# Dynamic Chunking
chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
# Generate embeddings for each chunk
embeddings = get_embeddings(chunks)
# Prepare upsert data
upsert_data = [
(str(uuid.uuid4()), emb, {"text": chunk})
for chunk, emb in zip(chunks, embeddings)
]
# Upsert to Pinecone
index.upsert(vectors=upsert_data)
return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
def dynamic_chunking(text, max_tokens=500, overlap=50):
"""
Splits text into chunks with a maximum number of tokens and a specified overlap.
"""
# Simple tokenization based on whitespace
tokens = re.findall(r'\S+', text)
chunks = []
start = 0
while start < len(tokens):
end = start + max_tokens
chunk = ' '.join(tokens[start:end])
chunks.append(chunk)
start += max_tokens - overlap
return chunks
def get_embeddings(chunks):
"""
Generates embeddings for each chunk using OpenAI's embedding API.
"""
response = client.embeddings.create(
input=chunks,
model=EMBEDDING_MODEL
)
embeddings = [data.embedding for data in response.data]
return embeddings
iface = gr.Interface(
fn=transcribe_pdf,
inputs=gr.File(label="Upload PDF", type="binary"),
outputs=gr.Textbox(label="Transcription"),
title="PDF Transcription and Upsert to Pinecone",
description="Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'."
)
if __name__ == "__main__":
iface.launch() |