File size: 3,392 Bytes
0d1233c
 
 
35b634b
 
03ccacf
81c2749
35b634b
 
 
 
 
 
81c2749
 
35b634b
 
 
 
 
03ccacf
 
35b634b
 
03ccacf
35b634b
 
03ccacf
35b634b
03ccacf
 
 
 
 
 
 
 
 
35b634b
 
03ccacf
35b634b
 
03ccacf
35b634b
03ccacf
0d1233c
 
35b634b
0d1233c
 
 
35b634b
 
 
03ccacf
35b634b
 
03ccacf
35b634b
 
03ccacf
35b634b
 
03ccacf
35b634b
 
03ccacf
35b634b
 
03ccacf
35b634b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81c2749
35b634b
03ccacf
35b634b
81c2749
35b634b
0d1233c
 
 
 
 
35b634b
 
0d1233c
 
 
03ccacf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
import PyPDF2
import io
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import uuid
import re

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Initialize Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
INDEX_NAME = "ghana"
EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_DIMENSION = 3072

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if the index exists
if INDEX_NAME not in pc.list_indexes().names():
    # Create the index with updated dimensions
    pc.create_index(
        name=INDEX_NAME,
        dimension=EMBEDDING_DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=PINECONE_ENVIRONMENT.split('-')[0],  # Assuming environment is in format 'gcp-starter'
            region=PINECONE_ENVIRONMENT.split('-')[1]
        )
    )
else:
    # Optionally, verify the existing index's dimension matches
    existing_index = pc.describe_index(INDEX_NAME)
    if existing_index.dimension != EMBEDDING_DIMENSION:
        raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")

# Connect to the Pinecone index
index = pc.Index(INDEX_NAME)

def transcribe_pdf(pdf_file):
    # Read PDF and extract text
    pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
    text = ""
    for page in pdf_reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    
    # Dynamic Chunking
    chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
    
    # Generate embeddings for each chunk
    embeddings = get_embeddings(chunks)
    
    # Prepare upsert data
    upsert_data = [
        (str(uuid.uuid4()), emb, {"text": chunk})
        for chunk, emb in zip(chunks, embeddings)
    ]
    
    # Upsert to Pinecone
    index.upsert(vectors=upsert_data)
    
    return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."

def dynamic_chunking(text, max_tokens=500, overlap=50):
    """
    Splits text into chunks with a maximum number of tokens and a specified overlap.
    """
    # Simple tokenization based on whitespace
    tokens = re.findall(r'\S+', text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk = ' '.join(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap
    return chunks

def get_embeddings(chunks):
    """
    Generates embeddings for each chunk using OpenAI's embedding API.
    """
    response = client.embeddings.create(
        input=chunks,
        model=EMBEDDING_MODEL
    )
    embeddings = [data.embedding for data in response.data]
    return embeddings

iface = gr.Interface(
    fn=transcribe_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.Textbox(label="Transcription"),
    title="PDF Transcription and Upsert to Pinecone",
    description="Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to a Pinecone index named 'ghana'."
)

if __name__ == "__main__":
    iface.launch()