Spaces:

poemsforaphrodite
/

ghana-helper

Sleeping

App Files Files Community

poemsforaphrodite commited on Sep 16, 2024

Commit

03ccacf

verified ·

1 Parent(s): 6c21711

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -17

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import PyPDF2
 import io
 import os
 from dotenv import load_dotenv
-import pinecone
 import openai
 import uuid
 import re
@@ -18,24 +18,32 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
 PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
 PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
 INDEX_NAME = "ghana"
-EMBEDDING_MODEL = "text-embedding-3-large"  # Updated model name
-EMBEDDING_DIMENSION = 3072  # Updated dimension
 # Initialize Pinecone
-pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
 # Check if the index exists
-if INDEX_NAME not in pinecone.list_indexes():
     # Create the index with updated dimensions
-    pinecone.create_index(name=INDEX_NAME, dimension=EMBEDDING_DIMENSION, metric="cosine")
 else:
     # Optionally, verify the existing index's dimension matches
-    existing_index = pinecone.describe_index(INDEX_NAME)
     if existing_index.dimension != EMBEDDING_DIMENSION:
         raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
 # Connect to the Pinecone index
-index = pinecone.Index(INDEX_NAME)
 def transcribe_pdf(pdf_file):
     # Read PDF and extract text
@@ -45,22 +53,22 @@ def transcribe_pdf(pdf_file):
         page_text = page.extract_text()
         if page_text:
             text += page_text + "\n"
     # Dynamic Chunking
     chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
     # Generate embeddings for each chunk
     embeddings = get_embeddings(chunks)
     # Prepare upsert data
     upsert_data = [
-        {"id": str(uuid.uuid4()), "values": emb, "metadata": {"text": chunk}}
         for chunk, emb in zip(chunks, embeddings)
     ]
     # Upsert to Pinecone
     index.upsert(vectors=upsert_data)
     return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
 def dynamic_chunking(text, max_tokens=500, overlap=50):
@@ -84,7 +92,7 @@ def get_embeddings(chunks):
     """
     response = openai.Embedding.create(
         input=chunks,
-        model=EMBEDDING_MODEL  # Updated model
     )
     embeddings = [record['embedding'] for record in response['data']]
     return embeddings
@@ -98,4 +106,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch()

 import io
 import os
 from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
 import openai
 import uuid
 import re
 PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
 PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
 INDEX_NAME = "ghana"
+EMBEDDING_MODEL = "text-embedding-3-large"
+EMBEDDING_DIMENSION = 3072
 # Initialize Pinecone
+pc = Pinecone(api_key=PINECONE_API_KEY)
 # Check if the index exists
+if INDEX_NAME not in pc.list_indexes().names():
     # Create the index with updated dimensions
+    pc.create_index(
+        name=INDEX_NAME,
+        dimension=EMBEDDING_DIMENSION,
+        metric="cosine",
+        spec=ServerlessSpec(
+            cloud=PINECONE_ENVIRONMENT.split('-')[0],  # Assuming environment is in format 'gcp-starter'
+            region=PINECONE_ENVIRONMENT.split('-')[1]
+        )
+    )
 else:
     # Optionally, verify the existing index's dimension matches
+    existing_index = pc.describe_index(INDEX_NAME)
     if existing_index.dimension != EMBEDDING_DIMENSION:
         raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
 # Connect to the Pinecone index
+index = pc.Index(INDEX_NAME)
 def transcribe_pdf(pdf_file):
     # Read PDF and extract text
         page_text = page.extract_text()
         if page_text:
             text += page_text + "\n"
     # Dynamic Chunking
     chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
     # Generate embeddings for each chunk
     embeddings = get_embeddings(chunks)
     # Prepare upsert data
     upsert_data = [
+        (str(uuid.uuid4()), emb, {"text": chunk})
         for chunk, emb in zip(chunks, embeddings)
     ]
     # Upsert to Pinecone
     index.upsert(vectors=upsert_data)
     return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
 def dynamic_chunking(text, max_tokens=500, overlap=50):
     """
     response = openai.Embedding.create(
         input=chunks,
+        model=EMBEDDING_MODEL
     )
     embeddings = [record['embedding'] for record in response['data']]
     return embeddings
 )
 if __name__ == "__main__":
+    iface.launch()