Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ import PyPDF2
|
|
3 |
import io
|
4 |
import os
|
5 |
from dotenv import load_dotenv
|
6 |
-
import
|
7 |
import openai
|
8 |
import uuid
|
9 |
import re
|
@@ -18,24 +18,32 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
18 |
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
19 |
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
|
20 |
INDEX_NAME = "ghana"
|
21 |
-
EMBEDDING_MODEL = "text-embedding-3-large"
|
22 |
-
EMBEDDING_DIMENSION = 3072
|
23 |
|
24 |
# Initialize Pinecone
|
25 |
-
|
26 |
|
27 |
# Check if the index exists
|
28 |
-
if INDEX_NAME not in
|
29 |
# Create the index with updated dimensions
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
else:
|
32 |
# Optionally, verify the existing index's dimension matches
|
33 |
-
existing_index =
|
34 |
if existing_index.dimension != EMBEDDING_DIMENSION:
|
35 |
raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
|
36 |
-
|
37 |
# Connect to the Pinecone index
|
38 |
-
index =
|
39 |
|
40 |
def transcribe_pdf(pdf_file):
|
41 |
# Read PDF and extract text
|
@@ -45,22 +53,22 @@ def transcribe_pdf(pdf_file):
|
|
45 |
page_text = page.extract_text()
|
46 |
if page_text:
|
47 |
text += page_text + "\n"
|
48 |
-
|
49 |
# Dynamic Chunking
|
50 |
chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
|
51 |
-
|
52 |
# Generate embeddings for each chunk
|
53 |
embeddings = get_embeddings(chunks)
|
54 |
-
|
55 |
# Prepare upsert data
|
56 |
upsert_data = [
|
57 |
-
|
58 |
for chunk, emb in zip(chunks, embeddings)
|
59 |
]
|
60 |
-
|
61 |
# Upsert to Pinecone
|
62 |
index.upsert(vectors=upsert_data)
|
63 |
-
|
64 |
return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
|
65 |
|
66 |
def dynamic_chunking(text, max_tokens=500, overlap=50):
|
@@ -84,7 +92,7 @@ def get_embeddings(chunks):
|
|
84 |
"""
|
85 |
response = openai.Embedding.create(
|
86 |
input=chunks,
|
87 |
-
model=EMBEDDING_MODEL
|
88 |
)
|
89 |
embeddings = [record['embedding'] for record in response['data']]
|
90 |
return embeddings
|
@@ -98,4 +106,4 @@ iface = gr.Interface(
|
|
98 |
)
|
99 |
|
100 |
if __name__ == "__main__":
|
101 |
-
iface.launch()
|
|
|
3 |
import io
|
4 |
import os
|
5 |
from dotenv import load_dotenv
|
6 |
+
from pinecone import Pinecone, ServerlessSpec
|
7 |
import openai
|
8 |
import uuid
|
9 |
import re
|
|
|
18 |
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
19 |
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
|
20 |
INDEX_NAME = "ghana"
|
21 |
+
EMBEDDING_MODEL = "text-embedding-3-large"
|
22 |
+
EMBEDDING_DIMENSION = 3072
|
23 |
|
24 |
# Initialize Pinecone
|
25 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
26 |
|
27 |
# Check if the index exists
|
28 |
+
if INDEX_NAME not in pc.list_indexes().names():
|
29 |
# Create the index with updated dimensions
|
30 |
+
pc.create_index(
|
31 |
+
name=INDEX_NAME,
|
32 |
+
dimension=EMBEDDING_DIMENSION,
|
33 |
+
metric="cosine",
|
34 |
+
spec=ServerlessSpec(
|
35 |
+
cloud=PINECONE_ENVIRONMENT.split('-')[0], # Assuming environment is in format 'gcp-starter'
|
36 |
+
region=PINECONE_ENVIRONMENT.split('-')[1]
|
37 |
+
)
|
38 |
+
)
|
39 |
else:
|
40 |
# Optionally, verify the existing index's dimension matches
|
41 |
+
existing_index = pc.describe_index(INDEX_NAME)
|
42 |
if existing_index.dimension != EMBEDDING_DIMENSION:
|
43 |
raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
|
44 |
+
|
45 |
# Connect to the Pinecone index
|
46 |
+
index = pc.Index(INDEX_NAME)
|
47 |
|
48 |
def transcribe_pdf(pdf_file):
|
49 |
# Read PDF and extract text
|
|
|
53 |
page_text = page.extract_text()
|
54 |
if page_text:
|
55 |
text += page_text + "\n"
|
56 |
+
|
57 |
# Dynamic Chunking
|
58 |
chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
|
59 |
+
|
60 |
# Generate embeddings for each chunk
|
61 |
embeddings = get_embeddings(chunks)
|
62 |
+
|
63 |
# Prepare upsert data
|
64 |
upsert_data = [
|
65 |
+
(str(uuid.uuid4()), emb, {"text": chunk})
|
66 |
for chunk, emb in zip(chunks, embeddings)
|
67 |
]
|
68 |
+
|
69 |
# Upsert to Pinecone
|
70 |
index.upsert(vectors=upsert_data)
|
71 |
+
|
72 |
return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
|
73 |
|
74 |
def dynamic_chunking(text, max_tokens=500, overlap=50):
|
|
|
92 |
"""
|
93 |
response = openai.Embedding.create(
|
94 |
input=chunks,
|
95 |
+
model=EMBEDDING_MODEL
|
96 |
)
|
97 |
embeddings = [record['embedding'] for record in response['data']]
|
98 |
return embeddings
|
|
|
106 |
)
|
107 |
|
108 |
if __name__ == "__main__":
|
109 |
+
iface.launch()
|