poemsforaphrodite commited on
Commit
03ccacf
·
verified ·
1 Parent(s): 6c21711

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -17
app.py CHANGED
@@ -3,7 +3,7 @@ import PyPDF2
3
  import io
4
  import os
5
  from dotenv import load_dotenv
6
- import pinecone
7
  import openai
8
  import uuid
9
  import re
@@ -18,24 +18,32 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
18
  PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
19
  PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
20
  INDEX_NAME = "ghana"
21
- EMBEDDING_MODEL = "text-embedding-3-large" # Updated model name
22
- EMBEDDING_DIMENSION = 3072 # Updated dimension
23
 
24
  # Initialize Pinecone
25
- pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
26
 
27
  # Check if the index exists
28
- if INDEX_NAME not in pinecone.list_indexes():
29
  # Create the index with updated dimensions
30
- pinecone.create_index(name=INDEX_NAME, dimension=EMBEDDING_DIMENSION, metric="cosine")
 
 
 
 
 
 
 
 
31
  else:
32
  # Optionally, verify the existing index's dimension matches
33
- existing_index = pinecone.describe_index(INDEX_NAME)
34
  if existing_index.dimension != EMBEDDING_DIMENSION:
35
  raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
36
-
37
  # Connect to the Pinecone index
38
- index = pinecone.Index(INDEX_NAME)
39
 
40
  def transcribe_pdf(pdf_file):
41
  # Read PDF and extract text
@@ -45,22 +53,22 @@ def transcribe_pdf(pdf_file):
45
  page_text = page.extract_text()
46
  if page_text:
47
  text += page_text + "\n"
48
-
49
  # Dynamic Chunking
50
  chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
51
-
52
  # Generate embeddings for each chunk
53
  embeddings = get_embeddings(chunks)
54
-
55
  # Prepare upsert data
56
  upsert_data = [
57
- {"id": str(uuid.uuid4()), "values": emb, "metadata": {"text": chunk}}
58
  for chunk, emb in zip(chunks, embeddings)
59
  ]
60
-
61
  # Upsert to Pinecone
62
  index.upsert(vectors=upsert_data)
63
-
64
  return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
65
 
66
  def dynamic_chunking(text, max_tokens=500, overlap=50):
@@ -84,7 +92,7 @@ def get_embeddings(chunks):
84
  """
85
  response = openai.Embedding.create(
86
  input=chunks,
87
- model=EMBEDDING_MODEL # Updated model
88
  )
89
  embeddings = [record['embedding'] for record in response['data']]
90
  return embeddings
@@ -98,4 +106,4 @@ iface = gr.Interface(
98
  )
99
 
100
  if __name__ == "__main__":
101
- iface.launch()
 
3
  import io
4
  import os
5
  from dotenv import load_dotenv
6
+ from pinecone import Pinecone, ServerlessSpec
7
  import openai
8
  import uuid
9
  import re
 
18
  PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
19
  PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
20
  INDEX_NAME = "ghana"
21
+ EMBEDDING_MODEL = "text-embedding-3-large"
22
+ EMBEDDING_DIMENSION = 3072
23
 
24
  # Initialize Pinecone
25
+ pc = Pinecone(api_key=PINECONE_API_KEY)
26
 
27
  # Check if the index exists
28
+ if INDEX_NAME not in pc.list_indexes().names():
29
  # Create the index with updated dimensions
30
+ pc.create_index(
31
+ name=INDEX_NAME,
32
+ dimension=EMBEDDING_DIMENSION,
33
+ metric="cosine",
34
+ spec=ServerlessSpec(
35
+ cloud=PINECONE_ENVIRONMENT.split('-')[0], # Assuming environment is in format 'gcp-starter'
36
+ region=PINECONE_ENVIRONMENT.split('-')[1]
37
+ )
38
+ )
39
  else:
40
  # Optionally, verify the existing index's dimension matches
41
+ existing_index = pc.describe_index(INDEX_NAME)
42
  if existing_index.dimension != EMBEDDING_DIMENSION:
43
  raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
44
+
45
  # Connect to the Pinecone index
46
+ index = pc.Index(INDEX_NAME)
47
 
48
  def transcribe_pdf(pdf_file):
49
  # Read PDF and extract text
 
53
  page_text = page.extract_text()
54
  if page_text:
55
  text += page_text + "\n"
56
+
57
  # Dynamic Chunking
58
  chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
59
+
60
  # Generate embeddings for each chunk
61
  embeddings = get_embeddings(chunks)
62
+
63
  # Prepare upsert data
64
  upsert_data = [
65
+ (str(uuid.uuid4()), emb, {"text": chunk})
66
  for chunk, emb in zip(chunks, embeddings)
67
  ]
68
+
69
  # Upsert to Pinecone
70
  index.upsert(vectors=upsert_data)
71
+
72
  return f"Successfully upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
73
 
74
  def dynamic_chunking(text, max_tokens=500, overlap=50):
 
92
  """
93
  response = openai.Embedding.create(
94
  input=chunks,
95
+ model=EMBEDDING_MODEL
96
  )
97
  embeddings = [record['embedding'] for record in response['data']]
98
  return embeddings
 
106
  )
107
 
108
  if __name__ == "__main__":
109
+ iface.launch()