poemsforaphrodite commited on
Commit
2434cea
β€’
1 Parent(s): 1eca854

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +225 -0
app.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ import io
4
+ import os
5
+ from dotenv import load_dotenv
6
+ from pinecone import Pinecone, ServerlessSpec
7
+ from openai import OpenAI
8
+ import uuid
9
+ import re
10
+ import time
11
+
12
+ # Load environment variables from .env file
13
+ load_dotenv()
14
+
15
+ # Initialize OpenAI client
16
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
17
+
18
+ # Initialize Pinecone
19
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
20
+ PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
21
+ INDEX_NAME = "ghana"
22
+ EMBEDDING_MODEL = "text-embedding-3-large"
23
+ EMBEDDING_DIMENSION = 3072
24
+
25
+ # Initialize Pinecone
26
+ pc = Pinecone(api_key=PINECONE_API_KEY)
27
+
28
+ # Check if the index exists
29
+ if INDEX_NAME not in pc.list_indexes().names():
30
+ # Create the index with updated dimensions
31
+ pc.create_index(
32
+ name=INDEX_NAME,
33
+ dimension=EMBEDDING_DIMENSION,
34
+ metric="cosine",
35
+ spec=ServerlessSpec(
36
+ cloud=PINECONE_ENVIRONMENT.split('-')[0], # Assuming environment is in format 'gcp-starter'
37
+ region=PINECONE_ENVIRONMENT.split('-')[1]
38
+ )
39
+ )
40
+ else:
41
+ # Optionally, verify the existing index's dimension matches
42
+ existing_index = pc.describe_index(INDEX_NAME)
43
+ if existing_index.dimension != EMBEDDING_DIMENSION:
44
+ raise ValueError(f"Existing index '{INDEX_NAME}' has dimension {existing_index.dimension}, expected {EMBEDDING_DIMENSION}. Please choose a different index name or adjust accordingly.")
45
+
46
+ # Connect to the Pinecone index
47
+ index = pc.Index(INDEX_NAME)
48
+
49
+ def transcribe_pdf(pdf_file):
50
+ print("Starting PDF transcription...")
51
+ # Read PDF and extract text
52
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
53
+ text = ""
54
+ for page in pdf_reader.pages:
55
+ page_text = page.extract_text()
56
+ if page_text:
57
+ text += page_text + "\n"
58
+
59
+ print(f"Extracted {len(text)} characters from PDF.")
60
+
61
+ # Dynamic Chunking
62
+ chunks = dynamic_chunking(text, max_tokens=500, overlap=50)
63
+ print(f"Created {len(chunks)} chunks from the extracted text.")
64
+
65
+ # Process chunks one by one
66
+ progress_bar = st.progress(0)
67
+ for i, chunk in enumerate(chunks):
68
+ print(f"Processing chunk {i+1}/{len(chunks)}...")
69
+
70
+ # Generate embedding for the chunk
71
+ embedding = get_embedding(chunk)
72
+
73
+ # Prepare upsert data
74
+ upsert_data = [(str(uuid.uuid4()), embedding, {"text": chunk})]
75
+
76
+ # Upsert to Pinecone
77
+ print(f"Upserting vector to Pinecone index '{INDEX_NAME}'...")
78
+ index.upsert(vectors=upsert_data)
79
+
80
+ # Update progress bar
81
+ progress = (i + 1) / len(chunks)
82
+ progress_bar.progress(progress)
83
+
84
+ # Optional: Add a small delay to avoid potential rate limits
85
+ time.sleep(0.5)
86
+
87
+ progress_bar.empty()
88
+ return f"Successfully processed and upserted {len(chunks)} chunks to Pinecone index '{INDEX_NAME}'."
89
+
90
+ def dynamic_chunking(text, max_tokens=200, overlap=100):
91
+ print(f"Starting dynamic chunking with max_tokens={max_tokens} and overlap={overlap}...")
92
+ tokens = re.findall(r'\S+', text)
93
+ chunks = []
94
+ start = 0
95
+ while start < len(tokens):
96
+ end = start + max_tokens
97
+ chunk = ' '.join(tokens[start:end])
98
+ chunks.append(chunk)
99
+ start += max_tokens - overlap
100
+ print(f"Dynamic chunking complete. Created {len(chunks)} chunks.")
101
+ return chunks
102
+
103
+ def get_embedding(chunk):
104
+ print("Generating embedding for chunk...")
105
+ try:
106
+ response = client.embeddings.create(
107
+ input=chunk, # Now we can pass the chunk directly
108
+ model=EMBEDDING_MODEL
109
+ )
110
+ embedding = response.data[0].embedding
111
+ print("Successfully generated embedding.")
112
+ return embedding
113
+ except Exception as e:
114
+ print(f"Error during embedding generation: {str(e)}")
115
+ raise e
116
+
117
+ def clear_database():
118
+ print("Clearing the Pinecone index...")
119
+ try:
120
+ index.delete(delete_all=True)
121
+ return "Successfully cleared all vectors from the Pinecone index."
122
+ except Exception as e:
123
+ print(f"Error clearing the Pinecone index: {str(e)}")
124
+ return f"Error clearing the Pinecone index: {str(e)}"
125
+
126
+ def query_database(query_text):
127
+ print(f"Querying database with: {query_text}")
128
+ try:
129
+ query_embedding = get_embedding(query_text)
130
+ results = index.query(vector=query_embedding, top_k=5, include_metadata=True)
131
+
132
+ context = ""
133
+ for match in results['matches']:
134
+ metadata = match.get('metadata', {})
135
+ text = metadata.get('text', '')
136
+ context += f"{text}\n\n"
137
+
138
+ if not context:
139
+ return "No relevant information found in the database."
140
+
141
+ return generate_answer(query_text, context)
142
+ except Exception as e:
143
+ print(f"Error querying the database: {str(e)}")
144
+ return f"Error querying the database: {str(e)}"
145
+
146
+ def generate_answer(query, context):
147
+ try:
148
+ response = client.chat.completions.create(
149
+ model="gpt-4o-mini",
150
+ messages=[
151
+ {"role": "system", "content": "You are an assistant for the Ghana Labor Act. Use the provided context to answer the user's question accurately and concisely."},
152
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
153
+ ]
154
+ )
155
+ return response.choices[0].message.content
156
+ except Exception as e:
157
+ print(f"Error generating answer: {str(e)}")
158
+ return f"Error generating answer: {str(e)}"
159
+
160
+ def generate_hr_document(prompt):
161
+ print(f"Generating HR document with prompt: {prompt}")
162
+ try:
163
+ response = client.chat.completions.create(
164
+ model="gpt-4o-mini", # Updated to use gpt-4o-mini
165
+ messages=[
166
+ {"role": "system", "content": "You are an HR assistant. Generate a professional HR document based on the given prompt."},
167
+ {"role": "user", "content": prompt}
168
+ ]
169
+ )
170
+ return response.choices[0].message.content
171
+ except Exception as e:
172
+ print(f"Error generating HR document: {str(e)}")
173
+ return f"Error generating HR document: {str(e)}"
174
+
175
+ def main():
176
+ st.set_page_config(page_title="HR Document Assistant", layout="wide")
177
+ st.title("HR Document Assistant")
178
+
179
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ“€ Upload PDF", "πŸ” Query Database", "πŸ“ Generate HR Document", "πŸ—‘οΈ Clear Database"])
180
+
181
+ with tab1:
182
+ st.header("Upload PDF")
183
+ st.write("Upload a PDF file to extract its text content, chunk it dynamically, and upsert the chunks to the Pinecone index.")
184
+ pdf_file = st.file_uploader("Upload PDF", type="pdf")
185
+ if st.button("πŸ“₯ Transcribe and Upsert"):
186
+ if pdf_file is not None:
187
+ with st.spinner("Processing PDF..."):
188
+ result = transcribe_pdf(pdf_file.read())
189
+ st.success(result)
190
+ else:
191
+ st.error("Please upload a PDF file first.")
192
+
193
+ with tab2:
194
+ st.header("Query Database")
195
+ st.write("Enter a query about the Ghana Labor Act.")
196
+ query = st.text_input("Enter your query", placeholder="What does the Act say about...?")
197
+ if st.button("πŸ”Ž Get Answer"):
198
+ answer = query_database(query)
199
+ st.markdown("### Answer:")
200
+ st.write(answer)
201
+
202
+ with tab3:
203
+ st.header("Generate HR Document")
204
+ st.write("Enter a prompt to generate an HR document using GPT-4.")
205
+ prompt = st.text_area("Enter your prompt", placeholder="Describe the HR document you need...")
206
+ if st.button("✍️ Generate Document"):
207
+ document = generate_hr_document(prompt)
208
+ st.text_area("Generated Document", value=document, height=400)
209
+
210
+ with tab4:
211
+ st.header("Clear Database")
212
+ st.write("Use this option carefully. It will remove all data from the Pinecone index.")
213
+ if st.button("πŸ—‘οΈ Clear Database"):
214
+ result = clear_database()
215
+ st.success(result)
216
+
217
+ st.markdown("""
218
+ ### πŸ“Œ Note
219
+ - Ensure you have the necessary API keys set up for OpenAI and Pinecone.
220
+ - The PDF upload process may take some time depending on the file size.
221
+ - Generated HR documents are based on AI and may require human review.
222
+ """)
223
+
224
+ if __name__ == "__main__":
225
+ main()