Spaces:
Running
Running
Nagesh Muralidhar
commited on
Commit
·
f6b7a05
1
Parent(s):
32402f2
Focus
Browse files- server/__pycache__/agents.cpython-311.pyc +0 -0
- server/__pycache__/main.cpython-311.pyc +0 -0
- server/__pycache__/workflow.cpython-311.pyc +0 -0
- server/logs/agents.log +0 -0
- server/main.py +154 -47
- server/transcripts/podcasts.json +0 -1
- server/utils.py +15 -4
server/__pycache__/agents.cpython-311.pyc
CHANGED
Binary files a/server/__pycache__/agents.cpython-311.pyc and b/server/__pycache__/agents.cpython-311.pyc differ
|
|
server/__pycache__/main.cpython-311.pyc
CHANGED
Binary files a/server/__pycache__/main.cpython-311.pyc and b/server/__pycache__/main.cpython-311.pyc differ
|
|
server/__pycache__/workflow.cpython-311.pyc
CHANGED
Binary files a/server/__pycache__/workflow.cpython-311.pyc and b/server/__pycache__/workflow.cpython-311.pyc differ
|
|
server/logs/agents.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
server/main.py
CHANGED
@@ -338,6 +338,27 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
|
|
338 |
"""Handle chat messages for a specific podcast."""
|
339 |
try:
|
340 |
logger.info(f"Processing chat message for podcast {podcast_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
# Path to transcripts file
|
343 |
transcripts_file = os.path.join(os.path.dirname(__file__), "transcripts", "podcasts.json")
|
@@ -352,50 +373,74 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
|
|
352 |
with open(transcripts_file, 'r') as f:
|
353 |
transcripts = json.load(f)
|
354 |
logger.info(f"Loaded {len(transcripts)} transcripts")
|
|
|
355 |
except json.JSONDecodeError as e:
|
356 |
logger.error(f"Error decoding transcripts file: {str(e)}")
|
357 |
raise HTTPException(status_code=500, detail="Error reading transcripts file")
|
358 |
-
|
359 |
-
#
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
logger.info(f"Found transcript for podcast {podcast_id}")
|
377 |
-
logger.debug(f"Transcript content: {podcast_transcript[:200]}...") # Log first 200 chars
|
378 |
-
except (IndexError, KeyError) as e:
|
379 |
-
logger.error(f"Error accessing podcast transcript: {str(e)}")
|
380 |
-
raise HTTPException(status_code=404, detail="Transcript not found for this podcast")
|
381 |
|
382 |
# Split text into chunks
|
383 |
text_splitter = RecursiveCharacterTextSplitter(
|
384 |
-
chunk_size=
|
385 |
-
chunk_overlap=
|
386 |
length_function=len,
|
|
|
387 |
)
|
388 |
|
389 |
# Use split_text for strings instead of split_documents
|
390 |
-
|
391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
if not chunks:
|
394 |
logger.error("No content chunks found in transcript")
|
395 |
raise HTTPException(status_code=404, detail="No content chunks found in transcript")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
|
397 |
# Initialize embedding model
|
398 |
-
embedding_model = OpenAIEmbeddings(
|
|
|
|
|
|
|
399 |
|
400 |
# Create a unique collection name for this podcast
|
401 |
collection_name = f"podcast_{podcast_id}"
|
@@ -411,40 +456,94 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
|
|
411 |
|
412 |
# Configure the retriever with search parameters
|
413 |
qdrant_retriever = vectorstore.as_retriever(
|
414 |
-
search_type="similarity",
|
415 |
-
search_kwargs={
|
|
|
|
|
|
|
416 |
)
|
417 |
|
418 |
base_rag_prompt_template = """\
|
419 |
You are a helpful podcast assistant. Answer the user's question based on the provided context from the podcast transcript.
|
420 |
-
If
|
|
|
421 |
Keep your responses concise and focused on the question.
|
|
|
|
|
422 |
|
423 |
Context:
|
424 |
{context}
|
425 |
|
426 |
Question:
|
427 |
{question}
|
|
|
|
|
428 |
"""
|
429 |
|
430 |
base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
|
431 |
-
base_llm = ChatOpenAI(
|
|
|
|
|
|
|
|
|
432 |
|
433 |
# Create the RAG chain
|
434 |
def format_docs(docs):
|
435 |
-
|
|
|
|
|
436 |
|
437 |
# Add logging for the retrieved documents and final prompt
|
438 |
def get_context_and_log(input_dict):
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
|
449 |
# Create the chain
|
450 |
chain = (
|
@@ -454,11 +553,19 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
|
|
454 |
| base_llm
|
455 |
)
|
456 |
|
457 |
-
# Get response
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
|
463 |
except HTTPException:
|
464 |
raise
|
|
|
338 |
"""Handle chat messages for a specific podcast."""
|
339 |
try:
|
340 |
logger.info(f"Processing chat message for podcast {podcast_id}")
|
341 |
+
logger.info(f"User message: {request.message}")
|
342 |
+
|
343 |
+
# Get list of audio files
|
344 |
+
audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.mp3')]
|
345 |
+
logger.info(f"Found {len(audio_files)} audio files: {audio_files}")
|
346 |
+
|
347 |
+
# Convert podcast_id to zero-based index and get the filename
|
348 |
+
try:
|
349 |
+
podcast_index = int(podcast_id) - 1
|
350 |
+
if podcast_index < 0 or podcast_index >= len(audio_files):
|
351 |
+
logger.error(f"Invalid podcast index: {podcast_index} (total files: {len(audio_files)})")
|
352 |
+
raise ValueError(f"Invalid podcast ID: {podcast_id}")
|
353 |
+
podcast_filename = audio_files[podcast_index]
|
354 |
+
logger.info(f"Found podcast file: {podcast_filename}")
|
355 |
+
except ValueError as e:
|
356 |
+
logger.error(f"Error converting podcast ID: {str(e)}")
|
357 |
+
raise HTTPException(status_code=404, detail=str(e))
|
358 |
+
|
359 |
+
# Extract topic from filename
|
360 |
+
topic = podcast_filename.split('-')[0].replace('_', ' ')
|
361 |
+
logger.info(f"Extracted topic: {topic}")
|
362 |
|
363 |
# Path to transcripts file
|
364 |
transcripts_file = os.path.join(os.path.dirname(__file__), "transcripts", "podcasts.json")
|
|
|
373 |
with open(transcripts_file, 'r') as f:
|
374 |
transcripts = json.load(f)
|
375 |
logger.info(f"Loaded {len(transcripts)} transcripts")
|
376 |
+
logger.info(f"Available topics: {[t.get('topic', 'NO_TOPIC') for t in transcripts]}")
|
377 |
except json.JSONDecodeError as e:
|
378 |
logger.error(f"Error decoding transcripts file: {str(e)}")
|
379 |
raise HTTPException(status_code=500, detail="Error reading transcripts file")
|
380 |
+
|
381 |
+
# Find matching transcript by topic
|
382 |
+
podcast_transcript = None
|
383 |
+
for transcript in transcripts:
|
384 |
+
transcript_topic = transcript.get("topic", "").lower().strip()
|
385 |
+
if transcript_topic == topic.lower().strip():
|
386 |
+
podcast_transcript = transcript.get("podcastScript")
|
387 |
+
logger.info(f"Found matching transcript for topic: {topic}")
|
388 |
+
break
|
389 |
+
|
390 |
+
if not podcast_transcript:
|
391 |
+
logger.error(f"No transcript found for topic: {topic}")
|
392 |
+
logger.error(f"Available topics: {[t.get('topic', 'NO_TOPIC') for t in transcripts]}")
|
393 |
+
raise HTTPException(status_code=404, detail=f"No transcript found for topic: {topic}")
|
394 |
+
|
395 |
+
logger.info(f"Found transcript for topic: {topic}")
|
396 |
+
logger.info(f"Full transcript length: {len(podcast_transcript)} characters")
|
397 |
+
logger.debug(f"Transcript preview: {podcast_transcript[:200]}...")
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
# Split text into chunks
|
400 |
text_splitter = RecursiveCharacterTextSplitter(
|
401 |
+
chunk_size=1000,
|
402 |
+
chunk_overlap=100,
|
403 |
length_function=len,
|
404 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
405 |
)
|
406 |
|
407 |
# Use split_text for strings instead of split_documents
|
408 |
+
try:
|
409 |
+
logger.info("Starting text splitting process...")
|
410 |
+
chunks = text_splitter.split_text(podcast_transcript)
|
411 |
+
logger.info(f"Successfully split transcript into {len(chunks)} chunks")
|
412 |
+
|
413 |
+
# Log some sample chunks
|
414 |
+
logger.info("\nSample chunks:")
|
415 |
+
for i, chunk in enumerate(chunks[:3]): # Log first 3 chunks
|
416 |
+
logger.info(f"\nChunk {i+1}:")
|
417 |
+
logger.info("=" * 50)
|
418 |
+
logger.info(chunk)
|
419 |
+
logger.info("=" * 50)
|
420 |
+
|
421 |
+
if len(chunks) > 3:
|
422 |
+
logger.info(f"... and {len(chunks) - 3} more chunks")
|
423 |
+
|
424 |
+
except Exception as e:
|
425 |
+
logger.error(f"Error splitting text into chunks: {str(e)}")
|
426 |
+
raise HTTPException(status_code=500, detail=f"Error splitting text: {str(e)}")
|
427 |
|
428 |
if not chunks:
|
429 |
logger.error("No content chunks found in transcript")
|
430 |
raise HTTPException(status_code=404, detail="No content chunks found in transcript")
|
431 |
+
|
432 |
+
# Validate chunk sizes
|
433 |
+
chunk_sizes = [len(chunk) for chunk in chunks]
|
434 |
+
logger.info(f"\nChunk size statistics:")
|
435 |
+
logger.info(f"Min chunk size: {min(chunk_sizes)} characters")
|
436 |
+
logger.info(f"Max chunk size: {max(chunk_sizes)} characters")
|
437 |
+
logger.info(f"Average chunk size: {sum(chunk_sizes)/len(chunk_sizes):.2f} characters")
|
438 |
|
439 |
# Initialize embedding model
|
440 |
+
embedding_model = OpenAIEmbeddings(
|
441 |
+
model="text-embedding-3-small",
|
442 |
+
openai_api_key=openai_api_key
|
443 |
+
)
|
444 |
|
445 |
# Create a unique collection name for this podcast
|
446 |
collection_name = f"podcast_{podcast_id}"
|
|
|
456 |
|
457 |
# Configure the retriever with search parameters
|
458 |
qdrant_retriever = vectorstore.as_retriever(
|
459 |
+
search_type="similarity", # Use simple similarity search
|
460 |
+
search_kwargs={
|
461 |
+
"k": 8, # Increased from 5 to 8 chunks
|
462 |
+
"score_threshold": 0.05 # Lowered threshold further for more matches
|
463 |
+
}
|
464 |
)
|
465 |
|
466 |
base_rag_prompt_template = """\
|
467 |
You are a helpful podcast assistant. Answer the user's question based on the provided context from the podcast transcript.
|
468 |
+
If the context contains relevant information, use it to answer the question.
|
469 |
+
If you can't find relevant information in the context to answer the question, say "I don't have enough information to answer that question."
|
470 |
Keep your responses concise and focused on the question.
|
471 |
+
|
472 |
+
Important: Even if only part of the context is relevant to the question, use that part to provide a partial answer rather than saying there isn't enough information.
|
473 |
|
474 |
Context:
|
475 |
{context}
|
476 |
|
477 |
Question:
|
478 |
{question}
|
479 |
+
|
480 |
+
Answer the question using the information from the context above. If you find ANY relevant information, use it to provide at least a partial answer. Only say "I don't have enough information" if there is absolutely nothing relevant in the context.
|
481 |
"""
|
482 |
|
483 |
base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
|
484 |
+
base_llm = ChatOpenAI(
|
485 |
+
model="gpt-3.5-turbo",
|
486 |
+
temperature=0.7,
|
487 |
+
openai_api_key=openai_api_key
|
488 |
+
)
|
489 |
|
490 |
# Create the RAG chain
|
491 |
def format_docs(docs):
|
492 |
+
formatted = "\n\n".join(doc.page_content for doc in docs)
|
493 |
+
logger.info(f"Formatted {len(docs)} documents into context of length: {len(formatted)}")
|
494 |
+
return formatted
|
495 |
|
496 |
# Add logging for the retrieved documents and final prompt
|
497 |
def get_context_and_log(input_dict):
|
498 |
+
try:
|
499 |
+
logger.info("\nAttempting to retrieve relevant documents...")
|
500 |
+
# Log the query being used
|
501 |
+
logger.info(f"Query: {input_dict['question']}")
|
502 |
+
|
503 |
+
# Use the newer invoke method instead of get_relevant_documents
|
504 |
+
retrieved_docs = qdrant_retriever.invoke(input_dict["question"])
|
505 |
+
logger.info(f"Successfully retrieved {len(retrieved_docs)} documents")
|
506 |
+
|
507 |
+
if not retrieved_docs:
|
508 |
+
logger.warning("No documents were retrieved!")
|
509 |
+
return {"context": "No relevant context found.", "question": input_dict["question"]}
|
510 |
+
|
511 |
+
# Log each retrieved document with its content and similarity score
|
512 |
+
total_content_length = 0
|
513 |
+
for i, doc in enumerate(retrieved_docs):
|
514 |
+
logger.info(f"\nDocument {i+1}:")
|
515 |
+
logger.info("=" * 50)
|
516 |
+
logger.info(f"Content: {doc.page_content}")
|
517 |
+
logger.info(f"Content Length: {len(doc.page_content)} characters")
|
518 |
+
logger.info(f"Metadata: {doc.metadata}")
|
519 |
+
logger.info("=" * 50)
|
520 |
+
total_content_length += len(doc.page_content)
|
521 |
+
|
522 |
+
context = format_docs(retrieved_docs)
|
523 |
+
|
524 |
+
# Log the final formatted context and question
|
525 |
+
logger.info("\nRetrieval Statistics:")
|
526 |
+
logger.info(f"Total documents retrieved: {len(retrieved_docs)}")
|
527 |
+
logger.info(f"Total content length: {total_content_length} characters")
|
528 |
+
logger.info(f"Average document length: {total_content_length/len(retrieved_docs):.2f} characters")
|
529 |
+
|
530 |
+
logger.info("\nFinal Context and Question:")
|
531 |
+
logger.info("=" * 50)
|
532 |
+
logger.info("Context:")
|
533 |
+
logger.info(f"{context}")
|
534 |
+
logger.info("-" * 50)
|
535 |
+
logger.info(f"Question: {input_dict['question']}")
|
536 |
+
logger.info("=" * 50)
|
537 |
+
|
538 |
+
if not context.strip():
|
539 |
+
logger.error("Warning: Empty context retrieved!")
|
540 |
+
return {"context": "No relevant context found.", "question": input_dict["question"]}
|
541 |
+
|
542 |
+
return {"context": context, "question": input_dict["question"]}
|
543 |
+
except Exception as e:
|
544 |
+
logger.error(f"Error in get_context_and_log: {str(e)}")
|
545 |
+
logger.error("Stack trace:", exc_info=True)
|
546 |
+
return {"context": "Error retrieving context.", "question": input_dict["question"]}
|
547 |
|
548 |
# Create the chain
|
549 |
chain = (
|
|
|
553 |
| base_llm
|
554 |
)
|
555 |
|
556 |
+
# Get response with enhanced logging
|
557 |
+
try:
|
558 |
+
logger.info("\nGenerating response...")
|
559 |
+
response = chain.invoke({"question": request.message})
|
560 |
+
logger.info("=" * 50)
|
561 |
+
logger.info("Final Response:")
|
562 |
+
logger.info(f"{response.content}")
|
563 |
+
logger.info("=" * 50)
|
564 |
+
|
565 |
+
return PodcastChatResponse(response=response.content)
|
566 |
+
except Exception as e:
|
567 |
+
logger.error(f"Error generating response: {str(e)}")
|
568 |
+
raise HTTPException(status_code=500, detail=f"Error generating response: {str(e)}")
|
569 |
|
570 |
except HTTPException:
|
571 |
raise
|
server/transcripts/podcasts.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
[]
|
|
|
|
server/utils.py
CHANGED
@@ -13,11 +13,15 @@ TRANSCRIPTS_FILE = os.path.join(TRANSCRIPTS_DIR, "podcasts.json")
|
|
13 |
|
14 |
def save_transcript(podcast_script: str, user_query: str) -> None:
|
15 |
"""Save podcast transcript to JSON file."""
|
|
|
|
|
|
|
|
|
16 |
# Create new transcript entry
|
17 |
transcript = {
|
18 |
"id": str(uuid.uuid4()),
|
19 |
"podcastScript": podcast_script,
|
20 |
-
"topic":
|
21 |
}
|
22 |
|
23 |
try:
|
@@ -34,13 +38,20 @@ def save_transcript(podcast_script: str, user_query: str) -> None:
|
|
34 |
else:
|
35 |
transcripts = []
|
36 |
|
37 |
-
#
|
38 |
-
transcripts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Save updated transcripts
|
41 |
with open(TRANSCRIPTS_FILE, 'w') as f:
|
42 |
json.dump(transcripts, f, indent=2)
|
43 |
-
logger.info("Successfully saved transcript")
|
44 |
|
45 |
except Exception as e:
|
46 |
logger.error(f"Error saving transcript: {str(e)}")
|
|
|
13 |
|
14 |
def save_transcript(podcast_script: str, user_query: str) -> None:
|
15 |
"""Save podcast transcript to JSON file."""
|
16 |
+
# Process the topic to match filename format
|
17 |
+
topic = user_query.lower().strip().replace(" ", "_")
|
18 |
+
topic = topic.replace("?", "").replace("!", "").replace(".", "") # Remove punctuation
|
19 |
+
|
20 |
# Create new transcript entry
|
21 |
transcript = {
|
22 |
"id": str(uuid.uuid4()),
|
23 |
"podcastScript": podcast_script,
|
24 |
+
"topic": topic.replace("_", " ") # Store topic with spaces for matching
|
25 |
}
|
26 |
|
27 |
try:
|
|
|
38 |
else:
|
39 |
transcripts = []
|
40 |
|
41 |
+
# Check if transcript for this topic already exists
|
42 |
+
for i, existing in enumerate(transcripts):
|
43 |
+
if existing.get("topic") == transcript["topic"]:
|
44 |
+
# Update existing transcript
|
45 |
+
transcripts[i] = transcript
|
46 |
+
break
|
47 |
+
else:
|
48 |
+
# Append new transcript if no existing one was found
|
49 |
+
transcripts.append(transcript)
|
50 |
|
51 |
# Save updated transcripts
|
52 |
with open(TRANSCRIPTS_FILE, 'w') as f:
|
53 |
json.dump(transcripts, f, indent=2)
|
54 |
+
logger.info(f"Successfully saved transcript for topic: {transcript['topic']}")
|
55 |
|
56 |
except Exception as e:
|
57 |
logger.error(f"Error saving transcript: {str(e)}")
|