Nagesh Muralidhar commited on
Commit
f6b7a05
·
1 Parent(s): 32402f2
server/__pycache__/agents.cpython-311.pyc CHANGED
Binary files a/server/__pycache__/agents.cpython-311.pyc and b/server/__pycache__/agents.cpython-311.pyc differ
 
server/__pycache__/main.cpython-311.pyc CHANGED
Binary files a/server/__pycache__/main.cpython-311.pyc and b/server/__pycache__/main.cpython-311.pyc differ
 
server/__pycache__/workflow.cpython-311.pyc CHANGED
Binary files a/server/__pycache__/workflow.cpython-311.pyc and b/server/__pycache__/workflow.cpython-311.pyc differ
 
server/logs/agents.log ADDED
The diff for this file is too large to render. See raw diff
 
server/main.py CHANGED
@@ -338,6 +338,27 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
338
  """Handle chat messages for a specific podcast."""
339
  try:
340
  logger.info(f"Processing chat message for podcast {podcast_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  # Path to transcripts file
343
  transcripts_file = os.path.join(os.path.dirname(__file__), "transcripts", "podcasts.json")
@@ -352,50 +373,74 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
352
  with open(transcripts_file, 'r') as f:
353
  transcripts = json.load(f)
354
  logger.info(f"Loaded {len(transcripts)} transcripts")
 
355
  except json.JSONDecodeError as e:
356
  logger.error(f"Error decoding transcripts file: {str(e)}")
357
  raise HTTPException(status_code=500, detail="Error reading transcripts file")
358
-
359
- # Convert podcast_id to zero-based index
360
- try:
361
- podcast_index = int(podcast_id) - 1
362
- if podcast_index < 0 or podcast_index >= len(transcripts):
363
- logger.error(f"Invalid podcast index: {podcast_index} (total transcripts: {len(transcripts)})")
364
- raise ValueError(f"Invalid podcast ID: {podcast_id}")
365
- except ValueError as e:
366
- logger.error(f"Error converting podcast ID: {str(e)}")
367
- raise HTTPException(status_code=404, detail=str(e))
368
-
369
- # Get podcast transcript
370
- try:
371
- podcast_transcript = transcripts[podcast_index].get("podcastScript")
372
- if not podcast_transcript:
373
- logger.error(f"No transcript content found for podcast {podcast_id}")
374
- raise HTTPException(status_code=404, detail="No transcript content found for this podcast")
375
-
376
- logger.info(f"Found transcript for podcast {podcast_id}")
377
- logger.debug(f"Transcript content: {podcast_transcript[:200]}...") # Log first 200 chars
378
- except (IndexError, KeyError) as e:
379
- logger.error(f"Error accessing podcast transcript: {str(e)}")
380
- raise HTTPException(status_code=404, detail="Transcript not found for this podcast")
381
 
382
  # Split text into chunks
383
  text_splitter = RecursiveCharacterTextSplitter(
384
- chunk_size=500,
385
- chunk_overlap=50,
386
  length_function=len,
 
387
  )
388
 
389
  # Use split_text for strings instead of split_documents
390
- chunks = text_splitter.split_text(podcast_transcript)
391
- logger.info(f"Split transcript into {len(chunks)} chunks")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
  if not chunks:
394
  logger.error("No content chunks found in transcript")
395
  raise HTTPException(status_code=404, detail="No content chunks found in transcript")
 
 
 
 
 
 
 
396
 
397
  # Initialize embedding model
398
- embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
 
 
 
399
 
400
  # Create a unique collection name for this podcast
401
  collection_name = f"podcast_{podcast_id}"
@@ -411,40 +456,94 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
411
 
412
  # Configure the retriever with search parameters
413
  qdrant_retriever = vectorstore.as_retriever(
414
- search_type="similarity",
415
- search_kwargs={"k": 3} # Get top 3 most relevant chunks
 
 
 
416
  )
417
 
418
  base_rag_prompt_template = """\
419
  You are a helpful podcast assistant. Answer the user's question based on the provided context from the podcast transcript.
420
- If you can't find the answer in the context, just say "I don't have enough information to answer that question."
 
421
  Keep your responses concise and focused on the question.
 
 
422
 
423
  Context:
424
  {context}
425
 
426
  Question:
427
  {question}
 
 
428
  """
429
 
430
  base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
431
- base_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
 
 
 
 
432
 
433
  # Create the RAG chain
434
  def format_docs(docs):
435
- return "\n\n".join(doc.page_content for doc in docs)
 
 
436
 
437
  # Add logging for the retrieved documents and final prompt
438
  def get_context_and_log(input_dict):
439
- retrieved_docs = qdrant_retriever.get_relevant_documents(input_dict["question"])
440
- context = format_docs(retrieved_docs)
441
- logger.info("Retrieved context from podcast:")
442
- logger.info("-" * 50)
443
- logger.info(f"Context:\n{context}")
444
- logger.info("-" * 50)
445
- logger.info(f"Question: {input_dict['question']}")
446
- logger.info("-" * 50)
447
- return {"context": context, "question": input_dict["question"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
  # Create the chain
450
  chain = (
@@ -454,11 +553,19 @@ async def podcast_chat(podcast_id: str, request: PodcastChatRequest):
454
  | base_llm
455
  )
456
 
457
- # Get response
458
- response = chain.invoke({"question": request.message})
459
- logger.info(f"Generated response: {response.content}")
460
-
461
- return PodcastChatResponse(response=response.content)
 
 
 
 
 
 
 
 
462
 
463
  except HTTPException:
464
  raise
 
338
  """Handle chat messages for a specific podcast."""
339
  try:
340
  logger.info(f"Processing chat message for podcast {podcast_id}")
341
+ logger.info(f"User message: {request.message}")
342
+
343
+ # Get list of audio files
344
+ audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.mp3')]
345
+ logger.info(f"Found {len(audio_files)} audio files: {audio_files}")
346
+
347
+ # Convert podcast_id to zero-based index and get the filename
348
+ try:
349
+ podcast_index = int(podcast_id) - 1
350
+ if podcast_index < 0 or podcast_index >= len(audio_files):
351
+ logger.error(f"Invalid podcast index: {podcast_index} (total files: {len(audio_files)})")
352
+ raise ValueError(f"Invalid podcast ID: {podcast_id}")
353
+ podcast_filename = audio_files[podcast_index]
354
+ logger.info(f"Found podcast file: {podcast_filename}")
355
+ except ValueError as e:
356
+ logger.error(f"Error converting podcast ID: {str(e)}")
357
+ raise HTTPException(status_code=404, detail=str(e))
358
+
359
+ # Extract topic from filename
360
+ topic = podcast_filename.split('-')[0].replace('_', ' ')
361
+ logger.info(f"Extracted topic: {topic}")
362
 
363
  # Path to transcripts file
364
  transcripts_file = os.path.join(os.path.dirname(__file__), "transcripts", "podcasts.json")
 
373
  with open(transcripts_file, 'r') as f:
374
  transcripts = json.load(f)
375
  logger.info(f"Loaded {len(transcripts)} transcripts")
376
+ logger.info(f"Available topics: {[t.get('topic', 'NO_TOPIC') for t in transcripts]}")
377
  except json.JSONDecodeError as e:
378
  logger.error(f"Error decoding transcripts file: {str(e)}")
379
  raise HTTPException(status_code=500, detail="Error reading transcripts file")
380
+
381
+ # Find matching transcript by topic
382
+ podcast_transcript = None
383
+ for transcript in transcripts:
384
+ transcript_topic = transcript.get("topic", "").lower().strip()
385
+ if transcript_topic == topic.lower().strip():
386
+ podcast_transcript = transcript.get("podcastScript")
387
+ logger.info(f"Found matching transcript for topic: {topic}")
388
+ break
389
+
390
+ if not podcast_transcript:
391
+ logger.error(f"No transcript found for topic: {topic}")
392
+ logger.error(f"Available topics: {[t.get('topic', 'NO_TOPIC') for t in transcripts]}")
393
+ raise HTTPException(status_code=404, detail=f"No transcript found for topic: {topic}")
394
+
395
+ logger.info(f"Found transcript for topic: {topic}")
396
+ logger.info(f"Full transcript length: {len(podcast_transcript)} characters")
397
+ logger.debug(f"Transcript preview: {podcast_transcript[:200]}...")
 
 
 
 
 
398
 
399
  # Split text into chunks
400
  text_splitter = RecursiveCharacterTextSplitter(
401
+ chunk_size=1000,
402
+ chunk_overlap=100,
403
  length_function=len,
404
+ separators=["\n\n", "\n", ". ", " ", ""]
405
  )
406
 
407
  # Use split_text for strings instead of split_documents
408
+ try:
409
+ logger.info("Starting text splitting process...")
410
+ chunks = text_splitter.split_text(podcast_transcript)
411
+ logger.info(f"Successfully split transcript into {len(chunks)} chunks")
412
+
413
+ # Log some sample chunks
414
+ logger.info("\nSample chunks:")
415
+ for i, chunk in enumerate(chunks[:3]): # Log first 3 chunks
416
+ logger.info(f"\nChunk {i+1}:")
417
+ logger.info("=" * 50)
418
+ logger.info(chunk)
419
+ logger.info("=" * 50)
420
+
421
+ if len(chunks) > 3:
422
+ logger.info(f"... and {len(chunks) - 3} more chunks")
423
+
424
+ except Exception as e:
425
+ logger.error(f"Error splitting text into chunks: {str(e)}")
426
+ raise HTTPException(status_code=500, detail=f"Error splitting text: {str(e)}")
427
 
428
  if not chunks:
429
  logger.error("No content chunks found in transcript")
430
  raise HTTPException(status_code=404, detail="No content chunks found in transcript")
431
+
432
+ # Validate chunk sizes
433
+ chunk_sizes = [len(chunk) for chunk in chunks]
434
+ logger.info(f"\nChunk size statistics:")
435
+ logger.info(f"Min chunk size: {min(chunk_sizes)} characters")
436
+ logger.info(f"Max chunk size: {max(chunk_sizes)} characters")
437
+ logger.info(f"Average chunk size: {sum(chunk_sizes)/len(chunk_sizes):.2f} characters")
438
 
439
  # Initialize embedding model
440
+ embedding_model = OpenAIEmbeddings(
441
+ model="text-embedding-3-small",
442
+ openai_api_key=openai_api_key
443
+ )
444
 
445
  # Create a unique collection name for this podcast
446
  collection_name = f"podcast_{podcast_id}"
 
456
 
457
  # Configure the retriever with search parameters
458
  qdrant_retriever = vectorstore.as_retriever(
459
+ search_type="similarity", # Use simple similarity search
460
+ search_kwargs={
461
+ "k": 8, # Increased from 5 to 8 chunks
462
+ "score_threshold": 0.05 # Lowered threshold further for more matches
463
+ }
464
  )
465
 
466
  base_rag_prompt_template = """\
467
  You are a helpful podcast assistant. Answer the user's question based on the provided context from the podcast transcript.
468
+ If the context contains relevant information, use it to answer the question.
469
+ If you can't find relevant information in the context to answer the question, say "I don't have enough information to answer that question."
470
  Keep your responses concise and focused on the question.
471
+
472
+ Important: Even if only part of the context is relevant to the question, use that part to provide a partial answer rather than saying there isn't enough information.
473
 
474
  Context:
475
  {context}
476
 
477
  Question:
478
  {question}
479
+
480
+ Answer the question using the information from the context above. If you find ANY relevant information, use it to provide at least a partial answer. Only say "I don't have enough information" if there is absolutely nothing relevant in the context.
481
  """
482
 
483
  base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
484
+ base_llm = ChatOpenAI(
485
+ model="gpt-3.5-turbo",
486
+ temperature=0.7,
487
+ openai_api_key=openai_api_key
488
+ )
489
 
490
  # Create the RAG chain
491
  def format_docs(docs):
492
+ formatted = "\n\n".join(doc.page_content for doc in docs)
493
+ logger.info(f"Formatted {len(docs)} documents into context of length: {len(formatted)}")
494
+ return formatted
495
 
496
  # Add logging for the retrieved documents and final prompt
497
  def get_context_and_log(input_dict):
498
+ try:
499
+ logger.info("\nAttempting to retrieve relevant documents...")
500
+ # Log the query being used
501
+ logger.info(f"Query: {input_dict['question']}")
502
+
503
+ # Use the newer invoke method instead of get_relevant_documents
504
+ retrieved_docs = qdrant_retriever.invoke(input_dict["question"])
505
+ logger.info(f"Successfully retrieved {len(retrieved_docs)} documents")
506
+
507
+ if not retrieved_docs:
508
+ logger.warning("No documents were retrieved!")
509
+ return {"context": "No relevant context found.", "question": input_dict["question"]}
510
+
511
+ # Log each retrieved document with its content and similarity score
512
+ total_content_length = 0
513
+ for i, doc in enumerate(retrieved_docs):
514
+ logger.info(f"\nDocument {i+1}:")
515
+ logger.info("=" * 50)
516
+ logger.info(f"Content: {doc.page_content}")
517
+ logger.info(f"Content Length: {len(doc.page_content)} characters")
518
+ logger.info(f"Metadata: {doc.metadata}")
519
+ logger.info("=" * 50)
520
+ total_content_length += len(doc.page_content)
521
+
522
+ context = format_docs(retrieved_docs)
523
+
524
+ # Log the final formatted context and question
525
+ logger.info("\nRetrieval Statistics:")
526
+ logger.info(f"Total documents retrieved: {len(retrieved_docs)}")
527
+ logger.info(f"Total content length: {total_content_length} characters")
528
+ logger.info(f"Average document length: {total_content_length/len(retrieved_docs):.2f} characters")
529
+
530
+ logger.info("\nFinal Context and Question:")
531
+ logger.info("=" * 50)
532
+ logger.info("Context:")
533
+ logger.info(f"{context}")
534
+ logger.info("-" * 50)
535
+ logger.info(f"Question: {input_dict['question']}")
536
+ logger.info("=" * 50)
537
+
538
+ if not context.strip():
539
+ logger.error("Warning: Empty context retrieved!")
540
+ return {"context": "No relevant context found.", "question": input_dict["question"]}
541
+
542
+ return {"context": context, "question": input_dict["question"]}
543
+ except Exception as e:
544
+ logger.error(f"Error in get_context_and_log: {str(e)}")
545
+ logger.error("Stack trace:", exc_info=True)
546
+ return {"context": "Error retrieving context.", "question": input_dict["question"]}
547
 
548
  # Create the chain
549
  chain = (
 
553
  | base_llm
554
  )
555
 
556
+ # Get response with enhanced logging
557
+ try:
558
+ logger.info("\nGenerating response...")
559
+ response = chain.invoke({"question": request.message})
560
+ logger.info("=" * 50)
561
+ logger.info("Final Response:")
562
+ logger.info(f"{response.content}")
563
+ logger.info("=" * 50)
564
+
565
+ return PodcastChatResponse(response=response.content)
566
+ except Exception as e:
567
+ logger.error(f"Error generating response: {str(e)}")
568
+ raise HTTPException(status_code=500, detail=f"Error generating response: {str(e)}")
569
 
570
  except HTTPException:
571
  raise
server/transcripts/podcasts.json DELETED
@@ -1 +0,0 @@
1
- []
 
 
server/utils.py CHANGED
@@ -13,11 +13,15 @@ TRANSCRIPTS_FILE = os.path.join(TRANSCRIPTS_DIR, "podcasts.json")
13
 
14
  def save_transcript(podcast_script: str, user_query: str) -> None:
15
  """Save podcast transcript to JSON file."""
 
 
 
 
16
  # Create new transcript entry
17
  transcript = {
18
  "id": str(uuid.uuid4()),
19
  "podcastScript": podcast_script,
20
- "topic": user_query
21
  }
22
 
23
  try:
@@ -34,13 +38,20 @@ def save_transcript(podcast_script: str, user_query: str) -> None:
34
  else:
35
  transcripts = []
36
 
37
- # Append new transcript
38
- transcripts.append(transcript)
 
 
 
 
 
 
 
39
 
40
  # Save updated transcripts
41
  with open(TRANSCRIPTS_FILE, 'w') as f:
42
  json.dump(transcripts, f, indent=2)
43
- logger.info("Successfully saved transcript")
44
 
45
  except Exception as e:
46
  logger.error(f"Error saving transcript: {str(e)}")
 
13
 
14
  def save_transcript(podcast_script: str, user_query: str) -> None:
15
  """Save podcast transcript to JSON file."""
16
+ # Process the topic to match filename format
17
+ topic = user_query.lower().strip().replace(" ", "_")
18
+ topic = topic.replace("?", "").replace("!", "").replace(".", "") # Remove punctuation
19
+
20
  # Create new transcript entry
21
  transcript = {
22
  "id": str(uuid.uuid4()),
23
  "podcastScript": podcast_script,
24
+ "topic": topic.replace("_", " ") # Store topic with spaces for matching
25
  }
26
 
27
  try:
 
38
  else:
39
  transcripts = []
40
 
41
+ # Check if transcript for this topic already exists
42
+ for i, existing in enumerate(transcripts):
43
+ if existing.get("topic") == transcript["topic"]:
44
+ # Update existing transcript
45
+ transcripts[i] = transcript
46
+ break
47
+ else:
48
+ # Append new transcript if no existing one was found
49
+ transcripts.append(transcript)
50
 
51
  # Save updated transcripts
52
  with open(TRANSCRIPTS_FILE, 'w') as f:
53
  json.dump(transcripts, f, indent=2)
54
+ logger.info(f"Successfully saved transcript for topic: {transcript['topic']}")
55
 
56
  except Exception as e:
57
  logger.error(f"Error saving transcript: {str(e)}")