resume-rag / app.py
suvadityamuk's picture
chore: update app.py
83f72c4
import os
import re
import json
import time
import requests
import wandb
import torch
import spaces
from tqdm.auto import tqdm
import psutil
import pymupdf
import gradio as gr
from qdrant_client import QdrantClient
from utils import download_pdf_from_gdrive, merge_strings_with_prefix
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
def rag_query(query: str):
"""
Allows searching the vector database which contains
information for a man named Suvaditya for a given query
by performing semantic search. Returns results by
looking at his resume, which contains a plethora of
information about him.
Args:
query: The query against which the search will be run,
in the form a single string phrase no more than
10 words.
Returns:
search_results: A list of results that come closest
to the given query semantically,
determined by Cosine Similarity.
"""
return client.query(
collection_name="resume",
query_text=query
)
def generate_answer(chat_history):
# Generate result
tool_prompt = tokenizer.apply_chat_template(
chat_history,
tools=[rag_query],
return_tensors="pt",
return_dict=True,
add_generation_prompt=True,
)
tool_prompt = tool_prompt.to(model.device)
out = model.generate(
**tool_prompt,
max_new_tokens=512,
do_sample=True,
top_p=0.95,
num_beams=4
)
generated_text = out[0, tool_prompt['input_ids'].shape[1]:]
generated_text = tokenizer.decode(generated_text)
torch.cuda.empty_cache()
return generated_text
def parse_tool_request(tool_call, top_k=5):
pattern = r"<tool_call>(.*?)</tool_call>"
match_result = re.search(pattern, tool_call, re.DOTALL)
if match_result:
result = match_result.group(1).strip()
else:
return None, None
query = json.loads(result)["arguments"]["query"]
query_results = [
query_piece.metadata["document"] for query_piece in rag_query(query)
]
return query_results[:top_k], query
def update_chat_history(chat_history, tool_query, query_results):
assistant_tool_message = {
"role": "assistant",
"metadata": "🛠️ Using Qdrant Engine to search for the query 🛠️",
"tool_calls": [{
"type": "function",
"function": {
"name": "rag_query",
"arguments": {"query": f"{tool_query}"}
}
}]
}
result_tool_message = {
"role": "tool",
"name": "rag_query",
"content": "\n".join(query_results)
}
chat_history.append(assistant_tool_message)
chat_history.append(result_tool_message)
return chat_history
if __name__ == "__main__":
RESUME_DATA = """
Suvaditya Mukherjee Email: suvadity@usc.edu
Portfolio: suvadityamuk.com Mobile: (213) 827-9733
Github: github.com/suvadityamuk
Education
University of Southern California Master of Science - Computer Science (Artificial Intelligence); GPA: 3.85/4 - Los Angeles, CA, USA
August 2024 - July 2026
Courses: Machine Learning, Deep Learning, Advanced Computer Vision, Analysis of Algorithms
NMIMS Mukesh Patel School of Technology, Management and Engineering
Bachelor of Technology - Computer Science (Artificial Intelligence); GPA: 3.94/4 - Mumbai, India
August 2020 - May 2024
Courses: Deep Learning, Data Structures and Algorithms, Machine Learning, Natural Language Processing, Software Engineering,
Operating Systems, Mathematics, Computer Organization and Architecture, Computer Networks, Database Management Systems
Experience
USC Institute of Creative Technologies Los Angeles, CA, USA
Machine Learning Student Worker - Learning Sciences Lab (Part-time) September 2024 - Present
Course Generation using Generative AI: Leverage Generative AI with LangChain and OpenAI to help make novel
techniques for course generation, tutoring content generation, and OpenTutor courses to learn and teach AI for the
AIRCOEE program in collaboration with the US Department of Defense, under Prof. (Dr.) Benjamin Nye.
Cogeneration Testbed: Maintain technologies for co-generation of tutoring content using open and cloud-based LLMs
to help educators.
USC School of Cinematic Arts Los Angeles, CA, USA
Machine Learning Assistant - Interactive Games Division (Part-time) September 2024 - Present
Student Worker: Assist Prof. (Dr.) Mark Bolas to develop an introductory Python Programming course for Game
Developers.
ML Research: Find new approaches to apply Generative AI based on LLMs and Diffusion Models to solve problems at
large-scale in Creative Media, with solutions such as generating scripts and summaries based on videos.
HARMAN International Bengaluru, India
Machine Learning Intern (Full-time) December 2023 - May 2024
K-Shot Rotation-Invariant Object Detection Pipeline Development: Produced new Intellectual Property
towards achieving a robust pipeline to perform K-shot object detection without dependence on rotation alignment.
Improved pipeline with 35\% better results on client data
Zero-shot Time-Series Forecasting with LLMs: Researched on how to achieve zero-shot time-series forecasting
through LLMs while building on previous developments.
Spot Instance Handler using Agentic LLMs: Built an agent-based LLM system on Gemini 1.5 Pro and LangChain
to help reduce costs by 10\% incurred, by running non-critical workloads on spot-instances
Center for Visual Information Technology, IIIT-Hyderabad Hyderabad, India
Research Intern (Full-time) June 2023 - November 2023
Research: Contributed towards research along Domain Adaptation problems in Autonomous Driving under Prof. C.V.
Jawahar and Prof. Shankar Gangisetty
Code Implementations: Operated with internal tools to execute large-scale GPU training and experimentation on
Image Segmentation problems
UnifyAI (Ivy) London, United Kingdom
ML Research Engineer Intern (Full-time) January 2023 - July 2023
Demos and Examples: Developed new demos, examples, and guides to internal and external official documentation,
most notably around converting torchvision models into TFLite. Also helped in establishing programs and managing the
Google Summer of Code program as an Organization Admin
Internal AI Developer: Prototyped an AI Developer (Code-LLM) to automate and builds upon existing codebases and
speeds up internal development, along with handling self-training through Cloud resources such as GCP and AWS
Publications and Research
Presentation: Pushing the Performance Envelope : An Optimization Study for 3D Generative Modelling with
PyTorch: Work on finding techniques to optimize 3D Text-to-Image Mesh generation [Accepted at PyTorch Conference 2024]
Paper: Guiding the Student\’s Learning Curve: Augmenting Knowledge Distillation with Insights from
GradCAM: Work on investigating the effects of using GradCAM representations of Teacher models as direct inputs to
Student models for quicker convergence. [Accepted]
Paper: Project Lingua Franca: Democratizing Information through Unified Optical Character Recognition
and Neural Machine Translation: Work on combined Optical Character Recognition and Neural Machine Translation for
information translation with high-impact languages as targets [Accepted]
Leadership
Google Developer Expert: Recognized and selected as a top contributor to the Google ML Developer Community. Work
towards creating detailed tutorials, delivering talks around Deep Learning, and helping beta-test new products on GCP Vertex
AI and Gemini suite of tools.
Google Summer of Code: (Org Admin and Mentor) Mentored incoming students for completing tasks, handled
communications with Google Open Source Programs Office for compliance.
"""
# RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
# RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
# ONNX_MODEL_PATH = "https://huggingface.co/onnx-community/Qwen2.5-1.5B-Instruct/resolve/main/onnx/model.onnx_data"
# SAVE_PATH = "./model.onnx_data"
# print("Downloading ONNX model...")
# response = requests.get(ONNX_MODEL_PATH, stream=True)
# response.raise_for_status()
# total_size = int(response.headers.get('content-length', 0))
# with open(SAVE_PATH, 'wb') as file, tqdm(
# desc=os.path.basename(SAVE_PATH),
# total=total_size,
# unit='iB',
# unit_scale=True
# ) as pbar:
# for data in response.iter_content(chunk_size=8192):
# size = file.write(data)
# pbar.update(size)
# print("Downloaded ONNX model!")
# Download file
# download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
# doc = pymupdf.open(RESUME_PATH)
# fulltext = doc[0].get_text().split("\n")
# fulltext = merge_strings_with_prefix(fulltext)
fulltext = RESUME_DATA.split("\n\n")
print(fulltext)
# Embed the sentences
# client = QdrantClient(":memory:", optimize_for_ram_usage=True)
client = QdrantClient(":memory:")
client.set_model("sentence-transformers/all-MiniLM-L6-v2")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if not client.collection_exists(collection_name="resume"):
client.create_collection(
collection_name="resume",
vectors_config=client.get_fastembed_vector_params(),
)
_ = client.add(
collection_name="resume",
documents=fulltext,
ids=range(len(fulltext)),
batch_size=100,
# parallel=0,
)
# wandb.login(
# key=os.getenv("WANDB_API_KEY")
# )
model_name = "Qwen/Qwen2.5-3B-Instruct"
# wandb.init(
# project="resume-rag",
# name="zerogpu-run",
# save_code=True,
# config={
# "model_name": model_name,
# "resume_url": RESUME_URL
# }
# )
# wandb.login(
# key=os.getenv("WANDB_API_KEY")
# )
@spaces.GPU
def rag_process(message, chat_history):
if not chat_history:
system_message = {
"role": "system",
"content": """You are an AI assistant focused on answering questions about Suvaditya's resume.
Only provide information that is explicitly mentioned in the resume data.
If you're unsure about any information, refuse to answer and direct users to suvadityamuk.com.
Be accurate and concise in your responses. """
}
chat_history = [system_message]
# wandb.init(
# project="resume-rag",
# name="zerogpu-run",
# save_code=True,
# config={
# "model_name": model_name,
# "resume_url": RESUME_URL
# }
# )
# Append current user message to chat history
current_message = {
"role": "user",
"content": message
}
chat_history.append(current_message)
# start_time = time.time()
# Generate LLM answer
generated_text = generate_answer(chat_history)
# generated_text = onnx_inference(chat_history, rag_query, tokenizer)
# Detect if tool call is requested by LLM. If yes, then
# execute tool and use else return None
query_results, tool_query = parse_tool_request(generated_text)
# If tool call was requested
if query_results is not None and tool_query is not None:
# Update chat history with result of tool call
chat_history = update_chat_history(
chat_history, tool_query, query_results
)
# Generate result from the
generated_text = generate_answer(chat_history)
# generated_text = onnx_inference(chat_history, rag_query, tokenizer)
# metrics = {
# "conversation": {
# "turn": len(chat_history) // 2,
# "history": chat_history,
# "current_question": message,
# "current_answer": generated_text[:-10],
# "tool_query": tool_query,
# "rag_results": query_results
# },
# "performance": {
# "response_time": time.time() - start_time,
# "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
# "cpu_memory": psutil.Process().memory_info().rss,
# # "gpu_utilization": torch.cuda.utilization() if torch.cuda.is_available() else 0
# }
# }
# wandb.log(metrics)
return generated_text[:-10]
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
# quantization_config=QuantoConfig(
# weights="int8",
# )
# quantization_config = BitsAndBytesConfig(
# load_in_8bit=True,
# # bnb_4bit_compute_dtype=torch.float16,
# # bnb_4bit_quant_type="nf4"
# )
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
demo = gr.ChatInterface(
fn=rag_process,
type="messages",
title="Suvaditya's Personal RAG, a space on ZeroGPU!",
examples=["Where did Suvaditya complete his Bachelor's Degree?", "Where is Suvaditya currently working?"],
description="Ask any question about Suvaditya's resume and get an answer! \n\nNote: Sometimes, as always, the LLM may give wrong answers. Here's a link to my [resume](https://suvadityamuk.com/uploads/resume.pdf), if you'd like to go through it yourself! Get in touch with me through [X](https://x.com/halcyonrayes), [Gmail](mailto:suvadityamuk@gmail.com), [LinkedIn](https://www.linkedin.com/in/suvadityamukherjee), or [schedule a meeting with me here](https://cal.com/suvadityamuk)",
theme="John6666/YntecDark",
)
demo.launch()
# wandb.finish()