Spaces:

suvadityamuk
/

resume-rag

Sleeping

App Files Files Community

resume-rag / app.py

suvadityamuk

chore: update app.py

83f72c4 about 1 month ago

raw

history blame contribute delete

14.4 kB

	import os
	import re
	import json
	import time
	import requests
	import wandb
	import torch
	import spaces
	from tqdm.auto import tqdm
	import psutil
	import pymupdf
	import gradio as gr
	from qdrant_client import QdrantClient
	from utils import download_pdf_from_gdrive, merge_strings_with_prefix
	from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig

	def rag_query(query: str):
	"""
	Allows searching the vector database which contains
	information for a man named Suvaditya for a given query
	by performing semantic search. Returns results by
	looking at his resume, which contains a plethora of
	information about him.

	Args:
	query: The query against which the search will be run,
	in the form a single string phrase no more than
	10 words.

	Returns:
	search_results: A list of results that come closest
	to the given query semantically,
	determined by Cosine Similarity.
	"""
	return client.query(
	collection_name="resume",
	query_text=query
	)

	def generate_answer(chat_history):
	# Generate result
	tool_prompt = tokenizer.apply_chat_template(
	chat_history,
	tools=[rag_query],
	return_tensors="pt",
	return_dict=True,
	add_generation_prompt=True,
	)
	tool_prompt = tool_prompt.to(model.device)
	out = model.generate(
	**tool_prompt,
	max_new_tokens=512,
	do_sample=True,
	top_p=0.95,
	num_beams=4
	)
	generated_text = out[0, tool_prompt['input_ids'].shape[1]:]
	generated_text = tokenizer.decode(generated_text)
	torch.cuda.empty_cache()
	return generated_text

	def parse_tool_request(tool_call, top_k=5):
	pattern = r"<tool_call>(.*?)</tool_call>"
	match_result = re.search(pattern, tool_call, re.DOTALL)
	if match_result:
	result = match_result.group(1).strip()
	else:
	return None, None

	query = json.loads(result)["arguments"]["query"]
	query_results = [
	query_piece.metadata["document"] for query_piece in rag_query(query)
	]

	return query_results[:top_k], query

	def update_chat_history(chat_history, tool_query, query_results):
	assistant_tool_message = {
	"role": "assistant",
	"metadata": "🛠️ Using Qdrant Engine to search for the query 🛠️",
	"tool_calls": [{
	"type": "function",
	"function": {
	"name": "rag_query",
	"arguments": {"query": f"{tool_query}"}
	}
	}]
	}
	result_tool_message = {
	"role": "tool",
	"name": "rag_query",
	"content": "\n".join(query_results)
	}

	chat_history.append(assistant_tool_message)
	chat_history.append(result_tool_message)

	return chat_history

	if __name__ == "__main__":
	RESUME_DATA = """

	Suvaditya Mukherjee Email: suvadity@usc.edu
	Portfolio: suvadityamuk.com Mobile: (213) 827-9733
	Github: github.com/suvadityamuk

	Education
	University of Southern California Master of Science - Computer Science (Artificial Intelligence); GPA: 3.85/4 - Los Angeles, CA, USA
	August 2024 - July 2026
	Courses: Machine Learning, Deep Learning, Advanced Computer Vision, Analysis of Algorithms


	NMIMS Mukesh Patel School of Technology, Management and Engineering
	Bachelor of Technology - Computer Science (Artificial Intelligence); GPA: 3.94/4 - Mumbai, India
	August 2020 - May 2024
	Courses: Deep Learning, Data Structures and Algorithms, Machine Learning, Natural Language Processing, Software Engineering,
	Operating Systems, Mathematics, Computer Organization and Architecture, Computer Networks, Database Management Systems


	Experience


	USC Institute of Creative Technologies Los Angeles, CA, USA
	Machine Learning Student Worker - Learning Sciences Lab (Part-time) September 2024 - Present

	Course Generation using Generative AI: Leverage Generative AI with LangChain and OpenAI to help make novel
	techniques for course generation, tutoring content generation, and OpenTutor courses to learn and teach AI for the
	AIRCOEE program in collaboration with the US Department of Defense, under Prof. (Dr.) Benjamin Nye.

	Cogeneration Testbed: Maintain technologies for co-generation of tutoring content using open and cloud-based LLMs
	to help educators.


	USC School of Cinematic Arts Los Angeles, CA, USA
	Machine Learning Assistant - Interactive Games Division (Part-time) September 2024 - Present

	Student Worker: Assist Prof. (Dr.) Mark Bolas to develop an introductory Python Programming course for Game
	Developers.

	ML Research: Find new approaches to apply Generative AI based on LLMs and Diffusion Models to solve problems at
	large-scale in Creative Media, with solutions such as generating scripts and summaries based on videos.


	HARMAN International Bengaluru, India
	Machine Learning Intern (Full-time) December 2023 - May 2024

	K-Shot Rotation-Invariant Object Detection Pipeline Development: Produced new Intellectual Property
	towards achieving a robust pipeline to perform K-shot object detection without dependence on rotation alignment.
	Improved pipeline with 35\% better results on client data

	Zero-shot Time-Series Forecasting with LLMs: Researched on how to achieve zero-shot time-series forecasting
	through LLMs while building on previous developments.

	Spot Instance Handler using Agentic LLMs: Built an agent-based LLM system on Gemini 1.5 Pro and LangChain
	to help reduce costs by 10\% incurred, by running non-critical workloads on spot-instances


	Center for Visual Information Technology, IIIT-Hyderabad Hyderabad, India
	Research Intern (Full-time) June 2023 - November 2023

	Research: Contributed towards research along Domain Adaptation problems in Autonomous Driving under Prof. C.V.
	Jawahar and Prof. Shankar Gangisetty

	Code Implementations: Operated with internal tools to execute large-scale GPU training and experimentation on
	Image Segmentation problems


	UnifyAI (Ivy) London, United Kingdom
	ML Research Engineer Intern (Full-time) January 2023 - July 2023

	Demos and Examples: Developed new demos, examples, and guides to internal and external official documentation,
	most notably around converting torchvision models into TFLite. Also helped in establishing programs and managing the
	Google Summer of Code program as an Organization Admin

	Internal AI Developer: Prototyped an AI Developer (Code-LLM) to automate and builds upon existing codebases and
	speeds up internal development, along with handling self-training through Cloud resources such as GCP and AWS


	Publications and Research

	Presentation: Pushing the Performance Envelope : An Optimization Study for 3D Generative Modelling with
	PyTorch: Work on finding techniques to optimize 3D Text-to-Image Mesh generation [Accepted at PyTorch Conference 2024]

	Paper: Guiding the Student\’s Learning Curve: Augmenting Knowledge Distillation with Insights from
	GradCAM: Work on investigating the effects of using GradCAM representations of Teacher models as direct inputs to
	Student models for quicker convergence. [Accepted]

	Paper: Project Lingua Franca: Democratizing Information through Unified Optical Character Recognition
	and Neural Machine Translation: Work on combined Optical Character Recognition and Neural Machine Translation for
	information translation with high-impact languages as targets [Accepted]


	Leadership

	Google Developer Expert: Recognized and selected as a top contributor to the Google ML Developer Community. Work
	towards creating detailed tutorials, delivering talks around Deep Learning, and helping beta-test new products on GCP Vertex
	AI and Gemini suite of tools.

	Google Summer of Code: (Org Admin and Mentor) Mentored incoming students for completing tasks, handled
	communications with Google Open Source Programs Office for compliance.
	"""
	# RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
	# RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"

	# ONNX_MODEL_PATH = "https://huggingface.co/onnx-community/Qwen2.5-1.5B-Instruct/resolve/main/onnx/model.onnx_data"
	# SAVE_PATH = "./model.onnx_data"

	# print("Downloading ONNX model...")
	# response = requests.get(ONNX_MODEL_PATH, stream=True)
	# response.raise_for_status()

	# total_size = int(response.headers.get('content-length', 0))

	# with open(SAVE_PATH, 'wb') as file, tqdm(
	# desc=os.path.basename(SAVE_PATH),
	# total=total_size,
	# unit='iB',
	# unit_scale=True
	# ) as pbar:
	# for data in response.iter_content(chunk_size=8192):
	# size = file.write(data)
	# pbar.update(size)
	# print("Downloaded ONNX model!")

	# Download file
	# download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)

	# doc = pymupdf.open(RESUME_PATH)

	# fulltext = doc[0].get_text().split("\n")

	# fulltext = merge_strings_with_prefix(fulltext)

	fulltext = RESUME_DATA.split("\n\n")

	print(fulltext)

	# Embed the sentences
	# client = QdrantClient(":memory:", optimize_for_ram_usage=True)
	client = QdrantClient(":memory:")

	client.set_model("sentence-transformers/all-MiniLM-L6-v2")

	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	if not client.collection_exists(collection_name="resume"):
	client.create_collection(
	collection_name="resume",
	vectors_config=client.get_fastembed_vector_params(),
	)

	_ = client.add(
	collection_name="resume",
	documents=fulltext,
	ids=range(len(fulltext)),
	batch_size=100,
	# parallel=0,
	)

	# wandb.login(
	# key=os.getenv("WANDB_API_KEY")
	# )

	model_name = "Qwen/Qwen2.5-3B-Instruct"

	# wandb.init(
	# project="resume-rag",
	# name="zerogpu-run",
	# save_code=True,
	# config={
	# "model_name": model_name,
	# "resume_url": RESUME_URL
	# }
	# )

	# wandb.login(
	# key=os.getenv("WANDB_API_KEY")
	# )

	@spaces.GPU
	def rag_process(message, chat_history):
	if not chat_history:
	system_message = {
	"role": "system",
	"content": """You are an AI assistant focused on answering questions about Suvaditya's resume.
	Only provide information that is explicitly mentioned in the resume data.
	If you're unsure about any information, refuse to answer and direct users to suvadityamuk.com.
	Be accurate and concise in your responses. """
	}
	chat_history = [system_message]
	# wandb.init(
	# project="resume-rag",
	# name="zerogpu-run",
	# save_code=True,
	# config={
	# "model_name": model_name,
	# "resume_url": RESUME_URL
	# }
	# )
	# Append current user message to chat history
	current_message = {
	"role": "user",
	"content": message
	}
	chat_history.append(current_message)

	# start_time = time.time()
	# Generate LLM answer
	generated_text = generate_answer(chat_history)
	# generated_text = onnx_inference(chat_history, rag_query, tokenizer)

	# Detect if tool call is requested by LLM. If yes, then
	# execute tool and use else return None
	query_results, tool_query = parse_tool_request(generated_text)

	# If tool call was requested
	if query_results is not None and tool_query is not None:
	# Update chat history with result of tool call
	chat_history = update_chat_history(
	chat_history, tool_query, query_results
	)
	# Generate result from the
	generated_text = generate_answer(chat_history)
	# generated_text = onnx_inference(chat_history, rag_query, tokenizer)

	# metrics = {
	# "conversation": {
	# "turn": len(chat_history) // 2,
	# "history": chat_history,
	# "current_question": message,
	# "current_answer": generated_text[:-10],
	# "tool_query": tool_query,
	# "rag_results": query_results
	# },
	# "performance": {
	# "response_time": time.time() - start_time,
	# "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
	# "cpu_memory": psutil.Process().memory_info().rss,
	# # "gpu_utilization": torch.cuda.utilization() if torch.cuda.is_available() else 0
	# }
	# }
	# wandb.log(metrics)

	return generated_text[:-10]

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	# quantization_config=QuantoConfig(
	# weights="int8",
	# )
	# quantization_config = BitsAndBytesConfig(
	# load_in_8bit=True,
	# # bnb_4bit_compute_dtype=torch.float16,
	# # bnb_4bit_quant_type="nf4"
	# )
	)

	tokenizer = AutoTokenizer.from_pretrained(model_name)

	demo = gr.ChatInterface(
	fn=rag_process,
	type="messages",
	title="Suvaditya's Personal RAG, a space on ZeroGPU!",
	examples=["Where did Suvaditya complete his Bachelor's Degree?", "Where is Suvaditya currently working?"],
	description="Ask any question about Suvaditya's resume and get an answer! \n\nNote: Sometimes, as always, the LLM may give wrong answers. Here's a link to my [resume](https://suvadityamuk.com/uploads/resume.pdf), if you'd like to go through it yourself! Get in touch with me through [X](https://x.com/halcyonrayes), [Gmail](mailto:suvadityamuk@gmail.com), [LinkedIn](https://www.linkedin.com/in/suvadityamukherjee), or [schedule a meeting with me here](https://cal.com/suvadityamuk)",
	theme="John6666/YntecDark",
	)
	demo.launch()

	# wandb.finish()