bhulston commited on
Commit
c755297
β€’
1 Parent(s): df1d046

Uploading relevant documents

Browse files
Files changed (5) hide show
  1. README.md +30 -13
  2. app.py +49 -0
  3. reranker.py +19 -0
  4. results.py +24 -0
  5. utils.py +34 -0
README.md CHANGED
@@ -1,13 +1,30 @@
1
- ---
2
- title: USC GPT
3
- emoji: πŸ‘
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.29.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # USC-GPT
2
+ IN PROGRESS - A chatbot that uses a pinecone vector database and OpenAI's GPT to help students find classes! Using RAG and Prompt Engineering
3
+
4
+ I scraped the USC schedule of classes website for all classes provided, and following that stored and embedded them into a vector database (Pinecone!).
5
+ Then using multiple agents to handle different steps in the process (to abstract away prompt engineering from the user to the backend), we are able to do complex searches to help students find their ideal class:)
6
+
7
+ ## Agents
8
+ #### Filter Agent:
9
+ An agent to extract filters from the user's query and outputting them as a json string. These can then be applied to the vector DB index query
10
+
11
+ #### Keywords Agent:
12
+ Extracting keywords from the query. In order to optimize the vector DB search, I have found that combining a user's original query with some keywords related to the class they are using helps in optimizing the vector search.
13
+
14
+ #### Results Agent:
15
+ This agent processes the similarity searches from the vector database to output it in a user friendly manner. There is also some judgement given to this in interpreting what class is recommended to a user.
16
+
17
+
18
+ ## Example of Agents in Action
19
+ Here's an example I've already built:):
20
+
21
+ #### Original Query:
22
+ "I want to take a Video game class, with 3-D modeling on Tuesdays and Thursdays before 5 pm"
23
+
24
+ #### Result Agent Response:
25
+ ```
26
+ Based on the scores and course content, I would recommend "ITP 351: 3D Character Modeling for Games" as it specifically focuses on 3D modeling for games and has a lower score of 0.856241643.
27
+ However, if you'd rather a course with more focus on the foundations of 3D gaming, "CTIN 583: Game Development for Designers" represents a good choice and has the lowest score of 0.845549047.
28
+ Please bear in mind that the courses may have prerequisites. If you have taken any prerequisites in past semesters, you should be eligible for either course.
29
+ Otherwise, the introductory course "ITP 215L: Introduction to 3D Modeling, Animation, and Visual Effects" could serve as a starting point.
30
+ ```
app.py CHANGED
@@ -4,6 +4,40 @@ from PIL import Image
4
  from datetime import time as t
5
  import time
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  if "messages" not in st.session_state:
8
  st.session_state.messages = []
9
 
@@ -35,6 +69,21 @@ if prompt := st.chat_input("What kind of class are you looking for?"):
35
  st.session_state.messages.append({"role": "user", "content": prompt})
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  ### GPT Response
40
  # Display assistant response in chat message container
 
4
  from datetime import time as t
5
  import time
6
 
7
+ from operator import itemgetter
8
+ import os
9
+ import json
10
+ import getpass
11
+ from dotenv import find_dotenv, load_dotenv
12
+
13
+ from langchain.vectorstores import Pinecone
14
+ from langchain.embeddings import OpenAIEmbeddings
15
+ import pinecone
16
+
17
+
18
+ from agents.keywords import keyword_agent
19
+ from agents.filter import filter_agent
20
+ from agents.results import results_agent
21
+ from reranker import reranker
22
+ from utils import build_filter
23
+
24
+ load_dotenv(find_dotenv())
25
+ OPENAI_API = os.environ.get("OPENAI_API_KEY")
26
+ PINECONE_API = os.environ.get("PINECONE_API")
27
+
28
+ pinecone.init(
29
+ api_key= PINECONE_API,
30
+ environment="gcp-starter"
31
+ )
32
+ index_name = "use-class-db"
33
+
34
+ embeddings = OpenAIEmbeddings()
35
+
36
+ index = pinecone.Index(index_name)
37
+
38
+ k = 5
39
+
40
+
41
  if "messages" not in st.session_state:
42
  st.session_state.messages = []
43
 
 
69
  st.session_state.messages.append({"role": "user", "content": prompt})
70
 
71
 
72
+ response = filter_agent(prompt)
73
+ query = response
74
+
75
+ response = index.query(
76
+ vector= embeddings.embed_query(query),
77
+ # filter= build_filter(json),
78
+ top_k=5,
79
+ include_metadata=True
80
+ )
81
+
82
+ response = reranker(query, response)
83
+
84
+ result_query = 'Original Query:' + query + 'Query Results:' + str(response)
85
+
86
+ print(results_agent(result_query))
87
 
88
  ### GPT Response
89
  # Display assistant response in chat message container
reranker.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import CrossEncoder
2
+ import numpy as np
3
+
4
+ # Let's use a reranker to get better results from our semantic search
5
+
6
+ def reranker(query, matches):
7
+ docs = matches.matches
8
+ print("matches are:", docs)
9
+
10
+ pairs = []
11
+ for match in docs:
12
+ pairs.append((query, match["metadata"]["text"]))
13
+
14
+ model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2', max_length = 512)
15
+
16
+ scores = model.predict(pairs)
17
+ top_indices = np.argsort(scores)[::-5]
18
+ top_results = ["Class: " + docs[index]["metadata"]["text"] for index in top_indices]
19
+ return top_results
results.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import find_dotenv, load_dotenv
3
+ import openai
4
+ import json, csv
5
+
6
+ # load_dotenv(find_dotenv())
7
+ # openai.api_key = os.environ.get("OPENAI_API")
8
+
9
+ def results_agent(query):
10
+
11
+ system_prompt = """
12
+ You are an academic advisor helping students find classes for the next semester.
13
+ Relay information in a succinct way that fully answers their questions. Use the scores (lower is better) from the context to help guide them in finding the best class based on their query (a higher score is not always better). It is okay to ask for follow up questions to further assist them:
14
+ """
15
+
16
+ response = openai.ChatCompletion.create(
17
+ model="gpt-4",
18
+ messages=[
19
+ {"role": "system", "content": system_prompt},
20
+ {"role": "user", "content": query}
21
+ ]
22
+ )
23
+
24
+ return response["choices"][0]["message"]["content"]
utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def fill_prompt(prompt, json):
2
+ """
3
+ Takes in a prompt and fills in the {key} with the corresponding value from the json KV pairs
4
+ """
5
+
6
+
7
+ def build_filter(json_response):
8
+ filter = {}
9
+
10
+ days_list = json_response['Days']
11
+ # Days
12
+ if "[" in days_list:
13
+ days = str(days_list[0])
14
+
15
+ for i in range(len(days_list)-1):
16
+ days += ', '
17
+ days += str(days_list[i+1])
18
+ filter["days"] = days
19
+
20
+ # Units
21
+ units = json_response['Units']
22
+
23
+ if units != "":
24
+ filter["units"] = units
25
+
26
+ # Program
27
+ program = json_response['Program']
28
+
29
+ if program != "":
30
+ filter["program"] = program
31
+
32
+ # Time
33
+
34
+ return filter