Spaces:
Sleeping
Sleeping
Commit
•
b35ebeb
1
Parent(s):
6ef4921
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from transformers import AutoTokenizer, AutoModel
|
4 |
+
|
5 |
+
# Load the PubMedBERT tokenizer and model directly
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
|
7 |
+
model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
|
8 |
+
|
9 |
+
# Step 1: Read CSV and extract relevant rows for the first ten data points
|
10 |
+
csv_filename = "new.csv"
|
11 |
+
grants_data = []
|
12 |
+
|
13 |
+
with open(csv_filename, "r", encoding="utf-8") as csv_file:
|
14 |
+
csv_reader = csv.DictReader(csv_file)
|
15 |
+
for i, row in enumerate(csv_reader):
|
16 |
+
if int(row["postdate"]) == 2023: # Only consider grants from 2023
|
17 |
+
grants_data.append(row)
|
18 |
+
if len(grants_data) == 3000: # Stop after collecting ten grants
|
19 |
+
break
|
20 |
+
|
21 |
+
# Streamlit app
|
22 |
+
st.title("Grants Ranking System")
|
23 |
+
|
24 |
+
# Step 2: Define the reference text using a text input widget
|
25 |
+
reference_text = st.text_input("Enter the reference text here:")
|
26 |
+
|
27 |
+
# Calculate similarity scores for each grant
|
28 |
+
if st.button("Calculate Similarity"):
|
29 |
+
similarity_scores = []
|
30 |
+
|
31 |
+
for grant_data in grants_data:
|
32 |
+
grant_description = grant_data["opportunitytitle"][:3000] # Truncate the description to reduce tokens
|
33 |
+
|
34 |
+
# Tokenize the reference text and grant description
|
35 |
+
inputs1 = tokenizer(reference_text, return_tensors="pt", padding=True, truncation=True)
|
36 |
+
inputs2 = tokenizer(grant_description, return_tensors="pt", padding=True, truncation=True)
|
37 |
+
|
38 |
+
# Calculate embeddings for reference text and grant description
|
39 |
+
with torch.no_grad():
|
40 |
+
outputs1 = model(**inputs1)
|
41 |
+
outputs2 = model(**inputs2)
|
42 |
+
|
43 |
+
embeddings1 = outputs1.last_hidden_state.mean(dim=1)
|
44 |
+
embeddings2 = outputs2.last_hidden_state.mean(dim=1)
|
45 |
+
|
46 |
+
# Calculate similarity score using dot product of embeddings
|
47 |
+
similarity_score = torch.matmul(embeddings1, embeddings2.transpose(0, 1)).item()
|
48 |
+
similarity_scores.append(similarity_score)
|
49 |
+
|
50 |
+
# Step 4: Sort grants based on similarity scores and assign ranks
|
51 |
+
sorted_indices = sorted(range(len(similarity_scores)), key=lambda k: similarity_scores[k], reverse=True)
|
52 |
+
sorted_grants_data = [grants_data[i] for i in sorted_indices]
|
53 |
+
|
54 |
+
# Step 5: Assign ranks to the grants based on similarity scores
|
55 |
+
ranks = list(range(1, len(sorted_grants_data) + 1))
|
56 |
+
|
57 |
+
# Step 6: Create a new DataFrame with ranks and similarity scores
|
58 |
+
ranked_grants_df = pd.DataFrame(sorted_grants_data)
|
59 |
+
ranked_grants_df["Similarity_Score"] = similarity_scores
|
60 |
+
ranked_grants_df["Rank"] = ranks
|
61 |
+
|
62 |
+
# Step 7: Display the ranked grants DataFrame in the Streamlit app
|
63 |
+
st.write(ranked_grants_df)
|
64 |
+
|
65 |
+
# Step 8: Add a radio button widget to get user input on the search quality
|
66 |
+
feedback = st.radio("How do you rate this search?", ("Good", "Ok", "Bad"))
|
67 |
+
|
68 |
+
# Step 9: Generate a unique identifier for this search
|
69 |
+
search_id = str(uuid.uuid4())
|
70 |
+
|
71 |
+
# Step 10: Save the reference text along with the feedback to a CSV file with the unique identifier
|
72 |
+
reference_text_filename = f"reference_text_{search_id}.csv"
|
73 |
+
reference_text_df = pd.DataFrame({
|
74 |
+
"Search_ID": [search_id],
|
75 |
+
"Reference_Text": [reference_text],
|
76 |
+
"Feedback": [feedback]
|
77 |
+
})
|
78 |
+
reference_text_df.to_csv(reference_text_filename, index=False)
|
79 |
+
|
80 |
+
# Step 11: Save the ranked grants DataFrame to a CSV file with the unique identifier
|
81 |
+
ranked_grants_filename = f"ranked_grants_{search_id}.csv"
|
82 |
+
ranked_grants_df.to_csv(ranked_grants_filename, index=False)
|
83 |
+
|
84 |
+
st.success("Reference text and ranked grants saved to CSV files.")
|