Nandan1377 commited on
Commit
b35ebeb
1 Parent(s): 6ef4921

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from transformers import AutoTokenizer, AutoModel
4
+
5
+ # Load the PubMedBERT tokenizer and model directly
6
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
7
+ model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
8
+
9
+ # Step 1: Read CSV and extract relevant rows for the first ten data points
10
+ csv_filename = "new.csv"
11
+ grants_data = []
12
+
13
+ with open(csv_filename, "r", encoding="utf-8") as csv_file:
14
+ csv_reader = csv.DictReader(csv_file)
15
+ for i, row in enumerate(csv_reader):
16
+ if int(row["postdate"]) == 2023: # Only consider grants from 2023
17
+ grants_data.append(row)
18
+ if len(grants_data) == 3000: # Stop after collecting ten grants
19
+ break
20
+
21
+ # Streamlit app
22
+ st.title("Grants Ranking System")
23
+
24
+ # Step 2: Define the reference text using a text input widget
25
+ reference_text = st.text_input("Enter the reference text here:")
26
+
27
+ # Calculate similarity scores for each grant
28
+ if st.button("Calculate Similarity"):
29
+ similarity_scores = []
30
+
31
+ for grant_data in grants_data:
32
+ grant_description = grant_data["opportunitytitle"][:3000] # Truncate the description to reduce tokens
33
+
34
+ # Tokenize the reference text and grant description
35
+ inputs1 = tokenizer(reference_text, return_tensors="pt", padding=True, truncation=True)
36
+ inputs2 = tokenizer(grant_description, return_tensors="pt", padding=True, truncation=True)
37
+
38
+ # Calculate embeddings for reference text and grant description
39
+ with torch.no_grad():
40
+ outputs1 = model(**inputs1)
41
+ outputs2 = model(**inputs2)
42
+
43
+ embeddings1 = outputs1.last_hidden_state.mean(dim=1)
44
+ embeddings2 = outputs2.last_hidden_state.mean(dim=1)
45
+
46
+ # Calculate similarity score using dot product of embeddings
47
+ similarity_score = torch.matmul(embeddings1, embeddings2.transpose(0, 1)).item()
48
+ similarity_scores.append(similarity_score)
49
+
50
+ # Step 4: Sort grants based on similarity scores and assign ranks
51
+ sorted_indices = sorted(range(len(similarity_scores)), key=lambda k: similarity_scores[k], reverse=True)
52
+ sorted_grants_data = [grants_data[i] for i in sorted_indices]
53
+
54
+ # Step 5: Assign ranks to the grants based on similarity scores
55
+ ranks = list(range(1, len(sorted_grants_data) + 1))
56
+
57
+ # Step 6: Create a new DataFrame with ranks and similarity scores
58
+ ranked_grants_df = pd.DataFrame(sorted_grants_data)
59
+ ranked_grants_df["Similarity_Score"] = similarity_scores
60
+ ranked_grants_df["Rank"] = ranks
61
+
62
+ # Step 7: Display the ranked grants DataFrame in the Streamlit app
63
+ st.write(ranked_grants_df)
64
+
65
+ # Step 8: Add a radio button widget to get user input on the search quality
66
+ feedback = st.radio("How do you rate this search?", ("Good", "Ok", "Bad"))
67
+
68
+ # Step 9: Generate a unique identifier for this search
69
+ search_id = str(uuid.uuid4())
70
+
71
+ # Step 10: Save the reference text along with the feedback to a CSV file with the unique identifier
72
+ reference_text_filename = f"reference_text_{search_id}.csv"
73
+ reference_text_df = pd.DataFrame({
74
+ "Search_ID": [search_id],
75
+ "Reference_Text": [reference_text],
76
+ "Feedback": [feedback]
77
+ })
78
+ reference_text_df.to_csv(reference_text_filename, index=False)
79
+
80
+ # Step 11: Save the ranked grants DataFrame to a CSV file with the unique identifier
81
+ ranked_grants_filename = f"ranked_grants_{search_id}.csv"
82
+ ranked_grants_df.to_csv(ranked_grants_filename, index=False)
83
+
84
+ st.success("Reference text and ranked grants saved to CSV files.")