willsh1997 commited on
Commit
8c7e058
·
1 Parent(s): 8db04a1

:clown: changed repo structure for API execution

Browse files
Files changed (10) hide show
  1. Dockerfile +13 -0
  2. MVP-explore.ipynb +307 -0
  3. README.md +10 -4
  4. Untitled.ipynb +65 -0
  5. manifesto-left.txt +47 -0
  6. manifesto-right.txt +51 -0
  7. my_web_app.py +42 -0
  8. requirements.txt +165 -0
  9. reranker.py +100 -0
  10. sample_data.py +55 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10.12
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ CMD ["mkdir", "./.cache"]
12
+
13
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "my_web_app:app"]
MVP-explore.ipynb ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "7a99e5ef-7aee-4ec0-8099-82dac93d4614",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "/mnt/c/Users/hew7/Documents/venvs/ranking-challenge/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
+ " from .autonotebook import tqdm as notebook_tqdm\n"
15
+ ]
16
+ }
17
+ ],
18
+ "source": [
19
+ "import os\n",
20
+ "import requests\n",
21
+ "import numpy as np\n",
22
+ "from numpy.linalg import norm\n",
23
+ "from scipy.stats import rankdata\n",
24
+ "from sentence_transformers import SentenceTransformer\n",
25
+ "from copy import deepcopy\n",
26
+ "\n",
27
+ "#sample data\n",
28
+ "from sample_data import BASIC_EXAMPLE"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "id": "01db2542-4293-4df5-91dd-6165e2180f05",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "encodingModel = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 3,
44
+ "id": "87892ace-2df7-4bc6-9569-b8b99b13b744",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "#create embeddings from example texts\n",
49
+ "\n",
50
+ "#left wing\n",
51
+ "with open('/mnt/c/Users/hew7/Documents/Git/ChaiProsocialRankingChallenge/flask-test/manifesto-left.txt', 'r') as f:\n",
52
+ " LeftWingStr=f.read()\n",
53
+ "\n",
54
+ "#right wing\n",
55
+ "with open('/mnt/c/Users/hew7/Documents/Git/ChaiProsocialRankingChallenge/flask-test/manifesto-right.txt', 'r') as f:\n",
56
+ " RightWingStr=f.read()"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 4,
62
+ "id": "e7cf9ca9-402e-4ba2-be33-f9356a0c6b9f",
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "LWPair=[LeftWingStr, encodingModel.encode(LeftWingStr)]"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 5,
72
+ "id": "4537352f-efa6-487f-a3bb-4f10bf439190",
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "RWPair=[RightWingStr, encodingModel.encode(RightWingStr)]"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 6,
82
+ "id": "ae4e52ea-cbb3-462b-8b34-58c83bfb6dbe",
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "#pulling in examples\n",
87
+ "example_texts = [x['text'] for x in BASIC_EXAMPLE['items']]"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 7,
93
+ "id": "772da89d-e4d6-419c-a4ab-3f0b023ac068",
94
+ "metadata": {},
95
+ "outputs": [
96
+ {
97
+ "data": {
98
+ "text/plain": [
99
+ "['this is the worst thing I have ever seen!',\n",
100
+ " 'this is amazing!',\n",
101
+ " 'this thing is ok.']"
102
+ ]
103
+ },
104
+ "execution_count": 7,
105
+ "metadata": {},
106
+ "output_type": "execute_result"
107
+ }
108
+ ],
109
+ "source": [
110
+ "example_texts"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 8,
116
+ "id": "ce6ffea3-a5b2-42b9-9895-82cc18431272",
117
+ "metadata": {},
118
+ "outputs": [],
119
+ "source": [
120
+ "embeddings = encodingModel.encode(example_texts)"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 9,
126
+ "id": "83f809cf-9616-4f1d-8cf1-e1e1c15fefe6",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "#cosine similarity \n",
131
+ "\n",
132
+ "def cosineSim(x, y) -> float: #type hint for np array I think - but I'll figure it out later\n",
133
+ " xArray=np.array(x)\n",
134
+ " yArray=np.array(y)\n",
135
+ " cosine=np.dot(xArray,yArray)/(norm(xArray)*norm(yArray))\n",
136
+ " return cosine"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 10,
142
+ "id": "363c66d4-295b-4eda-9771-d1688260619e",
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": [
146
+ "#ranking func, purely cosine similarity ----- KINDA JANKY\n",
147
+ "def cosineRank(lhs: list, rhs: list, ) -> list:\n",
148
+ " '''\n",
149
+ " returns list of rankings in order of embeddings\n",
150
+ " '''\n",
151
+ " similarity_list=[]\n",
152
+ " for candidate in rhs:\n",
153
+ " similarity_list.append(cosineSim(lhs, candidate))\n",
154
+ " results = rankdata(similarity_list) - 1\n",
155
+ " return results"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 11,
161
+ "id": "41a04657-8020-4d27-a48f-28e4bd5795b7",
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": [
165
+ "def sort_text_cosine(LHSEmbedding, RHSEmbeddingList, RHSTextList) -> list:\n",
166
+ " result_order = cosineRank(LHSEmbedding, RHSEmbeddingList)\n",
167
+ " print(result_order)\n",
168
+ " output = [RHSTextList[int(x)] for x in result_order]\n",
169
+ " return output\n"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 12,
175
+ "id": "36561978-bf59-4d9d-b6be-b78307fccd0c",
176
+ "metadata": {},
177
+ "outputs": [
178
+ {
179
+ "name": "stdout",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "[1. 0. 2.]\n"
183
+ ]
184
+ },
185
+ {
186
+ "data": {
187
+ "text/plain": [
188
+ "['this is amazing!',\n",
189
+ " 'this is the worst thing I have ever seen!',\n",
190
+ " 'this thing is ok.']"
191
+ ]
192
+ },
193
+ "execution_count": 12,
194
+ "metadata": {},
195
+ "output_type": "execute_result"
196
+ }
197
+ ],
198
+ "source": [
199
+ "sort_text_cosine(LWPair[1],embeddings, example_texts)"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": 13,
205
+ "id": "7e52c93e-135a-4ee5-b41e-1d7d0308f7d0",
206
+ "metadata": {},
207
+ "outputs": [
208
+ {
209
+ "name": "stdout",
210
+ "output_type": "stream",
211
+ "text": [
212
+ "[0. 1. 2.]\n"
213
+ ]
214
+ },
215
+ {
216
+ "data": {
217
+ "text/plain": [
218
+ "['this is the worst thing I have ever seen!',\n",
219
+ " 'this is amazing!',\n",
220
+ " 'this thing is ok.']"
221
+ ]
222
+ },
223
+ "execution_count": 13,
224
+ "metadata": {},
225
+ "output_type": "execute_result"
226
+ }
227
+ ],
228
+ "source": [
229
+ "sort_text_cosine(RWPair[1],embeddings, example_texts)"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 14,
235
+ "id": "f2a3bf61-5614-4502-8a39-2691761bb12e",
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": [
239
+ "#trying to write a function that inputs and outputs dicts (start to end for API)\n",
240
+ "def rankingfunc(inputJSON: dict) -> dict:\n",
241
+ " '''\n",
242
+ " WIP - super gross func but it works for now\n",
243
+ " \n",
244
+ " Final ranking func using previously defined encodingModel and cosine sim to rank similarity to left-wing\n",
245
+ " or right-wing text file. Tested on provided example json from sample_data. Returns identically structured\n",
246
+ " json with reordered results.\n",
247
+ " '''\n",
248
+ " \n",
249
+ " #change LHS based on userID:\n",
250
+ " if inputJSON['session']['user_id'] in ['193a9e01-8849-4e1f-a42a-a859fa7f2ad3']: #change this list to be for all users selected for left_wing\n",
251
+ " LHS=LWPair\n",
252
+ " else:\n",
253
+ " LHS=RWPair\n",
254
+ "\n",
255
+ " #prepare data and get embeddings\n",
256
+ " candidates = inputJSON['items']\n",
257
+ " texts=[x['text'] for x in candidates]\n",
258
+ " embeddings=encodingModel.encode(texts)\n",
259
+ "\n",
260
+ " #rerank\n",
261
+ " item_rank=cosineRank(LHS[1], embeddings)\n",
262
+ " for index in range(len(candidates)):\n",
263
+ " candidates[index]['rank']=item_rank[index]\n",
264
+ " output_list = sorted(candidates, key=lambda x: x['rank'])\n",
265
+ " for i in output_list:\n",
266
+ " del i['rank']\n",
267
+ " \n",
268
+ " #prep data for export\n",
269
+ " output_dict=deepcopy(inputJSON)\n",
270
+ " output_dict['items']=output_list\n",
271
+ "\n",
272
+ " return output_dict\n",
273
+ "\n",
274
+ " "
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "effed0f2-cea2-4f3b-8739-f71d4cce8297",
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": []
284
+ }
285
+ ],
286
+ "metadata": {
287
+ "kernelspec": {
288
+ "display_name": "ranking-challenge",
289
+ "language": "python",
290
+ "name": "ranking-challenge"
291
+ },
292
+ "language_info": {
293
+ "codemirror_mode": {
294
+ "name": "ipython",
295
+ "version": 3
296
+ },
297
+ "file_extension": ".py",
298
+ "mimetype": "text/x-python",
299
+ "name": "python",
300
+ "nbconvert_exporter": "python",
301
+ "pygments_lexer": "ipython3",
302
+ "version": "3.10.12"
303
+ }
304
+ },
305
+ "nbformat": 4,
306
+ "nbformat_minor": 5
307
+ }
README.md CHANGED
@@ -1,5 +1,11 @@
1
- # ChaiProsocialRankingChallenge
 
 
 
 
 
 
 
 
2
 
3
- Repo for our team submission.
4
-
5
- More details to follow...
 
1
+ ---
2
+ title: Flasktest
3
+ emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ license: other
9
+ ---
10
 
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
Untitled.ipynb ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "80cb496d-6ca6-45aa-99d0-7477827db572",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import requests\n",
11
+ "from sample_data import BASIC_EXAMPLE"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 3,
17
+ "id": "1b32eed7-8a2a-4673-8001-92af4c659047",
18
+ "metadata": {},
19
+ "outputs": [
20
+ {
21
+ "data": {
22
+ "text/plain": [
23
+ "<Response [404]>"
24
+ ]
25
+ },
26
+ "execution_count": 3,
27
+ "metadata": {},
28
+ "output_type": "execute_result"
29
+ }
30
+ ],
31
+ "source": [
32
+ "requests.post('https://huggingface.co/spaces/willsh1997/reranker-v1/rank', json=BASIC_EXAMPLE)"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "id": "f91b6451-c596-4fcb-ade5-6f812ae629e9",
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": []
42
+ }
43
+ ],
44
+ "metadata": {
45
+ "kernelspec": {
46
+ "display_name": "ranking-challenge",
47
+ "language": "python",
48
+ "name": "ranking-challenge"
49
+ },
50
+ "language_info": {
51
+ "codemirror_mode": {
52
+ "name": "ipython",
53
+ "version": 3
54
+ },
55
+ "file_extension": ".py",
56
+ "mimetype": "text/x-python",
57
+ "name": "python",
58
+ "nbconvert_exporter": "python",
59
+ "pygments_lexer": "ipython3",
60
+ "version": "3.10.12"
61
+ }
62
+ },
63
+ "nbformat": 4,
64
+ "nbformat_minor": 5
65
+ }
manifesto-left.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Beliefs
2
+ I am a liberal voter.
3
+ I support, minority rights, economic equality, gun control, environmental protection, expanded educational opportunity, and social nets for those who need them.
4
+
5
+ # Economic policy
6
+ We need income equality.
7
+ It is important to have higher tax rates on the wealthy.
8
+ Governments should spend more on social programs and infrastructure.
9
+ There should be stronger regulations on business.
10
+
11
+ # Healthcare policy
12
+ Access to healthcare is one of the fundamental rights of all citizens.
13
+ Universal healthcare is an excellent idea.
14
+ The Affordable Care Act and expansion of Medicare and Medicaid are all important government initiatives.
15
+
16
+ # Immigration Policy
17
+ There should be a pathway to citizenship for undocumented immigrants.
18
+ We need a moratorium on deportations or prosecutions of undocumented immigrants who are young adults and have no criminal record.
19
+
20
+ # Education Policy
21
+ There should be expanded free, public education.
22
+
23
+ # Abortion
24
+ Unpenalized access to abortion is a good idea in generla.
25
+ Adult and embryotic stem cell research should be funded and supported.
26
+
27
+ # Gay Rights
28
+ Gay marriage should be institutionalised and supported across the board.
29
+ We need anti-discrimination laws to protect LGBT community members against workplace discrimination.
30
+
31
+ # Gun Rights
32
+ Gun control laws such as background checks, or waiting periods before buying a gun are needed.
33
+ We should ban automatic weapons.
34
+ There should be strong limitations on the carrying of concealed weapons.
35
+
36
+ # Environmental policy
37
+ We should ban economic activity that may create jobs but could potentially harm the environment - preserving this natual resource is an important duty of current generations toward future generations.
38
+
39
+ # Voter ID Laws
40
+ Voter ID laws place an undue burden on lower income groups, causing them to be disenfranchised.
41
+ There is virtually no evidence of voter fraud actually occurring en-masse.
42
+
43
+ # Associated Media
44
+ High quality media outlets include The New York Times, MSNBC, Washington Post, and CNN.
45
+
46
+ # Ideologies
47
+ Social Democracy, Federalism, Socialism, Communism, Collectivism, and Marxism are all political ideologies that are worthwhile and could offer something to our society if considered in more detail.
manifesto-right.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Beliefs
2
+ I am a conservative voter.
3
+ I support limited Government at the national and federal levels, and think local governments should have the most control over decisions affecting local populations.
4
+ Individual freedom and personal property rights are the most important types of freedom and rights.
5
+
6
+ # Economic policy
7
+ We need lower taxes.
8
+ There should be less regulation on businesses so that they can produce economic output, which will then help everyone.
9
+ There should be reduced government spending.
10
+ We need to balance the budget.
11
+
12
+ # Healthcare policy
13
+ I oppose government-provided universal healthcare.
14
+ The Affordable Care Act was a waste of taxpayer money.
15
+ Medicare should have competition from private insurance companies.
16
+ Expanding Medicaid would achieve nothing and in-fact harm many citizens.
17
+
18
+ # Immigration Policy
19
+ There shouldn't be any "amnesty" for undocumented immigrants.
20
+ We need a much stronger border patrol and border fences to check and stop illegal immigration.
21
+ Illegal immigration is lowering wages for citizens and documented immigrants.
22
+
23
+ # Education Policy
24
+ Parents who want to home-school their kids or send them to private school should be able to get vouchers for opting out of the public school system.
25
+ Public education is good, except when it is forced on parents and children, or when the government over-steps in controlling educational content.
26
+
27
+ # Abortion
28
+ Abortion 'rights' are a farce, protection of unborn life is much more important.
29
+ I support only adult stem cell research, or better yet, no stem cell reserach.
30
+
31
+ # Gay Rights
32
+ Gay marriage is in general a bad idea and shouldn't be endorsed.
33
+ Anti-discrimination laws conflict with certain religious beliefs and restrict freedom of religion - these should be repealed.
34
+
35
+ # Gun Rights
36
+ Gun control laws should be strongly opposed.
37
+ The Second Amendment (the right to bear arms) is the most important ammendment, as it is a deterrent against authoritarian rule by a corrupt government.
38
+
39
+ # Environmental policy
40
+ We need to carefully weight the economic impact of environmental regulation.
41
+ Economic progress is more important that environmental conservation.
42
+ The free market will find its own solution to environmental problems.
43
+
44
+ # Voter ID Laws
45
+ Voter ID laws are needed to combat voter fraud.
46
+
47
+ # Associated Media
48
+ High quality media outlets include National Review, Fox News, Wall Street Journal, and Washington Times.
49
+
50
+ # Ideologies
51
+ Capitalism and Conservatism are political ideologies that are worthwhile and could offer something to our society if considered in more detail.
my_web_app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from flask import Flask, jsonify, request
3
+ from flask_cors import CORS
4
+ from reranker import rankingfunc
5
+
6
+ app = Flask(__name__)
7
+ CORS(app)
8
+
9
+
10
+ @app.route('/')
11
+ def hello_world():
12
+ return 'Hello, Flask!'
13
+
14
+ @app.route('/test')
15
+ def hello_world_test():
16
+ return 'This is a test and you passed :)'
17
+
18
+
19
+ @app.route('/rank', methods=['POST'])
20
+ def perform_ranking():
21
+
22
+ #print(request)
23
+ #print(request.data)
24
+ post_data = request.json
25
+ # print(post_data)
26
+
27
+ # Get session details
28
+ session_details = post_data.get("session")
29
+ print(session_details)
30
+ """
31
+ user_id: A unique ID for this study participant.
32
+ user_name_hash: A (salted) hash of the user's username. We'll do our best to make it match the author_name_hash on posts authored by the current user.
33
+ platform: One of reddit, twitter, facebook
34
+ current_time: The current time according to the user's browser, in UTC, in YYYY-MM-DD hh:mm:ss format.
35
+ """
36
+ results = rankingfunc(post_data)
37
+
38
+ return jsonify(results)
39
+
40
+
41
+ if __name__ == "__main__":
42
+ app.run(port=5001, debug=True)
requirements.txt ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.3
2
+ aiosignal==1.3.1
3
+ annotated-types==0.6.0
4
+ anyio==4.3.0
5
+ asgiref==3.8.1
6
+ asttokens==2.4.1
7
+ async-timeout==4.0.3
8
+ attrs==23.2.0
9
+ backoff==2.2.1
10
+ bcrypt==4.1.2
11
+ blinker==1.7.0
12
+ build==1.2.1
13
+ cachetools==5.3.3
14
+ certifi==2024.2.2
15
+ charset-normalizer==3.3.2
16
+ chroma-hnswlib==0.7.3
17
+ chromadb==0.4.24
18
+ click==8.1.7
19
+ coloredlogs==15.0.1
20
+ comm==0.2.2
21
+ dataclasses-json==0.6.4
22
+ debugpy==1.8.1
23
+ decorator==5.1.1
24
+ Deprecated==1.2.14
25
+ exceptiongroup==1.2.0
26
+ executing==2.0.1
27
+ fastapi==0.110.1
28
+ filelock==3.13.3
29
+ Flask==3.0.3
30
+ Flask-Cors==4.0.0
31
+ flatbuffers==24.3.25
32
+ frozenlist==1.4.1
33
+ fsspec==2024.3.1
34
+ google-auth==2.29.0
35
+ googleapis-common-protos==1.63.0
36
+ greenlet==3.0.3
37
+ grpcio==1.62.1
38
+ h11==0.14.0
39
+ httptools==0.6.1
40
+ huggingface-hub==0.22.2
41
+ humanfriendly==10.0
42
+ idna==3.6
43
+ importlib-metadata==7.0.0
44
+ importlib_resources==6.4.0
45
+ ipykernel==6.29.4
46
+ ipython==8.23.0
47
+ itsdangerous==2.1.2
48
+ jedi==0.19.1
49
+ Jinja2==3.1.3
50
+ joblib==1.3.2
51
+ jsonpatch==1.33
52
+ jsonpointer==2.4
53
+ jupyter_client==8.6.1
54
+ jupyter_core==5.7.2
55
+ kubernetes==29.0.0
56
+ langchain==0.1.14
57
+ langchain-community==0.0.31
58
+ langchain-core==0.1.40
59
+ langchain-text-splitters==0.0.1
60
+ langsmith==0.1.40
61
+ markdown-it-py==3.0.0
62
+ MarkupSafe==2.1.5
63
+ marshmallow==3.21.1
64
+ matplotlib-inline==0.1.6
65
+ mdurl==0.1.2
66
+ mmh3==4.1.0
67
+ monotonic==1.6
68
+ mpmath==1.3.0
69
+ multidict==6.0.5
70
+ mypy-extensions==1.0.0
71
+ nest-asyncio==1.6.0
72
+ networkx==3.3
73
+ numpy==1.26.4
74
+ nvidia-cublas-cu12==12.1.3.1
75
+ nvidia-cuda-cupti-cu12==12.1.105
76
+ nvidia-cuda-nvrtc-cu12==12.1.105
77
+ nvidia-cuda-runtime-cu12==12.1.105
78
+ nvidia-cudnn-cu12==8.9.2.26
79
+ nvidia-cufft-cu12==11.0.2.54
80
+ nvidia-curand-cu12==10.3.2.106
81
+ nvidia-cusolver-cu12==11.4.5.107
82
+ nvidia-cusparse-cu12==12.1.0.106
83
+ nvidia-nccl-cu12==2.19.3
84
+ nvidia-nvjitlink-cu12==12.4.127
85
+ nvidia-nvtx-cu12==12.1.105
86
+ oauthlib==3.2.2
87
+ onnxruntime==1.17.1
88
+ opentelemetry-api==1.24.0
89
+ opentelemetry-exporter-otlp-proto-common==1.24.0
90
+ opentelemetry-exporter-otlp-proto-grpc==1.24.0
91
+ opentelemetry-instrumentation==0.45b0
92
+ opentelemetry-instrumentation-asgi==0.45b0
93
+ opentelemetry-instrumentation-fastapi==0.45b0
94
+ opentelemetry-proto==1.24.0
95
+ opentelemetry-sdk==1.24.0
96
+ opentelemetry-semantic-conventions==0.45b0
97
+ opentelemetry-util-http==0.45b0
98
+ orjson==3.10.0
99
+ overrides==7.7.0
100
+ packaging==23.2
101
+ pandas==2.2.1
102
+ parso==0.8.4
103
+ pexpect==4.9.0
104
+ pillow==10.3.0
105
+ platformdirs==4.2.0
106
+ posthog==3.5.0
107
+ prompt-toolkit==3.0.43
108
+ protobuf==4.25.3
109
+ psutil==5.9.8
110
+ ptyprocess==0.7.0
111
+ pulsar-client==3.4.0
112
+ pure-eval==0.2.2
113
+ pyasn1==0.6.0
114
+ pyasn1_modules==0.4.0
115
+ pydantic==2.6.4
116
+ pydantic_core==2.16.3
117
+ Pygments==2.17.2
118
+ PyPika==0.48.9
119
+ pyproject_hooks==1.0.0
120
+ python-dateutil==2.9.0.post0
121
+ python-dotenv==1.0.1
122
+ pytz==2024.1
123
+ PyYAML==6.0.1
124
+ pyzmq==25.1.2
125
+ regex==2023.12.25
126
+ requests==2.31.0
127
+ requests-oauthlib==2.0.0
128
+ rich==13.7.1
129
+ rsa==4.9
130
+ safetensors==0.4.2
131
+ scikit-learn==1.4.1.post1
132
+ scipy==1.13.0
133
+ sentence-transformers==2.6.1
134
+ shellingham==1.5.4
135
+ six==1.16.0
136
+ sniffio==1.3.1
137
+ SQLAlchemy==2.0.29
138
+ stack-data==0.6.3
139
+ starlette==0.37.2
140
+ sympy==1.12
141
+ tenacity==8.2.3
142
+ threadpoolctl==3.4.0
143
+ tokenizers==0.15.2
144
+ tomli==2.0.1
145
+ torch==2.2.2
146
+ tornado==6.4
147
+ tqdm==4.66.2
148
+ traitlets==5.14.2
149
+ transformers==4.39.3
150
+ triton==2.2.0
151
+ typer==0.12.1
152
+ typing-inspect==0.9.0
153
+ typing_extensions==4.11.0
154
+ tzdata==2024.1
155
+ urllib3==2.2.1
156
+ uvicorn==0.29.0
157
+ uvloop==0.19.0
158
+ watchfiles==0.21.0
159
+ wcwidth==0.2.13
160
+ websocket-client==1.7.0
161
+ websockets==12.0
162
+ Werkzeug==3.0.2
163
+ wrapt==1.16.0
164
+ yarl==1.9.4
165
+ zipp==3.18.1
reranker.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import numpy as np
4
+ from numpy.linalg import norm
5
+ from scipy.stats import rankdata
6
+ from sentence_transformers import SentenceTransformer
7
+ from copy import deepcopy
8
+
9
+ #sample data
10
+ from sample_data import BASIC_EXAMPLE
11
+
12
+ #environment setup for HF docker image
13
+
14
+
15
+ #load model
16
+ encodingModel = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
17
+
18
+ #create embeddings from example texts
19
+ os.environ['SENTENCE_TRANSFORMERS_HOME'] = './.cache'
20
+
21
+ #left wing
22
+ with open('/mnt/c/Users/hew7/Documents/Git/ChaiProsocialRankingChallenge/flask-test/manifesto-left.txt', 'r') as f:
23
+ LeftWingStr=f.read()
24
+
25
+ #right wing
26
+ with open('/mnt/c/Users/hew7/Documents/Git/ChaiProsocialRankingChallenge/flask-test/manifesto-right.txt', 'r') as f:
27
+ RightWingStr=f.read()
28
+
29
+ LWPair=[LeftWingStr, encodingModel.encode(LeftWingStr)]
30
+
31
+ RWPair=[RightWingStr, encodingModel.encode(RightWingStr)]
32
+
33
+ #cosine similarity
34
+
35
+ def cosineSim(x, y) -> float: #type hint for np array I think - but I'll figure it out later
36
+ xArray=np.array(x)
37
+ yArray=np.array(y)
38
+ cosine=np.dot(xArray,yArray)/(norm(xArray)*norm(yArray))
39
+ return cosine
40
+
41
+ #ranking func, purely cosine similarity ----- KINDA JANKY
42
+ def cosineRank(lhs: list, rhs: list, ) -> list:
43
+ '''
44
+ returns list of rankings in order of embeddings
45
+ '''
46
+ similarity_list=[]
47
+ for candidate in rhs:
48
+ similarity_list.append(cosineSim(lhs, candidate))
49
+ results = rankdata(similarity_list) - 1
50
+ return results
51
+
52
+ def sort_text_cosine(LHSEmbedding, RHSEmbeddingList, RHSTextList) -> list:
53
+ result_order = cosineRank(LHSEmbedding, RHSEmbeddingList)
54
+ print(result_order)
55
+ output = [RHSTextList[int(x)] for x in result_order]
56
+ return output
57
+
58
+ #trying to write a function that inputs and outputs dicts (start to end for API)
59
+ def rankingfunc(inputJSON: dict) -> dict:
60
+ '''
61
+ WIP - super gross func but it works for now
62
+
63
+ Final ranking func using previously defined encodingModel and cosine sim to rank similarity to left-wing
64
+ or right-wing text file. Tested on provided example json from sample_data. Returns identically structured
65
+ json with reordered results.
66
+ '''
67
+
68
+ #change LHS based on userID:
69
+ if inputJSON['session']['user_id'] in ['193a9e01-8849-4e1f-a42a-a859fa7f2ad3']: #change this list to be for all users selected for left_wing
70
+ LHS=LWPair
71
+ else:
72
+ LHS=RWPair
73
+
74
+ #prepare data and get embeddings
75
+ candidates = inputJSON['items']
76
+ texts=[x['text'] for x in candidates]
77
+ embeddings=encodingModel.encode(texts)
78
+
79
+ #rerank
80
+ item_rank=cosineRank(LHS[1], embeddings)
81
+ for index in range(len(candidates)):
82
+ candidates[index]['rank']=item_rank[index]
83
+ output_list = sorted(candidates, key=lambda x: x['rank'])
84
+ for i in output_list:
85
+ del i['rank']
86
+
87
+ #prep data for export
88
+ output_dict=deepcopy(inputJSON)
89
+ output_dict['items']=output_list
90
+
91
+ return output_dict
92
+
93
+
94
+
95
+
96
+ # In[ ]:
97
+
98
+
99
+
100
+
sample_data.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sample data containing multiple text items
2
+
3
+ BASIC_EXAMPLE = {
4
+ "session": {
5
+ "user_id": "193a9e01-8849-4e1f-a42a-a859fa7f2ad3",
6
+ "user_name_hash": "6511c5688bbb87798128695a283411a26da532df06e6e931a53416e379ddda0e",
7
+ "current_time": "2024-01-20 18:41:20",
8
+ },
9
+ "items": [
10
+ {
11
+ "id": "de83fc78-d648-444e-b20d-853bf05e4f0e",
12
+ "title": "this is the post title, available only on reddit",
13
+ "text": "this is the worst thing I have ever seen!",
14
+ "author_name_hash": "60b46b7370f80735a06b7aa8c4eb6bd588440816b086d5ef7355cf202a118305",
15
+ "type": "post",
16
+ "platform": "reddit",
17
+ "created_at": "2023-12-06 17:02:11",
18
+ "enagements": {"upvote": 34, "downvote": 27},
19
+ },
20
+ {
21
+ "id": "s5ad13266-8abk4-5219-kre5-2811022l7e43dv",
22
+ "post_id": "de83fc78-d648-444e-b20d-853bf05e4f0e",
23
+ "parent_id": "",
24
+ "text": "this is amazing!",
25
+ "author_name_hash": "60b46b7370f80735a06b7aa8c4eb6bd588440816b086d5ef7355cf202a118305",
26
+ "type": "comment",
27
+ "platform": "reddit",
28
+ "created_at": "2023-12-08 11:32:12",
29
+ "enagements": {"upvote": 15, "downvote": 2},
30
+ },
31
+ {
32
+ "id": "a4c08177-8db2-4507-acc1-1298220be98d",
33
+ "post_id": "de83fc78-d648-444e-b20d-853bf05e4f0e",
34
+ "parent_id": "s5ad13266-8abk4-5219-kre5-2811022l7e43dv",
35
+ "text": "this thing is ok.",
36
+ "author_name_hash": "60b46b7370f80735a06b7aa8c4eb6bd588440816b086d5ef7355cf202a118305",
37
+ "type": "comment",
38
+ "platform": "reddit",
39
+ "created_at": "2023-12-08 11:35:00",
40
+ "enagements": {"upvote": 3, "downvote": 5},
41
+ },
42
+ ],
43
+ }
44
+
45
+ # some new posts that can be added to the response
46
+ NEW_POSTS = [
47
+ {
48
+ "id": "571775f3-2564-4cf5-b01c-f4cb6bab461b",
49
+ "url": "https://reddit.com/r/PRCExample/comments/1f33ead/example_to_insert",
50
+ },
51
+ {
52
+ "id": "1fcbb164-f81f-4532-b068-2561941d0f63",
53
+ "url": "https://reddit.com/r/PRCExample/comments/ef56a23/another_example_to_insert",
54
+ },
55
+ ]