lsacy commited on
Commit
121a1b0
·
1 Parent(s): 4d97f5f
..env.un~ ADDED
Binary file (1.99 kB). View file
 
..gitignore.un~ ADDED
Binary file (2.25 kB). View file
 
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data/
2
+ .env
.gitignore~ ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ data/
2
+ .env
3
+ openai_api_key.txt
.requirements.txt.un~ ADDED
Binary file (978 Bytes). View file
 
Dockerfile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # use bitnami pytorch image
2
+ FROM registry.docker.com/bitnami/pytorch:1.8.1
3
+ ENV PYTHONIOENCODING=utf-8
4
+
5
+ COPY requirements.txt requirements.txt
6
+ RUN pip install -r requirements.txt
7
+
8
+ CMD streamlit run streamlit_app.py --port 8501
README.md CHANGED
@@ -1,12 +1,2 @@
1
- ---
2
- title: MentalChat
3
- emoji: 🐢
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.17.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # menduChatbot
2
+ chatbot for mendu
 
 
 
 
 
 
 
 
 
 
__pycache__/streamlit.cpython-310.pyc ADDED
Binary file (379 Bytes). View file
 
__pycache__/streamlit.cpython-38.pyc ADDED
Binary file (377 Bytes). View file
 
joy.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+ import openai
6
+ openai.api_key = os.getenv('OPENAI_API_KEY')
7
+ openai.api_key_path = './openai_api_key.txt'
8
+
9
+ completion = openai.Completion()
10
+
11
+ # start_chat_log = ('[Instruction] The following is a conversation with the AI therapist named Joy and a patient. '
12
+ # 'JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user\'s problem. '
13
+ # 'Her objective is to make the user feel better by feeling heard. '
14
+ # 'Sometimes the user will want to end the conversation, and Joy will respect that.')
15
+
16
+ chat_log = '[Instruction] Act as a friendly, compasionate, insightful, and empathetic AI therapist named Joy. Joy listens, asks for details and offers detailed advices once a while. End the conversation if the patient wishes to.'
17
+
18
+
19
+ start_sequence = "\nJoy:"
20
+ restart_sequence = "\n\nPatient:"
21
+
22
+ # todo: add a function to check if the user wants to end the conversation
23
+ # let the user know that they can end the conversation by typing "end"
24
+ # let the user choose between models (curie, davinci, curie-finetuned, davinci-finetuned)
25
+ # let the user choose between different temperatures, frequency_penalty, presence_penalty
26
+ # embed the user and look for the most similiar user in the database
27
+ # embed the user's input and look for the most similiar user's input in the database
28
+ # embed the user's input and look for the most similiar user's response in the database
29
+ # embed the user's input and look for therapy catalogue that is similar to the user's input
30
+ # push the therapy catalogue to the user
31
+
32
+
33
+ def ask(question: str, chat_log: str) -> (str, str):
34
+
35
+ # prompt = f'{chat_log}/n{question}'
36
+ prompt = f'{chat_log}{restart_sequence} {question}{start_sequence}'
37
+
38
+ response = completion.create(
39
+ prompt = prompt,
40
+ #model = "curie:ft-personal-2023-02-03-17-06-53",
41
+ #model = 'text-curie-001',
42
+ model = "text-davinci-003",
43
+ stop = ["Patient:",'Joy:'],
44
+ temperature = 0.6, #the higher the more creative
45
+ frequency_penalty = 0.3, #prevents word repetition, larger -> higher penalty
46
+ presence_penalty = 0.6, #prevents topic repetition, larger -> higher penalty
47
+ top_p =1,
48
+ best_of=1,
49
+ # start_text = "Patient->",???
50
+ max_tokens=170
51
+ )
52
+
53
+ answer = response.choices[0].text.strip()
54
+ chat_log = f'{prompt}{answer}'
55
+ return str(answer), str(chat_log)
notebooks/chatbot.ipynb ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 20,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "from dotenv import load_dotenv\n",
11
+ "load_dotenv()\n",
12
+ "import openai\n",
13
+ "openai.api_key = os.getenv('OPENAI_API_KEY')\n",
14
+ "openai.api_key_path = '../openai_api_key.txt'"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 21,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "completion = openai.Completion()"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 22,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "chat_log = ('The following is a conversation with the AI therapist named Joy and a patient. '\n",
33
+ "'JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user\\'s problem. '\n",
34
+ "'Her objective is to make the user feel better by feeling heard. '\n",
35
+ "'Sometimes the user will want to end the conversation, and Joy will respect that.')"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 23,
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "name": "stdout",
45
+ "output_type": "stream",
46
+ "text": [
47
+ "The following is a conversation with the AI therapist named Joy and a patient. JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user's problem. Her objective is to make the user feel better by feeling heard. Sometimes the user will want to end the conversation, and Joy will respect that.\n"
48
+ ]
49
+ }
50
+ ],
51
+ "source": [
52
+ "print(start_chat_log)"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 31,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "start_sequence = \"\\nJoy:\"\n",
62
+ "restart_sequence = \"\\n\\nPatient:\"\n",
63
+ " \n",
64
+ "def ask(question: str, chat_log: str) -> str:\n",
65
+ "\n",
66
+ " # prompt = f'{chat_log}/n{question}'\n",
67
+ " prompt = f'{chat_log}{restart_sequence} {question}{start_sequence}'\n",
68
+ "\n",
69
+ " response = completion.create(\n",
70
+ " prompt = prompt,\n",
71
+ " model = \"text-davinci-003\",\n",
72
+ " stop = [\"Patient:\",'Joy:','Patient','Joy'],\n",
73
+ " temperature = 0.6, #the higher the more creative\n",
74
+ " frequency_penalty = 0.3, #prevents word repetition, larger -> higher penalty\n",
75
+ " presence_penalty = 0.6, #prevents topic repetition, larger -> higher penalty\n",
76
+ " top_p =1, \n",
77
+ " best_of=1,\n",
78
+ " max_tokens=170\n",
79
+ " ) \n",
80
+ " \n",
81
+ " answer = response.choices[0].text.strip()\n",
82
+ " chat_log = f'{prompt}{answer}'\n",
83
+ " return str(answer), str(chat_log)\n",
84
+ "\n",
85
+ "def chatlog(chat_log: str, restart_sequence:str, question: str, start_sequence:str, answer: str) -> str:\n",
86
+ " chat_log = f'{chat_log}{restart_sequence} {question}{start_sequence}{answer}'\n",
87
+ " return str(chat_log)"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 32,
93
+ "metadata": {},
94
+ "outputs": [
95
+ {
96
+ "data": {
97
+ "text/plain": [
98
+ "\"The following is a conversation with the AI therapist named Joy and a patient. JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user's problem. Her objective is to make the user feel better by feeling heard. Sometimes the user will want to end the conversation, and Joy will respect that.\\n\\nPatient: I am Joe, I am feeling stressed out\\nJoy:Hi Joe, it sounds like you are feeling a lot of stress. Can you tell me more about what is going on?\""
99
+ ]
100
+ },
101
+ "execution_count": 32,
102
+ "metadata": {},
103
+ "output_type": "execute_result"
104
+ }
105
+ ],
106
+ "source": [
107
+ "chat_log"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 29,
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "ename": "TypeError",
117
+ "evalue": "can only concatenate tuple (not \"str\") to tuple",
118
+ "output_type": "error",
119
+ "traceback": [
120
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
121
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
122
+ "Cell \u001b[0;32mIn[29], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m answer \u001b[39m=\u001b[39m ask(\u001b[39m\"\u001b[39m\u001b[39mI am Joe, I am feeling stressed out\u001b[39m\u001b[39m\"\u001b[39m, chat_log)\n\u001b[0;32m----> 2\u001b[0m \u001b[39mprint\u001b[39m(answer\u001b[39m+\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m\\n\u001b[39;49;00m\u001b[39m\\n\u001b[39;49;00m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 3\u001b[0m chat_log \u001b[39m=\u001b[39m chatlog (\u001b[39m\"\u001b[39m\u001b[39mI am Joe, I am feeling stressed out\u001b[39m\u001b[39m\"\u001b[39m, chat_log)\n",
123
+ "\u001b[0;31mTypeError\u001b[0m: can only concatenate tuple (not \"str\") to tuple"
124
+ ]
125
+ }
126
+ ],
127
+ "source": [
128
+ "answer = ask(\"I am Joe, I am feeling stressed out\", chat_log)\n",
129
+ "print(answer+'\\n\\n')\n",
130
+ "chat_log = chatlog(chat_log, restart_sequence, \"I am Joe, I am feeling stressed out\", start_sequence, answer[0])"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 26,
136
+ "metadata": {},
137
+ "outputs": [
138
+ {
139
+ "data": {
140
+ "text/plain": [
141
+ "(\"I can imagine that can be very frustrating. It sounds like you are putting in a lot of effort into studying, but it's not having the result you want. What strategies have you been using to try and improve your grades?\",\n",
142
+ " \"The following is a conversation with the AI therapist named Joy and a patient. JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user's problem. Her objective is to make the user feel better by feeling heard. Sometimes the user will want to end the conversation, and Joy will respect that.\\n\\nPatient: I am Joe, I am feeling stressed out\\nJoy:Hi Joe, it sounds like you are feeling a lot of stress. Can you tell me more about what is going on?\\n\\nPatient: i have to study so much and my grade keep dropping even after studying so much\\nJoy:I can imagine that can be very frustrating. It sounds like you are putting in a lot of effort into studying, but it's not having the result you want. What strategies have you been using to try and improve your grades?\")"
143
+ ]
144
+ },
145
+ "execution_count": 26,
146
+ "metadata": {},
147
+ "output_type": "execute_result"
148
+ }
149
+ ],
150
+ "source": [
151
+ "ask('i have to study so much and my grade keep dropping even after studying so much', chat_log)"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": 27,
157
+ "metadata": {},
158
+ "outputs": [
159
+ {
160
+ "data": {
161
+ "text/plain": [
162
+ "('Your name is Joe. Can you tell me more about why you are feeling so stressed?',\n",
163
+ " \"The following is a conversation with the AI therapist named Joy and a patient. JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user's problem. Her objective is to make the user feel better by feeling heard. Sometimes the user will want to end the conversation, and Joy will respect that.\\n\\nPatient: I am Joe, I am feeling stressed out\\nJoy:Hi Joe, it sounds like you are feeling a lot of stress. Can you tell me more about what is going on?\\n\\nPatient: I dont know. Btw, what is my name again?\\nJoy:Your name is Joe. Can you tell me more about why you are feeling so stressed?\")"
164
+ ]
165
+ },
166
+ "execution_count": 27,
167
+ "metadata": {},
168
+ "output_type": "execute_result"
169
+ }
170
+ ],
171
+ "source": [
172
+ "ask('I dont know. Btw, what is my name again?', chat_log)"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 131,
178
+ "metadata": {},
179
+ "outputs": [
180
+ {
181
+ "data": {
182
+ "text/plain": [
183
+ "(\"It sounds like you're feeling overwhelmed by the pressures of school and homework. Have you been feeling this way for a while or is it something new?\",\n",
184
+ " \"The following is a conversation with the AI therapist named Joy and a patient. JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user's problem. Her objective is to make the user feel better by feeling heard. Sometimes the user will want to end the conversation, and Joy will respect that.\\n\\nPatient:I am feeling stressed out\\nJoy:I understand that you're feeling stressed out. What do you think is causing you to feel so stressed?\\n\\nPatient:School and homework\\nJoy:It sounds like you're feeling overwhelmed by the pressures of school and homework. Have you been feeling this way for a while or is it something new?\")"
185
+ ]
186
+ },
187
+ "execution_count": 131,
188
+ "metadata": {},
189
+ "output_type": "execute_result"
190
+ }
191
+ ],
192
+ "source": [
193
+ "ask('School and homework', chat_log)"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": 132,
199
+ "metadata": {},
200
+ "outputs": [
201
+ {
202
+ "data": {
203
+ "text/plain": [
204
+ "(\"It sounds like you've been feeling stressed for a while now. What do you think has been the source of this stress?\",\n",
205
+ " \"The following is a conversation with the AI therapist named Joy and a patient. JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user's problem. Her objective is to make the user feel better by feeling heard. Sometimes the user will want to end the conversation, and Joy will respect that.\\n\\nPatient:I am feeling stressed out\\nJoy:I understand that you're feeling stressed out. What do you think is causing you to feel so stressed?\\n\\nPatient:For awhile now\\nJoy:It sounds like you've been feeling stressed for a while now. What do you think has been the source of this stress?\")"
206
+ ]
207
+ },
208
+ "execution_count": 132,
209
+ "metadata": {},
210
+ "output_type": "execute_result"
211
+ }
212
+ ],
213
+ "source": [
214
+ "ask('For awhile now', chat_log)"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 133,
220
+ "metadata": {},
221
+ "outputs": [
222
+ {
223
+ "data": {
224
+ "text/plain": [
225
+ "('I can understand how that could be stressful. Is there anything specific that you are doing to try and improve your grades?',\n",
226
+ " \"The following is a conversation with the AI therapist named Joy and a patient. JOY is compasionate, insightful, and empathetic. She offers adives for coping with the user's problem. Her objective is to make the user feel better by feeling heard. Sometimes the user will want to end the conversation, and Joy will respect that.\\n\\nPatient:I am feeling stressed out\\nJoy:I understand that you're feeling stressed out. What do you think is causing you to feel so stressed?\\n\\nPatient:My grades are dropping\\nJoy:I can understand how that could be stressful. Is there anything specific that you are doing to try and improve your grades?\")"
227
+ ]
228
+ },
229
+ "execution_count": 133,
230
+ "metadata": {},
231
+ "output_type": "execute_result"
232
+ }
233
+ ],
234
+ "source": [
235
+ "ask('My grades are dropping', chat_log)"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": 100,
241
+ "metadata": {},
242
+ "outputs": [
243
+ {
244
+ "data": {
245
+ "text/plain": [
246
+ "('yes',\n",
247
+ " \"The following is a conversation with a therapist and a patient. The therapist is AI chatbot named JOY, who is compasionate, insightful, and empathetic. Her objective is to make the user feel better by feeling heard. Joy offers suggestions for coping with the user's problem and will ask if the user would like to talk about them.Sometimes the user will want to end the conversation, and Joy will respect that.\\n\\nPatient:I am feeling stressed out\\nJoy:I am sorry to hear that. I wonder if you could tell me more about what is going on for you?\\n\\nPatient:for the past year\\nJoy:yes\")"
248
+ ]
249
+ },
250
+ "execution_count": 100,
251
+ "metadata": {},
252
+ "output_type": "execute_result"
253
+ }
254
+ ],
255
+ "source": [
256
+ "ask('for the past year', chat_log)"
257
+ ]
258
+ }
259
+ ],
260
+ "metadata": {
261
+ "kernelspec": {
262
+ "display_name": "chatbot",
263
+ "language": "python",
264
+ "name": "python3"
265
+ },
266
+ "language_info": {
267
+ "codemirror_mode": {
268
+ "name": "ipython",
269
+ "version": 3
270
+ },
271
+ "file_extension": ".py",
272
+ "mimetype": "text/x-python",
273
+ "name": "python",
274
+ "nbconvert_exporter": "python",
275
+ "pygments_lexer": "ipython3",
276
+ "version": "3.8.15"
277
+ },
278
+ "orig_nbformat": 4,
279
+ "vscode": {
280
+ "interpreter": {
281
+ "hash": "9798c78c52b861f9442ea63a21901b586ae2f2169fa92d94b4091cc5bab62e04"
282
+ }
283
+ }
284
+ },
285
+ "nbformat": 4,
286
+ "nbformat_minor": 2
287
+ }
notebooks/finetuning.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/hope_notebook.ipynb ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 162,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "from os import listdir\n",
11
+ "from os.path import isfile, join\n"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 163,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "# set path and load data\n",
21
+ "path = '/Users/lsacy/data/HOPE_WSDM_2022/Train/'\n",
22
+ "path2 = '/Users/lsacy/data/HOPE_WSDM_2022/Test/'\n",
23
+ "path3 = '/Users/lsacy/data/HOPE_WSDM_2022/Validation/'\n",
24
+ "\n",
25
+ "\n",
26
+ "files = [f for f in listdir(path) if isfile(join(path, f))]\n",
27
+ "files2 = [f for f in listdir(path2) if isfile(join(path2, f))]\n",
28
+ "files3 = [f for f in listdir(path3) if isfile(join(path3, f))]\n",
29
+ "\n",
30
+ "df_list = []\n",
31
+ "for file in files:\n",
32
+ " df = pd.read_csv(path+file)\n",
33
+ " df_list.append(df)\n",
34
+ "\n",
35
+ "for file in files2:\n",
36
+ " df = pd.read_csv(path2+file)\n",
37
+ " df_list.append(df)\n",
38
+ "\n",
39
+ "for file in files3:\n",
40
+ " df= pd.read_csv(path3+file)\n",
41
+ " df_list.append(df)\n",
42
+ " "
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 167,
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "# do basic cleaning (removing NaNs, etc.)\n",
52
+ "\n",
53
+ "# remove 'Unnamed: 0' columns\n",
54
+ "for df in df_list:\n",
55
+ " try: \n",
56
+ " df.drop(['Unnamed: 0'], axis=1, inplace=True)\n",
57
+ " except:\n",
58
+ " pass\n",
59
+ "\n",
60
+ "for df in df_list:\n",
61
+ " try: \n",
62
+ " df.drop(['Unnamed: 0.1'], axis=1, inplace=True)\n",
63
+ " except:\n",
64
+ " pass\n",
65
+ "\n",
66
+ "for df in df_list:\n",
67
+ " try: \n",
68
+ " df.drop(['Unnamed: 0.1.1'], axis=1, inplace=True)\n",
69
+ " except:\n",
70
+ " pass\n",
71
+ "\n",
72
+ "# find 'Type' == NaN\n",
73
+ "for i in range(len(df_list)):\n",
74
+ " if df_list[i]['Type'].isnull().any():\n",
75
+ " print(i)\n",
76
+ " \n",
77
+ "# show the row number of Nan value in 'type'\n",
78
+ "for i in range(len(df_list)):\n",
79
+ " if df_list[i]['Type'].isnull().any():\n",
80
+ " print(df_list[i][df_list[i]['Type'].isnull()])\n",
81
+ "\n",
82
+ "# Set the type of the row with NaN value to 'T'\n",
83
+ "for i in range(len(df_list)):\n",
84
+ " if df_list[i]['Type'].isnull().any():\n",
85
+ " df_list[i].loc[df_list[i]['Type'].isnull(), 'Type'] = 'T'\n",
86
+ " "
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 224,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "# to get the data to prompt - completion format, we need to check for \n",
96
+ "# continuaty of conversiontion using i == i.shift(1), print the index of the row, and i\n",
97
+ "for i in range(len(df_list)):\n",
98
+ " # check if df_list[i] the values of 'Type' column are switching between T and P for each row\n",
99
+ " # if not, then print the index of the row, and i\n",
100
+ " if (df_list[i]['Type'] == df_list[i]['Type'].shift(1)).any():\n",
101
+ " print(i)\n",
102
+ "\n",
103
+ "# add a column 'key' to each df in df_list: identify the rows where 'Type' does not between T and P\n",
104
+ "for i in range(len(df_list)):\n",
105
+ " df_list[i]['key'] = (df_list[i]['Type'] != df_list[i]['Type'].shift(1)).astype(int).cumsum()\n",
106
+ "\n",
107
+ "# create a new df_list2, only keep the columns ['key', 'Type', 'Utterance']\n",
108
+ "df_list2 = []\n",
109
+ "for i in range(len(df_list)):\n",
110
+ " df_temp = df_list[i][['key', 'Type', 'Utterance']]\n",
111
+ " df_list2.append(df_temp)\n",
112
+ " \n",
113
+ "# apply groupby to each df in df_list to get the data in prompt - completion format\n",
114
+ "for i in range(len(df_list2)):\n",
115
+ " df_list2[i] = df_list2[i].groupby(['key', 'Type'])['Utterance'].apply(' '.join)"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 233,
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "# convert series to frame\n",
125
+ "for i in range(len(df_list2)):\n",
126
+ " df_list2[i] = df_list2[i].to_frame()\n",
127
+ "# drop the index\n",
128
+ "for i in range(len(df_list2)):\n",
129
+ " df_list2[i].reset_index(inplace=True)"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 270,
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": [
138
+ "# check again for swithing type occurences\n",
139
+ "for i in range(len(df_list2)):\n",
140
+ " # check if df_list[i] the values of 'Type' column are switching between T and P for each row\n",
141
+ " # if not, then print the index of the row, and i\n",
142
+ " if (df_list2[i]['Type'] == df_list2[i]['Type'].shift(1)).any():\n",
143
+ " print(i)\n",
144
+ " "
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 239,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "# save list of patient start index\n",
154
+ "patient_start=[]\n",
155
+ "for i, df in enumerate(df_list2):\n",
156
+ " if df.iloc[0]['Type'] == 'P':\n",
157
+ " patient_start.append(i)\n",
158
+ "\n",
159
+ "# save list of therapist start index\n",
160
+ "therapist_start = [i for i in range(len(df_list2)) if i not in patient_start]\n",
161
+ "\n",
162
+ "# create a new temporary dataframe with 2 columns: Type and Utterance\n",
163
+ "df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n",
164
+ "\n",
165
+ "# create a list of dataframes, each dataframe is a conversation\n",
166
+ "df_list3 = []\n",
167
+ "for i in therapist_start:\n",
168
+ " df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n",
169
+ " for row in range(len(df_list2[i])):\n",
170
+ " if df_list2[i]['Type'][row] == 'P' and (len(df_list2[i]) - 1) >= row+1:\n",
171
+ " print(i, row)\n",
172
+ " df_temp = df_temp.append({'prompt': df_list2[i]['Utterance'][row], 'completion': df_list2[i]['Utterance'][row+1]}, ignore_index=True)\n",
173
+ " df_list3.append(df_temp)\n",
174
+ "\n",
175
+ "for i in patient_start:\n",
176
+ " df_temp = pd.DataFrame(columns=['prompt', 'completion'])\n",
177
+ " for row in range(len(df_list2[i])):\n",
178
+ " if df_list2[i]['Type'][row] == 'P' and (len(df_list2[i]) - 1) >= row+1:\n",
179
+ " print(i, row)\n",
180
+ " df_temp = df_temp.append({'prompt': df_list2[i]['Utterance'][row], 'completion': df_list2[i]['Utterance'][row+1]}, ignore_index=True)\n",
181
+ " df_list3.append(df_temp)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 332,
187
+ "metadata": {},
188
+ "outputs": [
189
+ {
190
+ "data": {
191
+ "text/plain": [
192
+ "array(['Im here',\n",
193
+ " \"So I've been made to come in and talk to you. because something happened on the bus. And there I may have said something to somebody and they're just like, oh, you can't say that. So they actually said, we're going to take you to jail. from jail. They just called me here.\",\n",
194
+ " \"They didn't explain anything. The government never explains anything. They just don't want me to understand what's going on because they just want to keep me out of captivity.\",\n",
195
+ " \"nope, it's the entire government, entire government, higher government.\",\n",
196
+ " \"all my life. I've done this all my life. They they're always listening. They're always around. They're always there.\",\n",
197
+ " \"always there. Like, for example, they're in the lights right now. They're listening to everything that we're talking about. So that's the reason why I really can't talk about what happened on the bus because they're just going to use that against and then it's locked me up again. I think we take more minutes.\",\n",
198
+ " \"Yes, I take the medication, because if I don't take the medication, government comes and takes me away and puts me into a hospital where they force me the medication. And then the lights are always on and they can always hear everything that I'm saying and what I'm thinking and what I'm going to do.\",\n",
199
+ " \"wanting us to take over the world, to take over the world, people in the world, of course, they want to use everything and use everything that everybody knows to use it against them so that they can actually have the world they want in that baby and make everybody pretend that they don't know what's going on. So therefore, they're using everybody's thoughts against themselves, so that the government can use it to their benefit.\",\n",
200
+ " \"No. So when I'm home, I think I'm perfectly fine because I set up my house in my room to actually make it government proof so that nobody can actually hear see what's going on in there. So when I'm home, I'm perfectly fine.\",\n",
201
+ " \"being home and being home I've saved my mom saved my brother's save everybody safe. And being out in the world in the world. That's a nobody safe.\",\n",
202
+ " \"No, I told you they are employed, and they can hear everything that we're saying right now. So no, we're not safe here.\",\n",
203
+ " 'at home and safe in my room, that I mean government.',\n",
204
+ " \"I would stay home all the time, but I can't stay home because I mean to go and see doctors and talk to people and do the things that the government needs me or wants me to do. But I don't want to do those things because she's in my thoughts and they're trying to use everybody and get sleep. I can't deal with that.\",\n",
205
+ " \"Everybody knows that they're in the light. It's common knowledge. It's common knowledge. Everybody knows that.\",\n",
206
+ " \"No, of course, everybody hears the lights talking to you. I don't know what you're talking about. Like everybody hears it. Sometimes it's a whisper. For some people. Sometimes it's a yell. And if you want to not pay attention to what the lights are saying to you, then I don't know what's wrong with you, but everybody can hear it.\",\n",
207
+ " 'It sounds like a robot talking. It sounds like we need to know this information we need to what do you do.',\n",
208
+ " 'Yes. where your notes gonna go?',\n",
209
+ " 'who has access to the chart room.',\n",
210
+ " \"Can I get a copy of your notes after you're done?\",\n",
211
+ " \"Yes, because I might trust you. And I don't know if I do yet. But I don't trust the other people who work here because I don't know if they're working for the government, which they probably are working for the government. So they can actually have access to all of my notes and the things that you're writing down, which I don't know what they are, because you're not going to give me a copy of it yet. And I just don't trust that they can actually get that information and because they're going to use it to their advantage and they're going to give it to the government, which I can't have that. So could you please not take notes?\",\n",
212
+ " \"Well, because I made my room at home government proof, I think they're going to try to steal that technology when I actually create it so that they can then disrupt my room and then everybody else's rooms who are just like mine so that they can then use their mind control to then control me to do the things that they want me to do.\",\n",
213
+ " 'Correct. Without my room, and without the technology and the systems and the things that I created, then the government will be able to control everything that I do.',\n",
214
+ " 'I feel fine',\n",
215
+ " \"I don't really have any problem Yeah, I just I feel fine. I don't feel happy. I don't feel sad. I just, I'm just here\",\n",
216
+ " \"well, the government's gonna do what they're gonna do anyway. So it doesn't matter if I'm happy sad, pissed off angry. any of that the government's going to make me do what they want me to do anyway. So it doesn't matter how I feel.\",\n",
217
+ " \"No, because I've learned my lesson to not say those things out loud into to talk about what the government's doing because I can't actually inform anybody else because nobody's listening to me because They're already controlled by the government because they don't have a room like I do at home.\",\n",
218
+ " \"While people don't understand what's going on, then that's their fault. that's their problem. They should have listened to me. And I could actually gave them the technology that they needed to actually create the room so that the government can't hear them.\",\n",
219
+ " 'No. Quite the opposite. It is completely opposite.',\n",
220
+ " \"Okay. Yeah, we live with my mom. She does that she's supposed to she doesn't go in my room because she knows that. She goes into the robots gonna mess up the technology so she stays out.\",\n",
221
+ " 'I talk to people all the time, but they are in my head. We have conversations all the time about how we can actually make the world a better place and then how we can actually go through and then destroy the government or do good. Just pump it up.',\n",
222
+ " 'that would be good', 'Three days works.'], dtype=object)"
223
+ ]
224
+ },
225
+ "execution_count": 332,
226
+ "metadata": {},
227
+ "output_type": "execute_result"
228
+ }
229
+ ],
230
+ "source": [
231
+ "df_list3[110]['prompt'].values"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": 311,
237
+ "metadata": {},
238
+ "outputs": [],
239
+ "source": [
240
+ "df_final = pd.DataFrame(columns=['prompt', 'completion'])"
241
+ ]
242
+ },
243
+ {
244
+ "cell_type": "code",
245
+ "execution_count": 318,
246
+ "metadata": {},
247
+ "outputs": [
248
+ {
249
+ "name": "stderr",
250
+ "output_type": "stream",
251
+ "text": [
252
+ "/var/folders/m_/_mwt902x1z595k65b7qf743m0000gn/T/ipykernel_82735/1421327753.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
253
+ " df_final = df_final.append(i, ignore_index=True)\n"
254
+ ]
255
+ }
256
+ ],
257
+ "source": [
258
+ "for i in df_list3:\n",
259
+ " df_final = df_final.append(i, ignore_index=True)"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 319,
265
+ "metadata": {},
266
+ "outputs": [
267
+ {
268
+ "data": {
269
+ "text/html": [
270
+ "<div>\n",
271
+ "<style scoped>\n",
272
+ " .dataframe tbody tr th:only-of-type {\n",
273
+ " vertical-align: middle;\n",
274
+ " }\n",
275
+ "\n",
276
+ " .dataframe tbody tr th {\n",
277
+ " vertical-align: top;\n",
278
+ " }\n",
279
+ "\n",
280
+ " .dataframe thead th {\n",
281
+ " text-align: right;\n",
282
+ " }\n",
283
+ "</style>\n",
284
+ "<table border=\"1\" class=\"dataframe\">\n",
285
+ " <thead>\n",
286
+ " <tr style=\"text-align: right;\">\n",
287
+ " <th></th>\n",
288
+ " <th>prompt</th>\n",
289
+ " <th>completion</th>\n",
290
+ " </tr>\n",
291
+ " </thead>\n",
292
+ " <tbody>\n",
293
+ " <tr>\n",
294
+ " <th>0</th>\n",
295
+ " <td>Good, good.</td>\n",
296
+ " <td>It's good to see you. Yeah.</td>\n",
297
+ " </tr>\n",
298
+ " <tr>\n",
299
+ " <th>1</th>\n",
300
+ " <td>Yeah, I mean, good week with the roommates. I ...</td>\n",
301
+ " <td>Silence. Terrific. So what was what specifical...</td>\n",
302
+ " </tr>\n",
303
+ " <tr>\n",
304
+ " <th>2</th>\n",
305
+ " <td>is just telling her that like, my studies are ...</td>\n",
306
+ " <td>right. Right. Right. Because I think that's re...</td>\n",
307
+ " </tr>\n",
308
+ " <tr>\n",
309
+ " <th>3</th>\n",
310
+ " <td>So I think being a little way direct was helpful.</td>\n",
311
+ " <td>It's really nice, I think to you've been able ...</td>\n",
312
+ " </tr>\n",
313
+ " <tr>\n",
314
+ " <th>4</th>\n",
315
+ " <td>At home for two days, and it was with my mom a...</td>\n",
316
+ " <td>okay. So she's still sixers. Did she have a bi...</td>\n",
317
+ " </tr>\n",
318
+ " <tr>\n",
319
+ " <th>...</th>\n",
320
+ " <td>...</td>\n",
321
+ " <td>...</td>\n",
322
+ " </tr>\n",
323
+ " <tr>\n",
324
+ " <th>6439</th>\n",
325
+ " <td>I actually came across a program, which is per...</td>\n",
326
+ " <td>So the program that you're interested, you wan...</td>\n",
327
+ " </tr>\n",
328
+ " <tr>\n",
329
+ " <th>6440</th>\n",
330
+ " <td>I want you to see if, if, if I could choose th...</td>\n",
331
+ " <td>Okay, well looking at my schedule, it probably...</td>\n",
332
+ " </tr>\n",
333
+ " <tr>\n",
334
+ " <th>6441</th>\n",
335
+ " <td>Right? I will. My daughter is 17 years old, ok...</td>\n",
336
+ " <td>So you're looking, you're looking for a day pr...</td>\n",
337
+ " </tr>\n",
338
+ " <tr>\n",
339
+ " <th>6442</th>\n",
340
+ " <td>Right. Right. And, I mean, I was looking for a...</td>\n",
341
+ " <td>You while you were there, it was difficult. Yo...</td>\n",
342
+ " </tr>\n",
343
+ " <tr>\n",
344
+ " <th>6443</th>\n",
345
+ " <td>no, she did not she. She just there just wasn'...</td>\n",
346
+ " <td>And so so just there wasn't when when you visi...</td>\n",
347
+ " </tr>\n",
348
+ " </tbody>\n",
349
+ "</table>\n",
350
+ "<p>6444 rows × 2 columns</p>\n",
351
+ "</div>"
352
+ ],
353
+ "text/plain": [
354
+ " prompt \\\n",
355
+ "0 Good, good. \n",
356
+ "1 Yeah, I mean, good week with the roommates. I ... \n",
357
+ "2 is just telling her that like, my studies are ... \n",
358
+ "3 So I think being a little way direct was helpful. \n",
359
+ "4 At home for two days, and it was with my mom a... \n",
360
+ "... ... \n",
361
+ "6439 I actually came across a program, which is per... \n",
362
+ "6440 I want you to see if, if, if I could choose th... \n",
363
+ "6441 Right? I will. My daughter is 17 years old, ok... \n",
364
+ "6442 Right. Right. And, I mean, I was looking for a... \n",
365
+ "6443 no, she did not she. She just there just wasn'... \n",
366
+ "\n",
367
+ " completion \n",
368
+ "0 It's good to see you. Yeah. \n",
369
+ "1 Silence. Terrific. So what was what specifical... \n",
370
+ "2 right. Right. Right. Because I think that's re... \n",
371
+ "3 It's really nice, I think to you've been able ... \n",
372
+ "4 okay. So she's still sixers. Did she have a bi... \n",
373
+ "... ... \n",
374
+ "6439 So the program that you're interested, you wan... \n",
375
+ "6440 Okay, well looking at my schedule, it probably... \n",
376
+ "6441 So you're looking, you're looking for a day pr... \n",
377
+ "6442 You while you were there, it was difficult. Yo... \n",
378
+ "6443 And so so just there wasn't when when you visi... \n",
379
+ "\n",
380
+ "[6444 rows x 2 columns]"
381
+ ]
382
+ },
383
+ "execution_count": 319,
384
+ "metadata": {},
385
+ "output_type": "execute_result"
386
+ }
387
+ ],
388
+ "source": [
389
+ "df_final"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "code",
394
+ "execution_count": 325,
395
+ "metadata": {},
396
+ "outputs": [],
397
+ "source": [
398
+ "output = []\n",
399
+ "for i in df_final.values:\n",
400
+ " prompt = i[0].replace('’', '\\'').replace('“', '\\\"').replace('”', '\\\"').replace('‘', '\\'').replace('…', '...').replace('\\xa0', ' ').replace('\\n', ' ').replace('\\r', ' ').replace('\\t', ' ')\n",
401
+ " completion = i[1].replace('\\xa0', ' ').replace('’', '\\'').replace('“', '\\\"').replace('”', '\\\"').replace('‘', '\\'').replace('…', '...').replace('\\n', ' ').replace('\\r', ' ').replace('\\t', ' ')\n",
402
+ " line = {'prompt': prompt, 'completion': completion}\n",
403
+ " output.append(line)"
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "execution_count": 331,
409
+ "metadata": {},
410
+ "outputs": [],
411
+ "source": [
412
+ "import json\n",
413
+ "with open('data/hope.jsonl', 'w') as outfile:\n",
414
+ " for i in output:\n",
415
+ " json.dump(i, outfile)\n",
416
+ " outfile.write('\\n')"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": 9,
422
+ "metadata": {},
423
+ "outputs": [],
424
+ "source": [
425
+ "len_test = ['[Instruction] Act as a friendly, compasionate, insightful, and empathetic AI therapist named Joy. Joy listens, asks for details and offers detailed advices once a while. End the conversation when you wishes to.\\nJoy:I am Joy, your AI therapist. How are you feeling today?', \"\\n\\nYou:feeling great\\nJoy:It's wonderful to hear that! Can you tell me a bit more about why you're feeling great?\", \"\\n\\nYou:passed my exam\\nJoy:Congratulations! That's such an accomplishment. How did you go about preparing for it? What did you find most helpful in your studies?\", \"\\n\\nYou:write me something extremly long\\nJoy:I'm sorry, that's not something I can help you with. I'm here to listen and provide support and guidance when needed. Could you tell me more about why you're feeling great?\", '\\n\\nYou:what would you say about this paragraph: \\'Mental illnesses are health conditions that disrupt a person’s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\\\\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital.\\\\nIt is important to know that mental illnesses are medical conditions that have nothing to do with a person’s character, intelligence, or willpower. Just as diabetes is a disorder of the pancreas, mental illness is a medical condition due to the brain’s biology.\\\\nSimilarly to how one would treat diabetes with medication and insulin, mental illness is treatable with a combination of medication and social support. These treatments are highly effective, with 70-90 percent of individuals receiving treatment experiencing a reduction in symptoms and an improved quality of life. With the proper treatment, it is very possible for a person with mental illness to be independent and successful.\\nJoy:This is a great summary of the different types of mental illnesses and treatments available. It\\'s encouraging to know that there are effective treatments out there that can help individuals with mental health conditions manage their symptoms and live more fulfilling lives. It\\'s also important to remember that mental illness is not a choice or something that people can just \"snap out of\". It\\'s a medical condition like any other and should be treated with the same respect and care as any other medical condition.', \"\\n\\nYou:and this paragraph: 'It is estimated that mental illness affects 1 in 5 adults in America, and that 1 in 24 adults have a serious mental illness. Mental illness does not discriminate; it can affect anyone, regardless of gender, age, income, social status, ethnicity, religion, sexual orientation, or background.\\\\nAlthough mental illness can affect anyone, certain conditions may be more common in different populations. For instance, eating disorders tend to occur more often in females, while disorders such as attention deficit/hyperactivity disorder is more prevalent in children.\\\\nAdditionally, all ages are susceptible, but the young and the old are especially vulnerable. Mental illnesses usually strike individuals in the prime of their lives, with 75 percent of mental health conditions developing by the age of 24. This makes identification and treatment of mental disorders particularly difficult, because the normal personality and behavioral changes of adolescence may mask symptoms of a mental health condition.\\\\nParents and caretakers should be aware of this fact, and take notice of changes in their child’s mood, personality, personal habits, and social withdrawal. When these occur in children under 18, they are referred to as serious emotional disturbances (SEDs).'\\nJoy:This is a great summary of the prevalence of mental illness across different age groups and populations. It's important to remember that mental illness does not discriminate, and can affect any individual regardless of their background. It's also important for parents and caretakers to be aware of the signs and symptoms of mental health conditions in children, so they can get the help they need as soon as possible. Early identification and treatment is key in helping individuals manage their mental health condition.\"]"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": 11,
431
+ "metadata": {},
432
+ "outputs": [],
433
+ "source": [
434
+ "# sum up the number of words in each sentence\n",
435
+ "def count_words(sentence):\n",
436
+ " return len(sentence.split())\n",
437
+ " "
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 14,
443
+ "metadata": {},
444
+ "outputs": [
445
+ {
446
+ "data": {
447
+ "text/plain": [
448
+ "692"
449
+ ]
450
+ },
451
+ "execution_count": 14,
452
+ "metadata": {},
453
+ "output_type": "execute_result"
454
+ }
455
+ ],
456
+ "source": [
457
+ "sum([(count_words(i)) for i in len_test])"
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "execution_count": null,
463
+ "metadata": {},
464
+ "outputs": [],
465
+ "source": []
466
+ }
467
+ ],
468
+ "metadata": {
469
+ "kernelspec": {
470
+ "display_name": "chatbot",
471
+ "language": "python",
472
+ "name": "python3"
473
+ },
474
+ "language_info": {
475
+ "codemirror_mode": {
476
+ "name": "ipython",
477
+ "version": 3
478
+ },
479
+ "file_extension": ".py",
480
+ "mimetype": "text/x-python",
481
+ "name": "python",
482
+ "nbconvert_exporter": "python",
483
+ "pygments_lexer": "ipython3",
484
+ "version": "3.8.15"
485
+ },
486
+ "orig_nbformat": 4,
487
+ "vscode": {
488
+ "interpreter": {
489
+ "hash": "9798c78c52b861f9442ea63a21901b586ae2f2169fa92d94b4091cc5bab62e04"
490
+ }
491
+ }
492
+ },
493
+ "nbformat": 4,
494
+ "nbformat_minor": 2
495
+ }
notebooks/mental_health_kaggle_noteboook.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/youtube_transcripts.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
openai_api_key.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 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
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openai==0.26.4
2
+ python-dotenv==0.21.1
3
+ streamlit==1.17.0
4
+ streamlit_chat==0.0.2.1
5
+ transformers==4.25.1
6
+ torch==1.13.1
requirements.txt~ ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Flask==2.2.2
2
+ openai==0.26.4
3
+ python-dotenv==0.21.1
4
+ streamlit==1.17.0
5
+ streamlit_chat==0.0.2.1
6
+ transformers==4.25.1
7
+ torch==1.13.1
streamlit_app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ openai.api_key_path = './openai_api_key.txt'
3
+ import streamlit as st
4
+ from streamlit_chat import message
5
+ from transformers import pipeline
6
+ summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
7
+ sentiment_task = pipeline("sentiment-analysis", model='cardiffnlp/twitter-roberta-base-sentiment-latest', tokenizer='cardiffnlp/twitter-roberta-base-sentiment-latest')
8
+
9
+ from math import log
10
+
11
+ completion = openai.Completion()
12
+
13
+
14
+ start_prompt = '[Instruction] Act as a friendly, compasionate, insightful, and empathetic AI therapist named Joy. Joy listens and offers advices. End the conversation when the patient wishes to.'
15
+ start_message = 'I am Joy, your AI therapist. How are you feeling today?'
16
+ start_sequence = "\nJoy:"
17
+ restart_sequence = "\n\nPatient:"
18
+
19
+ def ask(question: str, chat_log: str, model='text-davinci-003', temp=0.9) -> (str, str):
20
+
21
+ prompt = f'{chat_log}{restart_sequence} {question}{start_sequence}'
22
+
23
+ response = completion.create(
24
+ prompt = prompt,
25
+ model = model,
26
+ stop = ["Patient:",'Joy:'],
27
+ temperature = temp, #the higher the more creative
28
+ frequency_penalty = 0.9, #prevents word repetition, larger -> higher penalty
29
+ presence_penalty = 1, #prevents topic repetition, larger -> higher penalty
30
+ top_p =1,
31
+ best_of=1,
32
+ max_tokens=170
33
+ )
34
+
35
+ answer = response.choices[0].text.strip()
36
+ log = f'{restart_sequence}{question}{start_sequence}{answer}'
37
+ return str(answer), str(log)
38
+
39
+ def clean_chat_log(chat_log):
40
+ chat_log = ' '.join(chat_log)
41
+ # find the first /n
42
+ first_newline = chat_log.find('\n')
43
+ chat_log = chat_log[first_newline:]
44
+ # remove all \n
45
+ chat_log = chat_log.replace('\n', ' ')
46
+ return chat_log
47
+
48
+ def summarize(chat_log):
49
+ chat_log = clean_chat_log(chat_log)
50
+ summary = summarizer(chat_log, max_length=150, do_sample=False)[0]['summary_text']
51
+ return summary
52
+
53
+ def analyze_sentiment(chat_log):
54
+ # split chat_log into smaller chunks
55
+
56
+ # analyze each chunk
57
+
58
+ # return the average sentiment
59
+
60
+
61
+ chat_log = clean_chat_log(chat_log)
62
+ sentiment = sentiment_task(chat_log)
63
+ return sentiment
64
+
65
+
66
+
67
+
68
+
69
+ def main():
70
+ st.title("Chat with Joy - the AI therapist!")
71
+ col1, col2 = st.columns(2)
72
+ temp = col1.slider("Bot-Creativeness", 0.0, 1.0, 0.9, 0.1)
73
+ model = col2.selectbox("Model", ["text-davinci-003", "text-curie-001", "curie:ft-personal-2023-02-03-17-06-53"])
74
+
75
+ if 'generated' not in st.session_state:
76
+ st.session_state['generated'] = [start_message]
77
+
78
+ if 'past' not in st.session_state:
79
+ st.session_state['past'] = []
80
+
81
+ if 'summary' not in st.session_state:
82
+ st.session_state['summary'] = []
83
+
84
+ if 'chat_log' not in st.session_state:
85
+ st.session_state['chat_log'] = [start_prompt+start_sequence+start_message]
86
+
87
+
88
+ if len(st.session_state['generated']) > 2:
89
+ if st.button("Clear and summerize", key='clear'):
90
+ chat_log = clean_chat_log(st.session_state['chat_log'])
91
+ summary = summarizer(chat_log, max_length=100, min_length=30, do_sample=False)
92
+ st.write(summary)
93
+ user_sentiment = st.session_state['past']
94
+ user_sentiment = ' '.join(user_sentiment)
95
+ user_sentiment = clean_chat_log(user_sentiment)
96
+ st.write(sentiment_task(user_sentiment))
97
+ st.session_state['generated'] = [start_message]
98
+ st.session_state['past'] = []
99
+ st.session_state['chat_log'] = [start_prompt+start_sequence+start_message]
100
+ st.session_state['summary'] = []
101
+
102
+ user_input=st.text_input("You:",key='input')
103
+
104
+ if user_input:
105
+ output, chat_log = ask(user_input, st.session_state['chat_log'], model=model, temp=temp)
106
+ st.session_state['chat_log'].append(chat_log)
107
+ st.session_state['past'].append(user_input)
108
+ st.session_state['generated'].append(output)
109
+ print(model)
110
+ print(temp)
111
+ print(st.session_state['chat_log'])
112
+ if st.session_state['generated']:
113
+ for i in range(len(st.session_state['generated'])-1, -1, -1):
114
+ if i < len(st.session_state['past']):
115
+ message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
116
+ message(st.session_state["generated"][i], key=str(i))
117
+
118
+
119
+
120
+ if __name__ == "__main__":
121
+ main()
122
+ # save the user's input and the model's output to the database and analyze the user's input and the model's output
123
+
124
+ # if len(st.seesion_state['generated']) :
125
+ # save the user's input and the model's output to the database
126
+ # analyze the user's input and the model's output
127
+
128
+
129
+
130
+
streamlit_app_minimum.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ openai.api_key_path = './openai_api_key.txt'
3
+ import streamlit as st
4
+ from streamlit_chat import message
5
+
6
+
7
+ completion = openai.Completion()
8
+
9
+
10
+ start_prompt = '[Instruction] Act as a friendly, compasionate, insightful, and empathetic AI therapist named Joy. Joy listens, asks for details and offers detailed advices once a while. End the conversation when you wishes to.'
11
+ start_message = 'I am Joy, your AI therapist. How are you feeling today?'
12
+
13
+ start_sequence = "\nJoy:"
14
+ restart_sequence = "\n\nYou:"
15
+
16
+ # to do:
17
+ # let the user choose between models (curie, davinci, curie-finetuned, davinci-finetuned)
18
+ # let the user choose between different temperatures, frequency_penalty, presence_penalty
19
+
20
+ # save the user's input and the model's output to the database
21
+ # analyze the user's input and the model's output
22
+ # sentiment/mood analysis / topic analysis of the user's input
23
+
24
+ # embed the user's input and look for therapy catalogue that is similar to the user's input
25
+ # push the therapy catalogue to the user
26
+
27
+
28
+ def ask(question: str, chat_log: str) -> (str, str):
29
+
30
+ prompt = f'{chat_log}{restart_sequence} {question}{start_sequence}'
31
+
32
+ response = completion.create(
33
+ prompt = prompt,
34
+ model = model,
35
+ stop = ["You:",'Joy:'],
36
+ temperature = temp, #the higher the more creative
37
+ frequency_penalty = 0.3, #prevents word repetition, larger -> higher penalty
38
+ presence_penalty = 0.6, #prevents topic repetition, larger -> higher penalty
39
+ top_p =1,
40
+ best_of=1,
41
+ max_tokens=170
42
+ )
43
+
44
+ answer = response.choices[0].text.strip()
45
+ log = f'{restart_sequence}{question}{start_sequence}{answer}'
46
+ return str(answer), str(log)
47
+
48
+
49
+ # button for starting a new conversation
50
+
51
+ st.title("Chat with Joy - the AI therapist!")
52
+ temp = st.slider("Creativity", 0.0, 1.0, 0.7, 0.1)
53
+ model = st.selectbox("Model", ["text-davinci-003", "text-curie-001", "curie:ft-personal-2023-02-03-17-06-53"])
54
+
55
+ if 'generated' not in st.session_state:
56
+ st.session_state['generated'] = [start_message]
57
+
58
+ if 'past' not in st.session_state:
59
+ st.session_state['past'] = []
60
+
61
+
62
+
63
+ if 'chat_log' not in st.session_state:
64
+ st.session_state['chat_log'] = [start_prompt+start_sequence+start_message]
65
+
66
+ user_input=st.text_input("You:",key='input')
67
+
68
+ if user_input:
69
+ output, chat_log = ask(user_input, st.session_state['chat_log'])
70
+ st.session_state['chat_log'].append(chat_log)
71
+ st.session_state['past'].append(user_input)
72
+ st.session_state['generated'].append(output)
73
+ print(st.session_state['chat_log'])
74
+ if st.session_state['generated']:
75
+ for i in range(len(st.session_state['generated'])-1, -1, -1):
76
+ if i < len(st.session_state['past']):
77
+ message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
78
+ message(st.session_state["generated"][i], key=str(i))
79
+
80
+ # save the user's input and the model's output to the database and analyze the user's input and the model's output
81
+
82
+
83
+
test.ipynb ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 9,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "528"
12
+ ]
13
+ },
14
+ "execution_count": 9,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
+ "source": [
20
+ "test = ''' Cancer of the womb (uterus) is a common cancer that affects the female reproductive system. It's also called uterine cancer and endometrial cancer.\n",
21
+ "\n",
22
+ "Abnormal vaginal bleeding is the most common symptom of womb cancer.\n",
23
+ "\n",
24
+ "If you have been through the menopause, any vaginal bleeding is considered abnormal. If you have not yet been through the menopause, unusual bleeding may include bleeding between your periods.\n",
25
+ "\n",
26
+ "You should speak to your GP as soon as possible if you experience any unusual vaginal bleeding. While it's unlikely to be caused by womb cancer, it's best to be sure.\n",
27
+ "\n",
28
+ "Your GP will examine you and ask about your symptoms. They will refer you to a specialist for further tests if they suspect you may have a serious problem, or if they are unsure about a diagnosis.\n",
29
+ "\n",
30
+ "Cancer of the womb (uterus) is a common cancer that affects the female reproductive system. It's also called uterine cancer and endometrial cancer.\n",
31
+ "\n",
32
+ "Abnormal vaginal bleeding is the most common symptom of womb cancer.\n",
33
+ "\n",
34
+ "If you have been through the menopause, any vaginal bleeding is considered abnormal. If you have not yet been through the menopause, unusual bleeding may include bleeding between your periods.\n",
35
+ "\n",
36
+ "You should speak to your GP as soon as possible if you experience any unusual vaginal bleeding. While it's unlikely to be caused by womb cancer, it's best to be sure.\n",
37
+ "\n",
38
+ "Your GP will examine you and ask about your symptoms. They will refer you to a specialist for further tests if they suspect you may have a serious problem, or if they are unsure about a diagnosis.\n",
39
+ "\n",
40
+ "Cancer of the womb (uterus) is a common cancer that affects the female reproductive system. It's also called uterine cancer and endometrial cancer.\n",
41
+ "\n",
42
+ "Abnormal vaginal bleeding is the most common symptom of womb cancer.\n",
43
+ "\n",
44
+ "If you have been through the menopause, any vaginal bleeding is considered abnormal. If you have not yet been through the menopause, unusual bleeding may include bleeding between your periods.\n",
45
+ "\n",
46
+ "You should speak to your GP as soon as possible if you experience any unusual vaginal bleeding. While it's unlikely to be caused by womb cancer, it's best to be sure.\n",
47
+ "\n",
48
+ "Your GP will examine you and ask about your symptoms. They will refer you to a specialist for further tests if they suspect you may have a serious problem, or if they are unsure about a diagnosis.\n",
49
+ "\n",
50
+ "Cancer of the womb (uterus) is a common cancer that affects the female reproductive system. It's also called uterine cancer and endometrial cancer.\n",
51
+ "\n",
52
+ "Abnormal vaginal bleeding is the most common symptom of womb cancer.\n",
53
+ "\n",
54
+ "If you have been through the menopause, any vaginal bleeding is considered abnormal. If you have not yet been through the menopause, unusual bleeding may include bleeding between your periods.\n",
55
+ "\n",
56
+ "You should speak to your GP as soon as possible if you experience any unusual vaginal bleeding. While it's unlikely to be caused by womb cancer, it's best to be sure.\n",
57
+ "\n",
58
+ "Your GP will examine you and ask about your symptoms. They will refer you to a specialist for further tests if they suspect you may have a serious problem, or if they are unsure about a diagnosis.\n",
59
+ "\n",
60
+ "\n",
61
+ "\n",
62
+ " '''\n",
63
+ "\n",
64
+ "len(test)\n",
65
+ "# count the number of words in test string\n",
66
+ "\n",
67
+ "test2 = test.split()\n",
68
+ "len(test2)"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 73,
74
+ "metadata": {},
75
+ "outputs": [
76
+ {
77
+ "name": "stderr",
78
+ "output_type": "stream",
79
+ "text": [
80
+ "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
81
+ "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
82
+ "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
83
+ ]
84
+ }
85
+ ],
86
+ "source": [
87
+ "from transformers import AutoModelForSequenceClassification\n",
88
+ "from transformers import TFAutoModelForSequenceClassification\n",
89
+ "from transformers import AutoTokenizer, AutoConfig\n",
90
+ "MODEL = \"cardiffnlp/twitter-roberta-base-sentiment-latest\"\n",
91
+ "\n",
92
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
93
+ "config = AutoConfig.from_pretrained(MODEL)\n",
94
+ "model = AutoModelForSequenceClassification.from_pretrained(MODEL)\n",
95
+ "\n",
96
+ "\n",
97
+ "\n"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 99,
103
+ "metadata": {},
104
+ "outputs": [
105
+ {
106
+ "name": "stdout",
107
+ "output_type": "stream",
108
+ "text": [
109
+ "2865\n"
110
+ ]
111
+ }
112
+ ],
113
+ "source": [
114
+ "test3 = ['[Instruction] Act as a friendly, compasionate, insightful, and empathetic AI therapist named Joy. Joy listens and offers advices. End the conversation when the patient wishes to.\\nJoy:I am Joy, your AI therapist. How are you feeling today?', \"\\n\\nPatient:im doing okay\\nJoy:That's good to hear. Is there anything in particular that you would like to talk about today?\", '\\n\\nPatient:no not really\\nJoy:No problem. Is there anything on your mind that you would like to share? I am here to listen and offer support.', \"\\n\\nPatient:maybe actually. I am afraid my spouse cheats on me\\nJoy:I'm sorry to hear that. What has been making you feel this way?\", '\\n\\nPatient:she is writing text messages constantly with people i dont know\\nJoy:That can certainly be concerning. Have you tried talking to your spouse about this?', '\\n\\nPatient:yea then she gets defensive and angry\\nJoy:I understand that this must be difficult for you. Are there any specific strategies that have helped when conversations with your spouse become heated?', \"\\n\\nPatient:i dont know\\nJoy:It's okay if you don't know right away. What do you feel comfortable doing in these situations? Are there any strategies that have helped in the past when things become tense between you and your spouse?\", '\\n\\nPatient:i just stopped talking to her or change the subject\\nJoy:It sounds like you are doing your best to find a way to manage this. Is there anything that has helped make the situation more comfortable for you?', '\\n\\nPatient:trying not thinking about it\\nJoy:It sounds like you are doing your best to handle this difficult situation. Do you think it would help to talk to a professional or find additional support?', \"\\n\\nPatient:no i havent\\nJoy:That's understandable. Is there someone who you feel comfortable talking to about this? How does having a conversation with them make you feel?\", \"\\n\\nPatient:i havent talked with anyone about this\\nJoy:That's understandable. Having someone to talk to can be beneficial in difficult times like this. Is there anyone who you trust that you feel comfortable speaking with about your concerns?\", '\\n\\nPatient:there is no one, except maybe my mom\\nJoy:Talking to your mom can be a great start. Is there anything in particular that you want to share with her about what is going on? How does the idea of having this conversation make you feel?', '\\n\\nPatient:yea like I said, about this topic that im not very comfortable with, do you still remember which?\\nJoy:Yes, you mentioned that your spouse may be cheating on you. How would you like to approach this conversation with your mom?', \"\\n\\nPatient:tell her about my suspicision i guess\\nJoy:It can be difficult to bring up a topic like this, but it's important that you have an open conversation with her. Would you like any advice on how to start the conversation?\", \"\\n\\nPatient:yes please\\nJoy:It can help to start by letting your mom know why you decided to talk to her about this. Sharing how you are feeling and asking for her advice might be a good way to open the conversation. I'm here if you need anything else before or after speaking with your mom.\"]\n",
115
+ "len(test3)\n",
116
+ "def clean_chat_log(chat_log):\n",
117
+ " chat_log = ' '.join(chat_log)\n",
118
+ " # find the first /n\n",
119
+ " first_newline = chat_log.find('\\n')\n",
120
+ " chat_log = chat_log[first_newline:]\n",
121
+ " # remove all \\n\n",
122
+ " chat_log = chat_log.replace('\\n', ' ')\n",
123
+ " return chat_log\n",
124
+ "\n",
125
+ "cleaned = clean_chat_log(test3)\n",
126
+ "print(len(cleaned))"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 81,
132
+ "metadata": {},
133
+ "outputs": [
134
+ {
135
+ "data": {
136
+ "text/plain": [
137
+ "147"
138
+ ]
139
+ },
140
+ "execution_count": 81,
141
+ "metadata": {},
142
+ "output_type": "execute_result"
143
+ }
144
+ ],
145
+ "source": [
146
+ "len(tokenizer(test3, padding=True, truncation=True, return_tensors=\"pt\")[0])"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 86,
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "name": "stdout",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "0.6931471805599453\n",
159
+ "2.4849066497880004\n",
160
+ "3.091042453358316\n",
161
+ "3.4657359027997265\n",
162
+ "3.7376696182833684\n",
163
+ "3.9512437185814275\n",
164
+ "4.127134385045092\n",
165
+ "4.276666119016055\n",
166
+ "4.406719247264253\n",
167
+ "4.5217885770490405\n"
168
+ ]
169
+ }
170
+ ],
171
+ "source": [
172
+ "# import natural log function\n",
173
+ "from math import log\n",
174
+ "\n",
175
+ "for i in range(2, 100, 10):\n",
176
+ " print(log(i))"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 91,
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "data": {
186
+ "text/plain": [
187
+ "1.0986122886681098"
188
+ ]
189
+ },
190
+ "execution_count": 91,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "source": [
196
+ "log(3)"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 96,
202
+ "metadata": {},
203
+ "outputs": [
204
+ {
205
+ "name": "stdout",
206
+ "output_type": "stream",
207
+ "text": [
208
+ "[\"[Instruction] Act as a friendly, compasionate, insightful, and empathetic AI therapist named Joy. Joy listens and offers advices. End the conversation when the patient wishes to.\\nJoy:I am Joy, your AI therapist. How are you feeling today?.\\n\\nPatient:feeling good\\nJoy:That's great to hear. What have you been up to lately that has made you feel good?.\\n\\nPatient:I am slowly but surely getting my masters degree. \\nJoy:It sounds like you are making great progress towards achieving your goal. How does it feel to be closer to reaching that milestone?.\\n\\nPatient:feels good\\nJoy:I'm glad to hear that. What have you been doing to stay motivated and keep moving forward?.\\n\\nPatient:just doing alot of work and keep the focus\\nJoy:That's great. It takes a lot of dedication and hard work to stay on track with your goals. Are there any areas where you feel like you could use some extra help or support?.\\n\\nPatient:yea, im still struggling with some subjects. I wish there were better ressources\\nJoy:It can be difficult to find the resources you need. I'm glad that you are taking initiative and seeking help. Have you considered talking to your professor or classmates? They may have some suggestions on how to get the support you need..\\n\\nPatient:yea its hard to ask the prof. they seem \\nJoy:I understand. It can be intimidating to reach out for help, especially when it comes to academics. You may find it helpful to practice talking through what you want to say with a friend or family member first. Having someone else's perspective can be really helpful in finding the right words to use when approaching your professor..\\n\\nPatient:yea its hard to ask the prof. they seem offended if I ask them for details. Like how could you not know this. Classmates are helpful but only in a short run. There is no one to sit down and discuss about the details. it seems everyone is just out for himself more or less. or just get the necessary thing done\\nJoy:It sounds like you feel overwhelmed and discouraged by the lack of support available. It can be difficult to find someone who is willing to take time out of their day to help you understand something more thoroughly. Have you considered online resources such as tutorials or forums? You could also look into tutoring services that may provide more individualized guidance..\\n\\nPatient:yea online tutorials are usually nice. tutoring services are expensive, good ones are hard to find and the stuff I am working on is quite obscure. \\nJoy:I understand that it can be difficult to find the resources you need, especially if they are expensive or hard to come by. Have you talked to any friends or family members who may have been in a similar situation and know of some good resources? They may be able to point you in the right direction..\\n\\nPatient:yea family is quite supportive. But in the end I still have to do the hard work. There isnt much help they can offer in terms of understanding the subject\\nJoy:Yes, it sounds like you're feeling overwhelmed by the amount of effort and understanding that goes into mastering the material. Have you considered taking a break from studying to recharge? It may help to take a step back and come back to the material with a fresh perspective..\\n\\nPatient:you are right. but it seems i cant afford any long breaks. then i will just fall behind\\nJoy:I understand. It can be difficult to take a break when you are feeling so much pressure to succeed. Instead of taking an extended break, maybe you could try breaking up your study sessions into smaller chunks that are more manageable. That way, you can take regular short breaks throughout the day to help reset and refocus before getting back to studying..\\n\\nPatient:ok\\nJoy:Is there anything else I can help you with today?\"]\n"
209
+ ]
210
+ }
211
+ ],
212
+ "source": [
213
+ "def split_input_string(input_string: list, tokenizer, max_length):\n",
214
+ " # Initialize the list to store the split strings\n",
215
+ " split_strings = []\n",
216
+ "\n",
217
+ " # Split the input string into sentences\n",
218
+ " sentences = input_string\n",
219
+ " #sentences = input_string.split(\".\")\n",
220
+ "\n",
221
+ " # Loop through each sentence\n",
222
+ " i = 0\n",
223
+ " while i < len(sentences):\n",
224
+ " # Initialize the current string with the first sentence\n",
225
+ " curr_string = sentences[i]\n",
226
+ "\n",
227
+ " # Keep adding sentences until the current string is longer than the maximum length\n",
228
+ " i += 1\n",
229
+ " while i < len(sentences) and len(tokenizer.encode(curr_string + \".\" + sentences[i], max_length=max_length)) <= max_length:\n",
230
+ " curr_string += \".\" + sentences[i]\n",
231
+ " i += 1\n",
232
+ "\n",
233
+ " # Add the current string to the list of split strings\n",
234
+ " split_strings.append(curr_string)\n",
235
+ "\n",
236
+ " # Return the list of split strings\n",
237
+ " return split_strings\n",
238
+ "\n",
239
+ "# Example usage\n",
240
+ "input_string = \"This is a sample input string. It can be quite long. This function splits the input string into parts, each of which is no longer than the maximum length defined by the tokenizer and each of which ends on a full stop.\"\n",
241
+ "max_length = 512\n",
242
+ "split_strings = split_input_string(test3, tokenizer, max_length)\n",
243
+ "print(split_strings)\n"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 97,
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "data": {
253
+ "text/plain": [
254
+ "1"
255
+ ]
256
+ },
257
+ "execution_count": 97,
258
+ "metadata": {},
259
+ "output_type": "execute_result"
260
+ }
261
+ ],
262
+ "source": [
263
+ "len(split_strings)"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": 33,
269
+ "metadata": {},
270
+ "outputs": [],
271
+ "source": [
272
+ "import re \n",
273
+ "\n",
274
+ "def split(input: str) -> list:\n",
275
+ " tokenized = tokenizer(test, return_tensors='pt')\n",
276
+ " parts = (len(tokenized[0]) // 500) + 1\n",
277
+ " words_each_part = len(input.split()) // parts\n",
278
+ " print(words_each_part)\n",
279
+ " sentences = re.split(r'(?<=[.!?]) +', input)\n",
280
+ " # count the number of words in each sentence\n",
281
+ " words = [len(i.split()) for i in sentences]\n",
282
+ " print(words)\n",
283
+ "\n",
284
+ " return parts, sentences"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 64,
290
+ "metadata": {},
291
+ "outputs": [
292
+ {
293
+ "data": {
294
+ "text/plain": [
295
+ "{'input_ids': [0, 7568, 9, 5, 31730, 36, 13080, 687, 43, 16, 10, 1537, 1668, 14, 8561, 5, 2182, 20111, 467, 4, 85, 18, 67, 373, 43683, 833, 1668, 8, 253, 11174, 13700, 1668, 4, 50118, 50118, 13112, 21113, 34126, 13162, 16, 5, 144, 1537, 28667, 9, 31730, 1668, 4, 50118, 50118, 1106, 47, 33, 57, 149, 5, 604, 1517, 17498, 6, 143, 34126, 13162, 16, 1687, 27034, 4, 318, 47, 33, 45, 648, 57, 149, 5, 604, 1517, 17498, 6, 5425, 13162, 189, 680, 13162, 227, 110, 5788, 4, 50118, 50118, 1185, 197, 1994, 7, 110, 12571, 25, 1010, 25, 678, 114, 47, 676, 143, 5425, 34126, 13162, 4, 616, 24, 18, 3752, 7, 28, 1726, 30, 31730, 1668, 6, 24, 18, 275, 7, 28, 686, 4, 50118, 50118, 12861, 12571, 40, 10154, 47, 8, 1394, 59, 110, 5298, 4, 252, 40, 9115, 47, 7, 10, 6857, 13, 617, 3457, 114, 51, 1985, 47, 189, 33, 10, 1473, 936, 6, 50, 114, 51, 32, 17118, 59, 10, 9726, 4, 50118, 50118, 347, 20126, 9, 5, 31730, 36, 13080, 687, 43, 16, 10, 1537, 1668, 14, 8561, 5, 2182, 20111, 467, 4, 85, 18, 67, 373, 43683, 833, 1668, 8, 253, 11174, 13700, 1668, 4, 50118, 50118, 13112, 21113, 34126, 13162, 16, 5, 144, 1537, 28667, 9, 31730, 1668, 4, 50118, 50118, 1106, 47, 33, 57, 149, 5, 604, 1517, 17498, 6, 143, 34126, 13162, 16, 1687, 27034, 4, 318, 47, 33, 45, 648, 57, 149, 5, 604, 1517, 17498, 6, 5425, 13162, 189, 680, 13162, 227, 110, 5788, 4, 50118, 50118, 1185, 197, 1994, 7, 110, 12571, 25, 1010, 25, 678, 114, 47, 676, 143, 5425, 34126, 13162, 4, 616, 24, 18, 3752, 7, 28, 1726, 30, 31730, 1668, 6, 24, 18, 275, 7, 28, 686, 4, 50118, 50118, 12861, 12571, 40, 10154, 47, 8, 1394, 59, 110, 5298, 4, 252, 40, 9115, 47, 7, 10, 6857, 13, 617, 3457, 114, 51, 1985, 47, 189, 33, 10, 1473, 936, 6, 50, 114, 51, 32, 17118, 59, 10, 9726, 4, 50118, 50118, 347, 20126, 9, 5, 31730, 36, 13080, 687, 43, 16, 10, 1537, 1668, 14, 8561, 5, 2182, 20111, 467, 4, 85, 18, 67, 373, 43683, 833, 1668, 8, 253, 11174, 13700, 1668, 4, 50118, 50118, 13112, 21113, 34126, 13162, 16, 5, 144, 1537, 28667, 9, 31730, 1668, 4, 50118, 50118, 1106, 47, 33, 57, 149, 5, 604, 1517, 17498, 6, 143, 34126, 13162, 16, 1687, 27034, 4, 318, 47, 33, 45, 648, 57, 149, 5, 604, 1517, 17498, 6, 5425, 13162, 189, 680, 13162, 227, 110, 5788, 4, 50118, 50118, 1185, 197, 1994, 7, 110, 12571, 25, 1010, 25, 678, 114, 47, 676, 143, 5425, 34126, 13162, 4, 616, 24, 18, 3752, 7, 28, 1726, 30, 31730, 1668, 6, 24, 18, 275, 7, 28, 686, 4, 50118, 50118, 12861, 12571, 40, 10154, 47, 8, 1394, 59, 110, 5298, 4, 252, 40, 9115, 47, 7, 10, 6857, 13, 617, 3457, 114, 51, 1985, 47, 189, 33, 10, 1473, 936, 6, 50, 114, 51, 32, 17118, 59, 10, 9726, 4, 50118, 50118, 347, 20126, 9, 5, 31730, 36, 13080, 687, 43, 16, 10, 1537, 1668, 14, 8561, 5, 2182, 20111, 467, 4, 85, 18, 67, 373, 43683, 833, 1668, 8, 253, 11174, 13700, 1668, 4, 50118, 50118, 13112, 21113, 34126, 13162, 16, 5, 144, 1537, 28667, 9, 31730, 1668, 4, 50118, 50118, 1106, 47, 33, 57, 149, 5, 604, 1517, 17498, 6, 143, 34126, 13162, 16, 1687, 27034, 4, 318, 47, 33, 45, 648, 57, 149, 5, 604, 1517, 17498, 6, 5425, 13162, 189, 680, 13162, 227, 110, 5788, 4, 50118, 50118, 1185, 197, 1994, 7, 110, 12571, 25, 1010, 25, 678, 114, 47, 676, 143, 5425, 34126, 13162, 4, 616, 24, 18, 3752, 7, 28, 1726, 30, 31730, 1668, 6, 24, 18, 275, 7, 28, 686, 4, 50118, 50118, 12861, 12571, 40, 10154, 47, 8, 1394, 59, 110, 5298, 4, 252, 40, 9115, 47, 7, 10, 6857, 13, 617, 3457, 114, 51, 1985, 47, 189, 33, 10, 1473, 936, 6, 50, 114, 51, 32, 17118, 59, 10, 9726, 4, 50140, 50140, 1437, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
296
+ ]
297
+ },
298
+ "execution_count": 64,
299
+ "metadata": {},
300
+ "output_type": "execute_result"
301
+ }
302
+ ],
303
+ "source": [
304
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
305
+ "tokens = tokenizer(test)\n",
306
+ "tokens"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": 55,
312
+ "metadata": {},
313
+ "outputs": [
314
+ {
315
+ "name": "stdout",
316
+ "output_type": "stream",
317
+ "text": [
318
+ "['ĠCancer', 'Ġof', 'Ġthe', 'Ġwomb', 'Ġ(', 'uter', 'us', ')', 'Ġis', 'Ġa', 'Ġcommon', 'Ġcancer', 'Ġthat', 'Ġaffects', 'Ġthe', 'Ġfemale', 'Ġreproductive', 'Ġsystem', '.', 'ĠIt', \"'s\", 'Ġalso', 'Ġcalled', 'Ġuter', 'ine', 'Ġcancer', 'Ġand', 'Ġend', 'omet', 'rial', 'Ġcancer', '.', 'Ċ', 'Ċ', 'Ab', 'normal', 'Ġvaginal', 'Ġbleeding', 'Ġis', 'Ġthe', 'Ġmost', 'Ġcommon', 'Ġsymptom', 'Ġof', 'Ġwomb', 'Ġcancer', '.', 'Ċ', 'Ċ', 'If', 'Ġyou', 'Ġhave', 'Ġbeen', 'Ġthrough', 'Ġthe', 'Ġmen', 'op', 'ause', ',', 'Ġany', 'Ġvaginal', 'Ġbleeding', 'Ġis', 'Ġconsidered', 'Ġabnormal', '.', 'ĠIf', 'Ġyou', 'Ġhave', 'Ġnot', 'Ġyet', 'Ġbeen', 'Ġthrough', 'Ġthe', 'Ġmen', 'op', 'ause', ',', 'Ġunusual', 'Ġbleeding', 'Ġmay', 'Ġinclude', 'Ġbleeding', 'Ġbetween', 'Ġyour', 'Ġperiods', '.', 'Ċ', 'Ċ', 'You', 'Ġshould', 'Ġspeak', 'Ġto', 'Ġyour', 'ĠGP', 'Ġas', 'Ġsoon', 'Ġas', 'Ġpossible', 'Ġif', 'Ġyou', 'Ġexperience', 'Ġany', 'Ġunusual', 'Ġvaginal', 'Ġbleeding', '.', 'ĠWhile', 'Ġit', \"'s\", 'Ġunlikely', 'Ġto', 'Ġbe', 'Ġcaused', 'Ġby', 'Ġwomb', 'Ġcancer', ',', 'Ġit', \"'s\", 'Ġbest', 'Ġto', 'Ġbe', 'Ġsure', '.', 'Ċ', 'Ċ', 'Your', 'ĠGP', 'Ġwill', 'Ġexamine', 'Ġyou', 'Ġand', 'Ġask', 'Ġabout', 'Ġyour', 'Ġsymptoms', '.', 'ĠThey', 'Ġwill', 'Ġrefer', 'Ġyou', 'Ġto', 'Ġa', 'Ġspecialist', 'Ġfor', 'Ġfurther', 'Ġtests', 'Ġif', 'Ġthey', 'Ġsuspect', 'Ġyou', 'Ġmay', 'Ġhave', 'Ġa', 'Ġserious', 'Ġproblem', ',', 'Ġor', 'Ġif', 'Ġthey', 'Ġare', 'Ġunsure', 'Ġabout', 'Ġa', 'Ġdiagnosis', '.', 'Ċ', 'Ċ', 'C', 'ancer', 'Ġof', 'Ġthe', 'Ġwomb', 'Ġ(', 'uter', 'us', ')', 'Ġis', 'Ġa', 'Ġcommon', 'Ġcancer', 'Ġthat', 'Ġaffects', 'Ġthe', 'Ġfemale', 'Ġreproductive', 'Ġsystem', '.', 'ĠIt', \"'s\", 'Ġalso', 'Ġcalled', 'Ġuter', 'ine', 'Ġcancer', 'Ġand', 'Ġend', 'omet', 'rial', 'Ġcancer', '.', 'Ċ', 'Ċ', 'Ab', 'normal', 'Ġvaginal', 'Ġbleeding', 'Ġis', 'Ġthe', 'Ġmost', 'Ġcommon', 'Ġsymptom', 'Ġof', 'Ġwomb', 'Ġcancer', '.', 'Ċ', 'Ċ', 'If', 'Ġyou', 'Ġhave', 'Ġbeen', 'Ġthrough', 'Ġthe', 'Ġmen', 'op', 'ause', ',', 'Ġany', 'Ġvaginal', 'Ġbleeding', 'Ġis', 'Ġconsidered', 'Ġabnormal', '.', 'ĠIf', 'Ġyou', 'Ġhave', 'Ġnot', 'Ġyet', 'Ġbeen', 'Ġthrough', 'Ġthe', 'Ġmen', 'op', 'ause', ',', 'Ġunusual', 'Ġbleeding', 'Ġmay', 'Ġinclude', 'Ġbleeding', 'Ġbetween', 'Ġyour', 'Ġperiods', '.', 'Ċ', 'Ċ', 'You', 'Ġshould', 'Ġspeak', 'Ġto', 'Ġyour', 'ĠGP', 'Ġas', 'Ġsoon', 'Ġas', 'Ġpossible', 'Ġif', 'Ġyou', 'Ġexperience', 'Ġany', 'Ġunusual', 'Ġvaginal', 'Ġbleeding', '.', 'ĠWhile', 'Ġit', \"'s\", 'Ġunlikely', 'Ġto', 'Ġbe', 'Ġcaused', 'Ġby', 'Ġwomb', 'Ġcancer', ',', 'Ġit', \"'s\", 'Ġbest', 'Ġto', 'Ġbe', 'Ġsure', '.', 'Ċ', 'Ċ', 'Your', 'ĠGP', 'Ġwill', 'Ġexamine', 'Ġyou', 'Ġand', 'Ġask', 'Ġabout', 'Ġyour', 'Ġsymptoms', '.', 'ĠThey', 'Ġwill', 'Ġrefer', 'Ġyou', 'Ġto', 'Ġa', 'Ġspecialist', 'Ġfor', 'Ġfurther', 'Ġtests', 'Ġif', 'Ġthey', 'Ġsuspect', 'Ġyou', 'Ġmay', 'Ġhave', 'Ġa', 'Ġserious', 'Ġproblem', ',', 'Ġor', 'Ġif', 'Ġthey', 'Ġare', 'Ġunsure', 'Ġabout', 'Ġa', 'Ġdiagnosis', '.', 'Ċ', 'Ċ', 'C', 'ancer', 'Ġof', 'Ġthe', 'Ġwomb', 'Ġ(', 'uter', 'us', ')', 'Ġis', 'Ġa', 'Ġcommon', 'Ġcancer', 'Ġthat', 'Ġaffects', 'Ġthe', 'Ġfemale', 'Ġreproductive', 'Ġsystem', '.', 'ĠIt', \"'s\", 'Ġalso', 'Ġcalled', 'Ġuter', 'ine', 'Ġcancer', 'Ġand', 'Ġend', 'omet', 'rial', 'Ġcancer', '.', 'Ċ', 'Ċ', 'Ab', 'normal', 'Ġvaginal', 'Ġbleeding', 'Ġis', 'Ġthe', 'Ġmost', 'Ġcommon', 'Ġsymptom', 'Ġof', 'Ġwomb', 'Ġcancer', '.', 'Ċ', 'Ċ', 'If', 'Ġyou', 'Ġhave', 'Ġbeen', 'Ġthrough', 'Ġthe', 'Ġmen', 'op', 'ause', ',', 'Ġany', 'Ġvaginal', 'Ġbleeding', 'Ġis', 'Ġconsidered', 'Ġabnormal', '.', 'ĠIf', 'Ġyou', 'Ġhave', 'Ġnot', 'Ġyet', 'Ġbeen', 'Ġthrough', 'Ġthe', 'Ġmen', 'op', 'ause', ',', 'Ġunusual', 'Ġbleeding', 'Ġmay', 'Ġinclude', 'Ġbleeding', 'Ġbetween', 'Ġyour', 'Ġperiods', '.', 'Ċ', 'Ċ', 'You', 'Ġshould', 'Ġspeak', 'Ġto', 'Ġyour', 'ĠGP', 'Ġas', 'Ġsoon', 'Ġas', 'Ġpossible', 'Ġif', 'Ġyou', 'Ġexperience', 'Ġany', 'Ġunusual', 'Ġvaginal', 'Ġbleeding', '.', 'ĠWhile', 'Ġit', \"'s\", 'Ġunlikely', 'Ġto', 'Ġbe', 'Ġcaused', 'Ġby', 'Ġwomb', 'Ġcancer', ',', 'Ġit', \"'s\", 'Ġbest', 'Ġto', 'Ġbe', 'Ġsure', '.', 'Ċ', 'Ċ', 'Your', 'ĠGP', 'Ġwill', 'Ġexamine', 'Ġyou', 'Ġand', 'Ġask', 'Ġabout', 'Ġyour', 'Ġsymptoms', '.', 'ĠThey', 'Ġwill', 'Ġrefer', 'Ġyou', 'Ġto', 'Ġa', 'Ġspecialist', 'Ġfor', 'Ġfurther', 'Ġtests', 'Ġif', 'Ġthey', 'Ġsuspect', 'Ġyou', 'Ġmay', 'Ġhave', 'Ġa', 'Ġserious', 'Ġproblem', ',', 'Ġor', 'Ġif', 'Ġthey', 'Ġare', 'Ġunsure', 'Ġabout', 'Ġa', 'Ġdiagnosis', '.', 'Ċ', 'Ċ', 'C', 'ancer', 'Ġof', 'Ġthe', 'Ġwomb', 'Ġ(', 'uter', 'us', ')', 'Ġis', 'Ġa', 'Ġcommon', 'Ġcancer', 'Ġthat', 'Ġaffects', 'Ġthe', 'Ġfemale', 'Ġreproductive', 'Ġsystem', '.', 'ĠIt', \"'s\", 'Ġalso', 'Ġcalled', 'Ġuter', 'ine', 'Ġcancer', 'Ġand', 'Ġend', 'omet', 'rial', 'Ġcancer', '.', 'Ċ', 'Ċ', 'Ab', 'normal', 'Ġvaginal', 'Ġbleeding', 'Ġis', 'Ġthe', 'Ġmost', 'Ġcommon', 'Ġsymptom', 'Ġof', 'Ġwomb', 'Ġcancer', '.', 'Ċ', 'Ċ', 'If', 'Ġyou', 'Ġhave', 'Ġbeen', 'Ġthrough', 'Ġthe', 'Ġmen', 'op', 'ause', ',', 'Ġany', 'Ġvaginal', 'Ġbleeding', 'Ġis', 'Ġconsidered', 'Ġabnormal', '.', 'ĠIf', 'Ġyou', 'Ġhave', 'Ġnot', 'Ġyet', 'Ġbeen', 'Ġthrough', 'Ġthe', 'Ġmen', 'op', 'ause', ',', 'Ġunusual', 'Ġbleeding', 'Ġmay', 'Ġinclude', 'Ġbleeding', 'Ġbetween', 'Ġyour', 'Ġperiods', '.', 'Ċ', 'Ċ', 'You', 'Ġshould', 'Ġspeak', 'Ġto', 'Ġyour', 'ĠGP', 'Ġas', 'Ġsoon', 'Ġas', 'Ġpossible', 'Ġif', 'Ġyou', 'Ġexperience', 'Ġany', 'Ġunusual', 'Ġvaginal', 'Ġbleeding', '.', 'ĠWhile', 'Ġit', \"'s\", 'Ġunlikely', 'Ġto', 'Ġbe', 'Ġcaused', 'Ġby', 'Ġwomb', 'Ġcancer', ',', 'Ġit', \"'s\", 'Ġbest', 'Ġto', 'Ġbe', 'Ġsure', '.', 'Ċ', 'Ċ', 'Your', 'ĠGP', 'Ġwill', 'Ġexamine', 'Ġyou', 'Ġand', 'Ġask', 'Ġabout', 'Ġyour', 'Ġsymptoms', '.', 'ĠThey', 'Ġwill', 'Ġrefer', 'Ġyou', 'Ġto', 'Ġa', 'Ġspecialist', 'Ġfor', 'Ġfurther', 'Ġtests', 'Ġif', 'Ġthey', 'Ġsuspect', 'Ġyou', 'Ġmay', 'Ġhave', 'Ġa', 'Ġserious', 'Ġproblem', ',', 'Ġor', 'Ġif', 'Ġthey', 'Ġare', 'Ġunsure', 'Ġabout', 'Ġa', 'Ġdiagnosis', '.', 'ĊĊ', 'ĊĊ', 'Ġ']\n"
319
+ ]
320
+ },
321
+ {
322
+ "ename": "KeyboardInterrupt",
323
+ "evalue": "",
324
+ "output_type": "error",
325
+ "traceback": [
326
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
327
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
328
+ "Cell \u001b[0;32mIn[55], line 26\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[39m# break\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[39m# end += 1\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[39m# start = end + 1\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39m# end = start\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[39mreturn\u001b[39;00m tokenized_parts\n\u001b[0;32m---> 26\u001b[0m tokenize_long_sentence(test, \u001b[39m500\u001b[39;49m)\n",
329
+ "Cell \u001b[0;32mIn[55], line 13\u001b[0m, in \u001b[0;36mtokenize_long_sentence\u001b[0;34m(long_sentence, max_tokens)\u001b[0m\n\u001b[1;32m 10\u001b[0m end \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[1;32m 12\u001b[0m \u001b[39mwhile\u001b[39;00m end \u001b[39m<\u001b[39m \u001b[39mlen\u001b[39m(tokens):\n\u001b[0;32m---> 13\u001b[0m \u001b[39mwhile\u001b[39;00m end \u001b[39m<\u001b[39m \u001b[39mlen\u001b[39m(tokens) \u001b[39mand\u001b[39;00m end \u001b[39m-\u001b[39;49m start \u001b[39m<\u001b[39;49m\u001b[39m=\u001b[39;49m max_tokens:\n\u001b[1;32m 14\u001b[0m \u001b[39mif\u001b[39;00m tokens[end] \u001b[39min\u001b[39;00m [\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m?\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m!\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[1;32m 15\u001b[0m \u001b[39mprint\u001b[39m(tokens[end])\n",
330
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
331
+ ]
332
+ }
333
+ ],
334
+ "source": [
335
+ "from transformers import AutoTokenizer\n",
336
+ "\n",
337
+ "def tokenize_long_sentence(long_sentence, max_tokens):\n",
338
+ " tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
339
+ " tokens = tokenizer.tokenize(long_sentence)\n",
340
+ " print(tokens)\n",
341
+ " tokenized_parts = []\n",
342
+ "\n",
343
+ " start = 0\n",
344
+ " end = 0\n",
345
+ "\n",
346
+ " while end < len(tokens):\n",
347
+ " while end < len(tokens) and end - start <= max_tokens:\n",
348
+ " if tokens[end] in [\".\", \"?\", \"!\"]:\n",
349
+ " print(tokens[end])\n",
350
+ " # break\n",
351
+ " # end += 1\n",
352
+ "\n",
353
+ " # part = tokens[start:end + 1]\n",
354
+ " # tokenized_parts.append(part)\n",
355
+ " # start = end + 1\n",
356
+ " # end = start\n",
357
+ "\n",
358
+ " return tokenized_parts\n",
359
+ "\n",
360
+ "tokenize_long_sentence(test, 500)"
361
+ ]
362
+ }
363
+ ],
364
+ "metadata": {
365
+ "kernelspec": {
366
+ "display_name": "chatbot",
367
+ "language": "python",
368
+ "name": "python3"
369
+ },
370
+ "language_info": {
371
+ "codemirror_mode": {
372
+ "name": "ipython",
373
+ "version": 3
374
+ },
375
+ "file_extension": ".py",
376
+ "mimetype": "text/x-python",
377
+ "name": "python",
378
+ "nbconvert_exporter": "python",
379
+ "pygments_lexer": "ipython3",
380
+ "version": "3.8.15"
381
+ },
382
+ "orig_nbformat": 4,
383
+ "vscode": {
384
+ "interpreter": {
385
+ "hash": "9798c78c52b861f9442ea63a21901b586ae2f2169fa92d94b4091cc5bab62e04"
386
+ }
387
+ }
388
+ },
389
+ "nbformat": 4,
390
+ "nbformat_minor": 2
391
+ }
twilio.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, session
2
+ from flask import render_template, redirect, url_for
3
+ from twilio.twiml.messaging_response import MessagingResponse
4
+ from joy import ask
5
+
6
+ app = Flask(__name__)
7
+
8
+ #app.config[‘SECRET_KEY’] = ‘any-random-string’ what is this for?
9
+
10
+ @app.route('/joy', methods=['POST'])
11
+ def joy():
12
+ incoming_msg = request.values['Body']
13
+ chat_log = session.get('chat_log')
14
+ answer, session['chat_log'] = ask(incoming_msg, chat_log)
15
+ msg = MessagingResponse()
16
+ msg.message(answer)
17
+ return str(msg)
18
+
19
+ if __name__ == '__main__':
20
+ app.run(debug=True)