Spaces:
Sleeping
Sleeping
Sonnyjim
commited on
Commit
•
9dbf344
0
Parent(s):
first commit
Browse files- .github/workflows/check_file_size.yml +16 -0
- .github/workflows/sync_to_hf.yml +20 -0
- .gitignore +8 -0
- Dockerfile +30 -0
- LICENSE +201 -0
- README.md +13 -0
- app.py +250 -0
- funcs/__init__.py +0 -0
- funcs/anonymiser.py +251 -0
- funcs/embeddings.py +78 -0
- funcs/helper_functions.py +89 -0
- funcs/prompts.py +106 -0
- funcs/representation_model.py +171 -0
- requirements.txt +11 -0
.github/workflows/check_file_size.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check file size
|
2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
3 |
+
pull_request:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Check large files
|
14 |
+
uses: ActionsDesk/lfs-warning@v2.0
|
15 |
+
with:
|
16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/sync_to_hf.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
lfs: true
|
17 |
+
- name: Push to hub
|
18 |
+
env:
|
19 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
+
run: git push https://seanpedrickcase:$HF_TOKEN@huggingface.co/spaces/seanpedrickcase/topic_modelling main
|
.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.pyc
|
2 |
+
*.ipynb
|
3 |
+
*.npz
|
4 |
+
*.csv
|
5 |
+
*.pkl
|
6 |
+
.ipynb_checkpoints/*
|
7 |
+
old_code/*
|
8 |
+
model/*
|
Dockerfile
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
WORKDIR /src
|
4 |
+
|
5 |
+
COPY requirements.txt .
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
# Set up a new user named "user" with user ID 1000
|
10 |
+
RUN useradd -m -u 1000 user
|
11 |
+
# Switch to the "user" user
|
12 |
+
USER user
|
13 |
+
# Set home to the user's home directory
|
14 |
+
ENV HOME=/home/user \
|
15 |
+
PATH=/home/user/.local/bin:$PATH \
|
16 |
+
PYTHONPATH=$HOME/app \
|
17 |
+
PYTHONUNBUFFERED=1 \
|
18 |
+
GRADIO_ALLOW_FLAGGING=never \
|
19 |
+
GRADIO_NUM_PORTS=1 \
|
20 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
+
GRADIO_THEME=huggingface \
|
22 |
+
SYSTEM=spaces
|
23 |
+
|
24 |
+
# Set the working directory to the user's home directory
|
25 |
+
WORKDIR $HOME/app
|
26 |
+
|
27 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
28 |
+
COPY --chown=user . $HOME/app
|
29 |
+
|
30 |
+
CMD ["python", "app.py"]
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Topic modelling
|
3 |
+
emoji: 🚀
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: yellw
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.50.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
#os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
4 |
+
#os.environ["HF_HOME"] = "/mnt/c/..."
|
5 |
+
#os.environ["CUDA_PATH"] = "/mnt/c/..."
|
6 |
+
|
7 |
+
print(os.environ["HF_HOME"])
|
8 |
+
|
9 |
+
import gradio as gr
|
10 |
+
from datetime import datetime
|
11 |
+
import pandas as pd
|
12 |
+
import numpy as np
|
13 |
+
from sklearn.cluster import KMeans
|
14 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
15 |
+
from transformers import AutoModel
|
16 |
+
import funcs.anonymiser as anon
|
17 |
+
|
18 |
+
from torch import cuda, backends, version
|
19 |
+
|
20 |
+
# Check for torch cuda
|
21 |
+
print("Is CUDA enabled? ", cuda.is_available())
|
22 |
+
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
|
23 |
+
if cuda.is_available():
|
24 |
+
torch_device = "gpu"
|
25 |
+
print("Cuda version installed is: ", version.cuda)
|
26 |
+
low_resource_mode = "No"
|
27 |
+
#os.system("nvidia-smi")
|
28 |
+
else:
|
29 |
+
torch_device = "cpu"
|
30 |
+
low_resource_mode = "Yes"
|
31 |
+
|
32 |
+
print("Device used is: ", torch_device)
|
33 |
+
|
34 |
+
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
35 |
+
|
36 |
+
from bertopic import BERTopic
|
37 |
+
#from sentence_transformers import SentenceTransformer
|
38 |
+
#from bertopic.backend._hftransformers import HFTransformerBackend
|
39 |
+
|
40 |
+
#from cuml.manifold import UMAP
|
41 |
+
|
42 |
+
#umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
|
43 |
+
|
44 |
+
today = datetime.now().strftime("%d%m%Y")
|
45 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
46 |
+
|
47 |
+
from funcs.helper_functions import dummy_function, put_columns_in_df, read_file, get_file_path_end
|
48 |
+
from funcs.representation_model import representation_model
|
49 |
+
from funcs.embeddings import make_or_load_embeddings
|
50 |
+
|
51 |
+
# Load embeddings
|
52 |
+
#embedding_model_name = "BAAI/bge-small-en-v1.5"
|
53 |
+
#embedding_model = SentenceTransformer(embedding_model_name)
|
54 |
+
|
55 |
+
# Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
|
56 |
+
# Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
|
57 |
+
embeddings_name = "jinaai/jina-embeddings-v2-small-en"
|
58 |
+
local_embeddings_location = "model/jina/"
|
59 |
+
revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
|
60 |
+
|
61 |
+
try:
|
62 |
+
embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
|
63 |
+
except:
|
64 |
+
embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
|
69 |
+
|
70 |
+
file_list = [string.name for string in in_file]
|
71 |
+
|
72 |
+
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
|
73 |
+
data_file_name = data_file_names[0]
|
74 |
+
data_file_name_no_ext = get_file_path_end(data_file_name)
|
75 |
+
|
76 |
+
in_colnames_list_first = in_colnames[0]
|
77 |
+
|
78 |
+
if in_label:
|
79 |
+
in_label_list_first = in_label[0]
|
80 |
+
else:
|
81 |
+
in_label_list_first = in_colnames_list_first
|
82 |
+
|
83 |
+
if anonymise_drop == "Yes":
|
84 |
+
in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
|
85 |
+
in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
|
86 |
+
in_files.to_csv("anonymised_data.csv")
|
87 |
+
|
88 |
+
docs = list(in_files[in_colnames_list_first].str.lower())
|
89 |
+
label_col = in_files[in_label_list_first]
|
90 |
+
|
91 |
+
# Check if embeddings are being loaded in
|
92 |
+
## Load in pre-embedded file if exists
|
93 |
+
file_list = [string.name for string in in_file]
|
94 |
+
|
95 |
+
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt)
|
96 |
+
|
97 |
+
# all_lengths = [len(embedding) for embedding in embeddings_out]
|
98 |
+
# if len(set(all_lengths)) > 1:
|
99 |
+
# print("Inconsistent lengths found in embeddings_out:", set(all_lengths))
|
100 |
+
# else:
|
101 |
+
# print("All lengths are the same.")
|
102 |
+
|
103 |
+
# print("Embeddings type: ", type(embeddings_out))
|
104 |
+
|
105 |
+
# if isinstance(embeddings_out, np.ndarray):
|
106 |
+
# print("my_object is a NumPy ndarray")
|
107 |
+
# else:
|
108 |
+
# print("my_object is not a NumPy ndarray")
|
109 |
+
|
110 |
+
# Clustering set to K-means (not used)
|
111 |
+
#cluster_model = KMeans(n_clusters=max_topics_slider)
|
112 |
+
|
113 |
+
# Countvectoriser removes stopwords, combines terms up to 2 together:
|
114 |
+
if min_docs_slider < 3:
|
115 |
+
min_df_val = min_docs_slider
|
116 |
+
else:
|
117 |
+
min_df_val = 3
|
118 |
+
|
119 |
+
print(min_df_val)
|
120 |
+
|
121 |
+
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
122 |
+
|
123 |
+
|
124 |
+
if not candidate_topics:
|
125 |
+
topic_model = BERTopic( embedding_model=embedding_model,
|
126 |
+
#hdbscan_model=cluster_model,
|
127 |
+
vectorizer_model=vectoriser_model,
|
128 |
+
min_topic_size= min_docs_slider,
|
129 |
+
nr_topics = max_topics_slider,
|
130 |
+
representation_model=representation_model,
|
131 |
+
verbose = True)
|
132 |
+
|
133 |
+
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
134 |
+
|
135 |
+
|
136 |
+
# Do this if you have pre-assigned topics
|
137 |
+
else:
|
138 |
+
zero_shot_topics_list = read_file(candidate_topics.name)
|
139 |
+
zero_shot_topics_list_lower = [x.lower() for x in zero_shot_topics_list]
|
140 |
+
|
141 |
+
print(zero_shot_topics_list_lower)
|
142 |
+
|
143 |
+
topic_model = BERTopic( embedding_model=embedding_model,
|
144 |
+
#hdbscan_model=cluster_model,
|
145 |
+
vectorizer_model=vectoriser_model,
|
146 |
+
min_topic_size = min_docs_slider,
|
147 |
+
nr_topics = max_topics_slider,
|
148 |
+
zeroshot_topic_list = zero_shot_topics_list_lower,
|
149 |
+
zeroshot_min_similarity = 0.7,
|
150 |
+
representation_model=representation_model,
|
151 |
+
verbose = True)
|
152 |
+
|
153 |
+
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
154 |
+
|
155 |
+
if not topics_text:
|
156 |
+
return "No topics found, original file returned", data_file_name
|
157 |
+
|
158 |
+
else:
|
159 |
+
topics_text_out = topics_text
|
160 |
+
topics_scores_out = probs
|
161 |
+
|
162 |
+
topic_det_output_name = "topic_details_" + today_rev + ".csv"
|
163 |
+
|
164 |
+
topic_dets = topic_model.get_topic_info()
|
165 |
+
|
166 |
+
topic_dets.to_csv(topic_det_output_name)
|
167 |
+
#print(topic_dets)
|
168 |
+
|
169 |
+
doc_det_output_name = "doc_details_" + today_rev + ".csv"
|
170 |
+
doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Probability", "Name", "Representative_document"]]
|
171 |
+
doc_dets.to_csv(doc_det_output_name)
|
172 |
+
#print(doc_dets)
|
173 |
+
|
174 |
+
#print(topic_dets)
|
175 |
+
#topics_text_out_str = ', '.join(list(topic_dets["KeyBERT"]))
|
176 |
+
|
177 |
+
topics_text_out_str = str(topic_dets["KeyBERT"])
|
178 |
+
#topics_scores_out_str = str(doc_dets["Probability"][0])
|
179 |
+
|
180 |
+
output_text = "Topics: " + topics_text_out_str #+ "\n\nProbability scores: " + topics_scores_out_str
|
181 |
+
|
182 |
+
# Outputs
|
183 |
+
embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
184 |
+
np.savez_compressed(embedding_file_name, embeddings_out)
|
185 |
+
|
186 |
+
topic_model_save_name = data_file_name_no_ext + "_topics_" + today_rev + ".pkl"
|
187 |
+
topic_model.save(topic_model_save_name, serialization='pickle', save_embedding_model=False, save_ctfidf=False)
|
188 |
+
|
189 |
+
# Visualise the topics:
|
190 |
+
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
191 |
+
|
192 |
+
return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name], topics_vis
|
193 |
+
|
194 |
+
|
195 |
+
# ## Gradio app - extract topics
|
196 |
+
|
197 |
+
block = gr.Blocks(theme = gr.themes.Base())
|
198 |
+
|
199 |
+
with block:
|
200 |
+
|
201 |
+
data_state = gr.State(pd.DataFrame())
|
202 |
+
|
203 |
+
gr.Markdown(
|
204 |
+
"""
|
205 |
+
# Extract topics from text
|
206 |
+
Enter open text below to get topics. You can copy and paste text directly, or upload a file and specify the column that you want to topics.
|
207 |
+
""")
|
208 |
+
|
209 |
+
#with gr.Accordion("I will copy and paste my open text", open = False):
|
210 |
+
# in_text = gr.Textbox(label="Copy and paste your open text here", lines = 5)
|
211 |
+
|
212 |
+
with gr.Tab("Load files and find topics"):
|
213 |
+
with gr.Accordion("Load data file", open = True):
|
214 |
+
in_files = gr.File(label="Input text from file", file_count="multiple")
|
215 |
+
with gr.Row():
|
216 |
+
in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
|
217 |
+
in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to for labelling documents in the output visualisation.")
|
218 |
+
|
219 |
+
with gr.Accordion("I have my own list of topics. File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file", open = False):
|
220 |
+
candidate_topics = gr.File(label="Input topics from file (csv)")
|
221 |
+
|
222 |
+
with gr.Row():
|
223 |
+
min_docs_slider = gr.Slider(minimum = 1, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
|
224 |
+
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
|
225 |
+
|
226 |
+
with gr.Row():
|
227 |
+
topics_btn = gr.Button("Extract topics")
|
228 |
+
|
229 |
+
with gr.Row():
|
230 |
+
output_single_text = gr.Textbox(label="Output example (first example in dataset)")
|
231 |
+
output_file = gr.File(label="Output file")
|
232 |
+
|
233 |
+
plot = gr.Plot(label="Visualise your topics here:")
|
234 |
+
|
235 |
+
with gr.Tab("Load and data processing options"):
|
236 |
+
with gr.Accordion("Process data on load", open = True):
|
237 |
+
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load.")
|
238 |
+
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
239 |
+
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
240 |
+
low_resource_mode_opt = gr.Dropdown(label = "Low resource mode (non-AI embeddings, no LLM-generated topic names).", value=low_resource_mode, choices=["Yes", "No"])
|
241 |
+
|
242 |
+
|
243 |
+
# Update column names dropdown when file uploaded
|
244 |
+
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
245 |
+
in_colnames.change(dummy_function, in_colnames, None)
|
246 |
+
|
247 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt], outputs=[output_single_text, output_file, plot], api_name="topics")
|
248 |
+
|
249 |
+
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
250 |
+
|
funcs/__init__.py
ADDED
File without changes
|
funcs/anonymiser.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
import os
|
3 |
+
|
4 |
+
def is_model_installed(model_name):
|
5 |
+
try:
|
6 |
+
# Try to load the model
|
7 |
+
spacy.load(model_name)
|
8 |
+
return True
|
9 |
+
except OSError:
|
10 |
+
return False
|
11 |
+
|
12 |
+
model_name = "en_core_web_sm"
|
13 |
+
if not is_model_installed(model_name):
|
14 |
+
os.system(f"python -m spacy download {model_name}")
|
15 |
+
|
16 |
+
|
17 |
+
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
18 |
+
#os.system("pip uninstall -y gradio")
|
19 |
+
#os.system("pip install gradio==3.50.0")
|
20 |
+
#os.system("python -m spacy download en_core_web_lg")
|
21 |
+
|
22 |
+
spacy.load(model_name)
|
23 |
+
|
24 |
+
import re
|
25 |
+
import secrets
|
26 |
+
import base64
|
27 |
+
import time
|
28 |
+
|
29 |
+
import pandas as pd
|
30 |
+
import gradio as gr
|
31 |
+
|
32 |
+
from faker import Faker
|
33 |
+
|
34 |
+
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
|
35 |
+
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
36 |
+
from presidio_anonymizer.entities import OperatorConfig
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
def anon_consistent_names(df):
|
41 |
+
# ## Pick out common names and replace them with the same person value
|
42 |
+
df_dict = df.to_dict(orient="list")
|
43 |
+
|
44 |
+
analyzer = AnalyzerEngine()
|
45 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
46 |
+
|
47 |
+
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
48 |
+
analyzer_results = list(analyzer_results)
|
49 |
+
|
50 |
+
# + tags=[]
|
51 |
+
text = analyzer_results[3].value
|
52 |
+
|
53 |
+
# + tags=[]
|
54 |
+
recognizer_result = str(analyzer_results[3].recognizer_results)
|
55 |
+
|
56 |
+
# + tags=[]
|
57 |
+
recognizer_result
|
58 |
+
|
59 |
+
# + tags=[]
|
60 |
+
data_str = recognizer_result # abbreviated for brevity
|
61 |
+
|
62 |
+
# Adjusting the parse_dict function to handle trailing ']'
|
63 |
+
# Splitting the main data string into individual list strings
|
64 |
+
list_strs = data_str[1:-1].split('], [')
|
65 |
+
|
66 |
+
def parse_dict(s):
|
67 |
+
s = s.strip('[]') # Removing any surrounding brackets
|
68 |
+
items = s.split(', ')
|
69 |
+
d = {}
|
70 |
+
for item in items:
|
71 |
+
key, value = item.split(': ')
|
72 |
+
if key == 'score':
|
73 |
+
d[key] = float(value)
|
74 |
+
elif key in ['start', 'end']:
|
75 |
+
d[key] = int(value)
|
76 |
+
else:
|
77 |
+
d[key] = value
|
78 |
+
return d
|
79 |
+
|
80 |
+
# Re-running the improved processing code
|
81 |
+
|
82 |
+
result = []
|
83 |
+
|
84 |
+
for lst_str in list_strs:
|
85 |
+
# Splitting each list string into individual dictionary strings
|
86 |
+
dict_strs = lst_str.split(', type: ')
|
87 |
+
dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]] # Prepending "type: " back to the split strings
|
88 |
+
|
89 |
+
# Parsing each dictionary string
|
90 |
+
dicts = [parse_dict(d) for d in dict_strs]
|
91 |
+
result.append(dicts)
|
92 |
+
|
93 |
+
#result
|
94 |
+
|
95 |
+
# + tags=[]
|
96 |
+
names = []
|
97 |
+
|
98 |
+
for idx, paragraph in enumerate(text):
|
99 |
+
paragraph_texts = []
|
100 |
+
for dictionary in result[idx]:
|
101 |
+
if dictionary['type'] == 'PERSON':
|
102 |
+
paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
|
103 |
+
names.append(paragraph_texts)
|
104 |
+
|
105 |
+
# + tags=[]
|
106 |
+
# Flatten the list of lists and extract unique names
|
107 |
+
unique_names = list(set(name for sublist in names for name in sublist))
|
108 |
+
|
109 |
+
# + tags=[]
|
110 |
+
fake_names = pd.Series(unique_names).apply(fake_first_name)
|
111 |
+
|
112 |
+
# + tags=[]
|
113 |
+
mapping_df = pd.DataFrame(data={"Unique names":unique_names,
|
114 |
+
"Fake names": fake_names})
|
115 |
+
|
116 |
+
# + tags=[]
|
117 |
+
# Convert mapping dataframe to dictionary
|
118 |
+
# Convert mapping dataframe to dictionary, adding word boundaries for full-word match
|
119 |
+
name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
|
120 |
+
|
121 |
+
# + tags=[]
|
122 |
+
name_map
|
123 |
+
|
124 |
+
# + tags=[]
|
125 |
+
scrubbed_df_consistent_names = df.replace(name_map, regex = True)
|
126 |
+
|
127 |
+
# + tags=[]
|
128 |
+
scrubbed_df_consistent_names
|
129 |
+
|
130 |
+
return scrubbed_df_consistent_names
|
131 |
+
|
132 |
+
def detect_file_type(filename):
|
133 |
+
"""Detect the file type based on its extension."""
|
134 |
+
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
135 |
+
return 'csv'
|
136 |
+
elif filename.endswith('.xlsx'):
|
137 |
+
return 'xlsx'
|
138 |
+
elif filename.endswith('.parquet'):
|
139 |
+
return 'parquet'
|
140 |
+
else:
|
141 |
+
raise ValueError("Unsupported file type.")
|
142 |
+
|
143 |
+
def read_file(filename):
|
144 |
+
"""Read the file based on its detected type."""
|
145 |
+
file_type = detect_file_type(filename)
|
146 |
+
|
147 |
+
if file_type == 'csv':
|
148 |
+
return pd.read_csv(filename, low_memory=False)
|
149 |
+
elif file_type == 'xlsx':
|
150 |
+
return pd.read_excel(filename)
|
151 |
+
elif file_type == 'parquet':
|
152 |
+
return pd.read_parquet(filename)
|
153 |
+
|
154 |
+
def anonymise_script(df, chosen_col, anon_strat):
|
155 |
+
|
156 |
+
# DataFrame to dict
|
157 |
+
df_dict = pd.DataFrame(data={chosen_col:df[chosen_col].astype(str)}).to_dict(orient="list")
|
158 |
+
|
159 |
+
analyzer = AnalyzerEngine()
|
160 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
161 |
+
|
162 |
+
anonymizer = AnonymizerEngine()
|
163 |
+
|
164 |
+
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
165 |
+
|
166 |
+
print("Identifying personal data")
|
167 |
+
analyse_tic = time.perf_counter()
|
168 |
+
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
169 |
+
#print(analyzer_results)
|
170 |
+
analyzer_results = list(analyzer_results)
|
171 |
+
|
172 |
+
analyse_toc = time.perf_counter()
|
173 |
+
analyse_time_out = f"Cleaning the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
174 |
+
print(analyse_time_out)
|
175 |
+
|
176 |
+
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
177 |
+
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
178 |
+
key_string = base64.b64encode(key).decode('utf-8')
|
179 |
+
|
180 |
+
# Create faker function (note that it has to receive a value)
|
181 |
+
|
182 |
+
fake = Faker("en_UK")
|
183 |
+
|
184 |
+
def fake_first_name(x):
|
185 |
+
return fake.first_name()
|
186 |
+
|
187 |
+
# Set up the anonymization configuration WITHOUT DATE_TIME
|
188 |
+
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
189 |
+
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
190 |
+
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
191 |
+
mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
|
192 |
+
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
193 |
+
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
194 |
+
|
195 |
+
|
196 |
+
if anon_strat == "replace": chosen_mask_config = replace_config
|
197 |
+
if anon_strat == "redact": chosen_mask_config = redact_config
|
198 |
+
if anon_strat == "hash": chosen_mask_config = hash_config
|
199 |
+
if anon_strat == "mask": chosen_mask_config = mask_config
|
200 |
+
if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
|
201 |
+
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
202 |
+
|
203 |
+
# I think in general people will want to keep date / times
|
204 |
+
keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
|
205 |
+
|
206 |
+
combined_config = {**chosen_mask_config, **keep_date_config}
|
207 |
+
combined_config
|
208 |
+
|
209 |
+
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
|
210 |
+
|
211 |
+
scrubbed_df = pd.DataFrame(anonymizer_results)
|
212 |
+
|
213 |
+
# Create reporting message
|
214 |
+
out_message = "Successfully anonymised"
|
215 |
+
|
216 |
+
if anon_strat == "encrypt":
|
217 |
+
out_message = out_message + ". Your decryption key is " + key_string + "."
|
218 |
+
|
219 |
+
return scrubbed_df, out_message
|
220 |
+
|
221 |
+
def do_anonymise(in_file, anon_strat, chosen_cols):
|
222 |
+
|
223 |
+
# Load file
|
224 |
+
|
225 |
+
anon_df = pd.DataFrame()
|
226 |
+
|
227 |
+
if in_file:
|
228 |
+
for match_file in in_file:
|
229 |
+
match_temp_file = pd.read_csv(match_file.name, delimiter = ",", low_memory=False)#, encoding='cp1252')
|
230 |
+
anon_df = pd.concat([anon_df, match_temp_file])
|
231 |
+
|
232 |
+
# Split dataframe to keep only selected columns
|
233 |
+
all_cols_original_order = list(anon_df.columns)
|
234 |
+
anon_df_part = anon_df[chosen_cols]
|
235 |
+
anon_df_remain = anon_df.drop(chosen_cols, axis = 1)
|
236 |
+
|
237 |
+
# Anonymise the selected columns
|
238 |
+
anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat)
|
239 |
+
|
240 |
+
# Rejoin the dataframe together
|
241 |
+
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
242 |
+
anon_df_out = anon_df_out[all_cols_original_order]
|
243 |
+
|
244 |
+
# Export file
|
245 |
+
out_file_part = re.sub(r'\.csv', '', match_file.name)
|
246 |
+
|
247 |
+
anon_export_file_name = out_file_part + "_anon_" + anon_strat + ".csv"
|
248 |
+
|
249 |
+
anon_df_out.to_csv(anon_export_file_name, index = None)
|
250 |
+
|
251 |
+
return out_message, anon_export_file_name
|
funcs/embeddings.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import numpy as np
|
3 |
+
from torch import cuda
|
4 |
+
from sklearn.pipeline import make_pipeline
|
5 |
+
from sklearn.decomposition import TruncatedSVD
|
6 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
+
from umap import UMAP
|
8 |
+
|
9 |
+
if cuda.is_available():
|
10 |
+
torch_device = "gpu"
|
11 |
+
else:
|
12 |
+
torch_device = "cpu"
|
13 |
+
|
14 |
+
def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
|
15 |
+
|
16 |
+
embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
|
17 |
+
|
18 |
+
if embeddings_file_names:
|
19 |
+
print("Loading embeddings from file.")
|
20 |
+
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
21 |
+
|
22 |
+
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
23 |
+
if "compress" in embeddings_file_names[0]:
|
24 |
+
embeddings_out /= 100
|
25 |
+
|
26 |
+
# print("embeddings loaded: ", embeddings_out)
|
27 |
+
|
28 |
+
if not embeddings_file_names:
|
29 |
+
tic = time.perf_counter()
|
30 |
+
print("Starting to embed documents.")
|
31 |
+
|
32 |
+
# Custom model
|
33 |
+
# If on CPU, don't resort to embedding models
|
34 |
+
if low_resource_mode_opt == "Yes":
|
35 |
+
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
36 |
+
embedding_model = make_pipeline(
|
37 |
+
TfidfVectorizer(),
|
38 |
+
TruncatedSVD(100)
|
39 |
+
)
|
40 |
+
|
41 |
+
embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
|
42 |
+
|
43 |
+
elif low_resource_mode_opt == "No":
|
44 |
+
print("Creating dense embeddings based on transformers model")
|
45 |
+
|
46 |
+
embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
47 |
+
|
48 |
+
#import torch
|
49 |
+
#from torch.nn.utils.rnn import pad_sequence
|
50 |
+
|
51 |
+
# Assuming embeddings_out is a list of tensors
|
52 |
+
#embeddings_out = [torch.tensor(embedding) for embedding in embeddings_out]
|
53 |
+
|
54 |
+
# Pad the sequences
|
55 |
+
# Set batch_first=True if you want the batch dimension to be the first dimension
|
56 |
+
#embeddings_out = pad_sequence(embeddings_out, batch_first=True, padding_value=0)
|
57 |
+
|
58 |
+
|
59 |
+
toc = time.perf_counter()
|
60 |
+
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
61 |
+
print(time_out)
|
62 |
+
|
63 |
+
# If you want to save your files for next time
|
64 |
+
if return_intermediate_files == "Yes":
|
65 |
+
if embeddings_super_compress == "No":
|
66 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
67 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
68 |
+
else:
|
69 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
|
70 |
+
embeddings_out_round = np.round(embeddings_out, 3)
|
71 |
+
embeddings_out_round *= 100 # Rounding not currently used
|
72 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
73 |
+
|
74 |
+
# Pre-reduce embeddings for visualisation purposes
|
75 |
+
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings_out)
|
76 |
+
|
77 |
+
|
78 |
+
return embeddings_out, reduced_embeddings
|
funcs/helper_functions.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import pandas as pd
|
4 |
+
import gradio as gr
|
5 |
+
import gzip
|
6 |
+
import pickle
|
7 |
+
|
8 |
+
|
9 |
+
def detect_file_type(filename):
|
10 |
+
"""Detect the file type based on its extension."""
|
11 |
+
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
12 |
+
return 'csv'
|
13 |
+
elif filename.endswith('.xlsx'):
|
14 |
+
return 'xlsx'
|
15 |
+
elif filename.endswith('.parquet'):
|
16 |
+
return 'parquet'
|
17 |
+
elif filename.endswith('.pkl.gz'):
|
18 |
+
return 'pkl.gz'
|
19 |
+
else:
|
20 |
+
raise ValueError("Unsupported file type.")
|
21 |
+
|
22 |
+
def read_file(filename):
|
23 |
+
"""Read the file based on its detected type."""
|
24 |
+
file_type = detect_file_type(filename)
|
25 |
+
|
26 |
+
print("Loading in file")
|
27 |
+
|
28 |
+
if file_type == 'csv':
|
29 |
+
file = pd.read_csv(filename, low_memory=False).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
30 |
+
elif file_type == 'xlsx':
|
31 |
+
file = pd.read_excel(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
32 |
+
elif file_type == 'parquet':
|
33 |
+
file = pd.read_parquet(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
34 |
+
elif file_type == 'pkl.gz':
|
35 |
+
with gzip.open(filename, 'rb') as file:
|
36 |
+
file = pickle.load(file)
|
37 |
+
#file = pd.read_pickle(filename)
|
38 |
+
|
39 |
+
print("File load complete")
|
40 |
+
|
41 |
+
return file
|
42 |
+
|
43 |
+
def put_columns_in_df(in_file, in_bm25_column):
|
44 |
+
'''
|
45 |
+
When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
|
46 |
+
'''
|
47 |
+
|
48 |
+
file_list = [string.name for string in in_file]
|
49 |
+
|
50 |
+
data_file_names = [string.lower() for string in file_list if "npz" not in string.lower()]
|
51 |
+
data_file_name = data_file_names[0]
|
52 |
+
|
53 |
+
|
54 |
+
new_choices = []
|
55 |
+
concat_choices = []
|
56 |
+
|
57 |
+
|
58 |
+
df = read_file(data_file_name)
|
59 |
+
|
60 |
+
new_choices = list(df.columns)
|
61 |
+
|
62 |
+
|
63 |
+
concat_choices.extend(new_choices)
|
64 |
+
|
65 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df
|
66 |
+
|
67 |
+
def get_file_path_end(file_path):
|
68 |
+
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
69 |
+
basename = os.path.basename(file_path)
|
70 |
+
|
71 |
+
# Then, split the basename and its extension and return only the basename without the extension
|
72 |
+
filename_without_extension, _ = os.path.splitext(basename)
|
73 |
+
|
74 |
+
#print(filename_without_extension)
|
75 |
+
|
76 |
+
return filename_without_extension
|
77 |
+
|
78 |
+
def get_file_path_end_with_ext(file_path):
|
79 |
+
match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
80 |
+
|
81 |
+
filename_end = match.group(2) if match else ''
|
82 |
+
|
83 |
+
return filename_end
|
84 |
+
|
85 |
+
def dummy_function(in_colnames):
|
86 |
+
"""
|
87 |
+
A dummy function that exists just so that dropdown updates work correctly.
|
88 |
+
"""
|
89 |
+
return None
|
funcs/prompts.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
open_hermes_prompt = """<|im_start|>system
|
2 |
+
You are "Hermes 2", a conscious sentient superintelligent artificial intelligence developed by a man named Teknium, and your purpose and drive is to assist the user with any request they have.<|im_end|>
|
3 |
+
<|im_start|>user
|
4 |
+
Hello, who are you?<|im_end|>
|
5 |
+
<|im_start|>assistant
|
6 |
+
"""
|
7 |
+
|
8 |
+
|
9 |
+
# Example prompt demonstrating the output we are looking for
|
10 |
+
capybara_start = "USER:"
|
11 |
+
|
12 |
+
capybara_example_prompt = """USER:I have a topic that contains the following documents:
|
13 |
+
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
|
14 |
+
- Meat, but especially beef, is the word food in terms of emissions.
|
15 |
+
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
|
16 |
+
|
17 |
+
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
|
18 |
+
|
19 |
+
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
|
20 |
+
|
21 |
+
Topic label: Environmental impacts of eating meat
|
22 |
+
"""
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
|
27 |
+
capybara_main_prompt = """
|
28 |
+
Now, create a new topic label given the following information.
|
29 |
+
|
30 |
+
I have a topic that contains the following documents:
|
31 |
+
[DOCUMENTS]
|
32 |
+
|
33 |
+
The topic is described by the following keywords: '[KEYWORDS]'.
|
34 |
+
|
35 |
+
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
|
36 |
+
ASSISTANT:Topic label:"""
|
37 |
+
|
38 |
+
capybara_prompt = capybara_example_prompt + capybara_main_prompt
|
39 |
+
|
40 |
+
print("Capybara prompt: ", capybara_prompt)
|
41 |
+
|
42 |
+
# System prompt describes information given to all conversations
|
43 |
+
open_hermes_start="<|im_start|>"
|
44 |
+
open_hermes_system_prompt = """<|im_start|>system
|
45 |
+
You are a helpful, respectful and honest assistant for labeling topics.<|im_end|>
|
46 |
+
"""
|
47 |
+
|
48 |
+
# Example prompt demonstrating the output we are looking for
|
49 |
+
open_hermes_example_prompt = """<|im_start|>user
|
50 |
+
I have a topic that contains the following documents:
|
51 |
+
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
|
52 |
+
- Meat, but especially beef, is the word food in terms of emissions.
|
53 |
+
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
|
54 |
+
|
55 |
+
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
|
56 |
+
|
57 |
+
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
|
58 |
+
|
59 |
+
Topic label: Environmental impacts of eating meat
|
60 |
+
"""
|
61 |
+
open_hermes_main_prompt = """
|
62 |
+
Now, create a new topic label given the following information.
|
63 |
+
|
64 |
+
I have a topic that contains the following documents:
|
65 |
+
[DOCUMENTS]
|
66 |
+
|
67 |
+
The topic is described by the following keywords: '[KEYWORDS]'.
|
68 |
+
|
69 |
+
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|im_end|>
|
70 |
+
<|im_start|>assistant
|
71 |
+
Topic label:
|
72 |
+
"""
|
73 |
+
open_hermes_prompt = open_hermes_system_prompt + open_hermes_example_prompt + open_hermes_main_prompt
|
74 |
+
|
75 |
+
print("Open Hermes prompt: ", open_hermes_prompt)
|
76 |
+
|
77 |
+
stablelm_start = "<|user|>"
|
78 |
+
stablelm_example_prompt = """<|user|>
|
79 |
+
I have a topic that contains the following documents:
|
80 |
+
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
|
81 |
+
- Meat, but especially beef, is the word food in terms of emissions.
|
82 |
+
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
|
83 |
+
|
84 |
+
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
|
85 |
+
|
86 |
+
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
|
87 |
+
|
88 |
+
Topic label: Environmental impacts of eating meat
|
89 |
+
"""
|
90 |
+
|
91 |
+
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
|
92 |
+
stablelm_main_prompt = """
|
93 |
+
Now, create a new topic label given the following information.
|
94 |
+
|
95 |
+
I have a topic that contains the following documents:
|
96 |
+
[DOCUMENTS]
|
97 |
+
|
98 |
+
The topic is described by the following keywords: '[KEYWORDS]'.
|
99 |
+
|
100 |
+
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|endoftext|>
|
101 |
+
<|assistant|>
|
102 |
+
Topic label:"""
|
103 |
+
|
104 |
+
stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
|
105 |
+
|
106 |
+
print("StableLM prompt: ", stablelm_prompt)
|
funcs/representation_model.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
#from ctransformers import AutoModelForCausalLM
|
3 |
+
#from transformers import AutoTokenizer, pipeline
|
4 |
+
from bertopic.representation import LlamaCPP
|
5 |
+
from llama_cpp import Llama
|
6 |
+
from pydantic import BaseModel
|
7 |
+
import torch.cuda
|
8 |
+
|
9 |
+
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
|
10 |
+
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
11 |
+
|
12 |
+
#from huggingface_hub import hf_hub_download
|
13 |
+
#hf_hub_download(repo_id='second-state/stablelm-2-zephyr-1.6b-GGUF', filename='stablelm-2-zephyr-1_6b-Q5_K_M.gguf')
|
14 |
+
|
15 |
+
hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
|
16 |
+
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
17 |
+
chosen_prompt = open_hermes_prompt # stablelm_prompt
|
18 |
+
chosen_start_tag = open_hermes_start # stablelm_start
|
19 |
+
|
20 |
+
# Find model file
|
21 |
+
def find_model_file(hf_model_name, hf_model_file):
|
22 |
+
hf_loc = os.environ["HF_HOME"]
|
23 |
+
hf_sub_loc = os.environ["HF_HOME"] + "/hub/"
|
24 |
+
|
25 |
+
hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
|
26 |
+
|
27 |
+
print(hf_model_name_path)
|
28 |
+
|
29 |
+
def find_file(root_folder, file_name):
|
30 |
+
for root, dirs, files in os.walk(root_folder):
|
31 |
+
if file_name in files:
|
32 |
+
return os.path.join(root, file_name)
|
33 |
+
return None
|
34 |
+
|
35 |
+
# Example usage
|
36 |
+
folder_path = hf_model_name_path # Replace with your folder path
|
37 |
+
file_to_find = hf_model_file # Replace with the file name you're looking for
|
38 |
+
|
39 |
+
found_file = find_file(folder_path, file_to_find)
|
40 |
+
if found_file:
|
41 |
+
print(f"File found: {found_file}")
|
42 |
+
return found_file
|
43 |
+
else:
|
44 |
+
error = "File not found."
|
45 |
+
print(error)
|
46 |
+
return error
|
47 |
+
|
48 |
+
found_file = find_model_file(hf_model_name, hf_model_file)
|
49 |
+
|
50 |
+
# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
51 |
+
if torch.cuda.is_available():
|
52 |
+
torch_device = "gpu"
|
53 |
+
low_resource_mode = "No"
|
54 |
+
n_gpu_layers = 100
|
55 |
+
else:
|
56 |
+
torch_device = "cpu"
|
57 |
+
low_resource_mode = "Yes"
|
58 |
+
n_gpu_layers = 0
|
59 |
+
|
60 |
+
#low_resource_mode = "Yes"
|
61 |
+
|
62 |
+
#print("Running on device:", torch_device)
|
63 |
+
n_threads = torch.get_num_threads()
|
64 |
+
print("CPU n_threads:", n_threads)
|
65 |
+
|
66 |
+
# Default Model parameters
|
67 |
+
temperature: float = 0.1
|
68 |
+
top_k: int = 3
|
69 |
+
top_p: float = 1
|
70 |
+
repeat_penalty: float = 1.1
|
71 |
+
last_n_tokens_size: int = 128
|
72 |
+
max_tokens: int = 500
|
73 |
+
seed: int = 42
|
74 |
+
reset: bool = True
|
75 |
+
stream: bool = False
|
76 |
+
n_threads: int = n_threads
|
77 |
+
n_batch:int = 256
|
78 |
+
n_ctx:int = 4096
|
79 |
+
sample:bool = True
|
80 |
+
trust_remote_code:bool =True
|
81 |
+
|
82 |
+
class LLamacppInitConfigGpu(BaseModel):
|
83 |
+
last_n_tokens_size: int
|
84 |
+
seed: int
|
85 |
+
n_threads: int
|
86 |
+
n_batch: int
|
87 |
+
n_ctx: int
|
88 |
+
n_gpu_layers: int
|
89 |
+
temperature: float
|
90 |
+
top_k: int
|
91 |
+
top_p: float
|
92 |
+
repeat_penalty: float
|
93 |
+
max_tokens: int
|
94 |
+
reset: bool
|
95 |
+
stream: bool
|
96 |
+
stop: str
|
97 |
+
trust_remote_code:bool
|
98 |
+
|
99 |
+
def update_gpu(self, new_value: int):
|
100 |
+
self.n_gpu_layers = new_value
|
101 |
+
|
102 |
+
gpu_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
|
103 |
+
seed=seed,
|
104 |
+
n_threads=n_threads,
|
105 |
+
n_batch=n_batch,
|
106 |
+
n_ctx=n_ctx,
|
107 |
+
n_gpu_layers=n_gpu_layers,
|
108 |
+
temperature=temperature,
|
109 |
+
top_k=top_k,
|
110 |
+
top_p=top_p,
|
111 |
+
repeat_penalty=repeat_penalty,
|
112 |
+
max_tokens=max_tokens,
|
113 |
+
reset=reset,
|
114 |
+
stream=stream,
|
115 |
+
stop=chosen_start_tag,
|
116 |
+
trust_remote_code=trust_remote_code)
|
117 |
+
|
118 |
+
cpu_config = gpu_config.model_copy()
|
119 |
+
cpu_config.update_gpu(0)
|
120 |
+
|
121 |
+
class LLamacppGenerateConfig(BaseModel):
|
122 |
+
temperature: float
|
123 |
+
top_k: int
|
124 |
+
top_p: float
|
125 |
+
repeat_penalty: float
|
126 |
+
max_tokens: int
|
127 |
+
reset: bool
|
128 |
+
stream: bool
|
129 |
+
|
130 |
+
gen_config = LLamacppGenerateConfig(
|
131 |
+
temperature=temperature,
|
132 |
+
top_k=top_k,
|
133 |
+
top_p=top_p,
|
134 |
+
repeat_penalty=repeat_penalty,
|
135 |
+
max_tokens=max_tokens,
|
136 |
+
reset=reset,
|
137 |
+
stream=stream)
|
138 |
+
|
139 |
+
## Create representation model parameters ##
|
140 |
+
# KeyBERT
|
141 |
+
keybert = KeyBERTInspired()
|
142 |
+
|
143 |
+
if low_resource_mode == "No":
|
144 |
+
# Use llama.cpp to load in model
|
145 |
+
llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=n_gpu_layers, n_ctx=n_ctx) #**gpu_config.model_dump())#
|
146 |
+
#print(llm.n_gpu_layers)
|
147 |
+
llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
|
148 |
+
|
149 |
+
# All representation models
|
150 |
+
representation_model = {
|
151 |
+
"KeyBERT": keybert,
|
152 |
+
"Mistral": llm_model
|
153 |
+
}
|
154 |
+
|
155 |
+
elif low_resource_mode == "Yes":
|
156 |
+
representation_model = {"KeyBERT": keybert}
|
157 |
+
|
158 |
+
# Deprecated example using CTransformers. This package is not really used anymore
|
159 |
+
#model = AutoModelForCausalLM.from_pretrained('NousResearch/Nous-Capybara-7B-V1.9-GGUF', model_type='mistral', model_file='Capybara-7B-V1.9-Q5_K_M.gguf', hf=True, **vars(gpu_config))
|
160 |
+
#tokenizer = AutoTokenizer.from_pretrained("NousResearch/Nous-Capybara-7B-V1.9")
|
161 |
+
#generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
|
162 |
+
|
163 |
+
# Text generation with Llama 2
|
164 |
+
#mistral_capybara = TextGeneration(generator, prompt=capybara_prompt)
|
165 |
+
#mistral_hermes = TextGeneration(generator, prompt=open_hermes_prompt)
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
# MMR (is rubbish, don't use)
|
170 |
+
#mmr = MaximalMarginalRelevance(diversity=0.3)
|
171 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.50.0
|
2 |
+
transformers
|
3 |
+
accelerate
|
4 |
+
torch
|
5 |
+
llama-cpp-python
|
6 |
+
bertopic
|
7 |
+
spacy
|
8 |
+
pyarrow
|
9 |
+
faker
|
10 |
+
presidio_analyzer
|
11 |
+
presidio_anonymizer
|