Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +28 -12
- __pycache__/utils.cpython-312.pyc +0 -0
- data/hf_tokens.pkl +3 -0
- data/test.json +3 -0
- demo.py +72 -0
- requirements.txt +3 -0
- utils.py +180 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/test.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,12 +1,28 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: dialogue-summarization
|
3 |
+
app_file: demo.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 4.36.1
|
6 |
+
---
|
7 |
+
# dialogue-summarization
|
8 |
+
A mini-project topic at Viettel Digital Talent 2024
|
9 |
+
|
10 |
+
## How to use:
|
11 |
+
To use this project using git, please do the following.
|
12 |
+
* Firstly, clone the repository's main branch into your desired directory using your git command prompt.
|
13 |
+
|
14 |
+
```git clone https://github.com/king17pvp/dialogue-summarization.git```
|
15 |
+
* Secondly, you can access the directory by this command.
|
16 |
+
|
17 |
+
```cd dialogue-summarization```
|
18 |
+
|
19 |
+
* Thirdly, install required libraries via requirements.txt
|
20 |
+
|
21 |
+
```pip install -q -r requirements.txt```
|
22 |
+
* Finally, run the project by
|
23 |
+
|
24 |
+
```python demo.py```
|
25 |
+
* If you want to stop the program, press *Ctrl + C* in the terminal, the session will stop.
|
26 |
+
|
27 |
+
* **IMPORTANT NOTES** If there are any errors when you run the project via step 3, you should reinstall Python from the official website with version 3.10+ and restart from step 1. The reference link to download python: https://www.python.org/downloads/release/python-3123/ (Make sure to add Python to PATH when installed)
|
28 |
+
|
__pycache__/utils.cpython-312.pyc
ADDED
Binary file (8.55 kB). View file
|
|
data/hf_tokens.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0f9adedca61139d46f323d18a929f71424fa3931e3eee560c64a3a3484db290
|
3 |
+
size 218
|
data/test.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a26c61f1ac554e654bd4e439324a3a3caab5269050c8e378c1a4e615124e44bc
|
3 |
+
size 13393961
|
demo.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import nltk
|
4 |
+
import json
|
5 |
+
import random
|
6 |
+
from utils import *
|
7 |
+
|
8 |
+
# Function to handle summarization based on the selected method
|
9 |
+
def summarize(text: str, method: str) -> str:
|
10 |
+
processed_text = ' '.join(text.split("\n"))
|
11 |
+
chunks = split_chunk(text)
|
12 |
+
tree = MemWalker(chunks)
|
13 |
+
tree.build_memory_tree()
|
14 |
+
summary = summarize_three_ways(chunks)
|
15 |
+
|
16 |
+
#print(processed_text)
|
17 |
+
|
18 |
+
if method == "Truncation":
|
19 |
+
return summary['truncated']
|
20 |
+
elif method == "Rewrite":
|
21 |
+
return summary['rewrite']
|
22 |
+
elif method == "Accumulate":
|
23 |
+
return summary['accumulate']
|
24 |
+
elif method == "Memory Tree":
|
25 |
+
return tree.root.summary
|
26 |
+
|
27 |
+
example_transcript = get_example()
|
28 |
+
|
29 |
+
def clear_input():
|
30 |
+
return "", "Truncation"
|
31 |
+
|
32 |
+
def load_example(index):
|
33 |
+
return example_transcript[index]
|
34 |
+
|
35 |
+
def load_example_1():
|
36 |
+
return load_example(0)
|
37 |
+
def load_example_2():
|
38 |
+
return load_example(1)
|
39 |
+
def load_example_3():
|
40 |
+
return load_example(2)
|
41 |
+
def load_example_4():
|
42 |
+
return load_example(3)
|
43 |
+
|
44 |
+
# Create the interface
|
45 |
+
with gr.Blocks() as iface:
|
46 |
+
text_input = gr.Textbox(lines=5, placeholder="Enter text to summarize here...", label="Input Transcript")
|
47 |
+
method_input = gr.Radio(choices=["Truncation", "Rewrite", "Accumulate", "Memory Tree"], label="Summarization Method", value="Truncation")
|
48 |
+
output_text = gr.Textbox(label="Summary")
|
49 |
+
|
50 |
+
summarize_button = gr.Button("Summarize")
|
51 |
+
clear_button = gr.Button("Clear Input")
|
52 |
+
|
53 |
+
summarize_button.click(summarize, inputs=[text_input, method_input], outputs=output_text)
|
54 |
+
clear_button.click(clear_input, outputs=[text_input, method_input])
|
55 |
+
#gr.Exa
|
56 |
+
#gr.Examples([example_transcript[:100]], inputs=[text_input], label=f"Example", fn=load_example)
|
57 |
+
|
58 |
+
example_button_1 = gr.Button(f"Example {1}: {example_transcript[0][:600]}...")
|
59 |
+
example_button_1.click(load_example_1, inputs = None, outputs=[text_input])
|
60 |
+
|
61 |
+
example_button_2 = gr.Button(f"Example {2}: {example_transcript[1][:600]}...")
|
62 |
+
example_button_2.click(load_example_2, inputs = None, outputs=[text_input])
|
63 |
+
|
64 |
+
example_button_3 = gr.Button(f"Example {3}: {example_transcript[2][:600]}...")
|
65 |
+
example_button_3.click(load_example_3, inputs = None, outputs=[text_input])
|
66 |
+
|
67 |
+
example_button_4 = gr.Button(f"Example {4}: {example_transcript[3][:600]}...")
|
68 |
+
example_button_4.click(load_example_4, inputs = None, outputs=[text_input])
|
69 |
+
|
70 |
+
|
71 |
+
# Launch the interface
|
72 |
+
iface.launch(share = True, debug = True)
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
nltk
|
3 |
+
requests
|
utils.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import nltk
|
3 |
+
import random
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import pickle
|
7 |
+
import re
|
8 |
+
hf_tokens = []
|
9 |
+
|
10 |
+
filepath = __file__.replace("\\", "/").replace("utils.py", "")
|
11 |
+
|
12 |
+
with open(filepath + "data/hf_tokens.pkl", "rb") as f:
|
13 |
+
hf_tokens = pickle.load(f)
|
14 |
+
|
15 |
+
|
16 |
+
MAX_TOKEN_LENGTH = 4096
|
17 |
+
MAX_CHUNK_SIZE = 16000
|
18 |
+
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
|
19 |
+
|
20 |
+
def prompt_template(prompt, sys_prompt = ""):
|
21 |
+
return_prompt = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<system_prompt><|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<user_prompt><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'.replace('<user_prompt>', prompt).replace('<system_prompt>', sys_prompt)
|
22 |
+
return return_prompt
|
23 |
+
|
24 |
+
def query(payload: dict, hf_token: str):
|
25 |
+
headers = {"Authorization": f"Bearer {hf_token}"}
|
26 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
27 |
+
return response.json()
|
28 |
+
|
29 |
+
def gen_prompt(prompt: str, sys_prompt:str = ""):
|
30 |
+
input_prompt = prompt_template(prompt, sys_prompt)
|
31 |
+
selected_token = ''
|
32 |
+
for token in hf_tokens:
|
33 |
+
test_output = query({
|
34 |
+
"inputs": prompt_template("Who are you?"),
|
35 |
+
"parameters": {"max_new_tokens": 100}
|
36 |
+
}, token)
|
37 |
+
if 'error' not in test_output:
|
38 |
+
selected_token = token
|
39 |
+
break
|
40 |
+
output = query({
|
41 |
+
"inputs": input_prompt,
|
42 |
+
"parameters": {"max_new_tokens": 512},
|
43 |
+
}, selected_token)
|
44 |
+
return output[0]['generated_text'][len(input_prompt):]
|
45 |
+
|
46 |
+
class Node:
|
47 |
+
def __init__(self, summary=None):
|
48 |
+
self.summary = summary
|
49 |
+
self.children = []
|
50 |
+
self.parent = None
|
51 |
+
|
52 |
+
def add_child(self, child_node):
|
53 |
+
child_node.parent = self
|
54 |
+
self.children.append(child_node)
|
55 |
+
|
56 |
+
class MemWalker:
|
57 |
+
def __init__(self, segments):
|
58 |
+
self.segments = segments
|
59 |
+
self.root = 0
|
60 |
+
|
61 |
+
def build_memory_tree(self):
|
62 |
+
# Step 1: Create leaf nodes for each segment
|
63 |
+
leaves = [Node(summarize(seg, 0)) for seg in self.segments]
|
64 |
+
|
65 |
+
# Step 2: Build tree recursively
|
66 |
+
while len(leaves) > 1:
|
67 |
+
new_leaves = []
|
68 |
+
for i in range(0, len(leaves), 2):
|
69 |
+
if i + 1 < len(leaves):
|
70 |
+
combined_summary = summarize(leaves[i].summary + ", " + leaves[i + 1].summary, 1)
|
71 |
+
parent_node = Node(combined_summary)
|
72 |
+
parent_node.add_child(leaves[i])
|
73 |
+
parent_node.add_child(leaves[i + 1])
|
74 |
+
else:
|
75 |
+
parent_node = leaves[i]
|
76 |
+
new_leaves.append(parent_node)
|
77 |
+
leaves = new_leaves
|
78 |
+
self.root = leaves[0]
|
79 |
+
|
80 |
+
# Placeholder functions for LLM operations
|
81 |
+
def summarize(text, sum_type: int = 1):
|
82 |
+
assert sum_type in [0, 1], "Lmao sum type should be either 0 or 1"
|
83 |
+
if sum_type == 0:
|
84 |
+
USER_PROMPT = "Write a concise summary of the meeting transcript in maximum 5 sentences:" + "\n\n" + text
|
85 |
+
else:
|
86 |
+
USER_PROMPT = "Compress the following summaries into a much shorter summary: " + "\n\n" + text
|
87 |
+
SYS_PROMPT = "Act as a professional technical meeting minutes writer."
|
88 |
+
|
89 |
+
tmp = gen_prompt(USER_PROMPT, SYS_PROMPT)
|
90 |
+
if len(tmp.split("\n\n")) == 1:
|
91 |
+
return tmp
|
92 |
+
else:
|
93 |
+
return tmp.split("\n\n")[1]
|
94 |
+
|
95 |
+
|
96 |
+
#return output[0]['generated_text'][len(input_prompt):]
|
97 |
+
|
98 |
+
def split_chunk(transcript: str):
|
99 |
+
sentences = nltk.sent_tokenize(transcript)
|
100 |
+
idx = 0
|
101 |
+
chunk = []
|
102 |
+
current_chunk = ""
|
103 |
+
while idx < len(sentences):
|
104 |
+
if len(current_chunk + sentences[idx]) < MAX_CHUNK_SIZE:
|
105 |
+
current_chunk += sentences[idx] + " "
|
106 |
+
else:
|
107 |
+
chunk.append(current_chunk)
|
108 |
+
current_chunk = ''
|
109 |
+
for i in range(10, -1, -1):
|
110 |
+
current_chunk += sentences[idx - i] + " "
|
111 |
+
idx += 1
|
112 |
+
|
113 |
+
chunk.append(current_chunk)
|
114 |
+
|
115 |
+
return chunk
|
116 |
+
|
117 |
+
def summarize_three_ways(chunks: list[str]):
|
118 |
+
|
119 |
+
SYS_PROMPT = "Act as a professional technical meeting minutes writer."
|
120 |
+
PROMPT_TEMPLATE = "Write a concise summary of the meeting transcript in maximum 5 sentences:" + "\n\n" + "{text}"
|
121 |
+
REFINE_TEMPLATE = (
|
122 |
+
"Your job is to produce a final summary\n"
|
123 |
+
"We have provided an existing summary up to a certain point: {existing_answer}\n"
|
124 |
+
"We have the opportunity to refine the existing summary"
|
125 |
+
"(only if needed) with some more context below.\n"
|
126 |
+
"------------\n"
|
127 |
+
"{text}\n"
|
128 |
+
"------------\n"
|
129 |
+
f"Given the new context, refine the original summary in English within 5 sentences. If the context isn't useful, return the original summary."
|
130 |
+
)
|
131 |
+
step = 0
|
132 |
+
prev_sum = ""
|
133 |
+
partial_sum = []
|
134 |
+
return_dict = {}
|
135 |
+
for chunk in chunks:
|
136 |
+
if step == 0:
|
137 |
+
CUR_PROMPT = PROMPT_TEMPLATE.replace("{text}", chunk)
|
138 |
+
cur_sum = gen_prompt(CUR_PROMPT , SYS_PROMPT)
|
139 |
+
else:
|
140 |
+
CUR_PROMPT = REFINE_TEMPLATE.replace("{existing_answer}", partial_sum[-1])
|
141 |
+
CUR_PROMPT = CUR_PROMPT.replace("{text}", chunk)
|
142 |
+
cur_sum = gen_prompt(CUR_PROMPT, SYS_PROMPT)
|
143 |
+
if len(cur_sum.split("\n\n")) > 1:
|
144 |
+
cur_sum = cur_sum.split("\n\n")[1]
|
145 |
+
#print(cur_sum)
|
146 |
+
partial_sum.append(cur_sum)
|
147 |
+
step += 1
|
148 |
+
#print(partial_sum)
|
149 |
+
CUR_PROMPT = "Rewrite the following text by maintaining coherency: " + "\n\n"
|
150 |
+
CUR_PROMPT += ' '.join(partial_sum)
|
151 |
+
tmp = gen_prompt(CUR_PROMPT, SYS_PROMPT)
|
152 |
+
final_sum = ''
|
153 |
+
if len(tmp.split("\n\n")) == 1:
|
154 |
+
final_sum = tmp
|
155 |
+
else:
|
156 |
+
final_sum = tmp.split("\n\n")[1]
|
157 |
+
return_dict['truncated'] = partial_sum[0]
|
158 |
+
return_dict['accumulate'] = partial_sum[-1]
|
159 |
+
return_dict['rewrite'] = final_sum
|
160 |
+
|
161 |
+
return return_dict
|
162 |
+
|
163 |
+
def get_example()->list[str]:
|
164 |
+
data = []
|
165 |
+
with open(filepath + "data/test.json", "r") as f:
|
166 |
+
for line in f:
|
167 |
+
data.append(json.loads(line))
|
168 |
+
|
169 |
+
#random_idx = random.sample(list(range(len(data))), 6)
|
170 |
+
random_idx = [1, 2, 9, 13]
|
171 |
+
return ['\n'.join(nltk.sent_tokenize(data[i]['transcript'])) for i in random_idx]
|
172 |
+
|
173 |
+
if __name__ == "__main__":
|
174 |
+
'''data = []
|
175 |
+
with open(filepath + "data/test.json", "r") as f:
|
176 |
+
for line in f:
|
177 |
+
data.append(json.loads(line))
|
178 |
+
tmp = data[:100]
|
179 |
+
for j, i in enumerate(tmp):
|
180 |
+
print(j, len(i['transcript']))'''
|