king17pvp commited on
Commit
fb2d628
1 Parent(s): d7fccfd

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/test.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,28 @@
1
- ---
2
- title: Dialogue Summarization
3
- emoji: 📚
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: dialogue-summarization
3
+ app_file: demo.py
4
+ sdk: gradio
5
+ sdk_version: 4.36.1
6
+ ---
7
+ # dialogue-summarization
8
+ A mini-project topic at Viettel Digital Talent 2024
9
+
10
+ ## How to use:
11
+ To use this project using git, please do the following.
12
+ * Firstly, clone the repository's main branch into your desired directory using your git command prompt.
13
+
14
+ ```git clone https://github.com/king17pvp/dialogue-summarization.git```
15
+ * Secondly, you can access the directory by this command.
16
+
17
+ ```cd dialogue-summarization```
18
+
19
+ * Thirdly, install required libraries via requirements.txt
20
+
21
+ ```pip install -q -r requirements.txt```
22
+ * Finally, run the project by
23
+
24
+ ```python demo.py```
25
+ * If you want to stop the program, press *Ctrl + C* in the terminal, the session will stop.
26
+
27
+ * **IMPORTANT NOTES** If there are any errors when you run the project via step 3, you should reinstall Python from the official website with version 3.10+ and restart from step 1. The reference link to download python: https://www.python.org/downloads/release/python-3123/ (Make sure to add Python to PATH when installed)
28
+
__pycache__/utils.cpython-312.pyc ADDED
Binary file (8.55 kB). View file
 
data/hf_tokens.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0f9adedca61139d46f323d18a929f71424fa3931e3eee560c64a3a3484db290
3
+ size 218
data/test.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a26c61f1ac554e654bd4e439324a3a3caab5269050c8e378c1a4e615124e44bc
3
+ size 13393961
demo.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import nltk
4
+ import json
5
+ import random
6
+ from utils import *
7
+
8
+ # Function to handle summarization based on the selected method
9
+ def summarize(text: str, method: str) -> str:
10
+ processed_text = ' '.join(text.split("\n"))
11
+ chunks = split_chunk(text)
12
+ tree = MemWalker(chunks)
13
+ tree.build_memory_tree()
14
+ summary = summarize_three_ways(chunks)
15
+
16
+ #print(processed_text)
17
+
18
+ if method == "Truncation":
19
+ return summary['truncated']
20
+ elif method == "Rewrite":
21
+ return summary['rewrite']
22
+ elif method == "Accumulate":
23
+ return summary['accumulate']
24
+ elif method == "Memory Tree":
25
+ return tree.root.summary
26
+
27
+ example_transcript = get_example()
28
+
29
+ def clear_input():
30
+ return "", "Truncation"
31
+
32
+ def load_example(index):
33
+ return example_transcript[index]
34
+
35
+ def load_example_1():
36
+ return load_example(0)
37
+ def load_example_2():
38
+ return load_example(1)
39
+ def load_example_3():
40
+ return load_example(2)
41
+ def load_example_4():
42
+ return load_example(3)
43
+
44
+ # Create the interface
45
+ with gr.Blocks() as iface:
46
+ text_input = gr.Textbox(lines=5, placeholder="Enter text to summarize here...", label="Input Transcript")
47
+ method_input = gr.Radio(choices=["Truncation", "Rewrite", "Accumulate", "Memory Tree"], label="Summarization Method", value="Truncation")
48
+ output_text = gr.Textbox(label="Summary")
49
+
50
+ summarize_button = gr.Button("Summarize")
51
+ clear_button = gr.Button("Clear Input")
52
+
53
+ summarize_button.click(summarize, inputs=[text_input, method_input], outputs=output_text)
54
+ clear_button.click(clear_input, outputs=[text_input, method_input])
55
+ #gr.Exa
56
+ #gr.Examples([example_transcript[:100]], inputs=[text_input], label=f"Example", fn=load_example)
57
+
58
+ example_button_1 = gr.Button(f"Example {1}: {example_transcript[0][:600]}...")
59
+ example_button_1.click(load_example_1, inputs = None, outputs=[text_input])
60
+
61
+ example_button_2 = gr.Button(f"Example {2}: {example_transcript[1][:600]}...")
62
+ example_button_2.click(load_example_2, inputs = None, outputs=[text_input])
63
+
64
+ example_button_3 = gr.Button(f"Example {3}: {example_transcript[2][:600]}...")
65
+ example_button_3.click(load_example_3, inputs = None, outputs=[text_input])
66
+
67
+ example_button_4 = gr.Button(f"Example {4}: {example_transcript[3][:600]}...")
68
+ example_button_4.click(load_example_4, inputs = None, outputs=[text_input])
69
+
70
+
71
+ # Launch the interface
72
+ iface.launch(share = True, debug = True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ nltk
3
+ requests
utils.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import nltk
3
+ import random
4
+ import json
5
+ import os
6
+ import pickle
7
+ import re
8
+ hf_tokens = []
9
+
10
+ filepath = __file__.replace("\\", "/").replace("utils.py", "")
11
+
12
+ with open(filepath + "data/hf_tokens.pkl", "rb") as f:
13
+ hf_tokens = pickle.load(f)
14
+
15
+
16
+ MAX_TOKEN_LENGTH = 4096
17
+ MAX_CHUNK_SIZE = 16000
18
+ API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
19
+
20
+ def prompt_template(prompt, sys_prompt = ""):
21
+ return_prompt = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<system_prompt><|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<user_prompt><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'.replace('<user_prompt>', prompt).replace('<system_prompt>', sys_prompt)
22
+ return return_prompt
23
+
24
+ def query(payload: dict, hf_token: str):
25
+ headers = {"Authorization": f"Bearer {hf_token}"}
26
+ response = requests.post(API_URL, headers=headers, json=payload)
27
+ return response.json()
28
+
29
+ def gen_prompt(prompt: str, sys_prompt:str = ""):
30
+ input_prompt = prompt_template(prompt, sys_prompt)
31
+ selected_token = ''
32
+ for token in hf_tokens:
33
+ test_output = query({
34
+ "inputs": prompt_template("Who are you?"),
35
+ "parameters": {"max_new_tokens": 100}
36
+ }, token)
37
+ if 'error' not in test_output:
38
+ selected_token = token
39
+ break
40
+ output = query({
41
+ "inputs": input_prompt,
42
+ "parameters": {"max_new_tokens": 512},
43
+ }, selected_token)
44
+ return output[0]['generated_text'][len(input_prompt):]
45
+
46
+ class Node:
47
+ def __init__(self, summary=None):
48
+ self.summary = summary
49
+ self.children = []
50
+ self.parent = None
51
+
52
+ def add_child(self, child_node):
53
+ child_node.parent = self
54
+ self.children.append(child_node)
55
+
56
+ class MemWalker:
57
+ def __init__(self, segments):
58
+ self.segments = segments
59
+ self.root = 0
60
+
61
+ def build_memory_tree(self):
62
+ # Step 1: Create leaf nodes for each segment
63
+ leaves = [Node(summarize(seg, 0)) for seg in self.segments]
64
+
65
+ # Step 2: Build tree recursively
66
+ while len(leaves) > 1:
67
+ new_leaves = []
68
+ for i in range(0, len(leaves), 2):
69
+ if i + 1 < len(leaves):
70
+ combined_summary = summarize(leaves[i].summary + ", " + leaves[i + 1].summary, 1)
71
+ parent_node = Node(combined_summary)
72
+ parent_node.add_child(leaves[i])
73
+ parent_node.add_child(leaves[i + 1])
74
+ else:
75
+ parent_node = leaves[i]
76
+ new_leaves.append(parent_node)
77
+ leaves = new_leaves
78
+ self.root = leaves[0]
79
+
80
+ # Placeholder functions for LLM operations
81
+ def summarize(text, sum_type: int = 1):
82
+ assert sum_type in [0, 1], "Lmao sum type should be either 0 or 1"
83
+ if sum_type == 0:
84
+ USER_PROMPT = "Write a concise summary of the meeting transcript in maximum 5 sentences:" + "\n\n" + text
85
+ else:
86
+ USER_PROMPT = "Compress the following summaries into a much shorter summary: " + "\n\n" + text
87
+ SYS_PROMPT = "Act as a professional technical meeting minutes writer."
88
+
89
+ tmp = gen_prompt(USER_PROMPT, SYS_PROMPT)
90
+ if len(tmp.split("\n\n")) == 1:
91
+ return tmp
92
+ else:
93
+ return tmp.split("\n\n")[1]
94
+
95
+
96
+ #return output[0]['generated_text'][len(input_prompt):]
97
+
98
+ def split_chunk(transcript: str):
99
+ sentences = nltk.sent_tokenize(transcript)
100
+ idx = 0
101
+ chunk = []
102
+ current_chunk = ""
103
+ while idx < len(sentences):
104
+ if len(current_chunk + sentences[idx]) < MAX_CHUNK_SIZE:
105
+ current_chunk += sentences[idx] + " "
106
+ else:
107
+ chunk.append(current_chunk)
108
+ current_chunk = ''
109
+ for i in range(10, -1, -1):
110
+ current_chunk += sentences[idx - i] + " "
111
+ idx += 1
112
+
113
+ chunk.append(current_chunk)
114
+
115
+ return chunk
116
+
117
+ def summarize_three_ways(chunks: list[str]):
118
+
119
+ SYS_PROMPT = "Act as a professional technical meeting minutes writer."
120
+ PROMPT_TEMPLATE = "Write a concise summary of the meeting transcript in maximum 5 sentences:" + "\n\n" + "{text}"
121
+ REFINE_TEMPLATE = (
122
+ "Your job is to produce a final summary\n"
123
+ "We have provided an existing summary up to a certain point: {existing_answer}\n"
124
+ "We have the opportunity to refine the existing summary"
125
+ "(only if needed) with some more context below.\n"
126
+ "------------\n"
127
+ "{text}\n"
128
+ "------------\n"
129
+ f"Given the new context, refine the original summary in English within 5 sentences. If the context isn't useful, return the original summary."
130
+ )
131
+ step = 0
132
+ prev_sum = ""
133
+ partial_sum = []
134
+ return_dict = {}
135
+ for chunk in chunks:
136
+ if step == 0:
137
+ CUR_PROMPT = PROMPT_TEMPLATE.replace("{text}", chunk)
138
+ cur_sum = gen_prompt(CUR_PROMPT , SYS_PROMPT)
139
+ else:
140
+ CUR_PROMPT = REFINE_TEMPLATE.replace("{existing_answer}", partial_sum[-1])
141
+ CUR_PROMPT = CUR_PROMPT.replace("{text}", chunk)
142
+ cur_sum = gen_prompt(CUR_PROMPT, SYS_PROMPT)
143
+ if len(cur_sum.split("\n\n")) > 1:
144
+ cur_sum = cur_sum.split("\n\n")[1]
145
+ #print(cur_sum)
146
+ partial_sum.append(cur_sum)
147
+ step += 1
148
+ #print(partial_sum)
149
+ CUR_PROMPT = "Rewrite the following text by maintaining coherency: " + "\n\n"
150
+ CUR_PROMPT += ' '.join(partial_sum)
151
+ tmp = gen_prompt(CUR_PROMPT, SYS_PROMPT)
152
+ final_sum = ''
153
+ if len(tmp.split("\n\n")) == 1:
154
+ final_sum = tmp
155
+ else:
156
+ final_sum = tmp.split("\n\n")[1]
157
+ return_dict['truncated'] = partial_sum[0]
158
+ return_dict['accumulate'] = partial_sum[-1]
159
+ return_dict['rewrite'] = final_sum
160
+
161
+ return return_dict
162
+
163
+ def get_example()->list[str]:
164
+ data = []
165
+ with open(filepath + "data/test.json", "r") as f:
166
+ for line in f:
167
+ data.append(json.loads(line))
168
+
169
+ #random_idx = random.sample(list(range(len(data))), 6)
170
+ random_idx = [1, 2, 9, 13]
171
+ return ['\n'.join(nltk.sent_tokenize(data[i]['transcript'])) for i in random_idx]
172
+
173
+ if __name__ == "__main__":
174
+ '''data = []
175
+ with open(filepath + "data/test.json", "r") as f:
176
+ for line in f:
177
+ data.append(json.loads(line))
178
+ tmp = data[:100]
179
+ for j, i in enumerate(tmp):
180
+ print(j, len(i['transcript']))'''