import requests import nltk import random import json import os import pickle import re nltk.download('punkt') hf_tokens = [] filepath = __file__.replace("\\", "/").replace("utils.py", "") with open(filepath + "data/hf_tokens.pkl", "rb") as f: hf_tokens = pickle.load(f) MAX_TOKEN_LENGTH = 4096 MAX_CHUNK_SIZE = 16000 API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" def prompt_template(prompt, sys_prompt = ""): return_prompt = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'.replace('', prompt).replace('', sys_prompt) return return_prompt def query(payload: dict, hf_token: str): headers = {"Authorization": f"Bearer {hf_token}"} response = requests.post(API_URL, headers=headers, json=payload) return response.json() def gen_prompt(prompt: str, sys_prompt:str = ""): input_prompt = prompt_template(prompt, sys_prompt) selected_token = '' for token in hf_tokens: test_output = query({ "inputs": prompt_template("Who are you?"), "parameters": {"max_new_tokens": 100} }, token) if 'error' not in test_output: selected_token = token break output = query({ "inputs": input_prompt, "parameters": {"max_new_tokens": 512}, }, selected_token) return output[0]['generated_text'][len(input_prompt):] class Node: def __init__(self, summary=None): self.summary = summary self.children = [] self.parent = None def add_child(self, child_node): child_node.parent = self self.children.append(child_node) class MemWalker: def __init__(self, segments): self.segments = segments self.root = 0 def build_memory_tree(self): # Step 1: Create leaf nodes for each segment leaves = [Node(summarize(seg, 0)) for seg in self.segments] # Step 2: Build tree recursively while len(leaves) > 1: new_leaves = [] for i in range(0, len(leaves), 2): if i + 1 < len(leaves): combined_summary = summarize(leaves[i].summary + ", " + leaves[i + 1].summary, 1) parent_node = Node(combined_summary) parent_node.add_child(leaves[i]) parent_node.add_child(leaves[i + 1]) else: parent_node = leaves[i] new_leaves.append(parent_node) leaves = new_leaves self.root = leaves[0] # Placeholder functions for LLM operations def summarize(text, sum_type: int = 1): assert sum_type in [0, 1], "Lmao sum type should be either 0 or 1" if sum_type == 0: USER_PROMPT = "Write a concise summary of the meeting transcript in maximum 5 sentences:" + "\n\n" + text else: USER_PROMPT = "Compress the following summaries into a much shorter summary: " + "\n\n" + text SYS_PROMPT = "Act as a professional technical meeting minutes writer." tmp = gen_prompt(USER_PROMPT, SYS_PROMPT) if len(tmp.split("\n\n")) == 1: return tmp else: return tmp.split("\n\n")[1] #return output[0]['generated_text'][len(input_prompt):] def split_chunk(transcript: str): sentences = nltk.sent_tokenize(transcript) idx = 0 chunk = [] current_chunk = "" while idx < len(sentences): if len(current_chunk + sentences[idx]) < MAX_CHUNK_SIZE: current_chunk += sentences[idx] + " " else: chunk.append(current_chunk) current_chunk = '' for i in range(10, -1, -1): current_chunk += sentences[idx - i] + " " idx += 1 chunk.append(current_chunk) return chunk def summarize_three_ways(chunks: list[str]): SYS_PROMPT = "Act as a professional technical meeting minutes writer." PROMPT_TEMPLATE = "Write a concise summary of the meeting transcript in maximum 5 sentences:" + "\n\n" + "{text}" REFINE_TEMPLATE = ( "Your job is to produce a final summary\n" "We have provided an existing summary up to a certain point: {existing_answer}\n" "We have the opportunity to refine the existing summary" "(only if needed) with some more context below.\n" "------------\n" "{text}\n" "------------\n" f"Given the new context, refine the original summary in English within 5 sentences. If the context isn't useful, return the original summary." ) step = 0 prev_sum = "" partial_sum = [] return_dict = {} for chunk in chunks: if step == 0: CUR_PROMPT = PROMPT_TEMPLATE.replace("{text}", chunk) cur_sum = gen_prompt(CUR_PROMPT , SYS_PROMPT) else: CUR_PROMPT = REFINE_TEMPLATE.replace("{existing_answer}", partial_sum[-1]) CUR_PROMPT = CUR_PROMPT.replace("{text}", chunk) cur_sum = gen_prompt(CUR_PROMPT, SYS_PROMPT) if len(cur_sum.split("\n\n")) > 1: cur_sum = cur_sum.split("\n\n")[1] #print(cur_sum) partial_sum.append(cur_sum) step += 1 #print(partial_sum) CUR_PROMPT = "Rewrite the following text by maintaining coherency: " + "\n\n" CUR_PROMPT += ' '.join(partial_sum) tmp = gen_prompt(CUR_PROMPT, SYS_PROMPT) final_sum = '' if len(tmp.split("\n\n")) == 1: final_sum = tmp else: final_sum = tmp.split("\n\n")[1] return_dict['truncated'] = partial_sum[0] return_dict['accumulate'] = partial_sum[-1] return_dict['rewrite'] = final_sum return return_dict def get_example()->list[str]: data = [] with open(filepath + "data/test.json", "r") as f: for line in f: data.append(json.loads(line)) #random_idx = random.sample(list(range(len(data))), 6) random_idx = [2, 89, 94, 97] #random_idx = [1, 2, 9, 13] return ['\n'.join(nltk.sent_tokenize(data[i]['transcript'])) for i in random_idx] if __name__ == "__main__": data = [] with open(filepath + "data/test.json", "r") as f: for line in f: data.append(json.loads(line)) tmp = data[:100] for j, i in enumerate(tmp): print(j, len(i['transcript']))