XINZHANG94 commited on
Commit
ae46fe0
·
1 Parent(s): 40235f1
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisper
3
+ from langchain_openai import ChatOpenAI
4
+ from utils import RefineDataSummarizer
5
+ import os
6
+
7
+ def transcript(file_dir, model_type):
8
+ model_dir = os.path('models', model_type)'
9
+ model = whisper.load_model(model_dir)
10
+ result = model.transcribe(file_dir, language='English', task='transcribe')
11
+
12
+ lines = [s['text'] for s in result['segments']]
13
+ text = ''
14
+ for line in lines:
15
+ text += f"{line}\n"
16
+ return text
17
+
18
+
19
+ def upload_file(file_paths):
20
+ return file_paths
21
+
22
+
23
+ def summary(text, chunk_num, chunk_overlap, api_key, llm_type):
24
+ print(text)
25
+ api_key = api_key.strip()
26
+ llm = ChatOpenAI(temperature=1, openai_api_key=api_key, model_name=llm_type)
27
+ rds = RefineDataSummarizer(llm=llm)
28
+ result = rds.get_summarization(text, chunk_num=chunk_num, chunk_overlap=chunk_overlap)
29
+ return result["output_text"]
30
+
31
+
32
+ with gr.Blocks() as demo:
33
+ with gr.Row(equal_height=False):
34
+ with gr.Column():
35
+ file_output = gr.File()
36
+ upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio", "video"], file_count="single")
37
+ upload_button.upload(upload_file, upload_button, file_output)
38
+ model_type = gr.Dropdown(
39
+ [
40
+ "tiny.en.pt",
41
+ "tiny.pt",
42
+ "small.en.pt",
43
+ "small.pt",
44
+ "base.en.pt",
45
+ "base.pt",
46
+ "medium.en.pt",
47
+ "medium.pt",
48
+ "large-v1.pt",
49
+ "large-v2.pt",], label="Model Type", value="medium.en.pt")
50
+ TranscriptButton = gr.Button("Transcript", variant="primary")
51
+
52
+ with gr.Column():
53
+ transcript_text = gr.Textbox(placeholder="Transcript Result", label="Transcript")
54
+ chunk_num = gr.Number(precision=0, minimum=1, maximum=9999, step=1, label="Chunk Number", value=1)
55
+ chunk_overlap = gr.Number(precision=0, minimum=1, maximum=9999, step=1, label="Chunk Overlap", value=100)
56
+ api_key = gr.Textbox(placeholder="key", label="Your API Key", value='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')
57
+ llm_type = gr.Dropdown(
58
+ [
59
+ "gpt-3.5-turbo",
60
+ "gpt-3.5-turbo-16k",
61
+ "gpt-4-1106-preview"
62
+ ], label="LLM Type", value="gpt-4-1106-preview")
63
+ SunmmaryButton = gr.Button("Summary", variant="primary")
64
+ summary_text = gr.Textbox(placeholder="Summary Result", label="Summary")
65
+
66
+
67
+ TranscriptButton.click(
68
+ fn=transcript,
69
+ inputs=[
70
+ file_output,
71
+ model_type
72
+ ],
73
+ outputs=[transcript_text]
74
+ )
75
+ SunmmaryButton.click(
76
+ fn=summary,
77
+ inputs=[
78
+ transcript_text,
79
+ chunk_num,
80
+ chunk_overlap,
81
+ api_key,
82
+ llm_type
83
+ ],
84
+ outputs=[summary_text]
85
+ )
86
+
87
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ openai-whisper==20231117
2
+ langchain-openai
3
+ langchain-community
4
+ openai==1.13.3
5
+ torch
6
+ torchvision
7
+ torchaudio
utils/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .refine_summary import RefineDataSummarizer
2
+
3
+ __all__ = [
4
+
5
+ "RefineDataSummarizer"
6
+ ]
utils/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (243 Bytes). View file
 
utils/__pycache__/llms.cpython-39.pyc ADDED
Binary file (1.39 kB). View file
 
utils/__pycache__/refine_summary.cpython-39.pyc ADDED
Binary file (3.43 kB). View file
 
utils/refine_summary.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Definitions for refine data summarizer."""
2
+ from typing import Any, List, Dict
3
+
4
+ from langchain.chat_models.base import BaseChatModel
5
+ from langchain.docstore.document import Document
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain.document_loaders import TextLoader
8
+ from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.chains.summarize import load_summarize_chain
11
+
12
+
13
+ class RefineDataSummarizer:
14
+ """Refine data summarizer."""
15
+ token_limit = {"gpt-3.5-turbo": 4096,
16
+ "gpt-4": 8192,
17
+ "gpt-3.5-turbo-16k": 16385,
18
+ "gpt-3.5-turbo-1106": 16385,
19
+ "gpt-4-1106-preview": 128000,
20
+ "gemini-pro": 32768,
21
+ "codechat-bison": 8192,
22
+ "chat-bison": 8192
23
+ }
24
+
25
+ def __init__(
26
+ self,
27
+ llm: BaseChatModel
28
+ ):
29
+ """Initialize the data summarizer."""
30
+ self.llm = llm
31
+ self.llm_model = self.llm.model_name
32
+ prompt_template_bullet_point = (
33
+ "Write a summary of the following text.\n"
34
+ "TEXT: {text}\n"
35
+ "SUMMARY:\n"
36
+ )
37
+
38
+ prompt_bullet_point = PromptTemplate(
39
+ template=prompt_template_bullet_point, input_variables=["text"]
40
+ )
41
+
42
+ refine_prompt_template_bullet_point = (
43
+ "Write a concise summary of the following text delimited by triple backquotes.\n"
44
+ "Return your response in bullet points which covers the key points of the text.\n"
45
+ " ```{text}```\n"
46
+ "BULLET POINT SUMMARY:\n"
47
+ )
48
+
49
+ refine_prompt_bullet_point = PromptTemplate(
50
+ template=refine_prompt_template_bullet_point, input_variables=["text"]
51
+ )
52
+
53
+ prompt_template = (
54
+ "Write a concise summary of the following:\n"
55
+ "{text}\n"
56
+ "CONCISE SUMMARY:\n"
57
+ )
58
+
59
+ prompt = PromptTemplate.from_template(prompt_template)
60
+
61
+ refine_template = (
62
+ "Your job is to produce a final summary\n"
63
+ "We have provided an existing summary up to a certain point: {existing_answer}\n"
64
+ "We have the opportunity to refine the existing summary"
65
+ "(only if needed) with some more context below.\n"
66
+ "------------\n"
67
+ "{text}\n"
68
+ "------------\n"
69
+ "Given the new context, refine the original summary in Italian"
70
+ "If the context isn't useful, return the original summary."
71
+ )
72
+ refine_prompt = PromptTemplate.from_template(refine_template)
73
+
74
+ self.prompt = prompt
75
+ self.refine_prompt = refine_prompt
76
+
77
+ self.prompt_bullet_point = prompt_bullet_point
78
+ self.refine_prompt_bullet_point = refine_prompt_bullet_point
79
+
80
+ def get_summarization(self,
81
+ text: str,
82
+ chunk_num: int = 5,
83
+ chunk_overlap: int = 30,
84
+ bullet_point: bool = True) -> Dict:
85
+ """Get Summarization."""
86
+ if bullet_point:
87
+ prompt = self.prompt_bullet_point
88
+ refine_prompt = self.refine_prompt_bullet_point
89
+ else:
90
+ prompt = self.prompt
91
+ refine_prompt = self.refine_prompt
92
+
93
+ text_splitter = TokenTextSplitter(
94
+ chunk_size=self.token_limit[self.llm_model] // chunk_num,
95
+ chunk_overlap=chunk_overlap,
96
+ )
97
+ docs = [Document(page_content=t, metadata={"source": "local"}) for t in text_splitter.split_text(text)]
98
+ chain = load_summarize_chain(
99
+ llm=self.llm,
100
+ chain_type="refine",
101
+ question_prompt=prompt,
102
+ refine_prompt=refine_prompt,
103
+ return_intermediate_steps=True,
104
+ input_key="input_documents",
105
+ output_key="output_text",
106
+ verbose=True,
107
+ )
108
+ result = chain({"input_documents": docs}, return_only_outputs=True)
109
+ return result