Spaces:
Sleeping
Sleeping
Commit
·
ae46fe0
1
Parent(s):
40235f1
initial
Browse files- app.py +87 -0
- requirements.txt +7 -0
- utils/__init__.py +6 -0
- utils/__pycache__/__init__.cpython-39.pyc +0 -0
- utils/__pycache__/llms.cpython-39.pyc +0 -0
- utils/__pycache__/refine_summary.cpython-39.pyc +0 -0
- utils/refine_summary.py +109 -0
app.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import whisper
|
3 |
+
from langchain_openai import ChatOpenAI
|
4 |
+
from utils import RefineDataSummarizer
|
5 |
+
import os
|
6 |
+
|
7 |
+
def transcript(file_dir, model_type):
|
8 |
+
model_dir = os.path('models', model_type)'
|
9 |
+
model = whisper.load_model(model_dir)
|
10 |
+
result = model.transcribe(file_dir, language='English', task='transcribe')
|
11 |
+
|
12 |
+
lines = [s['text'] for s in result['segments']]
|
13 |
+
text = ''
|
14 |
+
for line in lines:
|
15 |
+
text += f"{line}\n"
|
16 |
+
return text
|
17 |
+
|
18 |
+
|
19 |
+
def upload_file(file_paths):
|
20 |
+
return file_paths
|
21 |
+
|
22 |
+
|
23 |
+
def summary(text, chunk_num, chunk_overlap, api_key, llm_type):
|
24 |
+
print(text)
|
25 |
+
api_key = api_key.strip()
|
26 |
+
llm = ChatOpenAI(temperature=1, openai_api_key=api_key, model_name=llm_type)
|
27 |
+
rds = RefineDataSummarizer(llm=llm)
|
28 |
+
result = rds.get_summarization(text, chunk_num=chunk_num, chunk_overlap=chunk_overlap)
|
29 |
+
return result["output_text"]
|
30 |
+
|
31 |
+
|
32 |
+
with gr.Blocks() as demo:
|
33 |
+
with gr.Row(equal_height=False):
|
34 |
+
with gr.Column():
|
35 |
+
file_output = gr.File()
|
36 |
+
upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio", "video"], file_count="single")
|
37 |
+
upload_button.upload(upload_file, upload_button, file_output)
|
38 |
+
model_type = gr.Dropdown(
|
39 |
+
[
|
40 |
+
"tiny.en.pt",
|
41 |
+
"tiny.pt",
|
42 |
+
"small.en.pt",
|
43 |
+
"small.pt",
|
44 |
+
"base.en.pt",
|
45 |
+
"base.pt",
|
46 |
+
"medium.en.pt",
|
47 |
+
"medium.pt",
|
48 |
+
"large-v1.pt",
|
49 |
+
"large-v2.pt",], label="Model Type", value="medium.en.pt")
|
50 |
+
TranscriptButton = gr.Button("Transcript", variant="primary")
|
51 |
+
|
52 |
+
with gr.Column():
|
53 |
+
transcript_text = gr.Textbox(placeholder="Transcript Result", label="Transcript")
|
54 |
+
chunk_num = gr.Number(precision=0, minimum=1, maximum=9999, step=1, label="Chunk Number", value=1)
|
55 |
+
chunk_overlap = gr.Number(precision=0, minimum=1, maximum=9999, step=1, label="Chunk Overlap", value=100)
|
56 |
+
api_key = gr.Textbox(placeholder="key", label="Your API Key", value='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')
|
57 |
+
llm_type = gr.Dropdown(
|
58 |
+
[
|
59 |
+
"gpt-3.5-turbo",
|
60 |
+
"gpt-3.5-turbo-16k",
|
61 |
+
"gpt-4-1106-preview"
|
62 |
+
], label="LLM Type", value="gpt-4-1106-preview")
|
63 |
+
SunmmaryButton = gr.Button("Summary", variant="primary")
|
64 |
+
summary_text = gr.Textbox(placeholder="Summary Result", label="Summary")
|
65 |
+
|
66 |
+
|
67 |
+
TranscriptButton.click(
|
68 |
+
fn=transcript,
|
69 |
+
inputs=[
|
70 |
+
file_output,
|
71 |
+
model_type
|
72 |
+
],
|
73 |
+
outputs=[transcript_text]
|
74 |
+
)
|
75 |
+
SunmmaryButton.click(
|
76 |
+
fn=summary,
|
77 |
+
inputs=[
|
78 |
+
transcript_text,
|
79 |
+
chunk_num,
|
80 |
+
chunk_overlap,
|
81 |
+
api_key,
|
82 |
+
llm_type
|
83 |
+
],
|
84 |
+
outputs=[summary_text]
|
85 |
+
)
|
86 |
+
|
87 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai-whisper==20231117
|
2 |
+
langchain-openai
|
3 |
+
langchain-community
|
4 |
+
openai==1.13.3
|
5 |
+
torch
|
6 |
+
torchvision
|
7 |
+
torchaudio
|
utils/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .refine_summary import RefineDataSummarizer
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
|
5 |
+
"RefineDataSummarizer"
|
6 |
+
]
|
utils/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (243 Bytes). View file
|
|
utils/__pycache__/llms.cpython-39.pyc
ADDED
Binary file (1.39 kB). View file
|
|
utils/__pycache__/refine_summary.cpython-39.pyc
ADDED
Binary file (3.43 kB). View file
|
|
utils/refine_summary.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Definitions for refine data summarizer."""
|
2 |
+
from typing import Any, List, Dict
|
3 |
+
|
4 |
+
from langchain.chat_models.base import BaseChatModel
|
5 |
+
from langchain.docstore.document import Document
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
from langchain.document_loaders import TextLoader
|
8 |
+
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
|
9 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
+
from langchain.chains.summarize import load_summarize_chain
|
11 |
+
|
12 |
+
|
13 |
+
class RefineDataSummarizer:
|
14 |
+
"""Refine data summarizer."""
|
15 |
+
token_limit = {"gpt-3.5-turbo": 4096,
|
16 |
+
"gpt-4": 8192,
|
17 |
+
"gpt-3.5-turbo-16k": 16385,
|
18 |
+
"gpt-3.5-turbo-1106": 16385,
|
19 |
+
"gpt-4-1106-preview": 128000,
|
20 |
+
"gemini-pro": 32768,
|
21 |
+
"codechat-bison": 8192,
|
22 |
+
"chat-bison": 8192
|
23 |
+
}
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
llm: BaseChatModel
|
28 |
+
):
|
29 |
+
"""Initialize the data summarizer."""
|
30 |
+
self.llm = llm
|
31 |
+
self.llm_model = self.llm.model_name
|
32 |
+
prompt_template_bullet_point = (
|
33 |
+
"Write a summary of the following text.\n"
|
34 |
+
"TEXT: {text}\n"
|
35 |
+
"SUMMARY:\n"
|
36 |
+
)
|
37 |
+
|
38 |
+
prompt_bullet_point = PromptTemplate(
|
39 |
+
template=prompt_template_bullet_point, input_variables=["text"]
|
40 |
+
)
|
41 |
+
|
42 |
+
refine_prompt_template_bullet_point = (
|
43 |
+
"Write a concise summary of the following text delimited by triple backquotes.\n"
|
44 |
+
"Return your response in bullet points which covers the key points of the text.\n"
|
45 |
+
" ```{text}```\n"
|
46 |
+
"BULLET POINT SUMMARY:\n"
|
47 |
+
)
|
48 |
+
|
49 |
+
refine_prompt_bullet_point = PromptTemplate(
|
50 |
+
template=refine_prompt_template_bullet_point, input_variables=["text"]
|
51 |
+
)
|
52 |
+
|
53 |
+
prompt_template = (
|
54 |
+
"Write a concise summary of the following:\n"
|
55 |
+
"{text}\n"
|
56 |
+
"CONCISE SUMMARY:\n"
|
57 |
+
)
|
58 |
+
|
59 |
+
prompt = PromptTemplate.from_template(prompt_template)
|
60 |
+
|
61 |
+
refine_template = (
|
62 |
+
"Your job is to produce a final summary\n"
|
63 |
+
"We have provided an existing summary up to a certain point: {existing_answer}\n"
|
64 |
+
"We have the opportunity to refine the existing summary"
|
65 |
+
"(only if needed) with some more context below.\n"
|
66 |
+
"------------\n"
|
67 |
+
"{text}\n"
|
68 |
+
"------------\n"
|
69 |
+
"Given the new context, refine the original summary in Italian"
|
70 |
+
"If the context isn't useful, return the original summary."
|
71 |
+
)
|
72 |
+
refine_prompt = PromptTemplate.from_template(refine_template)
|
73 |
+
|
74 |
+
self.prompt = prompt
|
75 |
+
self.refine_prompt = refine_prompt
|
76 |
+
|
77 |
+
self.prompt_bullet_point = prompt_bullet_point
|
78 |
+
self.refine_prompt_bullet_point = refine_prompt_bullet_point
|
79 |
+
|
80 |
+
def get_summarization(self,
|
81 |
+
text: str,
|
82 |
+
chunk_num: int = 5,
|
83 |
+
chunk_overlap: int = 30,
|
84 |
+
bullet_point: bool = True) -> Dict:
|
85 |
+
"""Get Summarization."""
|
86 |
+
if bullet_point:
|
87 |
+
prompt = self.prompt_bullet_point
|
88 |
+
refine_prompt = self.refine_prompt_bullet_point
|
89 |
+
else:
|
90 |
+
prompt = self.prompt
|
91 |
+
refine_prompt = self.refine_prompt
|
92 |
+
|
93 |
+
text_splitter = TokenTextSplitter(
|
94 |
+
chunk_size=self.token_limit[self.llm_model] // chunk_num,
|
95 |
+
chunk_overlap=chunk_overlap,
|
96 |
+
)
|
97 |
+
docs = [Document(page_content=t, metadata={"source": "local"}) for t in text_splitter.split_text(text)]
|
98 |
+
chain = load_summarize_chain(
|
99 |
+
llm=self.llm,
|
100 |
+
chain_type="refine",
|
101 |
+
question_prompt=prompt,
|
102 |
+
refine_prompt=refine_prompt,
|
103 |
+
return_intermediate_steps=True,
|
104 |
+
input_key="input_documents",
|
105 |
+
output_key="output_text",
|
106 |
+
verbose=True,
|
107 |
+
)
|
108 |
+
result = chain({"input_documents": docs}, return_only_outputs=True)
|
109 |
+
return result
|