Spaces:

XINZHANG-Geotab
/

transcript_summary

Sleeping

App Files Files Community

XINZHANG94 commited on Mar 28, 2024

Commit

ae46fe0

1 Parent(s): 40235f1

initial

Browse files

Files changed (7) hide show

app.py +87 -0
requirements.txt +7 -0
utils/__init__.py +6 -0
utils/__pycache__/__init__.cpython-39.pyc +0 -0
utils/__pycache__/llms.cpython-39.pyc +0 -0
utils/__pycache__/refine_summary.cpython-39.pyc +0 -0
utils/refine_summary.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+import whisper
+from langchain_openai import ChatOpenAI
+from utils import RefineDataSummarizer
+import os
+def transcript(file_dir, model_type):
+    model_dir = os.path('models', model_type)'
+    model = whisper.load_model(model_dir)
+    result = model.transcribe(file_dir, language='English', task='transcribe')
+    lines = [s['text'] for s in result['segments']]
+    text = ''
+    for line in lines:
+        text += f"{line}\n"
+    return text
+def upload_file(file_paths):
+    return file_paths
+def summary(text, chunk_num, chunk_overlap, api_key, llm_type):
+    print(text)
+    api_key = api_key.strip()
+    llm = ChatOpenAI(temperature=1, openai_api_key=api_key, model_name=llm_type)
+    rds = RefineDataSummarizer(llm=llm)
+    result = rds.get_summarization(text, chunk_num=chunk_num, chunk_overlap=chunk_overlap)
+    return result["output_text"]
+with gr.Blocks() as demo:
+    with gr.Row(equal_height=False):
+        with gr.Column():
+            file_output = gr.File()
+            upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio", "video"], file_count="single")
+            upload_button.upload(upload_file, upload_button, file_output)
+            model_type = gr.Dropdown(
+                [
+                    "tiny.en.pt",
+                    "tiny.pt",
+                    "small.en.pt",
+                    "small.pt",
+                    "base.en.pt",
+                    "base.pt",
+                    "medium.en.pt",
+                    "medium.pt",
+                    "large-v1.pt",
+                    "large-v2.pt",], label="Model Type", value="medium.en.pt")
+            TranscriptButton = gr.Button("Transcript", variant="primary")
+        with gr.Column():
+            transcript_text = gr.Textbox(placeholder="Transcript Result", label="Transcript")
+            chunk_num = gr.Number(precision=0, minimum=1, maximum=9999, step=1, label="Chunk Number", value=1)
+            chunk_overlap = gr.Number(precision=0, minimum=1, maximum=9999, step=1, label="Chunk Overlap", value=100)
+            api_key = gr.Textbox(placeholder="key", label="Your API Key", value='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')
+            llm_type =  gr.Dropdown(
+                [
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-16k",
+                "gpt-4-1106-preview"
+                ], label="LLM Type", value="gpt-4-1106-preview")
+            SunmmaryButton = gr.Button("Summary", variant="primary")
+            summary_text = gr.Textbox(placeholder="Summary Result", label="Summary")
+    TranscriptButton.click(
+        fn=transcript,
+        inputs=[
+            file_output,
+            model_type
+        ],
+        outputs=[transcript_text]
+    )
+    SunmmaryButton.click(
+        fn=summary,
+        inputs=[
+            transcript_text,
+            chunk_num,
+            chunk_overlap,
+            api_key,
+            llm_type
+        ],
+        outputs=[summary_text]
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+openai-whisper==20231117
+langchain-openai
+langchain-community
+openai==1.13.3
+torch
+torchvision
+torchaudio

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .refine_summary import RefineDataSummarizer
+__all__ = [
+    "RefineDataSummarizer"
+]

utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (243 Bytes). View file

utils/__pycache__/llms.cpython-39.pyc ADDED Viewed

Binary file (1.39 kB). View file

utils/__pycache__/refine_summary.cpython-39.pyc ADDED Viewed

Binary file (3.43 kB). View file

utils/refine_summary.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Definitions for refine data summarizer."""
+from typing import Any, List, Dict
+from langchain.chat_models.base import BaseChatModel
+from langchain.docstore.document import Document
+from langchain.prompts import PromptTemplate
+from langchain.document_loaders import TextLoader
+from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains.summarize import load_summarize_chain
+class RefineDataSummarizer:
+    """Refine data summarizer."""
+    token_limit = {"gpt-3.5-turbo": 4096,
+                   "gpt-4": 8192,
+                   "gpt-3.5-turbo-16k": 16385,
+                   "gpt-3.5-turbo-1106": 16385,
+                   "gpt-4-1106-preview": 128000,
+                   "gemini-pro": 32768,
+                   "codechat-bison": 8192,
+                   "chat-bison": 8192
+                   }
+    def __init__(
+            self,
+            llm: BaseChatModel
+    ):
+        """Initialize the data summarizer."""
+        self.llm = llm
+        self.llm_model = self.llm.model_name
+        prompt_template_bullet_point = (
+            "Write a summary of the following text.\n"
+            "TEXT: {text}\n"
+            "SUMMARY:\n"
+        )
+        prompt_bullet_point = PromptTemplate(
+            template=prompt_template_bullet_point, input_variables=["text"]
+        )
+        refine_prompt_template_bullet_point = (
+            "Write a concise summary of the following text delimited by triple backquotes.\n"
+            "Return your response in bullet points which covers the key points of the text.\n"
+            " ```{text}```\n"
+            "BULLET POINT SUMMARY:\n"
+        )
+        refine_prompt_bullet_point = PromptTemplate(
+            template=refine_prompt_template_bullet_point, input_variables=["text"]
+        )
+        prompt_template = (
+            "Write a concise summary of the following:\n"
+            "{text}\n"
+            "CONCISE SUMMARY:\n"
+        )
+        prompt = PromptTemplate.from_template(prompt_template)
+        refine_template = (
+            "Your job is to produce a final summary\n"
+            "We have provided an existing summary up to a certain point: {existing_answer}\n"
+            "We have the opportunity to refine the existing summary"
+            "(only if needed) with some more context below.\n"
+            "------------\n"
+            "{text}\n"
+            "------------\n"
+            "Given the new context, refine the original summary in Italian"
+            "If the context isn't useful, return the original summary."
+        )
+        refine_prompt = PromptTemplate.from_template(refine_template)
+        self.prompt = prompt
+        self.refine_prompt = refine_prompt
+        self.prompt_bullet_point = prompt_bullet_point
+        self.refine_prompt_bullet_point = refine_prompt_bullet_point
+    def get_summarization(self,
+                          text: str,
+                          chunk_num: int = 5,
+                          chunk_overlap: int = 30,
+                          bullet_point: bool = True) -> Dict:
+        """Get Summarization."""
+        if bullet_point:
+            prompt = self.prompt_bullet_point
+            refine_prompt = self.refine_prompt_bullet_point
+        else:
+            prompt = self.prompt
+            refine_prompt = self.refine_prompt
+        text_splitter = TokenTextSplitter(
+            chunk_size=self.token_limit[self.llm_model] // chunk_num,
+            chunk_overlap=chunk_overlap,
+        )
+        docs = [Document(page_content=t, metadata={"source": "local"}) for t in text_splitter.split_text(text)]
+        chain = load_summarize_chain(
+            llm=self.llm,
+            chain_type="refine",
+            question_prompt=prompt,
+            refine_prompt=refine_prompt,
+            return_intermediate_steps=True,
+            input_key="input_documents",
+            output_key="output_text",
+            verbose=True,
+        )
+        result = chain({"input_documents": docs}, return_only_outputs=True)
+        return result