Spaces:
Sleeping
Sleeping
import json | |
from everything2text4prompt.everything2text4prompt import Everything2Text4Prompt | |
from everything2text4prompt.util import BaseData, YoutubeData, PodcastData | |
from digester.chatgpt_service import LLMService, ChatGPTService | |
from digester.util import Prompt, provide_text_with_css, GradioInputs | |
WAITING_FOR_TARGET_INPUT = "Waiting for target source input" | |
RESPONSE_SUFFIX = "⚡by DigestEverythingGPT" | |
class GradioMethodService: | |
""" | |
GradioMethodService is defined as gradio functions | |
Therefore all methods here will fulfill | |
- gradio.inputs as signature | |
- gradio.outputs as return | |
Detailed-level methods called by methods in GradioMethodService will be in other classes (e.g. DigesterService) | |
""" | |
def write_results_to_file(history, file_name=None): | |
""" | |
Writes the conversation history to a file in Markdown format. | |
If no filename is specified, the filename is generated using the current time. | |
""" | |
import os, time | |
if file_name is None: | |
file_name = 'chatGPT_report' + time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + '.md' | |
os.makedirs('./analyzer_logs/', exist_ok=True) | |
with open(f'./analyzer_logs/{file_name}', 'w', encoding='utf8') as f: | |
f.write('# chatGPT report\n') | |
for i, content in enumerate(history): | |
try: | |
if type(content) != str: content = str(content) | |
except: | |
continue | |
if i % 2 == 0: | |
f.write('## ') | |
f.write(content) | |
f.write('\n\n') | |
res = 'The above material has been written in ' + os.path.abspath(f'./analyzer_logs/{file_name}') | |
print(res) | |
return res | |
def fetch_and_summarize(apikey_textbox, source_textbox, source_target_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history): | |
g_inputs = GradioInputs(apikey_textbox, source_textbox, source_target_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history) | |
g_inputs.history = [] | |
g_inputs.chatbot = [] | |
if g_inputs.apikey_textbox == "" or g_inputs.source_textbox == "" or g_inputs.source_target_textbox == "": | |
LLMService.report_exception(g_inputs.chatbot, g_inputs.history, | |
chat_input=f"Source target: [{g_inputs.source_textbox}] {g_inputs.source_target_textbox}", | |
chat_output=f"{provide_text_with_css('ERROR', 'red')} Please provide api key, source and target source") | |
yield g_inputs.chatbot, g_inputs.history, 'Error', WAITING_FOR_TARGET_INPUT | |
return | |
# TODO: invalid input checking | |
is_success, text_data = yield from DigesterService.fetch_text(g_inputs) | |
if not is_success: | |
return # TODO: error handling testing | |
yield from PromptEngineeringStrategy.execute_prompt_chain(g_inputs, text_data) | |
def ask_question(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history): | |
g_inputs = GradioInputs(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history) | |
prompt = f"ask_question(`{qa_textbox}`)" | |
g_inputs.chatbot.append((prompt, "Currently the ask feature is not supported yet")) | |
yield g_inputs.chatbot, g_inputs.history, 'Normal' | |
def test_formatting(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history): | |
g_inputs = GradioInputs(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history) | |
msg = r""" | |
# ASCII, table, code test | |
Overall, this program consists of the following files: | |
- `main.py`: This is the primary script of the program which uses NLP to analyze and summarize Python code. | |
- `model.py`: This file defines the `CodeModel` class that is used by `main.py` to model the code as graphs and performs operations on them. | |
- `parser.py`: This file contains custom parsing functions used by `model.py`. | |
- `test/`: This directory contains test scripts for `model.py` and `util.py` | |
- `util.py`: This file provides utility functions for the program such as getting the root directory of the project and reading configuration files. | |
`util.py` specifically has two functions: | |
| Function | Input | Output | Functionality | | |
|----------|-------|--------|---------------| | |
| `get_project_root()` | None | String containing the path of the parent directory of the script itself | Finds the path of the parent directory of the script itself | | |
| `get_config()` | None | Dictionary containing the contents of `config.yaml` and `config_secret.yaml`, merged together (with `config_secret.yaml` overwriting any keys with the same name in `config.yaml`) | Reads and merges two YAML configuration files (`config.yaml` and `config_secret.yaml`) located in the `config` directory in the parent directory of the script. Returns the resulting dictionary. |The above material has been written in C:\github\!CodeAnalyzerGPT\CodeAnalyzerGPT\analyzer_logs\chatGPT_report2023-04-07-14-11-55.md | |
The Hessian matrix is a square matrix that contains information about the second-order partial derivatives of a function. Suppose we have a function $f(x_1,x_2,...,x_n)$ which is twice continuously differentiable. Then the Hessian matrix $H(f)$ of $f$ is defined as the $n\times n$ matrix: | |
$$H(f) = \begin{bmatrix} \frac{\partial^2 f}{\partial x_1^2} & \frac{\partial^2 f}{\partial x_1 \partial x_2} & \cdots & \frac{\partial^2 f}{\partial x_1 \partial x_n} \ \frac{\partial^2 f}{\partial x_2 \partial x_1} & \frac{\partial^2 f}{\partial x_2^2} & \cdots & \frac{\partial^2 f}{\partial x_2 \partial x_n} \ \vdots & \vdots & \ddots & \vdots \ \frac{\partial^2 f}{\partial x_n \partial x_1} & \frac{\partial^2 f}{\partial x_n \partial x_2} & \cdots & \frac{\partial^2 f}{\partial x_n^2} \ \end{bmatrix}$$ | |
Each element in the Hessian matrix is the second-order partial derivative of the function with respect to a pair of variables, as shown in the matrix above | |
Here's an example Python code using SymPy module to get the derivative of a mathematical function: | |
``` | |
import sympy as sp | |
x = sp.Symbol('x') | |
f = input('Enter a mathematical function in terms of x: ') | |
expr = sp.sympify(f) | |
dfdx = sp.diff(expr, x) | |
print('The derivative of', f, 'is:', dfdx) | |
``` | |
This code will prompt the user to enter a mathematical function in terms of x and then use the `diff()` function from SymPy to calculate its derivative with respect to x. The result will be printed on the screen. | |
# Non-ASCII test | |
程序整体功能:CodeAnalyzerGPT工程是一个用于自动化代码分析和评审的工具。它使用了OpenAI的GPT模型对代码进行分析,然后根据一定的规则和标准来评价代码的质量和合规性。 | |
程序的构架包含以下几个模块: | |
1. CodeAnalyzerGPT: 主程序模块,包含了代码分析和评审的主要逻辑。 | |
2. analyzer: 包含了代码分析程序的具体实现。 | |
每个文件的功能可以总结为下表: | |
| 文件名 | 功能描述 | | |
| --- | --- | | |
| C:\github\!CodeAnalyzerGPT\CodeAnalyzerGPT\CodeAnalyzerGPT.py | 主程序入口,调用各种处理逻辑和输出结果 | | |
| C:\github\!CodeAnalyzerGPT\CodeAnalyzerGPT\analyzer\code_analyzer.py | 代码分析器,包含了对代码文本的解析和分析逻辑 | | |
| C:\github\!CodeAnalyzerGPT\CodeAnalyzerGPT\analyzer\code_segment.py | 对代码文本进行语句和表达式的分段处理 | | |
""" | |
g_inputs.chatbot.append(("test prompt query", msg)) | |
yield g_inputs.chatbot, g_inputs.history, 'Normal' | |
def test_asking(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history): | |
g_inputs = GradioInputs(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history) | |
msg = f"test_ask(`{qa_textbox}`)" | |
g_inputs.chatbot.append(("test prompt query", msg)) | |
g_inputs.chatbot.append(("test prompt query 2", msg)) | |
g_inputs.chatbot.append(("", "test empty message")) | |
g_inputs.chatbot.append(("test empty message 2", "")) | |
g_inputs.chatbot.append((None, "output msg, test no input msg")) | |
g_inputs.chatbot.append(("input msg, , test no output msg", None)) | |
g_inputs.chatbot.append((None, '<span style="background-color: yellow; color: black; padding: 3px; border-radius: 8px;">WARN</span>')) | |
yield g_inputs.chatbot, g_inputs.history, 'Normal' | |
class DigesterService: | |
def update_ui(chatbot_input, chatbot_output, status, target_md, chatbot, history, is_append=True): | |
""" | |
For instant chatbot_input+output | |
Not suitable if chatbot_output have delay / processing time | |
""" | |
if is_append: | |
chatbot.append((chatbot_input, chatbot_output)) | |
else: | |
chatbot[-1] = (chatbot_input, chatbot_output) | |
history.append(chatbot_input) | |
history.append(chatbot_output) | |
yield chatbot, history, status, target_md | |
def fetch_text(g_inputs: GradioInputs) -> (bool, BaseData): | |
"""Fetch text from source using everything2text4prompt. No OpenAI call here""" | |
converter = Everything2Text4Prompt(openai_api_key=g_inputs.apikey_textbox) | |
text_data, is_success, error_msg = converter.convert_text(g_inputs.source_textbox, g_inputs.source_target_textbox) | |
text_content = text_data.full_content | |
chatbot_input = f"Converting source to text for [{g_inputs.source_textbox}] {g_inputs.source_target_textbox} ..." | |
target_md = f"[{g_inputs.source_textbox}] {g_inputs.source_target_textbox}" | |
if is_success: | |
chatbot_output = f""" | |
Extracted text successfully: | |
{text_content} | |
""" | |
yield from DigesterService.update_ui(chatbot_input, chatbot_output, "Success", target_md, g_inputs.chatbot, g_inputs.history) | |
else: | |
chatbot_output = f""" | |
{provide_text_with_css("ERROR", "red")} Text extraction failed ({error_msg}) | |
""" | |
yield from DigesterService.update_ui(chatbot_input, chatbot_output, "Error", target_md, g_inputs.chatbot, g_inputs.history) | |
return is_success, text_data | |
class PromptEngineeringStrategy: | |
def execute_prompt_chain(g_inputs: GradioInputs, text_data: BaseData): | |
if g_inputs.source_textbox == 'youtube': | |
yield from PromptEngineeringStrategy.execute_prompt_chain_youtube(g_inputs, text_data) | |
elif g_inputs.source_textbox == 'podcast': | |
yield from PromptEngineeringStrategy.execute_prompt_chain_podcast(g_inputs, text_data) | |
def execute_prompt_chain_youtube(g_inputs: GradioInputs, text_data: YoutubeData): | |
yield from YoutubeChain.execute_chain(g_inputs, text_data) | |
def execute_prompt_chain_podcast(g_inputs: GradioInputs, text_data: PodcastData): | |
pass | |
class Chain: | |
def execute_chain(g_inputs: GradioInputs, text_data: YoutubeData): | |
raise NotImplementedError | |
class YoutubeChain(Chain): | |
CLASSIFIER_PROMPT = Prompt( | |
prompt_prefix=""" | |
[Youtube Video types] | |
N things: The youtube will shows N items that will be described in the video. For example "17 cheap purchases that save me time", "10 AMAZING Ways AutoGPT Is Being Used RIGHT NOW". Usually the title starts with a number. | |
Tutorials: how to do or make something in order to teach a skill or how to use a product or software | |
How-to and DIY: People show how to make or do something yourself, like crafts, recipes, projects, etc | |
Interview: Interviewee shows their standpoint with a topic. | |
Others: If the video type is not listed above | |
[TITLE] | |
{title} | |
[TRANSCRIPT] | |
""", | |
prompt_main=""" | |
{transcript} | |
""", | |
prompt_suffix=""" | |
[TASK] | |
From the above title, transcript, classify the youtube video type listed above. | |
Give the video type with JSON format like {"type": "N things"}, and exclude other text. | |
""") | |
TIMESTAMPED_SUMMARY_PROMPT = Prompt( | |
prompt_prefix=""" | |
[TITLE] | |
{title} | |
[Transcript with timestamp] | |
""", | |
prompt_main=""" | |
{transcript_with_ts} | |
""", | |
prompt_suffix=""" | |
[TASK] | |
Convert this into youtube summary. | |
Combine and merge timestamp to for 2-5 minutes chunk. Maximum {word_limit} using noun for one line. Must not exceed the limit | |
Start with the timestamp followed by the summarized text for that chunk. | |
Must use language: {language} | |
Strictly follow the task rules especially for language and character limit | |
Maximum {word_limit} using noun for one line. Using noun, not sentence | |
Example format: | |
{first_timestamp} - This is the first part | |
{second_minute}:44 - This is the second part | |
{third_minute}:02 - This is the third part | |
""") | |
FINAL_SUMMARY_PROMPT = Prompt( | |
prompt_prefix=""" | |
[TITLE] | |
{title} | |
[TRANSCRIPT] | |
""", | |
prompt_main=""" | |
{transcript} | |
""", | |
prompt_suffix=""" | |
[TASK] | |
Summarize the above points under 30 words. Step by step showing points for the main concepts. | |
Use markdown format. | |
Must use language: {language} | |
Strictly follow the task rules and use {language} language | |
{task_constraint} | |
The format is like: | |
Summary: (content of summary) | |
{format_constraint} | |
""") | |
FINAL_SUMMARY_TASK_CONSTRAINTS = { | |
"N things": """ | |
Additionally, since it is a N things video, the summary should include the N items stated in the video. | |
""", | |
"Tutorials": """ | |
Additionally, since it is a Tutorial video, provide step by step instructions for the tutorial. | |
""", | |
"Others": """ | |
""", | |
} | |
FINAL_SUMMARY_FORMAT_CONSTRAINTS = { | |
"N things": """ | |
Items mentioned in the video: (content of N things. Put different appropriate emoji in the beginning for each bullet point) | |
""", | |
"Tutorials": """ | |
Instructions: (step by step instructions, up to five concise bullet points, less than 20 words. Put different appropriate emoji for each bullet point) | |
""", | |
"Others": """ | |
Highlights: | |
- [Emoji] (content of highlights) | |
- [Emoji] (content of highlights) | |
- [Emoji] (content of highlights) | |
- [Emoji] (content of highlights) | |
- [Emoji] (content of highlights) | |
For highlight, up to five concise bullet points, less than {char_limit} for each bullet point. Put different appropriate emoji for each bullet point | |
Must use language {language} as output | |
""", | |
} | |
def execute_chain(g_inputs: GradioInputs, text_data: YoutubeData): | |
text_content = text_data.full_content | |
timestamped_summary = yield from YoutubeChain.execute_timestamped_summary_chain(g_inputs, text_data) | |
video_type = yield from YoutubeChain.execute_classifer_chain(g_inputs, text_data) | |
final_summary = yield from YoutubeChain.execute_final_summary_chain(g_inputs, text_data, video_type) | |
full_summary = f""" | |
{provide_text_with_css("✅DONE", "green")} | |
🎞️Video: {text_data.title} | |
📝Timestamped summary | |
{timestamped_summary} | |
📝Summary | |
{final_summary} | |
{RESPONSE_SUFFIX} | |
""" | |
prompt_show_user = "Full summary" | |
g_inputs.chatbot[-1] = (prompt_show_user, full_summary) | |
g_inputs.history.append(prompt_show_user) | |
g_inputs.history.append(full_summary) | |
yield g_inputs.chatbot, g_inputs.history, "Success", f"[{g_inputs.source_textbox}] {g_inputs.source_target_textbox}" | |
def execute_classifer_chain(cls, g_inputs: GradioInputs, youtube_data: YoutubeData): | |
TRANSCRIPT_CHAR_LIMIT = 200 # Because classifer don't need to see the whole transcript | |
prompt = Prompt(cls.CLASSIFIER_PROMPT.prompt_prefix.format(title=youtube_data.title), | |
cls.CLASSIFIER_PROMPT.prompt_main.format(transcript=youtube_data.full_content[:TRANSCRIPT_CHAR_LIMIT]), | |
cls.CLASSIFIER_PROMPT.prompt_suffix | |
) | |
prompt_show_user = "Classify the video type for me" | |
response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs) | |
try: | |
video_type = json.loads(response)['type'] | |
if video_type not in cls.FINAL_SUMMARY_TASK_CONSTRAINTS.keys(): | |
raise Exception(f"Video type is not valid: {video_type}. Use default: Others") | |
except Exception as e: | |
yield from ChatGPTService.say_using_ginputs(None, f"{provide_text_with_css('WARN', 'yellow')} {e}", "Success", g_inputs) | |
video_type = 'Others' | |
return video_type | |
def execute_timestamped_summary_chain(cls, g_inputs: GradioInputs, youtube_data: YoutubeData): | |
transcript_with_ts = "" | |
for entry in youtube_data.ts_transcript_list: | |
transcript_with_ts += f"{int(entry['start'] // 60)}:{int(entry['start'] % 60):02d} {entry['text']}\n" | |
def _get_char_limit(language: str): | |
"""If Chinese/Japan/Korean, use character limit. Otherwise, use word limit""" | |
if 'zh' in language or language in ["ja-JP", "ko-KR"]: | |
return f"15 {language} characters" | |
else: | |
return "8 words" | |
prompt = Prompt(cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title), | |
cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_main.format(transcript_with_ts=transcript_with_ts), | |
cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_suffix.replace("{language}", g_inputs.language_textbox) | |
.replace("{word_limit}", _get_char_limit(g_inputs.language_textbox)) | |
) | |
prompt_show_user = "Generate the timestamped summary" | |
response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs, is_timestamp=True) | |
return response | |
def execute_final_summary_chain(cls, g_inputs: GradioInputs, youtube_data: YoutubeData, video_type): | |
format_constraint = cls.FINAL_SUMMARY_FORMAT_CONSTRAINTS[video_type] | |
if video_type in cls.FINAL_SUMMARY_TASK_CONSTRAINTS.keys(): | |
task_constraint = cls.FINAL_SUMMARY_TASK_CONSTRAINTS[video_type] | |
else: | |
task_constraint = "" | |
def _get_char_limit(language): | |
"""If Chinese/Japan/Korean, use character limit. Otherwise, use word limit""" | |
if 'zh' in language or language in ["ja-JP", "ko-KR"]: | |
return f"30 {language} characters" | |
else: | |
return "15 words" | |
prompt = Prompt( | |
cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title), | |
cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=youtube_data.full_content), | |
cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint, | |
format_constraint=format_constraint.replace("{char_limit}", _get_char_limit(g_inputs.language_textbox)).replace("{language}", g_inputs.language_textbox), | |
language=g_inputs.language_textbox) | |
) | |
prompt_show_user = "Generate the final summary" | |
response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs) | |
if len_prompts > 1: | |
# Give summary of summaries if the video is long | |
prompt = Prompt( | |
cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title), | |
cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=response), | |
cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint, format_constraint=format_constraint, language=g_inputs.language_textbox) | |
) | |
prompt_show_user = "Since the video is long, generating the final summary of the summaries" | |
response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs) | |
return response | |
if __name__ == '__main__': | |
GPT_MODEL = "gpt-3.5-turbo-16k" | |
API_KEY = "" | |
input_1 = """Give me 2 ideas for the summer""" | |
# input_1 = """Explain more on the first idea""" | |
response_1 = ChatGPTService.single_rest_call_chatgpt(API_KEY, input_1, GPT_MODEL) | |
print(response_1) | |
input_2 = """ | |
For the first idea, suggest some step by step planning for me | |
""" | |
response_2 = ChatGPTService.single_rest_call_chatgpt(API_KEY, input_2, GPT_MODEL, history=[input_1, response_1]) | |
print(response_2) | |