import os import datetime import glob import shutil import requests import io import sys import re import boto3 from os import listdir from os.path import isfile, join import gradio from sqlitedict import SqliteDict import gradio as gr from langchain import PromptTemplate from langchain.agents import Tool from langchain.agents import load_tools from langchain.agents import initialize_agent from langchain.agents import AgentType from langchain.chains import LLMMathChain from langchain import SerpAPIWrapper from langchain.chains import ConversationalRetrievalChain from langchain.chains.summarize import load_summarize_chain from langchain.llms import AzureOpenAI from langchain.chat_models import AzureChatOpenAI from langchain.embeddings.openai import OpenAIEmbeddings from langchain.memory import ChatMessageHistory from langchain.memory import ConversationBufferMemory from langchain.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import DirectoryLoader from langchain.document_loaders import UnstructuredFileLoader import clickhouse_connect from pathlib import Path from langchain.document_loaders import YoutubeLoader from azure_utils import AzureVoiceData from polly_utils import PollyVoiceData, NEURAL_ENGINE from contextlib import closing from langchain.agents import create_pandas_dataframe_agent import pandas as pd #os env os.environ["OPENAI_API_TYPE"] = "azure" os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview" os.environ["OPENAI_API_BASE"] = "https://civet-project-001.openai.azure.com/" os.environ["OPENAI_API_KEY"] = "0e3e5b666818488fa1b5cb4e4238ffa7" os.environ["SERPAPI_API_KEY"] = "a5b67b8805b4e12b0ae147c9c6b2a7dbf3ab84fca5f24e531b6963b1f7fc1ff7" global_deployment_id = "CivetGPT" global_model_name = "gpt-35-turbo" #chroma settings chroma_api_impl = "HH_Azure_Openai" root_file_path = "./data/" #其實是data 存放的位置 hr_source_path = "hr_source" ks_source_path = "ks_source" believe_source_path = 'be_source' sqlite_name = "cache.sqlite3" sqlite_key="stored_files" persist_db = "persist_db" hr_collection_name = "hr_db" chroma_db_impl="localdb+langchain" tmp_collection="tmp_collection" davinci = "text-davinci-003" #global text setting inputText = "問題(按q 或Ctrl + c跳出): " refuse_string="服務被拒. 內容可能涉及敏感字詞,政治,煽動他人或是其他不當言詞, 請改以其他內容嚐試" #video LOOPING_TALKING_HEAD = "./data/videos/Masahiro.mp4" TALKING_HEAD_WIDTH = "192" AZURE_VOICE_DATA = AzureVoiceData() POLLY_VOICE_DATA = PollyVoiceData() def save_sqlite(key,value): try: with SqliteDict(sqlite_name) as mydict: old_value = mydict[key] mydict[key] = value+old_value # Using dict[key] to store mydict.commit() # Need to commit() to actually flush the data except Exception as ex: print("Error during storing data (Possibly unsupported):", ex) def load_sqlite(key): try: with SqliteDict(sqlite_name) as mydict: value = mydict[key] # No need to use commit(), since we are only loading data! return value except Exception as ex: print("Error during loading data:", ex) def delete_sql(key): try: with SqliteDict(sqlite_name) as mydict: mydict[key] = [] # Using dict[key] to store mydict.commit() # Need to commit() to actually flush the data except Exception as ex: print("Error during storing data (Possibly unsupported):", ex) def ai_answer(answer): print('AI 回答: \033[32m' + answer +'\033[0m') def get_openaiembeddings(): return OpenAIEmbeddings( deployment="CivetGPT_embedding", model="text-embedding-ada-002", #embed_batch_size=1 chunk_size=1 ) """ def get_chroma_client(): chroma_client = chromadb.Client(Settings(chroma_api_impl=chroma_api_impl, chroma_server_host=chroma_db_ip, chroma_server_http_port=chroma_db_port )) return chroma_client """ def multidocs_loader(files_path, file_ext): full_files_pattern = "*." + file_ext loader = DirectoryLoader(files_path, glob=full_files_pattern, show_progress=True) data = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) documents = text_splitter.split_documents(data) return documents def unstructure_file_loader(filename_path): loader = UnstructuredFileLoader(filename_path) data = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) documents = text_splitter.split_documents(data) return documents def add_documents_into_cromadb(db_name, file_path, collection_name): _db_name = db_name documents = multidocs_loader(file_path,"*") embeddings = get_openaiembeddings() chroma_db = Chroma.from_documents( documents, embeddings, collection_name=collection_name, persist_directory=root_file_path+ persist_db, chroma_db_impl=chroma_db_impl ) chroma_db.persist() print('adding documents done!') def initial_croma_db(db_name, files_path, file_ext, collection_name): _db_name = db_name documents = multidocs_loader(files_path, file_ext) embeddings = get_openaiembeddings() chroma_db = Chroma.from_documents( documents, embeddings, collection_name = collection_name, persist_directory= root_file_path+ persist_db, chroma_db_impl=chroma_db_impl ) chroma_db.persist() print('vectorstore done!') def add_files_to_collection(input_file_path, collection_name): file_path=root_file_path+input_file_path add_documents_into_cromadb(persist_db, file_path, collection_name) def get_prompt_summary_string(): return """使用中文替下面內容做個精簡摘要: {text} 精簡摘要:""" def get_prompt_template_string(): today = datetime.date.today().strftime("%Y年%m月%d日") template_string = f"我是鴻海(等同Foxconn)的員工, 你是一個鴻海的人資專家. 今天是{today}".format(today=today)+""" 請根據歷史對話,針對這次的問題, 形成獨立問題. 請優先從提供的文件中尋找答案, 你被允許回答不知道, 但回答不知道時需要給中央人資的客服聯絡窗口資訊. 不論什麼問題, 都以中文回答 歷史對話: {chat_history} 這次的問題: {question} 人資專家: """ return template_string def get_default_template_prompt(): template = "你是個知識廣泛的超級助手, 以下所有問題請用中文回答, 並請在500個中文字以內來解釋 {concept} 概念" prompt = PromptTemplate( input_variables = ["concept"], template = template ) return prompt def fine_tuning_model_chat(my_deployment_id, my_model_name): _prompt = get_default_template_prompt() llm = AzureOpenAI(model_name=my_model_name, deployment_name = my_deployment_id) while 1: text = input(inputText) if text == 'q': break response = llm(_prompt.format(concept = text)) ai_answer(response) def chat_conversation(): print("resource: " + global_deployment_id + " / " + global_model_name) chat = AzureChatOpenAI( deployment_name = global_deployment_id, model_name = global_model_name, ) history = ChatMessageHistory() history.add_ai_message("你是一個超級助理, 以下問題都用中文回答") while 1: text = input(inputText) if text == 'q': break history.add_user_message(text) ai_response = chat(history.messages) ai_answer(ai_response.content) def local_vector_search(question_str,chat_history, collection_name = hr_collection_name): embedding = get_openaiembeddings() vectorstore = Chroma( embedding_function=embedding, collection_name=collection_name, persist_directory=root_file_path+persist_db, ) memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, ai_prefix = "AI超級助理") llm = AzureOpenAI( deployment_name = global_deployment_id, model_name= global_model_name, temperature = 0.0) chat_llm = AzureChatOpenAI( deployment_name = global_deployment_id, model_name= global_model_name, temperature = 0.2) prompt = PromptTemplate( template=get_prompt_template_string(), input_variables=["question","chat_history"] ) prompt.format(question=question_str,chat_history=chat_history) km_chain = ConversationalRetrievalChain.from_llm( llm=chat_llm, retriever=vectorstore.as_retriever(), memory=memory, condense_question_prompt=prompt, ) km_tool = Tool( name='Knowledge Base', func=km_chain.run, description='一個非常有用的工具, 當要查詢任何公司政策以及鴻海相關資料都使用這個工具' ) math_math = LLMMathChain(llm=llm,verbose=True) math_tool = Tool( name='Calculator', func=math_math.run, description='Useful for when you need to answer questions about math.' ) search = SerpAPIWrapper() search_tool = Tool( name="Search", func=search.run, description="當你需要回答一般問題時,非常有用; 不可以用來回答任何跟鴻海有關的問題.", ) tools=[math_tool,km_tool, search_tool] agent=initialize_agent( agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, tools=tools, llm=chat_llm, verbose=True, memory=memory, max_iterations=30, ) result=km_chain(question_str) #result=agent.run(question_str) return result["answer"] def make_markdown_table(array): nl = "\n" markdown = "" for entry in array: markdown += f"{entry} {nl}" return markdown def get_hr_files(): files = load_sqlite(sqlite_key) if files == None: return else: return make_markdown_table(files) def update_hr_km(files): file_paths = [file.name for file in files] dest_file_path=root_file_path+hr_source_path if not os.path.exists(dest_file_path): os.makedirs(dest_file_path) for file in file_paths: shutil.copy(file, dest_file_path) add_files_to_collection(hr_source_path, hr_collection_name) save_sqlite(sqlite_key, [Path(file_path).name for file_path in file_paths]) return get_hr_files() def clear_all_collection(collection_name): pass def all_files_under_diretory(path): files = glob.glob(path+'\*') for f in files: os.remove(f) def clear_hr_datas(): #remove hr collection client = get_chroma_client(hr_collection_name) client.delete_collection(name=hr_collection_name) print("Collection removed completely!") #remove files all_files_under_diretory(root_file_path+hr_source_path) delete_sql(sqlite_key) return get_hr_files() def num_of_collection(collection_name): client = get_chroma_client(collection_name) number = client.get_collection(collection_name).count() return f"目前知識卷裡有{number}卷項目" def clear_tmp_collection(): client = get_chroma_client(tmp_collection) client.delete_collection(name=tmp_collection) all_files_under_diretory(root_file_path+ks_source_path) return num_of_collection(tmp_collection) def content_summary(split_documents): llm = AzureChatOpenAI( deployment_name=global_deployment_id, model_name=global_model_name, temperature=0.2) map_prompt = get_prompt_summary_string() map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"]) chain = load_summarize_chain( llm=llm, chain_type="map_reduce", verbose=True, map_prompt=map_prompt_template, combine_prompt=map_prompt_template ) try: output = chain({"input_documents": split_documents}, return_only_outputs=True) return output except Exception as e: print(e) return {'output_text':refuse_string} def pdf_summary(file_name): print("file_name: "+file_name) loader = UnstructuredFileLoader(file_name) document = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=20 ) split_documents = text_splitter.split_documents(document) return content_summary(split_documents) def youtube_summary(youtube_url): loader=YoutubeLoader.from_youtube_url(youtube_url, add_video_info=True, language=['en','zh-TW'], translation='zh-TW') document=loader.load() text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) split_documents=text_splitter.split_documents(document) result = content_summary(split_documents) return result['output_text'] def summary_large_file(files): file_paths = [file.name for file in files] print(file_paths[0]) result = pdf_summary(file_paths[0]) return result["output_text"] def upload_large_file(files): file_paths = [file.name for file in files] return Path(file_paths[0]).stem def set_allow_lightweight_delete(): client = clickhouse_connect.get_client(host='127.0.0.1',port=8123) command = "SET allow_experimental_lightweight_delete = true;" #command = "show databases;" res=client.command(command) print(res) def get_chroma_client(collection_name): vectorstore = Chroma( embedding_function=get_openaiembeddings(), collection_name=collection_name, persist_directory= root_file_path+persist_db, ) return vectorstore._client def create_db(): files_path = root_file_path+hr_source_path file_ext = "pdf" initial_croma_db(persist_db, files_path, file_ext, hr_collection_name) def generate_iframe_for_youtube(youtube_link): regex = r"(?:https:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?(.+)" _url=re.sub(regex, r"https://www.youtube.com/embed/\1", youtube_link) embed_html = f'' print(embed_html) return embed_html def create_html_video(file_name, width, temp_file_url): html_video = f'' return html_video def do_html_audio_speak(words_to_speak): polly_client = boto3.Session( aws_access_key_id="AKIAV7Q7AAGW54RBR6FZ", aws_secret_access_key="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", region_name='us-west-2' ).client('polly') language_code="cmn-CN" engine = NEURAL_ENGINE voice_id = "Zhiyu" print("voice_id: "+voice_id+"\nlanguage_code="+language_code) response = polly_client.synthesize_speech( Text=words_to_speak, OutputFormat='mp3', VoiceId=voice_id, LanguageCode=language_code, Engine=engine ) html_audio = '
no audio' # Save the audio stream returned by Amazon Polly on Lambda's temp directory if "AudioStream" in response: with closing(response["AudioStream"]) as stream: try: with open('./data/audios/tempfile.mp3', 'wb') as f: f.write(stream.read()) temp_aud_file = gr.File("./data/audios/tempfile.mp3") temp_aud_file_url = "/file=" + temp_aud_file.value['name'] html_audio = f'' except IOError as error: # Could not write to file, exit gracefully print(error) return None, None else: # The response didn't contain audio data, exit gracefully print("Could not stream audio") return None, None return html_audio, "./data/audios/tempfile.mp3" def do_html_video_speak(): key = "eyJhbGciOiJIUzUxMiJ9.eyJ1c2VybmFtZSI6ImNhdHNreXR3QGdtYWlsLmNvbSJ9.OypOUZF-xv4-b8i9F4_aaMQiJpxv0mXRT5kyuJwTMXVd4awV-O-Obntp--AqGghNNowzQ9oG7zArSnQjz2vQgg" url = "https://api.exh.ai/animations/v2/generate_lipsync_from_audio" files = {"audio_file": ("./data/audios/tempfile.mp3", open("./data/audios/tempfile.mp3", "rb"), "audio/mpeg")} payload = { "animation_pipeline": "high_quality", "idle_url": "https://ugc-idle.s3-us-west-2.amazonaws.com/5fd9ba1b1607b39a4d559300c1e35bee.mp4" } headers = { "accept": "application/json", "authorization": f"Bearer {key}" } res = requests.post(url, data=payload, files=files, headers=headers) print("res.status_code: ", res.status_code) html_video = '
no video' if isinstance(res.content, bytes): response_stream = io.BytesIO(res.content) print("len(res.content)): ", len(res.content)) with open('./data/videos/tempfile.mp4', 'wb') as f: f.write(response_stream.read()) temp_file = gr.File("./data/videos/tempfile.mp4") temp_file_url = "/file=" + temp_file.value['name'] html_video = f'' else: print('video url unknown') return res, html_video, "./data/videos/tempfile.mp4" def kh_update_km(files): file_paths = [file.name for file in files] dest_file_path = root_file_path + ks_source_path if not os.path.exists(dest_file_path): os.makedirs(dest_file_path) for file in file_paths: shutil.copy(file, dest_file_path) add_files_to_collection(ks_source_path, tmp_collection) return num_of_collection(tmp_collection) class Logger: def __init__(self, filename): self.terminal = sys.stdout self.log = open(filename, "w", encoding='UTF-8') def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): self.terminal.flush() self.log.flush() def isatty(self): return False def read_logs(): sys.stdout.flush() ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') with open("output.log", "r", encoding='UTF-8') as f: return ansi_escape.sub('', f.read()) def pandas_analysis(prompt_str, message, chat_history): dir_path = f"{root_file_path}{believe_source_path}/*.csv" res = glob.glob(dir_path) df = pd.concat(map(pd.read_csv, res)) local_deploy_id= "text-davinci-003" local_model_name = "text-davinci-003" llm = AzureOpenAI( deployment_name=local_deploy_id, model_name=local_model_name, max_tokens=2000, temperature=0, ) be_agent = create_pandas_dataframe_agent( llm, df, prefix="Remove any ` from the Action Input", max_iterations=30, return_intermediate_steps=False, max_execution_time=60, handle_parsing_errors="Check your output and make sure it conforms!", verbose=True) new_str = prompt_str.format(message=message, chat_history=chat_history) print(new_str) answer = be_agent.run(new_str) chat_history.append((message, answer)) return '', chat_history def lunch_style(demo, logs=gr.Text()): sys.stdout = Logger("output.log") demo.load(read_logs, None, logs, every=1) if len(sys.argv)==1: print("running server as default value") demo.launch(allowed_paths=[root_file_path, root_file_path+hr_source_path]) elif len(sys.argv)==2 and sys.argv[1] == "server": local_ip = "10.40.23.232" local_port = 7788 print(f"running server on http://{local_ip}:{local_port}") demo.launch(allowed_paths=[root_file_path, root_file_path+hr_source_path],auth=("Foxconn", "Foxconn123!"),server_name=local_ip, server_port=local_port) elif len(sys.argv)==4: local_ip = sys.argv[2] local_port = sys.argv[3] print(f"running server on http://{local_ip}:{local_port}") demo.launch(allowed_paths=[root_file_path, root_file_path+hr_source_path],auth=("Foxconn", "Foxconn123!"),server_name=local_ip, server_port=local_port) else: print("syntax: pythong