|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type |
|
import logging |
|
import json |
|
import os |
|
from datetime import datetime |
|
import hashlib |
|
import csv |
|
import requests |
|
import re |
|
import html |
|
import markdown2 |
|
import torch |
|
import sys |
|
import gc |
|
from pygments.lexers import guess_lexer, ClassNotFound |
|
import time |
|
import json |
|
import operator |
|
from typing import Annotated, Sequence, TypedDict |
|
import pprint |
|
|
|
import gradio as gr |
|
from pypinyin import lazy_pinyin |
|
import tiktoken |
|
import mdtex2html |
|
from markdown import markdown |
|
from pygments import highlight |
|
from pygments.lexers import guess_lexer,get_lexer_by_name |
|
from pygments.formatters import HtmlFormatter |
|
|
|
from langchain.chains import LLMChain, RetrievalQA |
|
from langgraph.graph import END, StateGraph |
|
from langchain_openai import ChatOpenAI |
|
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, UnstructuredWordDocumentLoader, DirectoryLoader |
|
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader |
|
from langchain.document_loaders.generic import GenericLoader |
|
from langchain.document_loaders.parsers import OpenAIWhisperParser |
|
from langchain.schema import AIMessage, HumanMessage |
|
from langchain_community.llms import HuggingFaceHub |
|
from langchain_community.llms import HuggingFaceTextGenInference |
|
from langchain_community.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings |
|
from langchain_community.tools import DuckDuckGoSearchRun |
|
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever |
|
from typing import Dict, TypedDict |
|
from langchain_core.messages import BaseMessage |
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain.prompts import PromptTemplate |
|
|
|
|
|
from langchain import hub |
|
from langchain.output_parsers.openai_tools import PydanticToolsParser |
|
from langchain.prompts import PromptTemplate |
|
from langchain.schema import Document |
|
from langchain_community.tools.tavily_search import TavilySearchResults |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_core.messages import BaseMessage, FunctionMessage |
|
from langchain_core.output_parsers import StrOutputParser |
|
from langchain_core.pydantic_v1 import BaseModel, Field |
|
from langchain_core.runnables import RunnablePassthrough |
|
from langchain_core.utils.function_calling import convert_to_openai_tool |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import Chroma |
|
from chromadb.errors import InvalidDimensionException |
|
import io |
|
from PIL import Image, ImageDraw, ImageOps, ImageFont |
|
import base64 |
|
from tempfile import NamedTemporaryFile |
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
nltk.download('punkt') |
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
PATH_WORK = "." |
|
CHROMA_DIR = "/chroma/kkg" |
|
CHROMA_PDF = './chroma/kkg/pdf' |
|
CHROMA_WORD = './chroma/kkg/word' |
|
CHROMA_EXCEL = './chroma/kkg/excel' |
|
YOUTUBE_DIR = "/youtube" |
|
HISTORY_PFAD = "/data/history" |
|
|
|
|
|
|
|
PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf" |
|
WEB_URL = "https://openai.com/research/gpt-4" |
|
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE" |
|
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE" |
|
|
|
|
|
urls = [ |
|
"https://kkg.hamburg.de/unser-leitbild/" |
|
"https://kkg.hamburg.de/unsere-schulcharta/", |
|
"https://kkg.hamburg.de/koordination-unterrichtsentwicklung/", |
|
"https://kkg.hamburg.de/konzept-medien-und-it-am-kkg/", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
def normalise_prompt (prompt): |
|
|
|
prompt_klein =prompt.lower() |
|
|
|
tokens = word_tokenize(prompt_klein) |
|
|
|
tokens = [word for word in tokens if word.isalnum()] |
|
|
|
|
|
nltk.download('stopwords') |
|
stop_words = set(stopwords.words('deutsch')) |
|
tokens = [word for word in tokens if not word in stop_words] |
|
|
|
nltk.download('wordnet') |
|
lemmatizer = WordNetLemmatizer() |
|
tokens = [lemmatizer.lemmatize(word) for word in tokens] |
|
|
|
tokens = [re.sub(r'\W+', '', word) for word in tokens] |
|
|
|
from spellchecker import SpellChecker |
|
spell = SpellChecker() |
|
tokens = [spell.correction(word) for word in tokens] |
|
|
|
normalized_prompt = ' '.join(tokens) |
|
print("normaiserd prompt..................................") |
|
print(normalized_prompt) |
|
return normalized_prompt |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_directory_loader(file_type, directory_path): |
|
|
|
loaders = { |
|
'.pdf': PyPDFLoader, |
|
'.word': UnstructuredWordDocumentLoader, |
|
} |
|
return DirectoryLoader( |
|
path=directory_path, |
|
glob=f"**/*{file_type}", |
|
loader_cls=loaders[file_type], |
|
) |
|
|
|
|
|
def document_loading_splitting(): |
|
|
|
|
|
docs = [] |
|
|
|
|
|
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF) |
|
word_loader = create_directory_loader('.word', CHROMA_WORD) |
|
print("PDF Loader done............................") |
|
|
|
|
|
pdf_documents = pdf_loader.load() |
|
word_documents = word_loader.load() |
|
|
|
|
|
docs.extend(pdf_documents) |
|
docs.extend(word_documents) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500) |
|
splits = text_splitter.split_documents(docs) |
|
|
|
return splits |
|
|
|
|
|
|
|
def document_storage_chroma(splits): |
|
|
|
Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def document_retrieval_chroma(llm, prompt): |
|
|
|
|
|
embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}) |
|
|
|
|
|
|
|
|
|
db = Chroma(embedding_function = embeddings, persist_directory = PATH_WORK + CHROMA_DIR) |
|
return db |
|
|
|
|
|
|
|
|
|
|
|
|
|
def rag_chain(prompt, db, k=3): |
|
rag_template = "Nutze ausschließlich die folgenden Kontext Teile am Ende, um die Frage zu beantworten . " + template + "Frage: " + prompt + "Kontext Teile: " |
|
retrieved_chunks = db.similarity_search(prompt, k) |
|
|
|
neu_prompt = rag_template |
|
for i, chunk in enumerate(retrieved_chunks): |
|
neu_prompt += f"{i+1}. {chunk}\n" |
|
|
|
return neu_prompt |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|