Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber | |
from langchain import PromptTemplate, LLMChain | |
from langchain.chat_models import HuggingFaceHub | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.vectorstores import Chroma | |
from langchain.document_loaders import TextLoader | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.memory import ConversationBufferMemory | |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
# โหลดโมเดล ThaiBERT จาก Hugging Face | |
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased") | |
model = AutoModelForQuestionAnswering.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased") | |
# ฟังก์ชันสำหรับอ่านเนื้อหาจาก PDF | |
def extract_text_from_pdf(pdf_file): | |
with pdfplumber.open(pdf_file) as pdf: | |
text = "" | |
for page in pdf.pages: | |
text += page.extract_text() | |
return text | |
# ฟังก์ชันสำหรับการตอบคำถามด้วย ThaiBERT | |
def answer_question(question, context): | |
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") | |
answer_start_scores, answer_end_scores = model(**inputs) | |
answer_start = torch.argmax(answer_start_scores) | |
answer_end = torch.argmax(answer_end_scores) + 1 | |
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end])) | |
return answer | |
# ตั้งค่าอินเตอร์เฟสของหน้าเว็บด้วย Streamlit | |
st.title("ThaiBERT PDF QA System") | |
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") | |
if uploaded_file: | |
# อ่านเนื้อหาจาก PDF | |
pdf_text = extract_text_from_pdf(uploaded_file) | |
# สร้าง chain สำหรับถามตอบ | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
docs = text_splitter.create_documents([pdf_text]) | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-xlm-r-multilingual-v1") | |
vector_store = Chroma.from_documents(documents=docs, embedding=embeddings) | |
retriever = vector_store.as_retriever() | |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
qa_chain = ConversationalRetrievalChain( | |
retriever=retriever, | |
llm=HuggingFaceHub(repo_id="airesearch/wangchanberta-base-att-spm-uncased", model_kwargs={"temperature": 0}), | |
memory=memory | |
) | |
# หน้าต่างสำหรับใส่คำถาม | |
user_question = st.text_input("Ask a question about the PDF content") | |
if user_question: | |
response = qa_chain.run(user_question) | |
st.write("Answer:", response) | |