Group5_Project / Sejarah.py
sengzi's picture
Update Sejarah.py
bdc51b5
raw
history blame
2.92 kB
import langid
import os
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import print_answers
from deep_translator import GoogleTranslator
class Sejarah:
def __init__(self):
document_store = InMemoryDocumentStore(use_bm25=True)
#initialize the pipeline
indexing_pipeline = Pipeline()
text_converter = TextConverter()
preprocessor = PreProcessor(
clean_whitespace=True,
clean_header_footer=True,
clean_empty_lines=True,
split_by="word",
split_length=200,
split_overlap=20,
split_respect_sentence_boundary=True,
)
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
dir = "documents"
files_to_index = [dir+"/" + f for f in os.listdir(dir)]
indexing_pipeline.run_batch(file_paths=files_to_index)
retriever = BM25Retriever(document_store=document_store)
reader = FARMReader(model_name_or_path="primasr/malaybert-for-eqa-finetuned", use_gpu=True)
self.querying_pipeline = Pipeline()
self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
def language_converter(self, content, lang, method):
if lang == "en":
if method == "question":
new_content = GoogleTranslator(source='en', target='ms').translate(content)
if "when" in content:
new_content = new_content.replace("apabila","bila")
else:
new_content = GoogleTranslator(source='ms', target='en').translate(content)
else
new_content = content
return new_content
def detect_language(self, content):
lang = langid.classify(content)
return lang[0]
def interface(self, question):
language = self.detect_language(question)
converted_question = self.language_converter(question, language, "question")
result = self.querying_pipeline.run(
query=converted_question,
params={
"Retriever": {"top_k": 10},
"Reader": {"top_k": 5}
}
)
answer = self.language_converter(result['answers'][0].answer, language, "answer")
context = self.language_converter(result['answers'][0].context, language, "answer")
return answer, context