import langid from haystack import Pipeline from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader from haystack.document_stores import InMemoryDocumentStore class Sejarah: def __init__(self): document_store = InMemoryDocumentStore(use_bm25=True) # #initialize the pipeline # indexing_pipeline = Pipeline() # text_converter = TextConverter() # preprocessor = PreProcessor( # clean_whitespace=True, # clean_header_footer=True, # clean_empty_lines=True, # split_by="word", # split_length=200, # split_overlap=20, # split_respect_sentence_boundary=True, # ) # indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"]) # indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"]) # indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"]) # doc_dir = "/content/drive/Shareddrives/Natural Language Processing/Dataset/txt files" # files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)] # indexing_pipeline.run_batch(file_paths=files_to_index) # retriever = BM25Retriever(document_store=document_store) # reader = FARMReader(model_name_or_path="primasr/malaybert-for-eqa-finetuned", use_gpu=True) # self.querying_pipeline = Pipeline() # self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) # self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"]) def interface(self, question): language = self.detect_language(question) answer = "Answer of "+question+" is: "+language return answer def detect_language(self, content): lang = langid.classify(content) print(lang) return lang[0]