File size: 2,920 Bytes
5248357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdc51b5
 
 
5248357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import langid
import os
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import print_answers
from deep_translator import GoogleTranslator

class Sejarah:
    def __init__(self):
        
        document_store = InMemoryDocumentStore(use_bm25=True)

        #initialize the pipeline
        indexing_pipeline = Pipeline()
        text_converter = TextConverter()
        preprocessor = PreProcessor(
            clean_whitespace=True,
            clean_header_footer=True,
            clean_empty_lines=True,
            split_by="word",
            split_length=200,
            split_overlap=20,
            split_respect_sentence_boundary=True,
        )

        indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
        indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
        indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])

        dir = "documents"

        files_to_index = [dir+"/" + f for f in os.listdir(dir)]
        indexing_pipeline.run_batch(file_paths=files_to_index)

        retriever = BM25Retriever(document_store=document_store)
        reader = FARMReader(model_name_or_path="primasr/malaybert-for-eqa-finetuned", use_gpu=True)

        self.querying_pipeline = Pipeline()
        self.querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
        self.querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

    
    def language_converter(self, content, lang, method):
 
        if lang == "en":
            if method == "question":
                new_content = GoogleTranslator(source='en', target='ms').translate(content)

                if "when" in content: 
                  new_content = new_content.replace("apabila","bila")
            else:
                new_content = GoogleTranslator(source='ms', target='en').translate(content)
        else
            new_content = content
            
        return new_content


    def detect_language(self, content):
        lang = langid.classify(content)
        return lang[0]

        
    def interface(self, question):
        language = self.detect_language(question)

        converted_question = self.language_converter(question, language, "question")

        result = self.querying_pipeline.run(
            query=converted_question,
            params={
                "Retriever": {"top_k": 10},
                "Reader": {"top_k": 5}
            }
        )

        answer = self.language_converter(result['answers'][0].answer, language, "answer")
        context = self.language_converter(result['answers'][0].context, language, "answer")
        
        return answer, context