Spaces:
Runtime error
Runtime error
Update Sejarah.py
Browse files- Sejarah.py +10 -23
Sejarah.py
CHANGED
@@ -28,7 +28,9 @@ class Sejarah:
|
|
28 |
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
|
29 |
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
|
30 |
|
31 |
-
|
|
|
|
|
32 |
indexing_pipeline.run_batch(file_paths=files_to_index)
|
33 |
|
34 |
retriever = BM25Retriever(document_store=document_store)
|
@@ -50,26 +52,18 @@ class Sejarah:
|
|
50 |
def language_converter(self, content, lang, method):
|
51 |
|
52 |
content = content.lower()
|
53 |
-
|
54 |
-
print(lang)
|
55 |
-
print(method)
|
56 |
-
print(content)
|
57 |
-
|
58 |
if lang == "en":
|
59 |
if method == "question":
|
60 |
-
tokenized_text = self.id_en_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
|
61 |
-
translation = self.id_en_model.generate(**tokenized_text)
|
62 |
-
content = self.id_en_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
|
63 |
-
|
64 |
-
print(tokenized_text)
|
65 |
-
print(translation)
|
66 |
-
print(content)
|
67 |
-
|
68 |
-
else:
|
69 |
tokenized_text = self.en_id_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
|
70 |
translation = self.en_id_model.generate(**tokenized_text)
|
71 |
content = self.en_id_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
|
72 |
|
|
|
|
|
|
|
|
|
|
|
73 |
return content
|
74 |
|
75 |
def interface(self, question):
|
@@ -85,12 +79,6 @@ class Sejarah:
|
|
85 |
}
|
86 |
)
|
87 |
|
88 |
-
print("_____Result____")
|
89 |
-
print(language)
|
90 |
-
print(question)
|
91 |
-
print(converted_question)
|
92 |
-
print(result['answers'][0])
|
93 |
-
|
94 |
answer = self.language_converter(result['answers'][0].answer, language, "answer")
|
95 |
context = self.language_converter(result['answers'][0].context, language, "answer")
|
96 |
|
@@ -99,5 +87,4 @@ class Sejarah:
|
|
99 |
|
100 |
def detect_language(self, content):
|
101 |
lang = langid.classify(content)
|
102 |
-
return lang[0]
|
103 |
-
|
|
|
28 |
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
|
29 |
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
|
30 |
|
31 |
+
dir = "/content/drive/Shareddrives/Natural Language Processing/Dataset/txt files"
|
32 |
+
|
33 |
+
files_to_index = [dir+"/" + f for f in os.listdir(dir)]
|
34 |
indexing_pipeline.run_batch(file_paths=files_to_index)
|
35 |
|
36 |
retriever = BM25Retriever(document_store=document_store)
|
|
|
52 |
def language_converter(self, content, lang, method):
|
53 |
|
54 |
content = content.lower()
|
55 |
+
|
|
|
|
|
|
|
|
|
56 |
if lang == "en":
|
57 |
if method == "question":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
tokenized_text = self.en_id_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
|
59 |
translation = self.en_id_model.generate(**tokenized_text)
|
60 |
content = self.en_id_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
|
61 |
|
62 |
+
else:
|
63 |
+
tokenized_text = self.id_en_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
|
64 |
+
translation = self.id_en_model.generate(**tokenized_text)
|
65 |
+
content = self.id_en_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
|
66 |
+
|
67 |
return content
|
68 |
|
69 |
def interface(self, question):
|
|
|
79 |
}
|
80 |
)
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
answer = self.language_converter(result['answers'][0].answer, language, "answer")
|
83 |
context = self.language_converter(result['answers'][0].context, language, "answer")
|
84 |
|
|
|
87 |
|
88 |
def detect_language(self, content):
|
89 |
lang = langid.classify(content)
|
90 |
+
return lang[0]
|
|