sengzi commited on
Commit
81560a4
1 Parent(s): c1dfeeb

Update Sejarah.py

Browse files
Files changed (1) hide show
  1. Sejarah.py +10 -23
Sejarah.py CHANGED
@@ -28,7 +28,9 @@ class Sejarah:
28
  indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
29
  indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
30
 
31
- files_to_index = ["documents/" + f for f in os.listdir("documents")]
 
 
32
  indexing_pipeline.run_batch(file_paths=files_to_index)
33
 
34
  retriever = BM25Retriever(document_store=document_store)
@@ -50,26 +52,18 @@ class Sejarah:
50
  def language_converter(self, content, lang, method):
51
 
52
  content = content.lower()
53
-
54
- print(lang)
55
- print(method)
56
- print(content)
57
-
58
  if lang == "en":
59
  if method == "question":
60
- tokenized_text = self.id_en_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
61
- translation = self.id_en_model.generate(**tokenized_text)
62
- content = self.id_en_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
63
-
64
- print(tokenized_text)
65
- print(translation)
66
- print(content)
67
-
68
- else:
69
  tokenized_text = self.en_id_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
70
  translation = self.en_id_model.generate(**tokenized_text)
71
  content = self.en_id_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
72
 
 
 
 
 
 
73
  return content
74
 
75
  def interface(self, question):
@@ -85,12 +79,6 @@ class Sejarah:
85
  }
86
  )
87
 
88
- print("_____Result____")
89
- print(language)
90
- print(question)
91
- print(converted_question)
92
- print(result['answers'][0])
93
-
94
  answer = self.language_converter(result['answers'][0].answer, language, "answer")
95
  context = self.language_converter(result['answers'][0].context, language, "answer")
96
 
@@ -99,5 +87,4 @@ class Sejarah:
99
 
100
  def detect_language(self, content):
101
  lang = langid.classify(content)
102
- return lang[0]
103
-
 
28
  indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
29
  indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
30
 
31
+ dir = "/content/drive/Shareddrives/Natural Language Processing/Dataset/txt files"
32
+
33
+ files_to_index = [dir+"/" + f for f in os.listdir(dir)]
34
  indexing_pipeline.run_batch(file_paths=files_to_index)
35
 
36
  retriever = BM25Retriever(document_store=document_store)
 
52
  def language_converter(self, content, lang, method):
53
 
54
  content = content.lower()
55
+
 
 
 
 
56
  if lang == "en":
57
  if method == "question":
 
 
 
 
 
 
 
 
 
58
  tokenized_text = self.en_id_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
59
  translation = self.en_id_model.generate(**tokenized_text)
60
  content = self.en_id_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
61
 
62
+ else:
63
+ tokenized_text = self.id_en_tokenizer.prepare_seq2seq_batch([content], return_tensors='pt')
64
+ translation = self.id_en_model.generate(**tokenized_text)
65
+ content = self.id_en_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
66
+
67
  return content
68
 
69
  def interface(self, question):
 
79
  }
80
  )
81
 
 
 
 
 
 
 
82
  answer = self.language_converter(result['answers'][0].answer, language, "answer")
83
  context = self.language_converter(result['answers'][0].context, language, "answer")
84
 
 
87
 
88
  def detect_language(self, content):
89
  lang = langid.classify(content)
90
+ return lang[0]