Pranjal12345 commited on
Commit
ce57726
·
1 Parent(s): 3c17b68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -24
app.py CHANGED
@@ -1,47 +1,54 @@
1
  import gradio as gr
2
  import re
 
 
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
 
5
  tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
6
  model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
7
 
8
- def inference(input_text):
9
- if input_text is None:
10
- return "Please upload a text"
 
 
11
 
12
- input_ids = tokenizer.encode(input_text, return_tensors="pt")
13
 
14
- sentences = re.split(r'(?<=[.!?])', input_text)
 
 
 
 
 
 
15
  question_answer_pairs = []
16
 
17
- for i, sentence in enumerate(sentences):
18
- input_ids_clone = tokenizer.encode(sentence, return_tensors="pt")
19
- outputs = model.generate(input_ids_clone, max_length=100, num_return_sequences=1)
20
  question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
21
- question = question_answer.strip()
22
- question_answer_pairs.append((f"Question:", question))
23
 
24
  result = ''
25
-
26
- for i in range(len(question_answer_pairs)):
27
- if question_answer_pairs[i][1] == '':
28
- break
29
- question_part = question_answer_pairs[i][1].split("?")[0] + "?"
30
- answer_part = question_answer_pairs[i][1].split("?")[1].strip()
31
 
32
- result += f"Question: {question_part}\nAnswer: {answer_part}\n\n"
 
 
 
 
 
33
 
34
  return result
35
 
36
- title = "Question Answer Pairs Generator"
37
-
38
- input_text = gr.Textbox(lines=4, label="Text:")
39
 
40
  interface = gr.Interface(
41
- fn=inference,
42
- inputs=[input_text],
43
- outputs= "text",
44
  title=title,
45
  )
46
-
47
  interface.launch()
 
1
  import gradio as gr
2
  import re
3
+ import os
4
+ import fitz
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
 
7
  tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
8
  model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
9
 
10
+ def extract_text_from_pdf(pdf_file_path):
11
+ doc = fitz.open(pdf_file_path)
12
+ text = ""
13
+ for page in doc:
14
+ text+=page.get_text()
15
 
16
+ return text
17
 
18
+ def generate_question_answer_pairs(pdf_file):
19
+ if pdf_file is None:
20
+ return "Please upload a PDF file"
21
+
22
+ pdf_text = extract_text_from_pdf(pdf_file.name)
23
+
24
+ sentences = re.split(r'(?<=[.!?])', pdf_text)
25
  question_answer_pairs = []
26
 
27
+ for sentence in sentences:
28
+ input_ids = tokenizer.encode(sentence, return_tensors="pt")
29
+ outputs = model.generate(input_ids, max_length=100, num_return_sequences=1)
30
  question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
31
+ question_answer_pairs.append(question_answer)
 
32
 
33
  result = ''
 
 
 
 
 
 
34
 
35
+ for question_answer in question_answer_pairs:
36
+ qa_parts = question_answer.split("?")
37
+ if len(qa_parts) >= 2:
38
+ question_part = qa_parts[0] + "?"
39
+ answer_part = qa_parts[1].strip()
40
+ result += f"Question: {question_part}\nAnswer: {answer_part}\n\n"
41
 
42
  return result
43
 
44
+ title = "Question-Answer Pairs Generation"
45
+ input_file = gr.File(label="Upload a PDF file")
46
+ output_text = gr.Textbox()
47
 
48
  interface = gr.Interface(
49
+ fn=generate_question_answer_pairs,
50
+ inputs=input_file,
51
+ outputs=output_text,
52
  title=title,
53
  )
 
54
  interface.launch()