Ari commited on
Commit
abdc1ac
1 Parent(s): 99a5022

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -16
app.py CHANGED
@@ -11,22 +11,18 @@ from reportlab.pdfgen import canvas
11
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
12
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
13
 
14
- # Function to split text into chunks based on paragraphs
15
- def split_text_by_paragraph(text, max_chunk_size=1024):
16
- paragraphs = text.split("\n\n") # Splitting by paragraphs
 
17
  chunks = []
18
- chunk = ""
19
-
20
- for paragraph in paragraphs:
21
- if len(chunk) + len(paragraph) <= max_chunk_size:
22
- chunk += paragraph + "\n\n"
23
- else:
24
- chunks.append(chunk.strip())
25
- chunk = paragraph + "\n\n"
26
 
27
- if chunk:
28
- chunks.append(chunk.strip())
29
-
 
 
 
30
  return chunks
31
 
32
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
@@ -46,7 +42,7 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
46
  pdf.save()
47
  return output_pdf
48
 
49
- # Main processing function with paragraph-based text chunking
50
  def pdf_to_text(text, PDF, min_length=80):
51
  try:
52
  file_extension = os.path.splitext(PDF.name)[1].lower()
@@ -57,7 +53,7 @@ def pdf_to_text(text, PDF, min_length=80):
57
  elif file_extension == '.pdf' and text == "":
58
  text = extract_text(PDF.name)
59
 
60
- chunks = split_text_by_paragraph(text)
61
  summarized_text = ""
62
 
63
  for chunk in chunks:
 
11
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
12
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
13
 
14
+ # Function to split text into chunks based on token length
15
+ def split_text_by_tokens(text, max_length=1024):
16
+ tokens = tokenizer.encode(text, return_tensors="pt", truncation=False)
17
+ total_length = tokens.shape[1]
18
  chunks = []
 
 
 
 
 
 
 
 
19
 
20
+ # Loop through the text, grabbing chunks of tokens
21
+ for i in range(0, total_length, max_length):
22
+ chunk_tokens = tokens[:, i:i+max_length]
23
+ chunk_text = tokenizer.decode(chunk_tokens[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
24
+ chunks.append(chunk_text)
25
+
26
  return chunks
27
 
28
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
 
42
  pdf.save()
43
  return output_pdf
44
 
45
+ # Main processing function with token-based text chunking
46
  def pdf_to_text(text, PDF, min_length=80):
47
  try:
48
  file_extension = os.path.splitext(PDF.name)[1].lower()
 
53
  elif file_extension == '.pdf' and text == "":
54
  text = extract_text(PDF.name)
55
 
56
+ chunks = split_text_by_tokens(text)
57
  summarized_text = ""
58
 
59
  for chunk in chunks: