not-lain commited on
Commit
579432a
1 Parent(s): fac9a75

add ppt text extraction support

Browse files
Files changed (1) hide show
  1. app.py +38 -5
app.py CHANGED
@@ -20,6 +20,39 @@ def extract_text_from_pptx(file_path):
20
 
21
  return "\n\n".join(text_content)
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def convert_pdf_to_image(file):
25
  images = convert_from_path(file)
@@ -87,16 +120,16 @@ doc_or_docx_to_text = gr.Interface(
87
  api_name="doc_or_docx_to_text",
88
  )
89
 
90
- pptx_to_text = gr.Interface(
91
- extract_text_from_pptx,
92
  gr.File(),
93
  gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
94
- api_name="pptx_to_text",
95
  )
96
 
97
  demo = gr.TabbedInterface(
98
- [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_to_text],
99
- ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text", "Extract PPTX Text"],
100
  )
101
 
102
  demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
 
20
 
21
  return "\n\n".join(text_content)
22
 
23
+ def extract_text_from_ppt(file_path):
24
+ try:
25
+ # Convert PPT to PPTX using unoconv
26
+ pptx_file_path = os.path.splitext(file_path)[0] + '.pptx'
27
+ subprocess.run(['unoconv', '-f', 'pptx', file_path], check=True)
28
+
29
+ # Extract text from PPTX
30
+ presentation = Presentation(pptx_file_path)
31
+ text_content = []
32
+
33
+ for slide in presentation.slides:
34
+ slide_text = []
35
+ for shape in slide.shapes:
36
+ if hasattr(shape, "text"):
37
+ slide_text.append(shape.text)
38
+ text_content.append("\n".join(slide_text))
39
+
40
+ # Remove the converted PPTX file
41
+ os.remove(pptx_file_path)
42
+
43
+ return "\n\n".join(text_content)
44
+ except Exception as e:
45
+ print(f"Error extracting text from PPT file: {e}")
46
+ return "Error extracting text from PPT file"
47
+
48
+ def extract_text_from_ppt_or_pptx(file_path):
49
+ if file_path.endswith('.pptx'):
50
+ return extract_text_from_pptx(file_path)
51
+ elif file_path.endswith('.ppt'):
52
+ return extract_text_from_ppt(file_path)
53
+ else:
54
+ return "Unsupported file type. Please provide a .ppt or .pptx file."
55
+
56
 
57
  def convert_pdf_to_image(file):
58
  images = convert_from_path(file)
 
120
  api_name="doc_or_docx_to_text",
121
  )
122
 
123
+ pptx_or_ppt_to_text = gr.Interface(
124
+ extract_text_from_ppt_or_pptx,
125
  gr.File(),
126
  gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
127
+ api_name="pptx_or_ppt_to_text",
128
  )
129
 
130
  demo = gr.TabbedInterface(
131
+ [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text],
132
+ ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text", "Extract PPTX/PPT Text"],
133
  )
134
 
135
  demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)