mvansegbroeck commited on
Commit
1420dd0
1 Parent(s): c2923d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -1
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import requests
3
  import os
 
4
  import markdownify
5
  import fitz # PyMuPDF
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -32,6 +33,15 @@ def markdown_to_text(md_path):
32
  with open(md_path, 'r') as file:
33
  return file.read()
34
 
 
 
 
 
 
 
 
 
 
35
  # Function to split text into chunks
36
  def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
37
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
@@ -91,9 +101,10 @@ def process_files(uploaded_files, use_example, chunk_size, chunk_overlap, min_ch
91
  text = markdown_to_text(file_path)
92
  else:
93
  text = ""
94
-
95
  markdown_text = markdownify.markdownify(text)
96
  file_id = os.path.splitext(os.path.basename(file_path))[0]
 
97
  markdown_path = os.path.join(output_dir, f"{file_id}.md")
98
  with open(markdown_path, 'w') as file:
99
  file.write(markdown_text)
 
1
  import gradio as gr
2
  import requests
3
  import os
4
+ import re
5
  import markdownify
6
  import fitz # PyMuPDF
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
33
  with open(md_path, 'r') as file:
34
  return file.read()
35
 
36
+ def sanitize_key(filename):
37
+ # Replace spaces with underscores
38
+ filename = filename.replace(" ", "_")
39
+ # Remove special characters except for underscores
40
+ filename = re.sub(r'[^a-zA-Z0-9_]', '', filename)
41
+ # Ensure the key is not too long
42
+ filename = filename[:100] # Truncate to 100 characters if necessary
43
+ return filename
44
+
45
  # Function to split text into chunks
46
  def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
47
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
101
  text = markdown_to_text(file_path)
102
  else:
103
  text = ""
104
+
105
  markdown_text = markdownify.markdownify(text)
106
  file_id = os.path.splitext(os.path.basename(file_path))[0]
107
+ file_id = sanitize_key(file_id)
108
  markdown_path = os.path.join(output_dir, f"{file_id}.md")
109
  with open(markdown_path, 'w') as file:
110
  file.write(markdown_text)