Spaces:
Runtime error
Runtime error
File size: 3,744 Bytes
14a6f5b c96166d 9f38a4d c96166d 14a6f5b 6cfb094 14a6f5b 6cfb094 c96166d 14a6f5b c96166d 14a6f5b c96166d 6cfb094 14a6f5b 6cfb094 14a6f5b 6cfb094 14a6f5b 6cfb094 14a6f5b 6cfb094 c96166d 14a6f5b 6cfb094 14a6f5b 6cfb094 14a6f5b fe257d3 14a6f5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import os
import zipfile
import gradio as gr
from PyPDF4 import PdfFileReader
import tiktoken
def extract_text_from_pdf(file_path):
with open(file_path, "rb") as file:
pdf = PdfFileReader(file)
text = ""
for page_num in range(pdf.getNumPages()):
text += pdf.getPage(page_num).extractText()
return text
def tokenize(text, model="gpt-3.5-turbo"):
tokenizer = tiktoken.encoding_for_model(model)
tokens = tokenizer.encode(text, disallowed_special=())
return tokens
def count_tokens(text):
return len(tokenize(text))
def analyse_text(text):
num_tokens = count_tokens(text)
result = []
try:
result.append(f"Text length: {len(text)}")
result.append(f"Token counts: {num_tokens}")
result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}")
except:
result = 'no text'
return '\n'.join(result)
def analyse_file(file):
paper_text = extract_text_from_pdf(file.name)
return paper_text
def write_chunks_to_files(chunks):
file_paths = []
for i, chunk in enumerate(chunks, start=1):
file_path = f"chunk_{i}.txt"
with open(file_path, "w") as file:
file.write(chunk)
file_paths.append(file_path)
return file_paths
def write_chunks_to_zip(chunks):
file_paths = write_chunks_to_files(chunks)
zip_file_name = "chunks.zip"
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
for file in file_paths:
zipf.write(file)
os.remove(file) # Remove the file after writing it into the zip
return zip_file_name
def chunk_text(text, max_char, overlap):
chunks = []
start = 0
end = max_char
while start < len(text):
if end >= len(text):
end = len(text)
chunk = text[start:end]
num_tokens = count_tokens(chunk)
chunks.append((chunk, len(chunk), num_tokens))
start += max_char - overlap
end = start + max_char
return chunks
def chunk_file(file, max_char, overlap):
text = extract_text_from_pdf(file.name)
chunks = chunk_text(text, max_char, overlap)
formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
return '\n'.join(formatted_chunks), zip_file_path
def chunk_and_zip_text(text, max_char, overlap):
chunks = chunk_text(text, max_char, overlap)
formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
return '\n'.join(formatted_chunks), zip_file_path
with gr.Blocks() as demo:
docs_input = gr.File(file_count="single", file_types=[".pdf"])
text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True)
tb_analysis = gr.Textbox(label='Text Analysis')
sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk")
sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
btn_chunk = gr.Button("Chunk text")
tb_chunked_text = gr.Textbox(label='Chunks Info')
download_link = gr.File(label='Download Chunks')
# Call analyse_file when a file is uploaded and display the results in tb_analysis
docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk])
text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis])
btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link])
demo.launch(debug=True, share=False)
|