import os import zipfile import gradio as gr from PyPDF4 import PdfFileReader import tiktoken def extract_text_from_pdf(file_path): with open(file_path, "rb") as file: pdf = PdfFileReader(file) text = "" for page_num in range(pdf.getNumPages()): text += pdf.getPage(page_num).extractText() return text def tokenize(text, model="gpt-3.5-turbo"): tokenizer = tiktoken.encoding_for_model(model) tokens = tokenizer.encode(text, disallowed_special=()) return tokens def count_tokens(text): return len(tokenize(text)) def analyse_text(text): num_tokens = count_tokens(text) result = [] try: result.append(f"Text length: {len(text)}") result.append(f"Token counts: {num_tokens}") result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}") except: result = 'no text' return '\n'.join(result) def analyse_file(file): paper_text = extract_text_from_pdf(file.name) return paper_text def write_chunks_to_files(chunks): file_paths = [] for i, chunk in enumerate(chunks, start=1): file_path = f"chunk_{i}.txt" with open(file_path, "w") as file: file.write(chunk) file_paths.append(file_path) return file_paths def write_chunks_to_zip(chunks): file_paths = write_chunks_to_files(chunks) zip_file_name = "chunks.zip" with zipfile.ZipFile(zip_file_name, 'w') as zipf: for file in file_paths: zipf.write(file) os.remove(file) # Remove the file after writing it into the zip return zip_file_name def chunk_text(text, max_char, overlap): chunks = [] start = 0 end = max_char while start < len(text): if end >= len(text): end = len(text) chunk = text[start:end] num_tokens = count_tokens(chunk) chunks.append((chunk, len(chunk), num_tokens)) start += max_char - overlap end = start + max_char return chunks def chunk_file(file, max_char, overlap): text = extract_text_from_pdf(file.name) chunks = chunk_text(text, max_char, overlap) formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)] zip_file_path = write_chunks_to_zip([c[0] for c in chunks]) return '\n'.join(formatted_chunks), zip_file_path def chunk_and_zip_text(text, max_char, overlap): chunks = chunk_text(text, max_char, overlap) formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)] zip_file_path = write_chunks_to_zip([c[0] for c in chunks]) return '\n'.join(formatted_chunks), zip_file_path with gr.Blocks() as demo: docs_input = gr.File(file_count="single", file_types=[".pdf"]) text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True) tb_analysis = gr.Textbox(label='Text Analysis') sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk") sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size") btn_chunk = gr.Button("Chunk text") tb_chunked_text = gr.Textbox(label='Chunks Info') download_link = gr.File(label='Download Chunks') # Call analyse_file when a file is uploaded and display the results in tb_analysis docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk]) text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis]) btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link]) demo.launch(debug=True, share=False)