import os from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, NLTKTextSplitter, SpacyTextSplitter docs_path = "/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/docs" char_text_splitter = CharacterTextSplitter( separator="\n", chunk_size=256, chunk_overlap=20 ) recursive_text_splitter = RecursiveCharacterTextSplitter( chunk_size=256, chunk_overlap=20 ) nltk_text_splitter = NLTKTextSplitter() spacy_text_splitter = SpacyTextSplitter() def split_chunks(splitter, input_dir, output_dir): files_split = {} for filename in os.listdir(input_dir): file_path = os.path.join(input_dir, filename) with open(file_path, 'r') as file: text = file.read() texts = splitter.create_documents([text]) files_split[filename[:-4]] = texts os.makedirs(output_dir, exist_ok=True) for key, texts in files_split.items(): for index, text in enumerate(texts): file_path = os.path.join(output_dir, f"{key}_{index}.txt") with open(file_path, 'w') as file: file.write(text.page_content) print('Chunks written') split_chunks(char_text_splitter, docs_path, '/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/char_split_docs') split_chunks(recursive_text_splitter, docs_path, '/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/recursive_split_docs') split_chunks(nltk_text_splitter, docs_path, '/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/nltk_split_docs') split_chunks(nltk_text_splitter, docs_path, '/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/spacy_split_docs')