{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "text = \"\"\n", "with open(\n", " \"./raw_data/dale_carnegie/how_to_win_friends_and_influence_people.txt\", \"r\"\n", ") as f:\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "139" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from langchain_text_splitters import RecursiveCharacterTextSplitter, Language\n", "\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " separators=RecursiveCharacterTextSplitter.get_separators_for_language(\n", " Language.MARKDOWN\n", " ),\n", " chunk_size=4096,\n", " keep_separator=True,\n", ")\n", "\n", "splitted_text = text_splitter.split_text(text)\n", "len(splitted_text)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "207" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Semantic Splitting\n", "from langchain.storage import LocalFileStore\n", "from langchain.embeddings.cache import CacheBackedEmbeddings\n", "from langchain_experimental.text_splitter import SemanticChunker\n", "from langchain_openai.embeddings import OpenAIEmbeddings\n", "\n", "underlying_embedder = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", "\n", "store = LocalFileStore(\"./.cache/embeddings\")\n", "\n", "embedder = CacheBackedEmbeddings.from_bytes_store(\n", " underlying_embedder, store, namespace=underlying_embedder.model\n", ")\n", "\n", "text_splitter = SemanticChunker(embedder)\n", "splitted_text = text_splitter.split_text(text)\n", "len(splitted_text)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame(splitted_text, columns=[\"text\"])\n", "df.to_parquet(\n", " \"./raw_data/dale_carnegie/how_to_win_friends_and_influence_people.parquet\"\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | text | \n", "
---|---|
0 | \n", "# PART ONE: Fundamental Techniques in Handling... | \n", "
1 | \n", "So they rationalize, they explain. They can te... | \n", "
2 | \n", "F. Skinner, the world-famous psychologist, pro... | \n", "
3 | \n", "Johnston of Enid, Oklahoma, is the safety coor... | \n", "
4 | \n", "Fall was condemned viciously - condemned as fe... | \n", "
... | \n", "... | \n", "
202 | \n", "Reminders, admonitions, confrontations, with h... | \n", "
203 | \n", "Perhaps. But that is what they said to Napoleo... | \n", "
204 | \n", "Ernest Gent of Scarsdale, New York, was troubl... | \n", "
205 | \n", "She tried coaxing. Neither worked. Then she tr... | \n", "
206 | \n", ". PRINCIPLE 9 Make the other person happy abou... | \n", "
207 rows × 1 columns
\n", "\n", " | text | \n", "
---|---|
0 | \n", "# PART ONE: Fundamental Techniques in Handling... | \n", "
1 | \n", "So they rationalize, they explain. They can te... | \n", "
2 | \n", "F. Skinner, the world-famous psychologist, pro... | \n", "
3 | \n", "Johnston of Enid, Oklahoma, is the safety coor... | \n", "
4 | \n", "Fall was condemned viciously - condemned as fe... | \n", "
... | \n", "... | \n", "
202 | \n", "Reminders, admonitions, confrontations, with h... | \n", "
203 | \n", "Perhaps. But that is what they said to Napoleo... | \n", "
204 | \n", "Ernest Gent of Scarsdale, New York, was troubl... | \n", "
205 | \n", "She tried coaxing. Neither worked. Then she tr... | \n", "
206 | \n", ". PRINCIPLE 9 Make the other person happy abou... | \n", "
180 rows × 1 columns
\n", "\n", " | text | \n", "
---|---|
0 | \n", "# PART ONE: Fundamental Techniques in Handling... | \n", "
1 | \n", "So they rationalize, they explain. They can te... | \n", "
2 | \n", "F. Skinner, the world-famous psychologist, pro... | \n", "
3 | \n", "Johnston of Enid, Oklahoma, is the safety coor... | \n", "
4 | \n", "Fall was condemned viciously - condemned as fe... | \n", "
... | \n", "... | \n", "
202 | \n", "Reminders, admonitions, confrontations, with h... | \n", "
203 | \n", "Perhaps. But that is what they said to Napoleo... | \n", "
204 | \n", "Ernest Gent of Scarsdale, New York, was troubl... | \n", "
205 | \n", "She tried coaxing. Neither worked. Then she tr... | \n", "
206 | \n", "PRINCIPLE 9 Make the other person happy about ... | \n", "
180 rows × 1 columns
\n", "