Chris4K commited on
Commit
c02fe70
1 Parent(s): 1c1f687

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -2
app.py CHANGED
@@ -46,14 +46,28 @@ from langchain_community.document_loaders import TextLoader
46
  def load_txt(path="./a.cv.ckaller.2024.txt"):
47
  loader = TextLoader(path)
48
  document = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
49
  # split the document into chunks
50
- text_splitter = RecursiveCharacterTextSplitter(
51
  chunk_size=1500,
52
  chunk_overlap=250,
53
  length_function=len,
54
  is_separator_regex=False,
55
  )
56
- document_chunks = text_splitter.split_documents(document)
 
 
57
  #######
58
  '''
59
  FAISS
@@ -66,6 +80,16 @@ def load_txt(path="./a.cv.ckaller.2024.txt"):
66
  embeddings = HuggingFaceBgeEmbeddings(
67
  model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
68
  )
 
 
 
 
 
 
 
 
 
 
69
  # load from disk
70
  vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
71
 
 
46
  def load_txt(path="./a.cv.ckaller.2024.txt"):
47
  loader = TextLoader(path)
48
  document = loader.load()
49
+
50
+
51
+
52
+ ####
53
+
54
+ from langchain_experimental.text_splitter import SemanticChunker
55
+ with open(path) as f:
56
+ state_of_the_union = f.read()
57
+
58
+
59
+
60
+ ######
61
  # split the document into chunks
62
+ a_text_splitter = RecursiveCharacterTextSplitter(
63
  chunk_size=1500,
64
  chunk_overlap=250,
65
  length_function=len,
66
  is_separator_regex=False,
67
  )
68
+ a_document_chunks = text_splitter.split_documents(document)
69
+
70
+
71
  #######
72
  '''
73
  FAISS
 
80
  embeddings = HuggingFaceBgeEmbeddings(
81
  model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
82
  )
83
+
84
+
85
+ #####
86
+
87
+
88
+ text_splitter = SemanticChunker(embeddings)
89
+
90
+ document_chunks = text_splitter.create_documents([state_of_the_union])
91
+ print(document_chunks[0].page_content)
92
+
93
  # load from disk
94
  vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
95