Abs6187 commited on
Commit
cf664b9
1 Parent(s): ffa232c

Update Ingest.py

Browse files
Files changed (1) hide show
  1. Ingest.py +13 -61
Ingest.py CHANGED
@@ -1,61 +1,13 @@
1
- import ray
2
- import logging
3
- from langchain_community.document_loaders import DirectoryLoader
4
- from langchain_community.embeddings import HuggingFaceEmbeddings
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.vectorstores import FAISS
7
- from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity
8
-
9
- # Initialize Ray
10
- ray.init()
11
-
12
- # Set up basic configuration for logging
13
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
-
15
- # Load documents with logging
16
- logging.info("Loading documents...")
17
- loader = DirectoryLoader('data', glob="./*.txt")
18
- documents = loader.load()
19
-
20
- # Extract text from documents and split into manageable texts with logging
21
- logging.info("Extracting and splitting texts from documents...")
22
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
23
- texts = []
24
- for document in documents:
25
- if hasattr(document, 'get_text'):
26
- text_content = document.get_text() # Adjust according to actual method
27
- else:
28
- text_content = "" # Default to empty string if no text method is available
29
-
30
- texts.extend(text_splitter.split_text(text_content))
31
-
32
- # Define embedding function
33
- def embedding_function(text):
34
- embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
35
- return embeddings_model.embed_query(text)
36
-
37
- # Create FAISS index for embeddings
38
- index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
39
-
40
- # Assuming docstore as a simple dictionary to store document texts
41
- docstore = {i: text for i, text in enumerate(texts)}
42
- index_to_docstore_id = {i: i for i in range(len(texts))}
43
-
44
- # Initialize FAISS
45
- faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
46
-
47
- # Process and store embeddings
48
- logging.info("Storing embeddings in FAISS...")
49
- for i, text in enumerate(texts):
50
- embedding = embedding_function(text)
51
- faiss_db.add_documents([embedding])
52
-
53
- # Exporting the vector embeddings database with logging
54
- logging.info("Exporting the vector embeddings database...")
55
- faiss_db.save_local("ipc_embed_db")
56
-
57
- # Log a message to indicate the completion of the process
58
- logging.info("Process completed successfully.")
59
-
60
- # Shutdown Ray after the process
61
- ray.shutdown()
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ import os
3
+
4
+ def load_model():
5
+ """Loads tokenizer and model from local files."""
6
+ model_dir = "./" # Ensure all model files are in the root directory
7
+ tokenizer = AutoTokenizer.from_pretrained(model_dir, config="config.json")
8
+ model = AutoModelForCausalLM.from_pretrained(model_dir)
9
+ return tokenizer, model
10
+
11
+ if __name__ == "__main__":
12
+ tokenizer, model = load_model()
13
+ print("Model and tokenizer loaded successfully.")