gufett0 commited on
Commit
2153a97
1 Parent(s): 532280e

HuggingFaceLLM

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. backend2.py +0 -109
  3. requirements.txt +7 -10
.gitignore CHANGED
@@ -2,3 +2,4 @@
2
  __pycache__/
3
  appcompleta.py
4
  interface.py
 
 
2
  __pycache__/
3
  appcompleta.py
4
  interface.py
5
+ backend2.py
backend2.py DELETED
@@ -1,109 +0,0 @@
1
- import os
2
- import logging
3
- from concurrent.futures import ThreadPoolExecutor
4
- from pypdf import PdfReader
5
- from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
6
- from langchain_community.vectorstores import FAISS
7
- #from langchain_community.embeddings import HuggingFaceEmbeddings
8
- from langchain_huggingface import HuggingFaceEmbeddings
9
- import time
10
- import torch
11
- from dotenv import load_dotenv
12
-
13
- logging.basicConfig(
14
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
15
- level=logging.DEBUG
16
- )
17
- logger = logging.getLogger(__name__)
18
- logging.getLogger('matplotlib').setLevel(logging.WARNING) # Suppress Matplotlib debug messages
19
-
20
- load_dotenv()
21
-
22
- logger.debug("Environment variables loaded.")
23
-
24
- def load_single_document(filepath):
25
- if filepath.endswith('.pdf'):
26
- with open(filepath, 'rb') as file:
27
- pdf_reader = PdfReader(file)
28
- text = " ".join([page.extract_text() for page in pdf_reader.pages])
29
- elif filepath.endswith('.txt'):
30
- with open(filepath, 'r', encoding='utf-8') as file:
31
- text = file.read()
32
- else:
33
- logger.warning("Unsupported file type: %s", filepath)
34
- return {"content": "", "source": filepath}
35
-
36
- return {"content": text, "source": filepath}
37
-
38
- def load_documents(directory):
39
- logger.debug("Loading documents from directory: %s", directory)
40
- start_time = time.time()
41
- filepaths = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith('.pdf') or filename.endswith('.txt')]
42
-
43
- if not filepaths:
44
- logger.error("No documents found in the directory.")
45
- else:
46
- logger.debug("Found %d documents", len(filepaths))
47
-
48
- documents = []
49
- with ThreadPoolExecutor() as executor:
50
- documents = list(executor.map(load_single_document, filepaths))
51
-
52
- end_time = time.time()
53
- logger.debug("Loaded %d documents in %.2f seconds.", len(documents), end_time - start_time)
54
- return documents
55
-
56
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
57
-
58
- def prepare_documents(documents):
59
- logger.debug("Preparing documents for embedding.")
60
- start_time = time.time()
61
-
62
- if not documents:
63
- logger.error("No documents to prepare.")
64
- return None
65
-
66
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
67
- texts = text_splitter.create_documents([doc["content"] for doc in documents], metadatas=[{"source": os.path.basename(doc["source"])} for doc in documents])
68
-
69
- if not texts:
70
- logger.error("No texts to embed after splitting.")
71
- return None
72
-
73
- logger.debug(f"Created {len(texts)} text chunks.")
74
-
75
- modelPath = "sentence-transformers/all-MiniLM-l6-v2"
76
- model_kwargs = {'device': device}
77
- encode_kwargs = {'normalize_embeddings': False}
78
- embeddings = HuggingFaceEmbeddings(model_name=modelPath, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
79
-
80
- try:
81
- db = FAISS.from_documents(texts, embeddings)
82
- logger.debug("FAISS index created successfully.")
83
- except Exception as e:
84
- logger.error(f"Error creating FAISS index: {e}")
85
- return None
86
-
87
- end_time = time.time()
88
- logger.debug(f"Documents prepared in {end_time - start_time:.2f} seconds.")
89
- return db
90
-
91
- def get_context_sources(question, db):
92
- start_time = time.time()
93
-
94
- if db is None:
95
- logger.error("Database is None. Cannot perform similarity search.")
96
- return "", ""
97
-
98
- try:
99
- docs = db.similarity_search(question, k=3)
100
- context = " ".join([doc.page_content for doc in docs])
101
- sources = ", ".join(set([doc.metadata['source'] for doc in docs]))
102
- except Exception as e:
103
- logger.error(f"Error during similarity search: {e}")
104
- return "", ""
105
-
106
- end_time = time.time()
107
- logger.debug(f"Similarity search done in {end_time - start_time:.2f} seconds.")
108
-
109
- return context, sources
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- python-dotenv==0.21.0
2
  llama-index
3
  llama-index-embeddings-huggingface
4
  llama-index-llms-huggingface
@@ -7,19 +7,16 @@ sentence-transformers==2.2.2
7
  llama-index-readers-web
8
  llama-index-readers-file
9
 
10
- accelerate==0.33.0
11
- gradio==4.39.0
12
- spaces==0.29.2
13
- torch==2.2.0
14
- transformers==4.43.3
15
- llama-cpp-agent>=0.2.25
16
  setuptools
17
- faiss-cpu
18
 
19
  pydantic
20
  ipython
21
  #keras
22
  #keras-nlp
23
  #tensorflow
24
- langchain-community
25
- langchain-huggingface
 
1
+ python-dotenv
2
  llama-index
3
  llama-index-embeddings-huggingface
4
  llama-index-llms-huggingface
 
7
  llama-index-readers-web
8
  llama-index-readers-file
9
 
10
+ accelerate
11
+ gradio
12
+ spaces
13
+ torch
14
+ transformers
15
+ llama-cpp-agent
16
  setuptools
 
17
 
18
  pydantic
19
  ipython
20
  #keras
21
  #keras-nlp
22
  #tensorflow