Redmind commited on
Commit
461c9df
·
verified ·
1 Parent(s): c0b6a01

Upload file_upload.py

Browse files
Files changed (1) hide show
  1. file_upload.py +133 -0
file_upload.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import json
4
+ import pandas as pd
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from PyPDF2 import PdfReader
8
+ from docx import Document
9
+
10
+ class FileHandler:
11
+ def __init__(self, vector_db_path, open_api_key, grok_api_key):
12
+ self.vector_db_path = vector_db_path
13
+ self.openai_embeddings = OpenAIEmbeddings(api_key=open_api_key)
14
+ self.grok_api_key = grok_api_key
15
+
16
+ def handle_file_upload(self, file_name, file_content):
17
+ try:
18
+ # Debug the type of the file object
19
+ # Extract the base file name
20
+ base_file_name = os.path.basename(file_name)
21
+
22
+ # Replace spaces with underscores and make the name lowercase
23
+ formatted_file_name = base_file_name.replace(" ", "_").lower()
24
+
25
+ file_content_encode = file_content.encode('utf-8')
26
+ file_hash = hashlib.md5(file_content_encode).hexdigest()
27
+ file_key = f"{formatted_file_name}_{file_hash}"
28
+ vector_store_dir = os.path.join(self.vector_db_path, file_key)
29
+ os.makedirs(vector_store_dir, exist_ok=True)
30
+ vector_store_path = os.path.join(vector_store_dir, "index.faiss")
31
+
32
+ if os.path.exists(vector_store_path):
33
+ return {"message": "File already processed."}
34
+
35
+ # Process file based on type
36
+ if file_name.endswith(".pdf"):
37
+ texts, metadatas = self.load_and_split_pdf(file_content)
38
+ elif file_name.endswith(".docx"):
39
+ texts, metadatas = self.load_and_split_docx(file_content)
40
+ elif file_name.endswith(".txt"):
41
+ texts, metadatas = self.load_and_split_txt(file_content)
42
+ elif file_name.endswith(".xlsx"):
43
+ texts, metadatas = self.load_and_split_table(file_content)
44
+ elif file_name.endswith(".csv"):
45
+ texts, metadatas = self.load_and_split_csv(file_content)
46
+ else:
47
+ raise ValueError("Unsupported file format.")
48
+
49
+ if not texts:
50
+ return {"message": "No text extracted from the file. Check the file content."}
51
+
52
+ # # Generate embeddings using Grok API
53
+ vector_store = FAISS.from_texts(texts, self.openai_embeddings, metadatas=metadatas)
54
+ vector_store.save_local(vector_store_dir)
55
+
56
+ metadata = {
57
+ "filename": file_name,
58
+ "file_size": len(file_content),
59
+ }
60
+ metadata_path = os.path.join(vector_store_dir, "metadata.json")
61
+ with open(metadata_path, 'w') as md_file:
62
+ json.dump(metadata, md_file)
63
+
64
+ return {"message": "File processed successfully."}
65
+ except Exception as e:
66
+ return {"message": f"Error processing file: {str(e)}"}
67
+
68
+
69
+ def load_and_split_pdf(self, file):
70
+ reader = PdfReader(file)
71
+ texts = []
72
+ metadatas = []
73
+ for page_num, page in enumerate(reader.pages):
74
+ text = page.extract_text()
75
+ if text:
76
+ texts.append(text)
77
+ metadatas.append({"page_number": page_num + 1})
78
+ return texts, metadatas
79
+
80
+ def load_and_split_docx(self, file):
81
+ doc = Document(file)
82
+ texts = []
83
+ metadatas = []
84
+ for para_num, paragraph in enumerate(doc.paragraphs):
85
+ if paragraph.text:
86
+ texts.append(paragraph.text)
87
+ metadatas.append({"paragraph_number": para_num + 1})
88
+ return texts, metadatas
89
+
90
+ def load_and_split_txt(self, content):
91
+ text = content.decode("utf-8")
92
+ lines = text.split('\n')
93
+ texts = [line for line in lines if line.strip()]
94
+ metadatas = [{}] * len(texts)
95
+ return texts, metadatas
96
+
97
+ def load_and_split_table(self, content):
98
+ excel_data = pd.read_excel(content, sheet_name=None)
99
+ texts = []
100
+ metadatas = []
101
+ for sheet_name, df in excel_data.items():
102
+ df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
103
+ df = df.fillna('N/A')
104
+ for _, row in df.iterrows():
105
+ row_dict = row.to_dict()
106
+ # Combine key-value pairs into a string
107
+ row_text = ', '.join([f"{key}: {value}" for key, value in row_dict.items()])
108
+ texts.append(row_text)
109
+ metadatas.append({"sheet_name": sheet_name})
110
+ return texts, metadatas
111
+
112
+ def load_and_split_csv(self, content):
113
+ print('its csv')
114
+ csv_data = pd.read_csv(content)
115
+ print(csv_data)
116
+ texts = []
117
+ metadatas = []
118
+ csv_data = csv_data.dropna(how='all', axis=0).dropna(how='all', axis=1)
119
+ csv_data = csv_data.fillna('N/A')
120
+ for _, row in csv_data.iterrows():
121
+ row_dict = row.to_dict()
122
+ row_text = ', '.join([f"{key}: {value}" for key, value in row_dict.items()])
123
+ texts.append(row_text)
124
+ metadatas.append({"row_index": _})
125
+ print(texts)
126
+ return texts, metadatas
127
+
128
+
129
+
130
+
131
+
132
+
133
+