add csv data indexing
Browse files
main.py
CHANGED
@@ -6,6 +6,8 @@ import json
|
|
6 |
import os
|
7 |
import logging
|
8 |
from txtai.embeddings import Embeddings
|
|
|
|
|
9 |
|
10 |
# Set up logging
|
11 |
logging.basicConfig(level=logging.INFO)
|
@@ -105,6 +107,41 @@ async def query_index(request: QueryRequest):
|
|
105 |
logger.error(f"Error querying index: {str(e)}")
|
106 |
raise HTTPException(status_code=500, detail=f"Error querying index: {str(e)}")
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
if __name__ == "__main__":
|
109 |
import uvicorn
|
110 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
6 |
import os
|
7 |
import logging
|
8 |
from txtai.embeddings import Embeddings
|
9 |
+
import pandas as pd
|
10 |
+
import glob
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO)
|
|
|
107 |
logger.error(f"Error querying index: {str(e)}")
|
108 |
raise HTTPException(status_code=500, detail=f"Error querying index: {str(e)}")
|
109 |
|
110 |
+
def process_csv_file(file_path):
|
111 |
+
try:
|
112 |
+
df = pd.read_csv(file_path)
|
113 |
+
df_rows = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
|
114 |
+
txtai_data = [(i, row, None) for i, row in enumerate(df_rows)]
|
115 |
+
return txtai_data, df_rows.tolist()
|
116 |
+
except Exception as e:
|
117 |
+
logger.error(f"Error processing CSV file {file_path}: {str(e)}")
|
118 |
+
return None, None
|
119 |
+
|
120 |
+
def check_and_index_csv_files():
|
121 |
+
index_data_folder = "/app/index_data"
|
122 |
+
if not os.path.exists(index_data_folder):
|
123 |
+
logger.warning(f"index_data folder not found: {index_data_folder}")
|
124 |
+
return
|
125 |
+
|
126 |
+
csv_files = glob.glob(os.path.join(index_data_folder, "*.csv"))
|
127 |
+
for csv_file in csv_files:
|
128 |
+
index_id = os.path.splitext(os.path.basename(csv_file))[0]
|
129 |
+
if not os.path.exists(f"/app/indexes/{index_id}"):
|
130 |
+
logger.info(f"Processing CSV file: {csv_file}")
|
131 |
+
txtai_data, documents = process_csv_file(csv_file)
|
132 |
+
if txtai_data and documents:
|
133 |
+
embeddings.index(txtai_data)
|
134 |
+
save_embeddings(index_id, documents)
|
135 |
+
logger.info(f"CSV file indexed successfully: {csv_file}")
|
136 |
+
else:
|
137 |
+
logger.warning(f"Failed to process CSV file: {csv_file}")
|
138 |
+
else:
|
139 |
+
logger.info(f"Index already exists for: {csv_file}")
|
140 |
+
|
141 |
+
@app.on_event("startup")
|
142 |
+
async def startup_event():
|
143 |
+
check_and_index_csv_files()
|
144 |
+
|
145 |
if __name__ == "__main__":
|
146 |
import uvicorn
|
147 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|