itsmeadarsh commited on
Commit
91d7875
1 Parent(s): e8603ed

Systems Ready!

Browse files
Files changed (8) hide show
  1. .gitignore +5 -0
  2. SOURCE_DOCUMENTS/constitution.pdf +0 -0
  3. app.py +304 -0
  4. constants.py +18 -0
  5. ingest.py +78 -0
  6. requirements.txt +23 -0
  7. run_localGPT.py +108 -0
  8. utils.py +37 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ /db
2
+ /DB
3
+ /venv
4
+ /.idea
5
+ /__pycache__/
SOURCE_DOCUMENTS/constitution.pdf ADDED
Binary file (414 kB). View file
 
app.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from pathlib import Path
4
+ from textwrap import dedent
5
+ from types import SimpleNamespace
6
+
7
+ import gradio as gr
8
+ from charset_normalizer import detect
9
+ from chromadb.config import Settings
10
+ from epub2txt import epub2txt
11
+ from langchain.chains import RetrievalQA
12
+ from langchain.docstore.document import Document
13
+ from langchain.document_loaders import (
14
+ CSVLoader,
15
+ Docx2txtLoader,
16
+ PDFMinerLoader,
17
+ TextLoader,
18
+ )
19
+
20
+ # from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
21
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
22
+ from langchain.llms import HuggingFacePipeline
23
+ from langchain.text_splitter import (
24
+ RecursiveCharacterTextSplitter,
25
+ )
26
+ import torch
27
+
28
+ # FAISS instead of PineCone
29
+ from langchain.vectorstores import Chroma
30
+ from loguru import logger
31
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
32
+ import argparse
33
+
34
+ parser = argparse.ArgumentParser('LocalGPT falcon', add_help=False)
35
+ parser.add_argument('--device_type', type=str, default="cuda", choices=["cpu", "mps", "cuda"], help='device type', )
36
+ args = parser.parse_args()
37
+
38
+
39
+ ROOT_DIRECTORY = Path(__file__).parent
40
+ PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/db"
41
+
42
+ # Define the Chroma settings
43
+ CHROMA_SETTINGS = Settings(
44
+ chroma_db_impl="duckdb+parquet",
45
+ persist_directory=PERSIST_DIRECTORY,
46
+ anonymized_telemetry=False,
47
+ )
48
+ ns = SimpleNamespace(qa=None)
49
+
50
+ # INSTRUCTORS_EMBEDDINGS_MODEL = "hkunlp/instructor-xl"
51
+ INSTRUCTORS_EMBEDDINGS_MODEL = "hkunlp/instructor-large"
52
+ # INSTRUCTORS_EMBEDDINGS_MODEL = "hkunlp/instructor-large"
53
+ # INSTRUCTORS_EMBEDDINGS_MODEL = "hkunlp/instructor-base"
54
+
55
+ def load_single_document(file_path: str or Path) -> Document:
56
+ """ingest.py"""
57
+ # Loads a single document from a file path
58
+ # encoding = detect(open(file_path, "rb").read()).get("encoding", "utf-8")
59
+ encoding = detect(Path(file_path).read_bytes()).get("encoding", "utf-8")
60
+ if file_path.endswith(".txt"):
61
+ if encoding is None:
62
+ logger.warning(
63
+ f" {file_path}'s encoding is None "
64
+ "Something is fishy, return empty str "
65
+ )
66
+ return Document(page_content="", metadata={"source": file_path})
67
+
68
+ try:
69
+ loader = TextLoader(file_path, encoding=encoding)
70
+ except Exception as exc:
71
+ logger.warning(f" {exc}, return dummy ")
72
+ return Document(page_content="", metadata={"source": file_path})
73
+
74
+ elif file_path.endswith(".pdf"):
75
+ loader = PDFMinerLoader(file_path)
76
+ elif file_path.endswith(".csv"):
77
+ loader = CSVLoader(file_path)
78
+ elif Path(file_path).suffix in [".docx"]:
79
+ try:
80
+ loader = Docx2txtLoader(file_path)
81
+ except Exception as exc:
82
+ logger.error(f" {file_path} errors: {exc}")
83
+ return Document(page_content="", metadata={"source": file_path})
84
+ elif Path(file_path).suffix in [".epub"]: # for epub? epub2txt unstructured
85
+ try:
86
+ _ = epub2txt(file_path)
87
+ except Exception as exc:
88
+ logger.error(f" {file_path} errors: {exc}")
89
+ return Document(page_content="", metadata={"source": file_path})
90
+ return Document(page_content=_, metadata={"source": file_path})
91
+ else:
92
+ if encoding is None:
93
+ logger.warning(
94
+ f" {file_path}'s encoding is None "
95
+ "Likely binary files, return empty str "
96
+ )
97
+ return Document(page_content="", metadata={"source": file_path})
98
+ try:
99
+ loader = TextLoader(file_path)
100
+ except Exception as exc:
101
+ logger.error(f" {exc}, returnning empty string")
102
+ return Document(page_content="", metadata={"source": file_path})
103
+
104
+ return loader.load()[0]
105
+
106
+ def greet(name):
107
+ """Test."""
108
+ logger.debug(f" name: [{name}] ")
109
+ return "Hello " + name + "!!"
110
+
111
+
112
+ def upload_files(files):
113
+ """Upload files."""
114
+ try:
115
+ file_paths = [file.name for file in files]
116
+ except:
117
+ file_paths = [files]
118
+ logger.info(file_paths)
119
+
120
+ res = ingest(file_paths)
121
+ logger.info("Processed:\n{res}")
122
+ del res
123
+
124
+ ns.qa = load_qa()
125
+
126
+ return file_paths
127
+
128
+
129
+ def ingest(
130
+ file_paths: list
131
+ ):
132
+ """Gen Chroma db.
133
+ torch.cuda.is_available()
134
+ file_paths =
135
+ []
136
+ """
137
+ logger.info("Doing ingest...")
138
+
139
+ documents = []
140
+ for file_path in file_paths:
141
+ documents.append(load_single_document(f"{file_path}"))
142
+
143
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
144
+ texts = text_splitter.split_documents(documents)
145
+
146
+ logger.info(f"Loaded {len(documents)} documents ")
147
+ logger.info(f"Split into {len(texts)} chunks of text")
148
+
149
+ # Create embeddings
150
+ logger.info(f"Load InstructEmbeddings model: {INSTRUCTORS_EMBEDDINGS_MODEL}")
151
+ embeddings = HuggingFaceInstructEmbeddings(
152
+ model_name=INSTRUCTORS_EMBEDDINGS_MODEL, model_kwargs={"device": args.device_type}
153
+ )
154
+
155
+ db = Chroma.from_documents(
156
+ texts,
157
+ embeddings,
158
+ persist_directory=PERSIST_DIRECTORY,
159
+ client_settings=CHROMA_SETTINGS,
160
+ )
161
+ db.persist()
162
+ db = None
163
+ logger.info("Done ingest")
164
+
165
+ return [
166
+ [Path(doc.metadata.get("source")).name, len(doc.page_content)]
167
+ for doc in documents
168
+ ]
169
+
170
+
171
+ # https://huggingface.co/tiiuae/falcon-7b-instruct
172
+ def gen_local_llm():
173
+ """Gen a local llm.
174
+ localgpt run_localgpt
175
+ """
176
+ model = "tiiuae/falcon-7b-instruct"
177
+
178
+ if args.device_type == "cuda":
179
+ tokenizer = AutoTokenizer.from_pretrained(model)
180
+ else: # cpu
181
+ tokenizer=AutoTokenizer.from_pretrained(model)
182
+ model=AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True)
183
+
184
+ pipe = pipeline(
185
+ "text-generation",
186
+ model=model,
187
+ tokenizer=tokenizer,
188
+ torch_dtype=torch.float32 if args.device_type =="cpu" else torch.bfloat16,
189
+ trust_remote_code=True,
190
+ device_map="cpu" if args.device_type =="cpu" else "auto",
191
+ max_length=2048,
192
+ temperature=0,
193
+ top_p=0.95,
194
+ top_k=10,
195
+ repetition_penalty=1.15,
196
+ num_return_sequences=1,
197
+ pad_token_id=tokenizer.eos_token_id
198
+ )
199
+
200
+ local_llm = HuggingFacePipeline(pipeline=pipe)
201
+
202
+ return local_llm
203
+
204
+
205
+ def load_qa():
206
+ """Gen qa."""
207
+ logger.info("Doing qa")
208
+
209
+ embeddings = HuggingFaceInstructEmbeddings(
210
+ model_name=INSTRUCTORS_EMBEDDINGS_MODEL, model_kwargs={"device": args.device_type}
211
+ )
212
+ # xl 4.96G, large 3.5G,
213
+ db = Chroma(
214
+ persist_directory=PERSIST_DIRECTORY,
215
+ embedding_function=embeddings,
216
+ client_settings=CHROMA_SETTINGS,
217
+ )
218
+ retriever = db.as_retriever()
219
+
220
+ llm = gen_local_llm() # "tiiuae/falcon-7b-instruct"
221
+
222
+ qa = RetrievalQA.from_chain_type(
223
+ llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
224
+ )
225
+
226
+ logger.info("Done qa")
227
+
228
+ return qa
229
+
230
+
231
+ def main1():
232
+ """Lump codes"""
233
+ with gr.Blocks() as demo:
234
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
235
+ iface.launch()
236
+
237
+ demo.launch()
238
+
239
+
240
+ def main():
241
+ """Do blocks."""
242
+ logger.info(f"ROOT_DIRECTORY: {ROOT_DIRECTORY}")
243
+
244
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
245
+ with gr.Accordion("Info", open=False):
246
+ _ = """
247
+ Talk to your docs (.pdf, .docx, .csv, .txt .md). It
248
+ takes quite a while to ingest docs (10-30 min. depending
249
+ on net, RAM, CPU etc.).
250
+ """
251
+ gr.Markdown(dedent(_))
252
+ title = """
253
+ <div style="text-align: center;">
254
+ <h1>LocalGPT with Falcon</h1>
255
+ <p style="text-align: center;">Upload your docs (.pdf, .docx, .csv, .txt .md) by clicking the "Load docs to LangChain" and wait until the upload is complete, <br />
256
+ when everything is ready, you can start asking questions about the docs <br />
257
+ </div>
258
+ """
259
+ gr.HTML(title)
260
+ with gr.Tab("Upload files"):
261
+ # Upload files and generate embeddings database
262
+ file_output = gr.File()
263
+ upload_button = gr.UploadButton(
264
+ "Load docs to LangChain",
265
+ file_count="multiple",
266
+ )
267
+ upload_button.upload(upload_files, upload_button, file_output)
268
+
269
+ chatbot = gr.Chatbot()
270
+ msg = gr.Textbox(label="Query")
271
+ clear = gr.Button("Clear")
272
+
273
+ def respond(message, chat_history):
274
+ if ns.qa is None: # no files processed yet
275
+ bot_message = "Provide some file(s) for processsing first."
276
+ chat_history.append((message, bot_message))
277
+ return "", chat_history
278
+ try:
279
+ res = ns.qa(message)
280
+ answer, docs = res["result"], res["source_documents"]
281
+ bot_message = f"{answer}"
282
+ except Exception as exc:
283
+ logger.error(exc)
284
+ bot_message = f"bummer! {exc}"
285
+
286
+ chat_history.append((message, bot_message))
287
+
288
+ return "", chat_history
289
+
290
+ msg.submit(respond, [msg, chatbot], [msg, chatbot])
291
+ clear.click(lambda: None, None, chatbot, queue=False)
292
+
293
+ try:
294
+ from google import colab
295
+
296
+ share = True # start share when in colab
297
+ except Exception:
298
+ share = False
299
+
300
+ demo.launch(share=share)
301
+
302
+
303
+ if __name__ == "__main__":
304
+ main()
constants.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # from dotenv import load_dotenv
3
+ from chromadb.config import Settings
4
+
5
+ # load_dotenv()
6
+ ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
7
+
8
+ # Define the folder for storing database
9
+ SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"
10
+
11
+ PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"
12
+
13
+ # Define the Chroma settings
14
+ CHROMA_SETTINGS = Settings(
15
+ chroma_db_impl='duckdb+parquet',
16
+ persist_directory=PERSIST_DIRECTORY,
17
+ anonymized_telemetry=False
18
+ )
ingest.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import click
3
+ from typing import List
4
+ from utils import xlxs_to_csv
5
+ from langchain.document_loaders import TextLoader, PDFMinerLoader, CSVLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.docstore.document import Document
9
+ from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
10
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
11
+
12
+
13
+ def load_single_document(file_path: str) -> Document:
14
+ # Loads a single document from a file path
15
+ if file_path.endswith(".txt"):
16
+ loader = TextLoader(file_path, encoding="utf8")
17
+ elif file_path.endswith(".pdf"):
18
+ loader = PDFMinerLoader(file_path)
19
+ elif file_path.endswith(".csv"):
20
+ loader = CSVLoader(file_path)
21
+ return loader.load()[0]
22
+
23
+
24
+ def load_documents(source_dir: str) -> List[Document]:
25
+ # Loads all documents from source documents directory
26
+ all_files = os.listdir(source_dir)
27
+ docs = []
28
+ for file_path in all_files:
29
+ if file_path[-4:] == 'xlsx':
30
+ for doc in xlxs_to_csv(f"{source_dir}/{file_path}"):
31
+ docs.append(load_single_document(doc))
32
+ elif file_path[-4:] in ['.txt', '.pdf', '.csv']:
33
+ docs.append(load_single_document(f"{source_dir}/{file_path}"))
34
+ return docs
35
+ # return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if
36
+ # file_path[-4:] in ['.txt', '.pdf', '.csv']]
37
+
38
+
39
+ # @click.command()
40
+ # @click.option('--device_type', default='gpu', help='device to run on, select gpu or cpu')
41
+ # def main(device_type, ):
42
+ # # load the instructorEmbeddings
43
+ # if device_type in ['cpu', 'CPU']:
44
+ # device='cpu'
45
+ # else:
46
+ # device='cuda'
47
+
48
+
49
+ @click.command()
50
+ @click.option('--device_type', default='cuda', help='device to run on, select gpu, cpu or mps')
51
+ def main(device_type, ):
52
+ # load the instructorEmbeddings
53
+ if device_type in ['cpu', 'CPU']:
54
+ device='cpu'
55
+ elif device_type in ['mps', 'MPS']:
56
+ device='mps'
57
+ else:
58
+ device='cuda'
59
+
60
+ #  Load documents and split in chunks
61
+ print(f"Loading documents from {SOURCE_DIRECTORY}")
62
+ documents = load_documents(SOURCE_DIRECTORY)
63
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
64
+ texts = text_splitter.split_documents(documents)
65
+ print(f"Loaded {len(documents)} documents from {SOURCE_DIRECTORY}")
66
+ print(f"Split into {len(texts)} chunks of text")
67
+
68
+ # Create embeddings
69
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base",
70
+ model_kwargs={"device": device})
71
+
72
+ db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY, client_settings=CHROMA_SETTINGS)
73
+ db.persist()
74
+ db = None
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.166
2
+ chromadb==0.3.22
3
+ llama-cpp-python==0.1.48
4
+ urllib3==1.26.6
5
+ pdfminer.six==20221105
6
+ InstructorEmbedding
7
+ sentence-transformers
8
+ faiss-cpu
9
+ huggingface_hub
10
+ transformers
11
+ protobuf==3.20.0
12
+ accelerate
13
+ bitsandbytes
14
+ click
15
+ openpyxl
16
+ einops
17
+ xformers
18
+ gradio
19
+ charset-normalizer
20
+ PyPDF2
21
+ epub2txt
22
+ docx2txt
23
+ loguru
run_localGPT.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQA
2
+ # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
5
+ from langchain.llms import HuggingFacePipeline
6
+ from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
+ import click
9
+ import torch
10
+ from constants import CHROMA_SETTINGS
11
+
12
+
13
+ def load_model(device):
14
+ """
15
+ Select a model on huggingface.
16
+ If you are running this for the first time, it will download a model for you.
17
+ subsequent runs will use the model from the disk.
18
+ """
19
+ model = "tiiuae/falcon-7b-instruct"
20
+
21
+ if device == "cuda":
22
+ tokenizer = AutoTokenizer.from_pretrained(model)
23
+ else: # cpu
24
+ tokenizer=AutoTokenizer.from_pretrained(model)
25
+ model=AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True)
26
+
27
+ pipe = pipeline(
28
+ "text-generation",
29
+ model=model,
30
+ tokenizer=tokenizer,
31
+ torch_dtype=torch.float32 if device =="cpu" else torch.bfloat16,
32
+ trust_remote_code=True,
33
+ device_map=device if device =="cpu" else "auto",
34
+ max_length=2048,
35
+ temperature=0,
36
+ top_p=0.95,
37
+ top_k=10,
38
+ repetition_penalty=1.15,
39
+ num_return_sequences=1,
40
+ pad_token_id=tokenizer.eos_token_id
41
+ )
42
+
43
+ local_llm = HuggingFacePipeline(pipeline=pipe)
44
+
45
+ return local_llm
46
+
47
+
48
+ # @click.command()
49
+ # @click.option('--device_type', default='gpu', help='device to run on, select gpu or cpu')
50
+ # def main(device_type, ):
51
+ # # load the instructorEmbeddings
52
+ # if device_type in ['cpu', 'CPU']:
53
+ # device='cpu'
54
+ # else:
55
+ # device='cuda'
56
+
57
+
58
+ ## for M1/M2 users:
59
+
60
+ @click.command()
61
+ @click.option('--device_type', default='cuda', help='device to run on, select gpu, cpu or mps')
62
+ def main(device_type, ):
63
+ # load the instructorEmbeddings
64
+ if device_type in ['cpu', 'CPU']:
65
+ device='cpu'
66
+ elif device_type in ['mps', 'MPS']:
67
+ device='mps'
68
+ else:
69
+ device='cuda'
70
+
71
+ print(f"Running on: {device}")
72
+
73
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base",
74
+ model_kwargs={"device": device})
75
+ # load the vectorstore
76
+ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
77
+ retriever = db.as_retriever()
78
+ # Prepare the LLM
79
+ # callbacks = [StreamingStdOutCallbackHandler()]
80
+ # load the LLM for generating Natural Language responses.
81
+ llm = load_model(device)
82
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
83
+ # Interactive questions and answers
84
+ while True:
85
+ query = input("\nEnter a query: ")
86
+ if query == "exit":
87
+ break
88
+
89
+ # Get the answer from the chain
90
+ res = qa(query)
91
+ answer, docs = res['result'], res['source_documents']
92
+
93
+ # Print the result
94
+ print("\n\n> Question:")
95
+ print(query)
96
+ print("\n> Answer:")
97
+ print(answer)
98
+
99
+ # Print the relevant sources used for the answer
100
+ print("----------------------------------SOURCE DOCUMENTS---------------------------")
101
+ for document in docs:
102
+ print("\n> " + document.metadata["source"] + ":")
103
+ print(document.page_content)
104
+ print("----------------------------------SOURCE DOCUMENTS---------------------------")
105
+
106
+
107
+ if __name__ == "__main__":
108
+ main()
utils.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openpyxl
2
+ import csv
3
+ import tempfile
4
+
5
+
6
+ def xlxs_to_csv(file_path: str, sheet_name: str = None) -> list[str]:
7
+ """
8
+ Convert a workbook into a list of csv files
9
+ :param file_path: the path to the workbook
10
+ :param sheet_name: the name of the sheet to convert
11
+ :return: a list of temporary file names
12
+ """
13
+ # Load the workbook and select the active worksheet
14
+ wb = openpyxl.load_workbook(file_path)
15
+ # ws = wb.active
16
+ #
17
+ # # Create a new temporary file and write the contents of the worksheet to it
18
+ # with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False) as f:
19
+ # c = csv.writer(f)
20
+ # for r in ws.rows:
21
+ # c.writerow([cell.value for cell in r])
22
+ # return f.name
23
+ # load all sheets if sheet_name is None
24
+ wb = wb if sheet_name is None else [wb[sheet_name]]
25
+ temp_file_name = []
26
+ # Iterate over the worksheets in the workbook
27
+ for ws in wb:
28
+ # Create a new temporary file and write the contents of the worksheet to it
29
+ with tempfile.NamedTemporaryFile(mode='w+', newline='', suffix='.csv', delete=False) as f:
30
+ c = csv.writer(f)
31
+ for r in ws.rows:
32
+ c.writerow([cell.value for cell in r])
33
+ temp_file_name.append(f.name)
34
+ # print(f'all Sheets are saved to temporary file {temp_file_name}')
35
+ return temp_file_name
36
+
37
+