Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +14 -8
split_files_to_excel.py
CHANGED
@@ -20,7 +20,9 @@ import unstructured
|
|
20 |
from unstructured.partition.docx import partition_docx
|
21 |
from unstructured.partition.auto import partition
|
22 |
|
23 |
-
|
|
|
|
|
24 |
|
25 |
from pypdf import PdfReader
|
26 |
|
@@ -40,14 +42,18 @@ embeddings = HuggingFaceEmbeddings(
|
|
40 |
|
41 |
|
42 |
|
43 |
-
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
|
44 |
-
access_token = os.getenv("HUGGINGFACE_SPLITFILES_API_KEY")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
47 |
-
model_id,
|
48 |
-
padding_side="left",
|
49 |
-
token = access_token
|
50 |
-
)
|
51 |
|
52 |
text_splitter = CharacterTextSplitter(
|
53 |
separator = "\n",
|
|
|
20 |
from unstructured.partition.docx import partition_docx
|
21 |
from unstructured.partition.auto import partition
|
22 |
|
23 |
+
|
24 |
+
import tiktoken
|
25 |
+
#from transformers import AutoTokenizer
|
26 |
|
27 |
from pypdf import PdfReader
|
28 |
|
|
|
42 |
|
43 |
|
44 |
|
45 |
+
# model_id = "mistralai/Mistral-7B-Instruct-v0.1"
|
46 |
+
# access_token = os.getenv("HUGGINGFACE_SPLITFILES_API_KEY")
|
47 |
+
|
48 |
+
# tokenizer = AutoTokenizer.from_pretrained(
|
49 |
+
# model_id,
|
50 |
+
# padding_side="left",
|
51 |
+
# token = access_token
|
52 |
+
# )
|
53 |
+
|
54 |
+
|
55 |
+
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
56 |
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
text_splitter = CharacterTextSplitter(
|
59 |
separator = "\n",
|