Spaces:
Sleeping
Sleeping
Carlos Salgado
commited on
Commit
·
e39bb0b
1
Parent(s):
c40d04b
fallback on pypdf, trim flake, minor ux
Browse files- app.py +15 -15
- flake.nix +4 -18
- requirements.txt +2 -2
- scripts.py +33 -29
app.py
CHANGED
@@ -13,17 +13,17 @@ def suggest_metadata(file_upload):
|
|
13 |
|
14 |
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
15 |
tmp.write(uploaded_file.read())
|
16 |
-
|
17 |
-
st.write(f'Created temporary file {file_path}')
|
18 |
|
19 |
-
st.write('##
|
20 |
-
|
21 |
-
|
|
|
22 |
|
|
|
23 |
st.write('## Querying Together.ai API')
|
24 |
-
|
25 |
-
st.write(f'
|
26 |
-
st.write(f'### {metadata}')
|
27 |
|
28 |
with st.form('analyze_form'):
|
29 |
st.write('Enter your file metadata in the following schema:')
|
@@ -38,14 +38,14 @@ with st.form('analyze_form'):
|
|
38 |
analysis = analyze_metadata(filename, description, discipline)
|
39 |
|
40 |
st.write(analysis)
|
|
|
41 |
|
42 |
st.write('## Generate metadata?')
|
43 |
-
uploaded_file = st.file_uploader("Choose a PDF file", type=
|
44 |
|
45 |
-
if uploaded_file is not None:
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
os.remove(file_path)
|
|
|
13 |
|
14 |
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
15 |
tmp.write(uploaded_file.read())
|
16 |
+
st.write(f'Created temporary file {tmp.name}')
|
|
|
17 |
|
18 |
+
st.write('## Ingesting Unstructured file')
|
19 |
+
|
20 |
+
docs = ingest(tmp.name)
|
21 |
+
print(f'Ingested {tmp.name}')
|
22 |
|
23 |
+
metadata = generate_metadata(docs)
|
24 |
st.write('## Querying Together.ai API')
|
25 |
+
st.write(f'### Suggested Metadata Generated by {MODEL_NAME}')
|
26 |
+
st.write(f'#### {metadata}')
|
|
|
27 |
|
28 |
with st.form('analyze_form'):
|
29 |
st.write('Enter your file metadata in the following schema:')
|
|
|
38 |
analysis = analyze_metadata(filename, description, discipline)
|
39 |
|
40 |
st.write(analysis)
|
41 |
+
submitted = None
|
42 |
|
43 |
st.write('## Generate metadata?')
|
44 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
45 |
|
46 |
+
if uploaded_file is not None:
|
47 |
|
48 |
+
query_api = st.button('Query API')
|
49 |
+
if query_api:
|
50 |
+
suggest_metadata(uploaded_file)
|
51 |
+
query_api = None
|
|
flake.nix
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
description = "A LLM backend development flake powered by unstructured and langchain";
|
3 |
-
|
4 |
inputs = {
|
5 |
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
|
6 |
};
|
@@ -9,6 +9,7 @@
|
|
9 |
system = "x86_64-linux";
|
10 |
# ↑ Swap it for your system if needed
|
11 |
# "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
|
|
|
12 |
pkgs = nixpkgs.legacyPackages.${system};
|
13 |
in {
|
14 |
devShells.${system}.default = pkgs.mkShell {
|
@@ -17,33 +18,18 @@
|
|
17 |
python-pkgs.pip # VsCode starts
|
18 |
python-pkgs.jupyter
|
19 |
python-pkgs.notebook # VsCode ends
|
20 |
-
python-pkgs.numpy
|
21 |
python-pkgs.pandas
|
22 |
-
python-pkgs.scipy
|
23 |
-
python-pkgs.matplotlib
|
24 |
python-pkgs.requests
|
25 |
python-pkgs.langchain-community
|
26 |
python-pkgs.langchain
|
27 |
python-pkgs.langchain-text-splitters
|
28 |
-
python-pkgs.
|
29 |
-
python-pkgs.wrapt # unstructured[local-inference] starts
|
30 |
-
python-pkgs.iso-639
|
31 |
-
python-pkgs.emoji
|
32 |
-
python-pkgs.pillow-heif
|
33 |
-
python-pkgs.magic
|
34 |
-
python-pkgs.poppler-qt5
|
35 |
-
python-pkgs.pytesseract
|
36 |
-
python-pkgs.langdetect # unstructured[local-inference] ends
|
37 |
python-pkgs.openai
|
38 |
-
python-pkgs.pydantic
|
39 |
python-pkgs.python-dotenv
|
40 |
python-pkgs.configargparse
|
41 |
python-pkgs.streamlit
|
42 |
-
python-pkgs.lark
|
43 |
python-pkgs.sentence-transformers
|
44 |
-
pkgs.unstructured
|
45 |
-
pkgs.poppler
|
46 |
-
pkgs.haskellPackages.iso639
|
47 |
]))
|
48 |
];
|
49 |
|
|
|
1 |
{
|
2 |
description = "A LLM backend development flake powered by unstructured and langchain";
|
3 |
+
|
4 |
inputs = {
|
5 |
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
|
6 |
};
|
|
|
9 |
system = "x86_64-linux";
|
10 |
# ↑ Swap it for your system if needed
|
11 |
# "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
|
12 |
+
debug = true;
|
13 |
pkgs = nixpkgs.legacyPackages.${system};
|
14 |
in {
|
15 |
devShells.${system}.default = pkgs.mkShell {
|
|
|
18 |
python-pkgs.pip # VsCode starts
|
19 |
python-pkgs.jupyter
|
20 |
python-pkgs.notebook # VsCode ends
|
|
|
21 |
python-pkgs.pandas
|
|
|
|
|
22 |
python-pkgs.requests
|
23 |
python-pkgs.langchain-community
|
24 |
python-pkgs.langchain
|
25 |
python-pkgs.langchain-text-splitters
|
26 |
+
python-pkgs.pypdf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
python-pkgs.openai
|
|
|
28 |
python-pkgs.python-dotenv
|
29 |
python-pkgs.configargparse
|
30 |
python-pkgs.streamlit
|
|
|
31 |
python-pkgs.sentence-transformers
|
32 |
+
python-pkgs.unstructured
|
|
|
|
|
33 |
]))
|
34 |
];
|
35 |
|
requirements.txt
CHANGED
@@ -7,5 +7,5 @@ streamlit
|
|
7 |
python-dotenv
|
8 |
sentence-transformers
|
9 |
iso639-lang
|
10 |
-
|
11 |
-
|
|
|
7 |
python-dotenv
|
8 |
sentence-transformers
|
9 |
iso639-lang
|
10 |
+
unstructured[pdf]
|
11 |
+
pypdf
|
scripts.py
CHANGED
@@ -5,8 +5,11 @@ import json
|
|
5 |
import openai
|
6 |
import sys
|
7 |
from dotenv import load_dotenv
|
|
|
8 |
from langchain_community.document_loaders import TextLoader
|
|
|
9 |
from langchain_community.document_loaders import UnstructuredPDFLoader
|
|
|
10 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
11 |
from langchain_community.vectorstores import Vectara
|
12 |
from langchain_core.output_parsers import StrOutputParser
|
@@ -56,35 +59,35 @@ def get_sources(documents):
|
|
56 |
def get_summary(documents):
|
57 |
return documents[-1].page_content
|
58 |
|
59 |
-
def ingest(file_path):
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
63 |
loader = UnstructuredPDFLoader(file_path)
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
return docs
|
87 |
-
|
88 |
|
89 |
|
90 |
def generate_metadata(docs):
|
@@ -126,8 +129,9 @@ def generate_metadata(docs):
|
|
126 |
}
|
127 |
]
|
128 |
)
|
|
|
129 |
|
130 |
-
return json.loads(chat_completion.choices[0].message.content)
|
131 |
|
132 |
|
133 |
def analyze_metadata(filename, description, discipline):
|
|
|
5 |
import openai
|
6 |
import sys
|
7 |
from dotenv import load_dotenv
|
8 |
+
|
9 |
from langchain_community.document_loaders import TextLoader
|
10 |
+
from langchain_community.document_loaders import PyPDFLoader
|
11 |
from langchain_community.document_loaders import UnstructuredPDFLoader
|
12 |
+
|
13 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
14 |
from langchain_community.vectorstores import Vectara
|
15 |
from langchain_core.output_parsers import StrOutputParser
|
|
|
59 |
def get_summary(documents):
|
60 |
return documents[-1].page_content
|
61 |
|
62 |
+
def ingest(file_path):
|
63 |
+
try:
|
64 |
+
loader = PyPDFLoader(file_path)
|
65 |
+
documents = loader.load()
|
66 |
+
print('Loaded PyPDFLoader')
|
67 |
+
except Exception as e:
|
68 |
+
print(f'{e}')
|
69 |
loader = UnstructuredPDFLoader(file_path)
|
70 |
+
documents = loader.load()
|
71 |
+
print('Loaded UnstructuredPDFLoader')
|
72 |
+
finally:
|
73 |
+
# transform locally
|
74 |
+
documents = loader.load()
|
75 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
|
76 |
+
separators=[
|
77 |
+
"\n\n",
|
78 |
+
"\n",
|
79 |
+
" ",
|
80 |
+
",",
|
81 |
+
"\uff0c", # Fullwidth comma
|
82 |
+
"\u3001", # Ideographic comma
|
83 |
+
"\uff0e", # Fullwidth full stop
|
84 |
+
# "\u200B", # Zero-width space (Asian languages)
|
85 |
+
# "\u3002", # Ideographic full stop (Asian languages)
|
86 |
+
"",
|
87 |
+
])
|
88 |
+
docs = text_splitter.split_documents(documents)
|
89 |
+
|
90 |
+
return docs
|
|
|
|
|
|
|
91 |
|
92 |
|
93 |
def generate_metadata(docs):
|
|
|
129 |
}
|
130 |
]
|
131 |
)
|
132 |
+
return chat_completion.choices[0].message.content
|
133 |
|
134 |
+
#return json.loads(chat_completion.choices[0].message.content)
|
135 |
|
136 |
|
137 |
def analyze_metadata(filename, description, discipline):
|