Spaces:
Sleeping
Sleeping
added main file and requirements.txt
Browse files- requirements.txt +85 -0
- strm.py +141 -0
requirements.txt
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.1
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==5.2.0
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==4.1.0
|
6 |
+
arxiv==2.0.0
|
7 |
+
attrs==23.1.0
|
8 |
+
blinker==1.7.0
|
9 |
+
cachetools==5.3.2
|
10 |
+
certifi==2023.11.17
|
11 |
+
charset-normalizer==3.3.2
|
12 |
+
click==8.1.7
|
13 |
+
colorama==0.4.6
|
14 |
+
dataclasses-json==0.6.3
|
15 |
+
feedparser==6.0.10
|
16 |
+
filelock==3.13.1
|
17 |
+
frozenlist==1.4.0
|
18 |
+
fsspec==2023.12.1
|
19 |
+
gitdb==4.0.11
|
20 |
+
GitPython==3.1.40
|
21 |
+
greenlet==3.0.2
|
22 |
+
huggingface-hub==0.19.4
|
23 |
+
idna==3.6
|
24 |
+
importlib-metadata==6.11.0
|
25 |
+
Jinja2==3.1.2
|
26 |
+
jsonpatch==1.33
|
27 |
+
jsonpointer==2.4
|
28 |
+
jsonschema==4.20.0
|
29 |
+
jsonschema-specifications==2023.11.2
|
30 |
+
langchain==0.0.348
|
31 |
+
langchain-core==0.0.12
|
32 |
+
langsmith==0.0.69
|
33 |
+
markdown-it-py==3.0.0
|
34 |
+
MarkupSafe==2.1.3
|
35 |
+
marshmallow==3.20.1
|
36 |
+
mdurl==0.1.2
|
37 |
+
mpmath==1.3.0
|
38 |
+
multidict==6.0.4
|
39 |
+
mypy-extensions==1.0.0
|
40 |
+
networkx==3.2.1
|
41 |
+
numpy==1.26.2
|
42 |
+
packaging==23.2
|
43 |
+
pandas==2.1.4
|
44 |
+
Pillow==10.1.0
|
45 |
+
protobuf==4.25.1
|
46 |
+
pyarrow==14.0.1
|
47 |
+
pydantic==2.5.2
|
48 |
+
pydantic_core==2.14.5
|
49 |
+
pydeck==0.8.1b0
|
50 |
+
Pygments==2.17.2
|
51 |
+
PyMuPDF==1.23.7
|
52 |
+
PyMuPDFb==1.23.7
|
53 |
+
python-dateutil==2.8.2
|
54 |
+
pytz==2023.3.post1
|
55 |
+
PyYAML==6.0.1
|
56 |
+
referencing==0.32.0
|
57 |
+
regex==2023.10.3
|
58 |
+
requests==2.31.0
|
59 |
+
rich==13.7.0
|
60 |
+
rpds-py==0.13.2
|
61 |
+
safetensors==0.4.1
|
62 |
+
sgmllib3k==1.0.0
|
63 |
+
six==1.16.0
|
64 |
+
smmap==5.0.1
|
65 |
+
sniffio==1.3.0
|
66 |
+
SQLAlchemy==2.0.23
|
67 |
+
streamlit==1.29.0
|
68 |
+
sympy==1.12
|
69 |
+
tenacity==8.2.3
|
70 |
+
tokenizers==0.15.0
|
71 |
+
toml==0.10.2
|
72 |
+
toolz==0.12.0
|
73 |
+
torch==2.1.1
|
74 |
+
tornado==6.4
|
75 |
+
tqdm==4.66.1
|
76 |
+
transformers==4.36.0
|
77 |
+
typing-inspect==0.9.0
|
78 |
+
typing_extensions==4.9.0
|
79 |
+
tzdata==2023.3
|
80 |
+
tzlocal==5.2
|
81 |
+
urllib3==2.1.0
|
82 |
+
validators==0.22.0
|
83 |
+
watchdog==3.0.0
|
84 |
+
yarl==1.9.4
|
85 |
+
zipp==3.17.0
|
strm.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain.document_loaders import ArxivLoader
|
3 |
+
from transformers import pipeline
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
def strip(content):
|
8 |
+
content = str(content)
|
9 |
+
#print(content)
|
10 |
+
content = content.split("\n")
|
11 |
+
content = " ".join(content)
|
12 |
+
#print(content)
|
13 |
+
|
14 |
+
return content
|
15 |
+
|
16 |
+
def clip(content):
|
17 |
+
loc_intro = content.find("Introduction")
|
18 |
+
loc_refer = content.rfind("Reference")
|
19 |
+
if loc_intro !=-1:
|
20 |
+
if loc_refer !=-1:
|
21 |
+
content = content[loc_intro:loc_refer]
|
22 |
+
else:
|
23 |
+
content = content[loc_intro:]
|
24 |
+
print("Warning: Paper Doesn't have a References Title, may lead to overlap of references in summary")
|
25 |
+
else:
|
26 |
+
print("Warning: Paper Doesn't Have an Introduction Title, these may lead to overlap of summarization")
|
27 |
+
|
28 |
+
return content
|
29 |
+
|
30 |
+
|
31 |
+
def chunk(content):
|
32 |
+
|
33 |
+
print("-----Clipping content between Intro and References--------")
|
34 |
+
|
35 |
+
content = clip(content)
|
36 |
+
|
37 |
+
sent = []
|
38 |
+
c= 0
|
39 |
+
k = ""
|
40 |
+
content = content.split(". ")
|
41 |
+
for i in range(len(content)):
|
42 |
+
k = k + content[i] + ". "
|
43 |
+
c = c+1
|
44 |
+
if c == 10:
|
45 |
+
sent.append(k)
|
46 |
+
c = 0
|
47 |
+
k = ""
|
48 |
+
elif i==len(content)-1:
|
49 |
+
sent.append(k)
|
50 |
+
|
51 |
+
return sent
|
52 |
+
|
53 |
+
|
54 |
+
def summarize(sent):
|
55 |
+
model_str = "Falconsai/text_summarization"
|
56 |
+
tokenizer_str = "Falconsai/text_summarization"
|
57 |
+
|
58 |
+
summarizer = pipeline("summarization", model=model_str, tokenizer = tokenizer_str)
|
59 |
+
|
60 |
+
|
61 |
+
summarized = ""
|
62 |
+
for i in sent:
|
63 |
+
s = summarizer(i, max_length=256, min_length=64, do_sample=False)
|
64 |
+
summarized = summarized + s[0]['summary_text'] +"\n"
|
65 |
+
|
66 |
+
return summarized
|
67 |
+
|
68 |
+
def doc_load(search_query="default", n_docs=1):
|
69 |
+
if search_query == "default":
|
70 |
+
return [" ", " "], [" ", " "], [" ", " "]
|
71 |
+
try :
|
72 |
+
print("-------searching Paper----------")
|
73 |
+
docs = ArxivLoader(query=search_query, load_max_docs=n_docs).load()
|
74 |
+
titles = []
|
75 |
+
n_pairs = {}
|
76 |
+
for i in range(n_docs):
|
77 |
+
title = docs[i].metadata['Title']
|
78 |
+
titles.append(title)
|
79 |
+
n_pairs[title] = i
|
80 |
+
return titles, docs, n_pairs
|
81 |
+
except Exception as e:
|
82 |
+
print("--------ERROR while Trying to Search Paper-------------")
|
83 |
+
print(e)
|
84 |
+
|
85 |
+
|
86 |
+
def run(choice, docs, n_pairs):
|
87 |
+
ch = n_pairs[choice]
|
88 |
+
st.text("Fetching Metadata")
|
89 |
+
print("-----fetching metadata-------------")
|
90 |
+
metadata = docs[ch].metadata
|
91 |
+
content = docs[ch].page_content
|
92 |
+
|
93 |
+
print("----stripping new lines----------")
|
94 |
+
content = strip(content)
|
95 |
+
print("-----------chunking content--------------")
|
96 |
+
sent = chunk(content)
|
97 |
+
st.text("Chunking Text....")
|
98 |
+
st.text("🤔 Shortening text...")
|
99 |
+
print("----summarizing content---------")
|
100 |
+
summarized = summarize(sent)
|
101 |
+
|
102 |
+
|
103 |
+
out = "Date: "+ str(metadata['Published']) + "\n" + "\n Title: "+ metadata['Title'] + "\n" + "\n Authors: " + metadata['Authors'] + "\n" + "\n Summary: \n" + summarized
|
104 |
+
return out
|
105 |
+
|
106 |
+
st.title("ArXiV Summarizer")
|
107 |
+
titles = []
|
108 |
+
with st.form(key="search_form"):
|
109 |
+
col1, col2 = st.columns(2)
|
110 |
+
with col1:
|
111 |
+
search_query = st.text_input("Search Using Paper ID or Name*")
|
112 |
+
with col2:
|
113 |
+
n_docs = st.selectbox(label="Number of Documents to Load", options=(1, 2, 3, 4, 5, 6, 7, 8, 9 ,10))
|
114 |
+
submit = st.form_submit_button(label="Search")
|
115 |
+
if submit:
|
116 |
+
c = "Fetching Papers 🤔 "
|
117 |
+
st.write(c)
|
118 |
+
try:
|
119 |
+
titles, docs, n_pairs = doc_load(search_query=search_query, n_docs=n_docs)
|
120 |
+
except Exception as e:
|
121 |
+
print(e)
|
122 |
+
if titles:
|
123 |
+
c = "Papers Fetched 🤩 "
|
124 |
+
st.write(c)
|
125 |
+
else:
|
126 |
+
c = "Error while Fetching Papers 😥 Please Check ID or Name"
|
127 |
+
st.write(c)
|
128 |
+
|
129 |
+
label = "Papers for " + search_query
|
130 |
+
with st.form(key="paper_form"):
|
131 |
+
paper_name = st.selectbox(label=label, options=titles)
|
132 |
+
submit_paper = st.form_submit_button(label="Fetch Paper & Summarize")
|
133 |
+
print(submit_paper)
|
134 |
+
if submit_paper:
|
135 |
+
st.text("Reading Document.... 📄 ")
|
136 |
+
output = run(paper_name, docs, n_pairs)
|
137 |
+
st.text_area(label = "Summary", value=output, height = 650)
|
138 |
+
|
139 |
+
st.text('''* - Please Use Paper ID (Example : 2301.10172) as it will give accurate results.
|
140 |
+
Free text search can give errors sometimes''')
|
141 |
+
st.text("While using Paper ID no need to change Number of Documents to load")
|