Spaces:

Archan
/

Arxiv-Summarizer

Sleeping

App Files Files Community

Archan commited on Dec 11, 2023

Commit

d8f23da

1 Parent(s): 46c7287

added main file and requirements.txt

Browse files

Files changed (2) hide show

requirements.txt +85 -0
strm.py +141 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,85 @@

+aiohttp==3.9.1
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.1.0
+arxiv==2.0.0
+attrs==23.1.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+dataclasses-json==0.6.3
+feedparser==6.0.10
+filelock==3.13.1
+frozenlist==1.4.0
+fsspec==2023.12.1
+gitdb==4.0.11
+GitPython==3.1.40
+greenlet==3.0.2
+huggingface-hub==0.19.4
+idna==3.6
+importlib-metadata==6.11.0
+Jinja2==3.1.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.2
+langchain==0.0.348
+langchain-core==0.0.12
+langsmith==0.0.69
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+networkx==3.2.1
+numpy==1.26.2
+packaging==23.2
+pandas==2.1.4
+Pillow==10.1.0
+protobuf==4.25.1
+pyarrow==14.0.1
+pydantic==2.5.2
+pydantic_core==2.14.5
+pydeck==0.8.1b0
+Pygments==2.17.2
+PyMuPDF==1.23.7
+PyMuPDFb==1.23.7
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.0
+regex==2023.10.3
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.13.2
+safetensors==0.4.1
+sgmllib3k==1.0.0
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+SQLAlchemy==2.0.23
+streamlit==1.29.0
+sympy==1.12
+tenacity==8.2.3
+tokenizers==0.15.0
+toml==0.10.2
+toolz==0.12.0
+torch==2.1.1
+tornado==6.4
+tqdm==4.66.1
+transformers==4.36.0
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.3
+tzlocal==5.2
+urllib3==2.1.0
+validators==0.22.0
+watchdog==3.0.0
+yarl==1.9.4
+zipp==3.17.0

strm.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import streamlit as st
+from langchain.document_loaders import ArxivLoader
+from transformers import pipeline
+def strip(content):
+  content = str(content)
+  #print(content)
+  content = content.split("\n")
+  content = " ".join(content)
+  #print(content)
+  return content
+def clip(content):
+  loc_intro = content.find("Introduction")
+  loc_refer = content.rfind("Reference")
+  if loc_intro !=-1:
+    if loc_refer !=-1:
+      content = content[loc_intro:loc_refer]
+    else:
+      content = content[loc_intro:]
+      print("Warning: Paper Doesn't have a References Title, may lead to overlap of references in summary")
+  else:
+    print("Warning: Paper Doesn't Have an Introduction Title, these may lead to overlap of summarization")
+  return content
+def chunk(content):
+  print("-----Clipping content between Intro and References--------")
+  content = clip(content)
+  sent = []
+  c= 0
+  k = ""
+  content = content.split(". ")
+  for i in range(len(content)):
+    k = k + content[i] + ". "
+    c = c+1
+    if c == 10:
+      sent.append(k)
+      c = 0
+      k = ""
+    elif i==len(content)-1:
+      sent.append(k)
+  return sent
+def summarize(sent):
+  model_str = "Falconsai/text_summarization"
+  tokenizer_str = "Falconsai/text_summarization"
+  summarizer = pipeline("summarization", model=model_str, tokenizer = tokenizer_str)
+  summarized = ""
+  for i in sent:
+    s = summarizer(i, max_length=256, min_length=64, do_sample=False)
+    summarized = summarized + s[0]['summary_text'] +"\n"
+  return summarized
+def doc_load(search_query="default", n_docs=1):
+  if search_query == "default":
+    return [" ",  " "], [" ",  " "], [" ",  " "]
+  try :
+    print("-------searching Paper----------")
+    docs = ArxivLoader(query=search_query, load_max_docs=n_docs).load()
+    titles = []
+    n_pairs = {}
+    for i in range(n_docs):
+      title = docs[i].metadata['Title']
+      titles.append(title)
+      n_pairs[title] = i
+    return titles, docs, n_pairs
+  except Exception as e:
+    print("--------ERROR while Trying to Search Paper-------------")
+    print(e)
+def run(choice, docs, n_pairs):
+  ch = n_pairs[choice]
+  st.text("Fetching Metadata")
+  print("-----fetching metadata-------------")
+  metadata = docs[ch].metadata
+  content = docs[ch].page_content
+  print("----stripping new lines----------")
+  content = strip(content)
+  print("-----------chunking content--------------")
+  sent = chunk(content)
+  st.text("Chunking Text....")
+  st.text("🤔 Shortening text...")
+  print("----summarizing content---------")
+  summarized = summarize(sent)
+  out = "Date: "+ str(metadata['Published']) + "\n" + "\n Title: "+ metadata['Title'] + "\n" + "\n Authors: " + metadata['Authors'] + "\n" + "\n Summary: \n" + summarized
+  return out
+st.title("ArXiV Summarizer")
+titles = []
+with st.form(key="search_form"):
+    col1, col2 = st.columns(2)
+    with col1:
+        search_query = st.text_input("Search Using Paper ID or Name*")
+    with col2:
+        n_docs = st.selectbox(label="Number of Documents to Load", options=(1, 2, 3, 4, 5, 6, 7, 8, 9 ,10))
+    submit = st.form_submit_button(label="Search")
+if submit:
+  c = "Fetching Papers 🤔 "
+  st.write(c)
+try:
+  titles, docs, n_pairs = doc_load(search_query=search_query, n_docs=n_docs)
+except Exception as e:
+  print(e)
+if titles:
+  c = "Papers Fetched 🤩 "
+  st.write(c)
+else:
+  c = "Error while Fetching Papers 😥 Please Check ID or Name"
+  st.write(c)
+label = "Papers for " + search_query
+with st.form(key="paper_form"):
+  paper_name = st.selectbox(label=label, options=titles)
+  submit_paper = st.form_submit_button(label="Fetch Paper & Summarize")
+  print(submit_paper)
+if submit_paper:
+  st.text("Reading Document.... 📄 ")
+  output = run(paper_name, docs, n_pairs)
+  st.text_area(label = "Summary", value=output, height = 650)
+st.text('''* - Please Use Paper ID (Example : 2301.10172) as it will give accurate results.
+          Free text search can give errors sometimes''')
+st.text("While using Paper ID no need to change Number of Documents to load")