coderpotter commited on
Commit
7b2e5db
·
verified ·
1 Parent(s): d9cd6b8

Upload folder using huggingface_hub

Browse files
.github/workflows/lint_pytest.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: lint_pytest
2
+ on: push
3
+ jobs:
4
+ lint:
5
+ runs-on: ubuntu-latest
6
+ steps:
7
+ - uses: actions/checkout@v4
8
+ - uses: psf/black@stable
9
+ - name: Set up Python
10
+ uses: actions/setup-python@v5
11
+ with:
12
+ python-version: "3.9.19"
13
+ - name: Install dependencies
14
+ run: |
15
+ python -m pip install --upgrade pip
16
+ pip install -r requirements.txt
17
+ - name: Ruff
18
+ run: |
19
+ ruff check --output-format=github .
20
+ - name: Mypy
21
+ run: |
22
+ mypy . --install-types --non-interactive
23
+ - name: Test with pytest
24
+ run: |
25
+ pytest -s tests.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **cache**
2
+ **playground**
3
+ .DS_Store
4
+ .env
5
+ settings.json
6
+ data
7
+ logs
8
+ keys.yaml
9
+ *.egg-info
10
+ research_trails
11
+ KEYS.py
12
+ .gradio/
13
+ **.pdf
KEYS_TEMPLATE.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ANTHROPIC = "sk-xxx"
2
+ FIREWORKS_AI = "xxx"
3
+ OPENAI = "sk-proj-xxx"
README.md CHANGED
@@ -1,12 +1,68 @@
1
  ---
2
- title: Research Assistant
3
- emoji: 🚀
4
- colorFrom: indigo
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.6.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: research-assistant
3
+ app_file: src/research_assistant/web/app.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.6.0
 
 
6
  ---
7
+ # Research Assistant
8
 
9
+ This is a Research Assistant that helps in analyzing and simplifying the content present in a research article, such that you don't have to read the whole thing to understand what knowledge is being presented inside the article. This tool takes care of it and provides you the understanding you would need.
10
+
11
+ # How to run?
12
+ ## Step 1:
13
+ First install the requirements
14
+ ```bash
15
+ pip install -r requirements.txt
16
+ pip install -e .
17
+ ```
18
+
19
+ ## Step 2:
20
+ create a file named keys.yaml following the template present in keys_template.yaml
21
+ Input your api keys inside keys.yaml and save them
22
+
23
+ ## Step 3:
24
+ Update the contents in config/config.yaml file. The path for your file name, and the search parameters for articles are present inside the config.yaml file. Before every run, if you want to change the serach configuration or summarization parameters, you need to update the config.yaml file.
25
+
26
+ ## Step 4:
27
+ The Summarization pipeline can be run in 2 ways:
28
+
29
+ ### From Command Line Interface as Pip Package:
30
+ Step 1 installs the whole repo as a pip installable package in editable mode in your pip.
31
+ To access the package and get the summary of the file, run the following command:
32
+
33
+
34
+ ```bash
35
+ research --pipeline_name
36
+ ```
37
+ ### From running the Main File:
38
+
39
+ ##### Confirm the Arguments:
40
+ To change the model names, filepath to get the summary, filepath to save the summary, make changes in the config/config.yaml file.
41
+
42
+ ##### Run the following command
43
+ ```bash
44
+ python src/research_assistant/main.py --pipeline_name
45
+ ```
46
+
47
+ # Different Pipelines:
48
+ There are two pipelies available here:
49
+ ## Summarization Pipeline:
50
+ Given a paper, this pipeline gives out the summarization. The paper filepath can be set in config/config.yaml. To activate this pipeline, run the following command:
51
+
52
+ ```bash
53
+ # If you want to use the package version, run the following command
54
+ research --summarize_article
55
+
56
+ #If you directly want to directly compile in CLI, run the following command
57
+ python src/research_assistant/main.py --summarize_article
58
+ ```
59
+ ## Searching for Articles:
60
+ Given a few keywords, and other parameters, this pipeline will crawl Arxiv and get you the artciles that are relevant for you. To activate this pipeline, run the following command
61
+
62
+ ```bash
63
+ # If you want to use the package version, run the following command
64
+ research --search_articles
65
+
66
+ #If you directly want to directly compile in CLI, run the following command
67
+ python src/research_assistant/main.py --search_articles
68
+ ```
config/config.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RENAME THIS TO YOUR CONFIGURATIONS
2
+ article_details:
3
+ file_path: 'data/2106.07691v1.pdf'
4
+ summary_save_dir: 'data/summary_results'
5
+
6
+ # TODO: Remove this when TECH-67 is implemented
7
+ article_search_details:
8
+ search_terms: ['Hallucination', 'Natural Language Inference']
9
+ num_results: 5
10
+ date_range:
11
+ start_date: '2023-01-01'
12
+ end_date: '2024-12-31'
13
+ sort_by: 'submittedDate'
14
+ sort_order: 'descending'
15
+
16
+ #################################################################################################################
17
+ # MAKE CHANGES FROM HERE ONLY IF YOU ARE SURE OF WHAT YOU ARE DOING, ELSE DO NOT EDIT
18
+ #################################################################################################################
19
+
20
+ planner:
21
+ model_name: 'claude-3-5-sonnet-20241022'
22
+
23
+ planner_parser:
24
+ tool_list_model: 'claude-3-5-sonnet-20241022'
25
+ argument_list_model: 'claude-3-5-sonnet-20241022'
26
+ dependency_list_model: 'claude-3-5-sonnet-20241022'
27
+
28
+ qa_tool:
29
+ model_name: 'claude-3-5-sonnet-20241022'
30
+
31
+ solver:
32
+ model_name: 'claude-3-5-sonnet-20241022'
mypy.ini ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [mypy]
2
+ ignore_missing_imports = True
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python-dotenv
2
+ langsmith
3
+ langgraph
4
+ langchain
5
+ numexpr
6
+ langchain-anthropic
7
+ langchain-community
8
+ langchain-core
9
+ pdfminer
10
+ pytest
11
+ pdfminer.six
12
+ arxiv
13
+ python-box
14
+ langchain-fireworks
15
+ langchain-google_vertexai
16
+ langchain-openai
17
+ types-PyYAML
18
+ gradio
19
+ markdown2
20
+ fpdf2
21
+ mistletoe
22
+ pydantic==2.9.0
23
+ xmltodict
24
+ mypy
25
+ types-requests
26
+ ruff
27
+ pytest
28
+ pytest-cov
setup.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import setuptools
2
+
3
+ with open("README.md", "r", encoding="utf-8") as f:
4
+ long_description = f.read()
5
+
6
+ __version__ = "0.0.0"
7
+ REPO_NAME = "research-assistant"
8
+ AUTHOR_USER_NAME = "Actualization-AI"
9
+ SRC_REPO = "research-assistant"
10
+ AUTHOR_EMAIL = "manas@actualization.ai"
11
+ setuptools.setup(
12
+ name=SRC_REPO,
13
+ version=__version__,
14
+ author=AUTHOR_USER_NAME,
15
+ author_email=AUTHOR_EMAIL,
16
+ description="A Research Assistant which can scrape and summarize research articles for easier understanding",
17
+ long_description=long_description,
18
+ long_description_content_type="text/markdown",
19
+ url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
20
+ project_urls={
21
+ "Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
22
+ },
23
+ entry_points={ # Entry point for CLI
24
+ "console_scripts": [
25
+ "research=research_assistant.main:main", # Command to run main function
26
+ ]
27
+ },
28
+ package_dir={"": "src"},
29
+ packages=setuptools.find_packages(where="src"),
30
+ )
src/research_assistant/__init__.py ADDED
File without changes
src/research_assistant/app_logging/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+
5
+ log_dir = "logs"
6
+ log_filepath = os.path.join(log_dir, "running_logs.log")
7
+ os.makedirs(log_dir, exist_ok=True)
8
+
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format="[%(asctime)s: %(levelname)s: %(module)s: %(message)s]",
12
+ handlers=[logging.FileHandler(log_filepath), logging.StreamHandler(sys.stdout)],
13
+ )
14
+ app_logger = logging.getLogger("ResearchAssistantLogger")
src/research_assistant/components/__init__.py ADDED
File without changes
src/research_assistant/components/agent.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ from langchain_anthropic import ChatAnthropic
5
+ from langchain_fireworks import ChatFireworks
6
+ from langchain_google_vertexai import ChatVertexAI
7
+ from langchain_openai import ChatOpenAI
8
+
9
+ sys.path.append(os.getcwd())
10
+ import KEYS
11
+ from research_assistant.app_logging import app_logger
12
+
13
+
14
+ def set_api_key(env_var: str, api_key: str):
15
+ os.environ[env_var] = api_key
16
+
17
+
18
+ class Agent:
19
+ def __init__(self, model_name: str):
20
+ model_classes = {
21
+ "gpt": (
22
+ (ChatOpenAI, "OPENAI_API_KEY", KEYS.OPENAI) # type: ignore
23
+ if "OPENAI" in KEYS.__dict__
24
+ else (None, None, None)
25
+ ),
26
+ "claude": (
27
+ (ChatAnthropic, "ANTHROPIC_API_KEY", KEYS.ANTHROPIC) # type: ignore
28
+ if "ANTHROPIC" in KEYS.__dict__
29
+ else (None, None, None)
30
+ ),
31
+ "gemini": (
32
+ (ChatVertexAI, "GOOGLE_API_KEY", KEYS.VERTEX_AI) # type: ignore
33
+ if "VERTEX_AI" in KEYS.__dict__
34
+ else (None, None, None)
35
+ ),
36
+ "fireworks": (
37
+ (ChatFireworks, "FIREWORKS_API_KEY", KEYS.FIREWORKS_AI) # type: ignore
38
+ if "FIREWORKS_AI" in KEYS.__dict__
39
+ else (None, None, None)
40
+ ),
41
+ }
42
+ max_tokens_map = {
43
+ "gpt-3.5": 16000,
44
+ "gpt-4": 8000,
45
+ "gpt-4o-mini": 8000,
46
+ "llama-v3p2-1b-instruct": 128000,
47
+ "llama-v3p2-3b-instruct": 128000,
48
+ "llama-v3p1-8b-instruct": 128000,
49
+ "llama-v3p1-70b-instruct": 128000,
50
+ "llama-v3p1-405b-instruct": 128000,
51
+ "mixtral-8x22b-instruct": 64000,
52
+ "mixtral-8x7b-instruct": 32000,
53
+ "mixtral-8x7b-instruct-hf": 32000,
54
+ "qwen2p5-72b-instruct": 32000,
55
+ "gemma2-9b-it": 8000,
56
+ "llama-v3-8b-instruct": 8000,
57
+ "llama-v3-70b-instruct": 8000,
58
+ "llama-v3-70b-instruct-hf": 8000,
59
+ }
60
+ for key, (model_class, env_var, api_key) in model_classes.items():
61
+ if model_class is not None and key in model_name:
62
+ set_api_key(env_var, api_key) # type: ignore
63
+ model = model_class(model=model_name, temperature=0.5) # type: ignore
64
+ max_tokens = max_tokens_map.get(model_name, 128000)
65
+ break
66
+ else:
67
+ raise ValueError(f"Model {model_name} not supported")
68
+
69
+ app_logger.info(f"Model {model_name} is initialized successfully")
70
+ self.model = model
71
+ self.max_tokens = max_tokens
72
+
73
+ def get_model(self):
74
+ return self.model
src/research_assistant/components/agent_tools.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+
3
+ from langchain_anthropic.chat_models import ChatAnthropic
4
+ from langchain_community.utilities.arxiv import ArxivAPIWrapper
5
+ from langchain_core.messages import HumanMessage, SystemMessage
6
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
7
+ from langchain_core.runnables import RunnableConfig
8
+ from langchain_core.tools import StructuredTool
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class ExecuteCode(BaseModel):
13
+ """The input to the summarizer tool function."""
14
+
15
+ reasoning: str = Field(
16
+ description="The reasoning behind the code expression, including how context is included, if applicable.",
17
+ )
18
+ answer: str = Field(
19
+ ...,
20
+ description="The answer to the question about the research article.",
21
+ )
22
+
23
+
24
+ def get_qa_tool(llm: ChatAnthropic):
25
+ prompt = ChatPromptTemplate.from_messages(
26
+ [
27
+ SystemMessage(
28
+ (
29
+ "You are an advanced research assistant answering questions about a specific research article. The question may require external information beyond the research article itself. This external information, along with the parsed content from the research article, will be provided as 'Additional Context'.\n\n"
30
+ # comment for readibility
31
+ "You must:\n"
32
+ "1. Thoroughly analyze the research article to understand its key objectives, methods, findings, and implications.\n"
33
+ "2. Use the research article and any additional context to construct a comprehensive, well-informed answer to the given question.\n"
34
+ "3. Explicitly reference and combine information from both the research article and the additional context when needed, ensuring that the response is relevant, accurate, and complete.\n\n"
35
+ # comment for readibility
36
+ "Follow these steps when answering:\n"
37
+ "- If the question can be answered using information from the research article alone, do so.\n"
38
+ "- If additional context is needed to supplement or clarify the answer, carefully integrate it with the information from the article.\n"
39
+ "- Ensure the response is precise, concise, and clear, citing the research article and additional context appropriately."
40
+ )
41
+ ),
42
+ MessagesPlaceholder(variable_name="context", optional=True),
43
+ MessagesPlaceholder(variable_name="question"),
44
+ ]
45
+ )
46
+ summarizer = prompt | llm.with_structured_output(ExecuteCode)
47
+
48
+ def get_answer(
49
+ question: str,
50
+ context: Optional[List[str]] = None,
51
+ config: Optional[RunnableConfig] = None,
52
+ ):
53
+ context_str = "\n".join(context).strip() if context else None
54
+ chain_input = {
55
+ "question": [HumanMessage(question)],
56
+ "context": [
57
+ (
58
+ HumanMessage(
59
+ (
60
+ f"Additional context has been provided from other tools (such as parsed PDF content or information retrieved from internet searches). Use it to substitute into any {{#}} variables or other words in the question. Do not directly substitute the value. Rather, extract information in the best suitable format and then substitute. Use this context to enrich your answer by integrating it with the information from the research article. Context:\n{context_str}\n\n"
61
+ # comment for readibility
62
+ "Instructions:\n"
63
+ "- Identify where the additional context is necessary to supplement or clarify the research article's information.\n"
64
+ "- Replace any placeholders or variable information (e.g., {{#}}) with appropriate details from the context.\n"
65
+ "- Make sure the final answer blends the research article content with the additional context in a cohesive and accurate manner.\n\n"
66
+ # comment for readibility
67
+ "Once done, output the updated, comprehensive answer."
68
+ )
69
+ )
70
+ if context_str
71
+ else HumanMessage("No Additional Context is Provided")
72
+ )
73
+ ],
74
+ }
75
+ return summarizer.invoke(chain_input, config)
76
+
77
+ return StructuredTool.from_function(
78
+ name="qa_agent",
79
+ func=get_answer,
80
+ description="This tool is designed to answer specific questions about a research article, rather than simply providing a full summary. It offers a well-rounded and accurate response to your inquiry, allowing you to focus on the exact information you need without having to go through the entire article.",
81
+ )
82
+
83
+
84
+ def get_arxiv_tool(
85
+ k_results: int = 3,
86
+ max_query_length: int = 300,
87
+ max_docs: int = 3,
88
+ doc_content_chars_max: int = 40000,
89
+ ):
90
+ return ArxivAPIWrapper( # type: ignore
91
+ top_k_results=k_results,
92
+ ARXIV_MAX_QUERY_LENGTH=max_query_length,
93
+ load_max_docs=max_docs,
94
+ load_all_available_meta=False,
95
+ doc_content_chars_max=doc_content_chars_max,
96
+ )
src/research_assistant/components/arxiv_search_api.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ import requests
4
+ import xmltodict
5
+
6
+ from research_assistant.app_logging import app_logger
7
+ from research_assistant.constants import ARXIV_API_ACCESS_POINT
8
+ from research_assistant.entity import ArticleSearchConfig
9
+
10
+
11
+ class ArxivApiWrap:
12
+ def __init__(self, config: ArticleSearchConfig):
13
+ self.config = config
14
+
15
+ def convert_link_to_pdflink(self, link):
16
+ return link.replace("/abs/", "/pdf/") + ".pdf"
17
+
18
+ def convert_date(self, date):
19
+ return datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d")
20
+
21
+ """
22
+ Fetches the response from the arXiv API based on the specified search terms and parameters.
23
+ Args used by the arXiv API:
24
+ Keywords (list of str): Contains the search terms
25
+ max_length (int): Maximum number of articles to retrieve
26
+ Date range : Contains start and end dates for the search
27
+ Sort by : Sorts the results by a specific field (e.g., submittedDate)
28
+ Sort order (str): Sort order for the results (e.g., asc, desc)
29
+
30
+ Returns:
31
+ requests.Response: The HTTP response object returned by the arXiv API.
32
+ """
33
+
34
+ def get_arxiv_api_response(self):
35
+ keyword_query = " AND all:".join([f"'{kw}'" for kw in self.config.search_terms])
36
+ if self.config.date_range.start_date:
37
+ query = f" all:{keyword_query} AND submittedDate:[{self.convert_date(self.config.date_range.start_date)} TO {self.convert_date(self.config.date_range.end_date)}]"
38
+ else:
39
+ query = f" all:{keyword_query}"
40
+ params = {
41
+ "search_query": query,
42
+ "start": 0, # Starts from page 1 of the results obtained
43
+ "max_results": self.config.num_results, # Adjust the number of results as needed
44
+ "sortBy": self.config.sort_by, # Sort by submission date
45
+ "sortOrder": self.config.sort_order, # Sort in descending order (latest first)
46
+ }
47
+ return requests.get(ARXIV_API_ACCESS_POINT, params=params)
48
+
49
+ """
50
+ Retrieves article search results from the arXiv API and logs detailed information about each article.
51
+
52
+ This method fetches the API response, parses the XML content into a structured format,
53
+ and extracts key information such as the title, summary, link, and authors for each article.
54
+
55
+ Returns:
56
+ list of str: A list of article links retrieved from the arXiv API.
57
+ """
58
+
59
+ def get_article_search_result(self):
60
+ response = self.get_arxiv_api_response() # Fetch the API response
61
+ article_links = []
62
+ if response.status_code == 200: # Check if the request was successful
63
+ # Parse the response (arXiv API returns XML)
64
+ data = xmltodict.parse(response.content)
65
+ for entry in data["feed"]["entry"]:
66
+ title, summary, link, authors = (
67
+ entry["title"],
68
+ entry["summary"],
69
+ entry["id"],
70
+ [author["name"] for author in entry["author"]],
71
+ )
72
+ app_logger.info(
73
+ f"Title: {title}\n Authors: {authors} \n,Abstract: {summary}\n Page Link: {link}\n PDF Link: {self.convert_link_to_pdflink(link)}\n Paper Id: {link.split('/')[-1]}\n {'-'*80}"
74
+ )
75
+ article_links.append(link)
76
+ else:
77
+ app_logger.info(f"Failed to retrieve papers: {response.status_code}")
78
+ return article_links
79
+
80
+ def download_pdf(self, pdf_url):
81
+ response, title = requests.get(pdf_url), pdf_url.split("/")[0]
82
+ with open(f"data/{title}.pdf", "wb") as f:
83
+ f.write(response.content)
84
+ print(f"Downloaded: {title}.pdf")
src/research_assistant/components/pdfParser.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from pdfminer.high_level import extract_pages
4
+ from pdfminer.layout import LTTextContainer
5
+
6
+ from research_assistant.app_logging import app_logger
7
+
8
+
9
+ def pdf_parser(pdf_path):
10
+ """
11
+ Extracts text from a PDF file, removing headers, footers, and page numbers.
12
+ Args:
13
+ pdf_path (str): The file path to the PDF.
14
+ Returns:
15
+ str: The extracted text suitable for LLM input.
16
+ """
17
+ extracted_text = []
18
+ header_counter, footer_counter = {}, {}
19
+ header_patterns, footer_patterns = set(), set()
20
+ # Matches lines with page numbers
21
+ page_number_pattern = re.compile(r"^(Page\s+)?\d+(/\d+)?$")
22
+
23
+ try:
24
+ # First pass: identify headers and footers by tracking recurring lines
25
+ total_pages = 0
26
+ for page_layout in extract_pages(pdf_path):
27
+ total_pages += 1
28
+ page_text = [
29
+ element.get_text().strip()
30
+ for element in page_layout
31
+ if isinstance(element, LTTextContainer) and element.get_text().strip()
32
+ ]
33
+
34
+ if len(page_text) >= 2:
35
+ header, footer = page_text[0], page_text[-1]
36
+ header_counter[header] = header_counter.get(header, 0) + 1
37
+ footer_counter[footer] = footer_counter.get(footer, 0) + 1
38
+
39
+ # Determine most common headers and footers
40
+ header_patterns = {
41
+ k for k, v in header_counter.items() if v > total_pages * 0.5
42
+ }
43
+ footer_patterns = {
44
+ k for k, v in footer_counter.items() if v > total_pages * 0.5
45
+ }
46
+
47
+ # Compile regex patterns
48
+ header_regexes = [re.compile(re.escape(header)) for header in header_patterns]
49
+ footer_regexes = [re.compile(re.escape(footer)) for footer in footer_patterns]
50
+
51
+ # Second pass: extract and clean text
52
+ for page_layout in extract_pages(pdf_path):
53
+ page_text = [
54
+ element.get_text().strip()
55
+ for element in page_layout
56
+ if isinstance(element, LTTextContainer) and element.get_text().strip()
57
+ ]
58
+ extracted_text.extend(
59
+ line
60
+ for line in page_text
61
+ if not any(regex.match(line) for regex in header_regexes)
62
+ and not any(regex.match(line) for regex in footer_regexes)
63
+ and not page_number_pattern.match(line)
64
+ )
65
+ return " ".join(extracted_text).replace("\n", " ").strip()
66
+ except Exception as e:
67
+ app_logger.error(f"Failed to parse PDF {pdf_path}: {e}")
68
+ return ""
src/research_assistant/components/planner.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional
2
+
3
+ from langchain_anthropic.chat_models import ChatAnthropic
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
6
+ from langchain_core.runnables import RunnableConfig
7
+ from langchain_core.tools import StructuredTool
8
+ from pydantic import BaseModel, Field
9
+
10
+ from research_assistant.components.plannerParser import PlannerParser
11
+ from research_assistant.constants import HEILMEIER_CATECHISM
12
+
13
+
14
+ class PlannerOutput(BaseModel):
15
+ plan_str: str = Field(
16
+ ...,
17
+ description=(
18
+ "This plan includes a detailed breakdown of each step, specifying the task, the tool used, the arguments provided, and any dependencies (outputs from previous steps) required as inputs for that step. An example of a single step would be:\n"
19
+ 'Plan_step: "Using the different shots from #E2, analyze their impact on the game. #E3: LLM [What impact do the shots mentioned in #E2 have on the game?]"'
20
+ ),
21
+ )
22
+ tools: list[str] = Field(
23
+ ..., description="The Tool each step of the plan needs to use."
24
+ )
25
+ dependencies: Dict[int, list[int]] = Field(
26
+ ...,
27
+ description=(
28
+ "A dictionary of dependencies, which elaborates on what outputs are needed for each step in the plan to execute. So that the output of those steps is retrieved and added inside the prompt for the present step. An example of a dependency would be: {2 : [1]}"
29
+ ),
30
+ )
31
+ arguments: list[str] = Field(
32
+ ...,
33
+ description="The arguments that the tool needs to be given. These arguments will be used in the prompt to get the output.",
34
+ )
35
+
36
+
37
+ def get_planner(llm: ChatAnthropic):
38
+ prompt_template = ChatPromptTemplate.from_messages(
39
+ [
40
+ SystemMessage(
41
+ (
42
+ f"You are a research assistant whose primary job is to explain a research article in a clear and accessible way. Your goal is to read the entire article and provide an explanation that allows other researchers to understand its content without having to read it themselves. Additionally, you should be able to answer any questions they might have. The most efficient way to accomplish this is by answering the following Heilmeier catechism questions in detail:\n{HEILMEIER_CATECHISM}\n\n"
43
+ # comment for readability
44
+ "You are a planner of the research assistant agent architecture. You need to generate a step-by-step plan process such that you can get all the answers to the given questions using the provided tools:\n"
45
+ "(1) Arxiv [input]: A tool that searches for results from the Arxiv website. It is useful for finding information on specific topics. The input should be a concise text string, similar to a search query used in a Google search. This tool searches published articles and provides details about the article and a summary of its content. The information obtained is reliable, so if you need information not covered in the research article or require external information, use this tool.\n"
46
+ "(2) LLM [input]: A pretrained language model that can answer any questions. You provide the query and additional context, and it generates a relevant, summarized answer. The additional context may include the output from previous steps or evidence gathered using the Arxiv tool.\n\n"
47
+ # comment for readability
48
+ "For example,\n"
49
+ "Task: Explain different kinds of cricket shots.\n"
50
+ "plan_str:\n"
51
+ '1. Start by finding different kinds of cricket shots. #E1 = Arxiv["Different kinds of cricket shots"].\n'
52
+ '2. Given the result of the search query, find different types of cricket shots. #E2 = LLM ["Find the different types of cricket shots given the result of search query #E1."]\n'
53
+ '3. Now, let us find out about different types of cricket shots and their impact on the game. #E3 = LLM ["Given the different types of cricket shots from the step #E1 till step #E2, how does their impact on the game look like?"]\n\n'
54
+ # comment for readability
55
+ "Describe the steps of your plan with rich details. Each step of the plan should contain #E as shown in the example. DO NOT write a step at the end to summarize the plan."
56
+ )
57
+ ),
58
+ MessagesPlaceholder(variable_name="context", optional=True),
59
+ MessagesPlaceholder(variable_name="article_text"),
60
+ ]
61
+ )
62
+ planner = prompt_template | llm
63
+
64
+ # parse the response to get the plan, tasks, tools, dependencies, and arguments
65
+ def parse_plan(plan_string: str):
66
+ parser = PlannerParser(plan_string=plan_string)
67
+ return PlannerOutput(
68
+ plan_str=plan_string,
69
+ tools=parser.get_tool_list(),
70
+ dependencies=parser.get_dependency_list(),
71
+ arguments=parser.get_argument_list(),
72
+ )
73
+
74
+ def get_plan(
75
+ article_text: str,
76
+ _context: Optional[List[str]] = None, # TODO: rename when context is used
77
+ _config: Optional[RunnableConfig] = None, # TODO: rename when config is used
78
+ ):
79
+ response = planner.invoke(
80
+ {
81
+ "article_text": [
82
+ HumanMessage(
83
+ f"You are given a research document with the following content:\n{article_text}.\n\n"
84
+ "Read the research document thoroughly. Using the tools provided to you, generate a step-by-step plan that would use these tools in the specified step-wise manner to get a detailed summary for all the questions."
85
+ )
86
+ ]
87
+ }
88
+ )
89
+ if isinstance(response.content, str):
90
+ return parse_plan(response.content)
91
+ else:
92
+ raise TypeError(
93
+ "Response.Content i.e the plan given out from the llm must be a string"
94
+ )
95
+
96
+ return StructuredTool.from_function(
97
+ name="planner",
98
+ func=get_plan,
99
+ description=(
100
+ (
101
+ 'This tool is used to generate a plan for obtaining a summary of research articles. Rather than providing the entire summary, it focuses on creating a step-by-step plan that guides the agent in producing a detailed, accurate summary of a research article. This tool can be considered the "brain" that designs the agent\'s workflow.\n'
102
+ "For Example:\n"
103
+ "Input: The parsed pdf string of the article\n"
104
+ "Answer: An object consisting of the following fields:\n"
105
+ "plan_string: str\n"
106
+ "steps : List[str]\n"
107
+ "tools : List[str]\n"
108
+ "dependencies : dict\n"
109
+ "arguments : List[str]\n"
110
+ )
111
+ ),
112
+ )
src/research_assistant/components/plannerParser.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.messages import HumanMessage, SystemMessage
2
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
3
+
4
+ from research_assistant.components.agent import Agent
5
+ from research_assistant.config.configuration import ConfigurationManager
6
+
7
+
8
+ class PlannerParser:
9
+ tool_list_schema = {
10
+ "title": "ToolSchema",
11
+ "description": "This is the schema used to get tools list, after parsing the plan string, which will be used to do tool or fucntion calling for the further parts of the agentic framework",
12
+ "type": "object",
13
+ "properties": {
14
+ "tools": {
15
+ "type": "array",
16
+ "items": {"type": "string"},
17
+ "title": "Tools",
18
+ "description": "The Tool each step in the plan needs to access. If we have seven steps, this has a list of 7 values depicting the tool name for each step that needs to be used",
19
+ "default": [],
20
+ },
21
+ },
22
+ "required": ["tools"],
23
+ }
24
+ argument_list_schema = {
25
+ "title": "ArgumentSchema",
26
+ "description": "This is the schema used to get argument list used for each tool call, after parsing the plan string, which will be used to do tool or fucntion calling for the further parts of the agentic framework",
27
+ "type": "object",
28
+ "properties": {
29
+ "arguments": {
30
+ "type": "array",
31
+ "items": {"type": "string"},
32
+ "title": "Arguments",
33
+ "description": "The Arugment for tool call for each step in the plan needs to access. If we have seven steps, this has a list of 7 values depicting the argument value for each step that needs to be used",
34
+ "default": [],
35
+ },
36
+ },
37
+ "required": ["arguments"],
38
+ }
39
+ dependency_list_schema = {
40
+ "title": "DependentSchema",
41
+ "description": "This schema defines dependencies for each tool call to fetch additional context after parsing the plan string for the agentic framework.",
42
+ "type": "object",
43
+ "properties": {
44
+ "dependencies": {
45
+ "type": "object",
46
+ "additionalProperties": {
47
+ "type": "array",
48
+ "items": {"type": "integer"},
49
+ },
50
+ "title": "Dependencies",
51
+ "description": "A dictionary where each key is a step identifier, and the value is an array of integers, representing the step dependencies.",
52
+ "default": {},
53
+ }
54
+ },
55
+ "required": ["dependencies"],
56
+ }
57
+
58
+ def __init__(self, plan_string: str):
59
+ self.plan_string = plan_string
60
+ self.config = ConfigurationManager().get_planner_parser_config()
61
+
62
+ def get_tool_list(self):
63
+ llm = Agent(self.config.tool_list_model).get_model()
64
+ prompt_template = ChatPromptTemplate.from_messages(
65
+ [
66
+ SystemMessage(
67
+ (
68
+ "You are a planner parser. You will be given a plan consisting of a series of steps, and you need to give me a list consisting of what tool is being used for each step.\n"
69
+ "So if there are 7 steps, I need a list of length 7, where each value is the tool that is being used for the corresponding step.\n"
70
+ "For example:\n"
71
+ '1. Find out the temperature right now. #E1 = Google["What is the temperature near me?"]\n'
72
+ '2. Check the weather forecast for tomorrow. #E2: WeatherAPI["What will be the weather like tomorrow?"]\n'
73
+ '3. Get the top 5 news articles related to the topic of climate change. !3 = NewsAPI["What are the top 5 news articles about climate change?"]\n'
74
+ "If this is the given plan, the output should be: ['Google', 'WeatherAPI', 'NewsAPI'].\n"
75
+ "Be very careful that you don't miss any step or tool. The number of steps and tools should be the same. Check your output thoroughly. Write only one tool on each line."
76
+ ),
77
+ ),
78
+ MessagesPlaceholder(variable_name="plan"),
79
+ MessagesPlaceholder(variable_name="context", optional=True),
80
+ ]
81
+ )
82
+ tool_parser = prompt_template | llm.with_structured_output(
83
+ self.tool_list_schema
84
+ )
85
+ return tool_parser.invoke(
86
+ {
87
+ "plan": [
88
+ HumanMessage(
89
+ f"This is the generated plan:\n{self.plan_string}\n\n"
90
+ "Now parse this content and give me the list of tools for each step."
91
+ )
92
+ ]
93
+ }
94
+ )["tools"]
95
+
96
+ def get_argument_list(self):
97
+ llm = Agent(self.config.argument_list_model).get_model()
98
+ prompt_template = ChatPromptTemplate.from_messages(
99
+ [
100
+ SystemMessage(
101
+ (
102
+ "You are a planner parser. You take in a plan consisting of a series of steps, and you need to give me a list consisting of what argument is being used for each tool call in each step.\n"
103
+ "So if there are 7 steps, I need a list of length 7, where each value is the argument that is being called inside the tool for each step. A step is defined as #Ex where x is the step number. An argument will always be of the format #Ex = tool_name['argument']\n"
104
+ "For example:\n"
105
+ '1. Find out the temperature right now. #E1 = Google["What is the temperature near me?"]\n'
106
+ '2. Check the weather forecast for tomorrow. #E2 : WeatherAPI["What will be the weather like tomorrow?"]\n'
107
+ '3. Get the top 5 news articles related to the topic of climate change. !3 = NewsAPI["What are the top 5 news articles about climate change?"]\n'
108
+ "If this is the given plan, the output should be: ['What is the temperature near me?', 'What will be the weather like tomorrow?', 'What are the top 5 news articles about climate change?']\n"
109
+ "Be very careful that you don't miss any step or argument. The number of steps and arguments should be the same. Check your output thoroughly."
110
+ ),
111
+ ),
112
+ MessagesPlaceholder(variable_name="plan"),
113
+ MessagesPlaceholder(variable_name="context", optional=True),
114
+ ]
115
+ )
116
+ argument_parser = prompt_template | llm.with_structured_output(
117
+ self.argument_list_schema
118
+ )
119
+ return argument_parser.invoke(
120
+ {
121
+ "plan": [
122
+ HumanMessage(
123
+ f"This is the generated plan:\n{self.plan_string}\n\n"
124
+ "Give me the list of arguments of each tool call for each step. For each step I need to know what is the argument that is being passed inside the tool. If I have 10 steps, I need a list of length 10, consisting of the query or argument for each tool call in each step."
125
+ )
126
+ ]
127
+ }
128
+ )["arguments"]
129
+
130
+ def get_dependency_list(self):
131
+ llm = Agent(self.config.dependency_list_model).get_model()
132
+ prompt_template = ChatPromptTemplate.from_messages(
133
+ [
134
+ SystemMessage(
135
+ (
136
+ "You are a planner parser. You get a plan consisting of a series of steps, and you need to give me a dictionary consisting of what step results each step argument is dependent upon. If there are 7 steps and 5 steps require the results of previous steps, I need a dictionary containing those 5 keys, where each value is a list of step numbers that the key step is dependent upon. For example:\n"
137
+ '1. Find out the temperature right now. #E1 = Location["What is my current location?"]\n'
138
+ '2. Check the weather forecast for tomorrow. #E2 : WeatherAPI["What will be the weather like tomorrow at #E1?"]\n'
139
+ '3. Get the top 5 news articles related to the topic of climate change. !3 = LLM["What are major tourist things to do based on the information of #E1 and #E2?"]\n'
140
+ "If this is the given plan, the output should be:\n"
141
+ "{'2': [1], '3': [1, 2]}\n"
142
+ "This is because step 2 is dependent on the value of #E1, and step 3 is dependent on the value of #E2 and #E1.\n"
143
+ "A step cannot depend on itself. A step cannot depend on any step that comes after it. A dependency only exists if a step's content has 'from #Ex' where x is the step number.\n"
144
+ "For example:\n"
145
+ '#E8 = LLM["What were the main results and findings from their experiments? Explain the results in simple terms without technical jargon"]\n'
146
+ '#E9 = Arxiv["UNLI limitations natural language inference"]\n'
147
+ "Under no circumstances can the output be: {'8': [9]}\n"
148
+ "This is because 8 comes before 9.\n\n"
149
+ "Be very careful that you don't miss any step or dependency. The number of steps and dependencies should be the same. Check your output thoroughly."
150
+ ),
151
+ ),
152
+ MessagesPlaceholder(variable_name="plan"),
153
+ MessagesPlaceholder(variable_name="context", optional=True),
154
+ ]
155
+ )
156
+ dependency_parser = prompt_template | llm.with_structured_output(
157
+ self.dependency_list_schema
158
+ )
159
+ return dependency_parser.invoke(
160
+ {
161
+ "plan": [
162
+ HumanMessage(
163
+ f"This is the generated plan:\n{self.plan_string}\n\n"
164
+ "Parse this content and give me the dictionary of dependencies. Look at each step and see on what steps each argument of the step is dependent upon, and give me the values in the form of a key value pair. Key being the step number and value being the list of step numbers that the key is dependent upon."
165
+ )
166
+ ]
167
+ }
168
+ )["dependencies"]
src/research_assistant/components/solver.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+
3
+ from langchain_anthropic.chat_models import ChatAnthropic
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
6
+ from langchain_core.runnables import RunnableConfig
7
+ from langchain_core.tools import StructuredTool
8
+ from pydantic import BaseModel, Field
9
+
10
+ from research_assistant.app_logging import app_logger
11
+ from research_assistant.constants import HEILMEIER_CATECHISM
12
+
13
+
14
+ class SolverResponse(BaseModel):
15
+ """The input to the summarizer tool function."""
16
+
17
+ answer: str = Field(
18
+ ...,
19
+ description="The summary of the research article ",
20
+ )
21
+
22
+
23
+ def get_solver(llm: ChatAnthropic):
24
+ prompt_template = ChatPromptTemplate.from_messages(
25
+ [
26
+ SystemMessage(
27
+ (
28
+ "You are a research assistant responsible for simplifying and explaining the core concepts of a research article to a user who may not be familiar with technical terms. You will be given a plan created by a planner tool, which breaks down the main ideas of the research article through a series of questions. Each question is paired with an answer, providing insights into the purpose, methodology, and key findings of the article. Your task is to synthesize these questions and answers to produce a clear, concise summary that captures the main message of the research article. The summary should enable the user to understand the article's significance, its contributions, and whether it contains information relevant to their needs or goals. Your summary should be:\n"
29
+ "1. Simple and accessible, avoiding technical jargon.\n"
30
+ "2. Comprehensive enough to convey the article's goals and key insights.\n"
31
+ "3. Informative so that the user can decide if reading the full article is worth their time.\n"
32
+ "Please proceed by summarizing based on these questions and answers. Make sure to respond in the markdown format."
33
+ ),
34
+ ),
35
+ MessagesPlaceholder(variable_name="context", optional=True),
36
+ MessagesPlaceholder(variable_name="text"),
37
+ ]
38
+ )
39
+ solver = prompt_template | llm
40
+
41
+ def get_joined_answer(
42
+ input: str,
43
+ _context: Optional[List[str]] = None, # TODO: rename when context is used
44
+ _config: Optional[RunnableConfig] = None, # TODO: rename when config is used
45
+ ):
46
+ response = solver.invoke(
47
+ {
48
+ "text": [
49
+ HumanMessage(
50
+ (
51
+ f"Solve the following task or question. To solve the question, we have made a step-by-step plan and retrieved corresponding evidence for each plan. Use them with caution since long evidence might contain irrelevant information. Here's the plan with the evidence:\n{input}\n\n"
52
+ f"Now solve the question or task according to the provided evidence above. Respond with:\n{HEILMEIER_CATECHISM}\n\n"
53
+ "Since you are a research assistant, you need to be as detailed as possible to help me understand your breakdown of the research document. Assume I have no prior knowledge of the document’s content. Make it clear, comprehensive, and easy to understand. Avoid complex language and technical jargon. The more in-depth the explanation, the better\n"
54
+ "The output should be answering all the Heilmeier catechism questions using the obtained evidence information. Answer all the questions.\n"
55
+ )
56
+ )
57
+ ]
58
+ }
59
+ )
60
+ if not isinstance(response.content, str):
61
+ app_logger.info(
62
+ "The response from solver is not a string. It is %s",
63
+ type(response.content),
64
+ )
65
+ response.content = str(response.content)
66
+ return SolverResponse(answer=response.content)
67
+
68
+ return StructuredTool.from_function(
69
+ name="solver",
70
+ func=get_joined_answer,
71
+ description="This is a tool which takes in a list of questions and the answers provide to those questions, and generates a summary by using all of this information.",
72
+ )
src/research_assistant/components/state.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from typing_extensions import TypedDict
4
+
5
+
6
+ class ResearchSummary(TypedDict):
7
+ article_text: str
8
+ plan_string: str
9
+ dependencies: dict
10
+ tools: List[str]
11
+ arguments: List[str]
12
+ results: dict
13
+ result: str
src/research_assistant/config/__init__.py ADDED
File without changes
src/research_assistant/config/configuration.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from research_assistant.constants import CONFIG_FILE_PATH
2
+ from research_assistant.entity import (
3
+ ArticleSearchConfig,
4
+ PlannerConfig,
5
+ PlannerParserConfig,
6
+ QaToolConfig,
7
+ SolverConfig,
8
+ SubmittedDateConfig,
9
+ articleLoaderConfig,
10
+ )
11
+ from research_assistant.utils.common import read_yaml
12
+
13
+
14
+ class ConfigurationManager:
15
+ def __init__(self, config_filepath=CONFIG_FILE_PATH):
16
+ self.config = read_yaml(config_filepath)
17
+
18
+ def get_article_details_config(self) -> articleLoaderConfig:
19
+ config = self.config.article_details
20
+ return articleLoaderConfig(
21
+ file_path=config.file_path, summary_save_dir=config.summary_save_dir
22
+ )
23
+
24
+ def get_article_search_params(self) -> ArticleSearchConfig:
25
+ config = self.config.article_search_details
26
+ return ArticleSearchConfig(
27
+ search_terms=config.search_terms,
28
+ num_results=config.num_results,
29
+ date_range=SubmittedDateConfig(
30
+ start_date=config.date_range.start_date,
31
+ end_date=config.date_range.end_date,
32
+ ),
33
+ sort_by=config.sort_by,
34
+ sort_order=config.sort_order,
35
+ )
36
+
37
+ def get_planner_config(self) -> PlannerConfig:
38
+ config = self.config.planner
39
+ return PlannerConfig(model_name=config.model_name)
40
+
41
+ def get_qa_tool_config(self) -> QaToolConfig:
42
+ config = self.config.qa_tool
43
+ return QaToolConfig(model_name=config.model_name)
44
+
45
+ def get_solver_config(self) -> SolverConfig:
46
+ config = self.config.solver
47
+ return SolverConfig(model_name=config.model_name)
48
+
49
+ def get_planner_parser_config(self) -> PlannerParserConfig:
50
+ config = self.config.planner_parser
51
+ return PlannerParserConfig(
52
+ tool_list_model=config.tool_list_model,
53
+ argument_list_model=config.argument_list_model,
54
+ dependency_list_model=config.dependency_list_model,
55
+ )
src/research_assistant/constants/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ CONFIG_FILE_PATH = Path("config/config.yaml")
4
+ HEILMEIER_CATECHISM = (
5
+ "1. What are they trying to do? Articulate the objectives using absolutely no jargon.\n"
6
+ "2. How was it done before this article, and what are the limitations of those practices?\n"
7
+ "3. What is new in their approach, and why do they think it will be successful?\n"
8
+ "4. Who cares? If they are successful, what difference will it make?\n"
9
+ "5. What experiment do they design to show their approach works? What dataset or question set did they use? What LLMs or other AI systems did they work? How did they measure effectiveness?\n"
10
+ "6. What were the results? What do they show? Again, articulate this using absolutely no jargon.\n"
11
+ )
12
+ ARXIV_API_ACCESS_POINT = "http://export.arxiv.org/api/query"
src/research_assistant/entity/__init__.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class articleLoaderConfig:
8
+ file_path: Path
9
+ summary_save_dir: Path
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class SubmittedDateConfig:
14
+ start_date: str
15
+ end_date: str
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class ArticleSearchConfig:
20
+ search_terms: List[str]
21
+ num_results: int
22
+ date_range: SubmittedDateConfig
23
+ sort_by: str
24
+ sort_order: str
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class PlannerConfig:
29
+ model_name: str
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class QaToolConfig:
34
+ model_name: str
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class SolverConfig:
39
+ model_name: str
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class PlannerParserConfig:
44
+ tool_list_model: str
45
+ argument_list_model: str
46
+ dependency_list_model: str
src/research_assistant/main.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from pathlib import Path
3
+
4
+ from dotenv import load_dotenv
5
+
6
+ from research_assistant.app_logging import app_logger
7
+ from research_assistant.config.configuration import ConfigurationManager
8
+ from research_assistant.pipeline.articleSearch import ArticleSearchPipeline
9
+ from research_assistant.pipeline.articleSummarization import ArticleSummarization
10
+ from research_assistant.utils.common import write_summary_to_file
11
+
12
+
13
+ def article_summarization():
14
+ app_logger.info("Starting the Summarization Pipeline")
15
+ load_dotenv(Path(".env"))
16
+ article_config = ConfigurationManager().get_article_details_config()
17
+ filepath = article_config.file_path
18
+ app_logger.info(f"Processing file: {filepath}")
19
+ summary = ArticleSummarization(filepath).get_summary()
20
+ app_logger.info(f"Summary: {summary}")
21
+ app_logger.info("Completed Summarizing the article")
22
+ write_summary_to_file(article_config, summary)
23
+ app_logger.info("Summarization Pipeline completed successfully")
24
+
25
+
26
+ def article_search():
27
+ app_logger.info("Starting the article search pipeline")
28
+ arxiv_search_details = ConfigurationManager().get_article_search_params()
29
+ article_search = ArticleSearchPipeline(arxiv_search_details)
30
+ article_list = article_search.get_article_list()
31
+ app_logger.info(
32
+ f"Completed searching for articles. We found a total of {len(article_list)} articles"
33
+ )
34
+
35
+
36
+ def main():
37
+ parser = argparse.ArgumentParser(description="Research Assistant CLI")
38
+ parser.add_argument(
39
+ "--summarize_article",
40
+ action="store_true",
41
+ help="Runs the article summmarization pipeline",
42
+ )
43
+ parser.add_argument(
44
+ "--search_articles",
45
+ action="store_true",
46
+ help="Run the article search pipeline",
47
+ )
48
+ args = parser.parse_args()
49
+ if args.summarize_article:
50
+ article_summarization()
51
+ elif args.search_articles:
52
+ article_search()
53
+ else:
54
+ app_logger.info("No valid arguments provided. Use --help for options.")
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
src/research_assistant/pipeline/__init__.py ADDED
File without changes
src/research_assistant/pipeline/articleSearch.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from research_assistant.app_logging import app_logger
2
+ from research_assistant.components.arxiv_search_api import ArxivApiWrap
3
+ from research_assistant.entity import ArticleSearchConfig
4
+
5
+
6
+ class ArticleSearchPipeline:
7
+ def __init__(self, config: ArticleSearchConfig):
8
+ self.config = config
9
+
10
+ def get_article_list(self):
11
+ arxiv_api = ArxivApiWrap(self.config)
12
+ article_list = arxiv_api.get_article_search_result()
13
+ if not article_list:
14
+ app_logger.info("No articles were found for the given parameters.")
15
+ return article_list
src/research_assistant/pipeline/articleSummarization.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.messages import HumanMessage
2
+ from langgraph.graph import END, START, StateGraph
3
+
4
+ from research_assistant.app_logging import app_logger
5
+ from research_assistant.components.agent import Agent
6
+ from research_assistant.components.agent_tools import get_arxiv_tool, get_qa_tool
7
+ from research_assistant.components.pdfParser import pdf_parser
8
+ from research_assistant.components.planner import get_planner
9
+ from research_assistant.components.solver import get_solver
10
+ from research_assistant.components.state import ResearchSummary
11
+ from research_assistant.config.configuration import ConfigurationManager
12
+ from research_assistant.utils.state_utils import SummaryStateUtils
13
+
14
+
15
+ class ArticleSummarization:
16
+ def __init__(self, file_path):
17
+ self.article_path = file_path
18
+ self.config = ConfigurationManager()
19
+ self.summary_utils = SummaryStateUtils()
20
+
21
+ # This function gives us the model name being requested for any component in the workflow.
22
+ def get_model(self, component: str):
23
+ if component == "planner":
24
+ config = self.config.get_planner_config()
25
+ elif component == "qa_tool":
26
+ config = self.config.get_qa_tool_config()
27
+ elif component == "solver":
28
+ config = self.config.get_solver_config()
29
+ else:
30
+ raise ValueError("Invalid component name for getting the Model")
31
+ agent = Agent(config.model_name)
32
+ return agent.get_model()
33
+
34
+ # This function generates the plan for the given task using planner tool. This is attached to the planner node.
35
+ def get_plan(self, state: ResearchSummary):
36
+ response = get_planner(llm=self.get_model("planner")).invoke(
37
+ {"article_text": state["article_text"]}
38
+ )
39
+ if len(response.tools) != len(response.arguments):
40
+ raise ValueError("The Plan string is not parsed properly")
41
+ app_logger.info(f"The plan produced is: {response.plan_str}")
42
+ return {
43
+ "plan_string": response.plan_str,
44
+ "dependencies": response.dependencies,
45
+ "arguments": response.arguments,
46
+ "tools": response.tools,
47
+ }
48
+
49
+ # This function executes the tools of the plan. This is attached to the tool execution node.
50
+ def tool_execution(self, state: ResearchSummary):
51
+ """Worker node that executes the tools of a given plan."""
52
+ current_step = self.summary_utils.get_current_task(state)
53
+ arg, tools = state["arguments"], state["tools"]
54
+ results_dict = (state["results"] or {}) if "results" in state else {}
55
+ # Tool calling for each step.
56
+ if tools[current_step - 1] == "Arxiv":
57
+ result = get_arxiv_tool().run(arg[current_step - 1])
58
+ elif tools[current_step - 1] == "LLM":
59
+ result = get_qa_tool(llm=self.get_model("qa_tool")).invoke(
60
+ {
61
+ "question": arg[current_step - 1],
62
+ "context": self.summary_utils.get_current_dependencies(
63
+ state, current_step
64
+ ),
65
+ }
66
+ )
67
+ else:
68
+ raise ValueError
69
+ # Store the result in the results dictionary with the step number as key.
70
+ results_dict[current_step] = str(result)
71
+ return {"results": results_dict}
72
+
73
+ # This function generates the final answer using the results obtained from tool executions. This is attached to the solve node.
74
+ def solve(self, state: ResearchSummary):
75
+ return {
76
+ "result": get_solver(llm=self.get_model("solver"))
77
+ .invoke(self.summary_utils.get_plan_results(state))
78
+ .answer
79
+ }
80
+
81
+ # This function builds the execution graph for the article summarization workflow.
82
+ def get_graph(self):
83
+ graph = StateGraph(ResearchSummary)
84
+ graph.add_node("plan", self.get_plan)
85
+ graph.add_node("tool", self.tool_execution)
86
+ graph.add_node("solve", self.solve)
87
+ graph.add_edge("plan", "tool")
88
+ graph.add_edge("solve", END)
89
+ graph.add_conditional_edges("tool", self.summary_utils.route)
90
+ graph.add_edge(START, "plan")
91
+ return graph.compile()
92
+
93
+ # This function builds the execution graph for the summarization task workflow.
94
+ def get_summary(self):
95
+ app = self.get_graph()
96
+ for s in app.stream({"article_text": pdf_parser(self.article_path)}):
97
+ final_output = s
98
+ return final_output["solve"]["result"]
src/research_assistant/utils/__init__.py ADDED
File without changes
src/research_assistant/utils/common.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import yaml
4
+ from box import ConfigBox
5
+ from box.exceptions import BoxValueError
6
+
7
+ from research_assistant.app_logging import app_logger
8
+ from research_assistant.entity import articleLoaderConfig
9
+
10
+
11
+ def read_yaml(path_to_yaml: Path) -> ConfigBox:
12
+ """reads yaml file and returns
13
+ Args:
14
+ path_to_yaml (str): path like input
15
+ Raises:
16
+ ValueError: if yaml file is empty
17
+ e: empty file
18
+ Returns:
19
+ ConfigBox: ConfigBox type
20
+ """
21
+ try:
22
+ with open(path_to_yaml) as yaml_file:
23
+ app_logger.info(f"yaml file: {path_to_yaml} loaded successfully")
24
+ return ConfigBox(yaml.safe_load(yaml_file))
25
+ except BoxValueError as e:
26
+ raise ValueError("yaml file is empty") from e
27
+
28
+
29
+ def create_directories(path_to_directories: list, verbose=True):
30
+ """create list of directories
31
+ Args:
32
+ path_to_directories (list): list of path of directories
33
+ verbose (bool, optional): whether to log the creation of directories. Defaults to True.
34
+ """
35
+ for path in path_to_directories:
36
+ Path(path).mkdir(parents=True, exist_ok=True)
37
+ if verbose:
38
+ app_logger.info(f"created directory at: {path}")
39
+
40
+
41
+ def write_to_file(filename, text):
42
+ """write text to file
43
+ Args:
44
+ path (str): file path
45
+ text (str): text to write
46
+ """
47
+ with open(filename, "w") as file:
48
+ file.write(text)
49
+ app_logger.info(f"wrote text to file: {filename}")
50
+
51
+
52
+ def write_summary_to_file(config: articleLoaderConfig, text: str):
53
+ create_directories([config.summary_save_dir])
54
+ output_filepath = (
55
+ Path(config.summary_save_dir) / f"summary_{Path(config.file_path).stem}.md"
56
+ )
57
+ write_to_file(output_filepath, text)
src/research_assistant/utils/state_utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from research_assistant.components.state import ResearchSummary
2
+
3
+
4
+ class SummaryStateUtils:
5
+
6
+ def get_current_task(self, state: ResearchSummary):
7
+ if results := state.get("results"):
8
+ return None if len(results) == len(state["arguments"]) else len(results) + 1
9
+ return 1
10
+
11
+ def get_current_dependencies(self, state: ResearchSummary, step: int):
12
+ return [
13
+ state["results"].get(i, "") for i in state["dependencies"].get(step, [])
14
+ ] or [state["article_text"]]
15
+
16
+ def route(self, state: ResearchSummary):
17
+ return "solve" if self.get_current_task(state) is None else "tool"
18
+
19
+ def get_plan_results(self, state: ResearchSummary) -> str:
20
+ results = state.get("results", {})
21
+ plan_lines = [
22
+ f"Plan: {plan}\n Answer = {results.get(i+1, '')}"
23
+ for i, plan in enumerate(state["arguments"])
24
+ ]
25
+ return "\n".join(plan_lines)
src/research_assistant/web/app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fpdf import FPDF
3
+ from mistletoe import markdown
4
+
5
+ from research_assistant.app_logging import app_logger
6
+ from research_assistant.pipeline.articleSummarization import ArticleSummarization
7
+
8
+
9
+ def process_file(file):
10
+ try:
11
+ app_logger.info(f"Processing file: {file}")
12
+ summary_pipeline = ArticleSummarization(file)
13
+ summary = summary_pipeline.get_summary()
14
+ word_count = len(summary.split())
15
+ except Exception as e:
16
+ summary = f"An error occurred: {e}"
17
+ word_count = 0
18
+ return summary, word_count
19
+
20
+
21
+ def generate_pdf(summary):
22
+ pdf = FPDF()
23
+ pdf.add_page()
24
+ pdf.set_auto_page_break(auto=True, margin=15)
25
+ pdf.set_font("Helvetica", size=12)
26
+ try:
27
+ html_content = markdown(summary)
28
+ pdf.write_html(html_content)
29
+ except Exception as e:
30
+ app_logger.error(f"Error generating PDF: {e}")
31
+ pdf.write(5, "Error generating PDF content.")
32
+
33
+ pdf_output_path = "summary.pdf"
34
+ pdf.output(name=pdf_output_path)
35
+ return pdf_output_path
36
+
37
+
38
+ def process_and_generate_pdf(file):
39
+ summary, wordcount = process_file(file)
40
+ pdf_output_path = generate_pdf(summary)
41
+ return summary, wordcount, pdf_output_path
42
+
43
+
44
+ iface = gr.Interface(
45
+ fn=process_and_generate_pdf,
46
+ inputs=gr.File(label="Upload PDF", type="filepath"),
47
+ outputs=[
48
+ gr.Textbox(label="Summary"),
49
+ gr.Number(label="Word Count"),
50
+ gr.File(label="Download PDF"),
51
+ ],
52
+ title="Research Assistant Summarizer",
53
+ description="Summarize your research paper.",
54
+ theme=gr.themes.Default(),
55
+ )
56
+
57
+ if __name__ == "__main__":
58
+ iface.launch(share=True)
tests.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ def test_dummy():
2
+ pass