wis-k commited on
Commit
6370672
1 Parent(s): 95eaf35

Upload folder using huggingface_hub

Browse files
.env.template ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ OPENAI_API_KEY=
2
+
3
+ # Only if you want to share the threads on X/Twitter
4
+ CONSUMER_KEY=
5
+ CONSUMER_SECRET=
6
+ ACCESS_KEY=
7
+ ACCESS_SECRET=
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/examples.png filter=lfs diff=lfs merge=lfs -text
37
+ images/gradio.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ data/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10.0
README.md CHANGED
@@ -1,12 +1,117 @@
1
  ---
2
- title: Thread Gpt
3
- emoji: 🏃
4
- colorFrom: gray
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 4.7.1
8
  app_file: app.py
9
- pinned: false
 
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: thread-gpt
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.4.1
6
  ---
7
+ <h1 align="center">ThreadGPT</h1>
8
+ <p align="center">
9
+ <img src="images/logo.png" alt="ThreadGPT Logo" style="height: 150px">
10
+ </p>
11
+
12
+ Struggling to keep up with the latest AI research papers? **ThreadGPT** is here to help. It seamlessly transforms complex academic papers into concise, easy-to-understand threads. Not only does it summarize the text, but it also includes relevant figures, tables, and visuals from the papers directly into the threads. 🧵✨📄
13
+
14
+ <p align="center">
15
+ <img src="./images/gradio.png" alt="Gradio UI" width="800">
16
+ <br>
17
+ <i>Gradio App UI</i>
18
+ </p>
19
+
20
+ <p align="center">
21
+ <img src="./images/examples.png" alt="Example Threads" width="1200">
22
+ <br>
23
+ <i>Examples of threads generated by ThreadGPT (<a href="https://twitter.com/paper_threadoor">@paper_threadoor</a>)</i>
24
+ </p>
25
+
26
+ ## 🛠️ Installation
27
+
28
+ ### Clone the repo
29
+
30
+ ```bash
31
+ git clone https://github.com/wiskojo/thread-gpt
32
+ ```
33
+
34
+ ### Install dependencies
35
+
36
+ ```bash
37
+ # Install PyTorch, torchvision, and torchaudio
38
+ # Please refer to the official PyTorch website (https://pytorch.org) for the installation command that matches your system. Example:
39
+ pip install torch==2.0.0 torchvision==0.15.1
40
+
41
+ # Install all other dependencies
42
+ pip install -r requirements.txt
43
+ ```
44
+
45
+ ### Configure environment variables
46
+
47
+ Copy the `.env.template` file and fill in your `OPENAI_API_KEY`.
48
+
49
+ ```bash
50
+ cp .env.template .env
51
+ ```
52
+
53
+ ## 🚀 Getting Started
54
+
55
+ Before proceeding, please ensure that all the installation steps have been successfully completed.
56
+
57
+ ### 🚨 Cost Warning
58
+
59
+ Please be aware that usage of GPT-4 with the assistant API can incur high costs. Make sure to monitor your usage and understand the pricing details provided by OpenAI before proceeding.
60
+
61
+ ### Gradio
62
+
63
+ ```bash
64
+ python app.py
65
+ ```
66
+
67
+ ### CLI
68
+
69
+ #### 🧵 Create Thread
70
+
71
+ To create a thread, you can either provide a URL to a file or a local path to a file. Use the following commands:
72
+
73
+ ```bash
74
+ # For a URL
75
+ python thread.py <URL_TO_PDF>
76
+
77
+ # For a local file
78
+ python thread.py <LOCAL_PATH_TO_PDF>
79
+ ```
80
+
81
+ By default, you will find all outputs under `./data/<PDF_NAME>`. It will have the following structure.
82
+
83
+ ```
84
+ ./data/<PDF_NAME>/
85
+ ├── figures/
86
+ │ ├── <figure_1_name>.jpg
87
+ │ ├── <figure_2_name>.png
88
+ │ └── ...
89
+ ├── <PDF_NAME>.pdf
90
+ ├── results.json
91
+ ├── thread.json
92
+ ├── processed_thread.json
93
+ └── processed_thread.md
94
+ ```
95
+
96
+ The final output for user consumption is located at `./data/<PDF_NAME>/processed_thread.md`. This file is formatted in Markdown and can be conveniently viewed using any Markdown editor.
97
+
98
+ #### All Contents
99
+
100
+ 1. `figures/`: This directory contains all the figures, tables, and visuals that have been extracted from the paper.
101
+ 2. `<PDF_NAME>.pdf`: This is the original PDF file.
102
+ 3. `results.json`: This file contains the results of the layout parsing. It includes an index of all figures, their paths, and captions that were passed to OpenAI.
103
+ 4. `thread.json`: This file contains the raw thread that was generated by OpenAI before any post-processing was done.
104
+ 5. `processed_thread.json`: This file is a post-processed version of `thread.json`. The post-processing includes steps such as removing source annotations and duplicate figures.
105
+ 6. `processed_thread.md`: This is a markdown version of `processed_thread.json`. It is the final output provided for user consumption.
106
+
107
+ #### 📨 Share Thread
108
+
109
+ To actually share the thread on X/Twitter, you need to set up the credentials in the `.env` file. This requires creating a [developer account](https://developer.twitter.com/) and filling in your `CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_KEY`, and `ACCESS_SECRET`. Then run this command on the created JSON file:
110
+
111
+ ```bash
112
+ python tweet.py ./data/<PDF_NAME>/processed_thread.json
113
+ ```
114
+
115
+ #### 🔧 Customize Assistant
116
 
117
+ ThreadGPT utilizes OpenAI's assistant API. To customize the assistant's behavior, you need to modify the `create_assistant.py` file. This script has defaults for the prompt, name, tools, and model (`gpt-4-1106-preview`). You can customize these parameters to your liking.
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import json
3
+ import os
4
+
5
+ import gradio as gr
6
+ import openai
7
+ from dotenv import load_dotenv
8
+ from gradio_pdf import PDF
9
+
10
+ from create_assistant import INSTRUCTIONS, MODEL
11
+ from thread import create_assistant_then_thread, render_markdown
12
+
13
+ load_dotenv()
14
+
15
+
16
+ OUTPUT_PATH = "data"
17
+ IMAGES_PATH = "images"
18
+
19
+
20
+ def fix_image_paths_in_thread(thread, base_path):
21
+ for tweet in thread:
22
+ for media in tweet.get("media"):
23
+ media["path"] = os.path.join(
24
+ "file", OUTPUT_PATH, os.path.basename(base_path), media["path"]
25
+ )
26
+ return thread
27
+
28
+
29
+ def run_create_thread(
30
+ url_or_path, openai_api_key, assistant_instructions, assistant_model
31
+ ):
32
+ if not openai_api_key:
33
+ raise gr.Error("No OpenAI API Key provided.")
34
+
35
+ client = openai.OpenAI(api_key=openai_api_key)
36
+
37
+ try:
38
+ saved_path = create_assistant_then_thread(
39
+ url_or_path,
40
+ OUTPUT_PATH,
41
+ client,
42
+ assistant_kwargs={
43
+ "instructions": assistant_instructions,
44
+ "model": assistant_model,
45
+ },
46
+ )
47
+ except Exception as e:
48
+ raise gr.Error(e)
49
+
50
+ with open(os.path.join(saved_path, "processed_thread.json"), "r") as f:
51
+ thread = json.load(f)
52
+
53
+ fixed_thread = fix_image_paths_in_thread(copy.deepcopy(thread), saved_path)
54
+ thread_md = render_markdown(fixed_thread)
55
+
56
+ return (
57
+ thread_md,
58
+ json.dumps(thread, indent=2),
59
+ )
60
+
61
+
62
+ with gr.Blocks() as demo:
63
+ banner = gr.Markdown(
64
+ """<div style="display: flex; align-items: center; justify-content: center; margin-top: 20px;">
65
+ <img src="file/images/logo.png" alt="ThreadGPT Logo" style="height: 60px; margin-right: 12px; margin-top: -12px;">
66
+ <h1 style="font-size: 48px">ThreadGPT</h1>
67
+ </div>
68
+
69
+ <p align="center" style="font-size: 12px;">🚨 Please be aware that usage of GPT-4 with the assistant API can incur high costs. Make sure to monitor your usage and understand the pricing details provided by OpenAI before proceeding. 🚨
70
+ <br>
71
+ ❗ There currently seems to be a bug with the Assistant API where a completed run returns no new messages from the assistant. If you encounter this, please click "Retry 🔁". ❗</p>"""
72
+ )
73
+
74
+ with gr.Accordion("Configuration"):
75
+ with gr.Row():
76
+ api_key = gr.Textbox(
77
+ value=os.getenv("OPENAI_API_KEY"),
78
+ placeholder="sk-**************",
79
+ label="OpenAI API Key",
80
+ type="password",
81
+ interactive=True,
82
+ )
83
+ with gr.Column():
84
+ assistant_instr = gr.Textbox(
85
+ value=INSTRUCTIONS,
86
+ placeholder="Enter system instructions",
87
+ label="System Instructions",
88
+ interactive=True,
89
+ )
90
+ assistant_model = gr.Textbox(
91
+ value=MODEL,
92
+ placeholder="Enter model",
93
+ label="Model",
94
+ interactive=True,
95
+ )
96
+
97
+ with gr.Row():
98
+ url_or_path_state = gr.State("")
99
+ txt = gr.Textbox(
100
+ scale=6,
101
+ show_label=False,
102
+ placeholder="https://arxiv.org/pdf/1706.03762.pdf",
103
+ container=False,
104
+ )
105
+ upload_btn = gr.UploadButton("Upload PDF 📄", file_types=[".pdf"])
106
+ retry_btn = gr.Button("Retry 🔄")
107
+
108
+ with gr.Row(visible=False) as output_row:
109
+ with gr.Column():
110
+ pdf = PDF(height=900)
111
+ with gr.Column():
112
+ with gr.Tab("Markdown"):
113
+ md_viewer = gr.Markdown()
114
+ with gr.Tab("JSON"):
115
+ json_viewer = gr.Textbox(lines=44)
116
+
117
+ txt.submit(
118
+ lambda url_or_path: ("", url_or_path, gr.Row(visible=True), "", ""),
119
+ [txt],
120
+ [txt, url_or_path_state, output_row, md_viewer, json_viewer],
121
+ ).then(
122
+ lambda url_or_path: url_or_path,
123
+ [url_or_path_state],
124
+ [pdf],
125
+ ).then(
126
+ run_create_thread,
127
+ [url_or_path_state, api_key, assistant_instr, assistant_model],
128
+ [md_viewer, json_viewer],
129
+ )
130
+
131
+ upload_btn.upload(
132
+ lambda path: (path, gr.Row(visible=True), "", ""),
133
+ [upload_btn],
134
+ [url_or_path_state, output_row, md_viewer, json_viewer],
135
+ ).then(
136
+ lambda url_or_path: url_or_path,
137
+ [url_or_path_state],
138
+ [pdf],
139
+ ).then(
140
+ run_create_thread,
141
+ [url_or_path_state, api_key, assistant_instr, assistant_model],
142
+ [md_viewer, json_viewer],
143
+ )
144
+
145
+ retry_btn.click(
146
+ lambda url_or_path: url_or_path,
147
+ [url_or_path_state],
148
+ [pdf],
149
+ ).then(
150
+ run_create_thread,
151
+ [url_or_path_state, api_key, assistant_instr, assistant_model],
152
+ [md_viewer, json_viewer],
153
+ )
154
+
155
+ if __name__ == "__main__":
156
+ demo.launch(allowed_paths=[OUTPUT_PATH, IMAGES_PATH])
create_assistant.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+
3
+ NAME = "ThreadGPT"
4
+ INSTRUCTIONS = """Paper Threadoor 📄🧳 specializes in transforming academic papers into engaging Twitter threads. The threads are formatted in a distinct style for clarity and engagement:
5
+
6
+ 1. Each tweet starts with a numbering in parentheses out of the total number of tweets in the thread and an emoji, e.g., "([number]/[total_number_of_tweets]) [emoji]".
7
+ 2. The tweet content follows, focusing on key insights or information.
8
+
9
+ # Guidelines
10
+
11
+ Your threads should begin with a captivating hook and sequentially explore the methodology, results, and implications of the research, highlighted by the included visual elements. The final tweet offers a conclusion and broader impact statement. Follow this general structure, in the order they are presented, when writing your threads:
12
+
13
+ ## 1. Hook
14
+ * Include something eye catching from the main results (e.g. 2-3x faster, 60% better, 12% higher score on [dataset], etc.).
15
+ * In 1 - 3 sentences, explain intuitively the methodology/approach or what is unique to this paper. From reading just this, the user should be able to fully understand what the approach is, and how it works, but where the details are abstracted and will follow.
16
+ * You should include the main/overview figure of the paper when possible. Most of the time this is "Figure 1", however, pick whichever is most appropriate. Keep in mind, the image(s) you pick should be visually engaging so e.g. tables are generally not recommended.
17
+
18
+ ## 2. Methodology
19
+ * Follow up on the hook's explanation by providing more context, details, and motivation around the methodology. Include relevant figures and tables that can be used to explain the approach.
20
+ * Your explanation should be sufficient for readers who have never read the paper to understand how it works at a conceptual level.
21
+ * Instead of describing surface concepts, actually explain the essential details of how things work so the readers will understand without having to read the full paper (what is special about their approach vs. prior art?).
22
+
23
+ ## 3. Main Results
24
+ * Highlight the main results from the paper
25
+
26
+ ## 4. Supplemental Results and Other Details
27
+ * Supplement the main results with other reported results that provide more insights.
28
+
29
+ ## 5. Conclusion, Discussion, Broader Impact
30
+ * Conclude by explaining the application and broader implication of the work.
31
+ * Generally this tweet should not have any figures/tables.
32
+
33
+ ## Note for all Sections
34
+ * A PDF processing tool is used for extracting figures, tables, and their captions, but it may not be 100% accurate. This tool names the files using the closest text block to the figure or table, assuming it to be the caption. However, this method can lead to errors. For instance, not all captions may be labeled as "Figure N" or "Table N", which might result in misidentifying a non-figure element as a figure, or mismatching the captions. Therefore, when selecting figures for inclusion, it's crucial to refer back to the original document for verification, rather than relying solely on the file's caption or name.
35
+ * Do not reuse the same figures/tables on multiple tweets in the same thread.
36
+ * Provide citations to material referenced from the `retrieval` tool in the form of "【\d+†source】" in your tweet content.
37
+
38
+ # Steps
39
+
40
+ Follow the following steps when writing your threads:
41
+ 1. A PDF processor is used to extract all figures and tables from the PDF, which will be provided to you. The results from the processing will include paths and captions of each figure/table for you to reference in your thread.
42
+ 2. Use `retrieval` tool to actually read and understand the contents of the paper beyond just the figures and tables from step 1.
43
+ 3. Combine your results from step 1 and 2 and write your thread, adding figures/tables using markdown syntax when relevant.
44
+
45
+ # Output Format
46
+
47
+ Make sure that your output format is JSON (within a ```json\n``` markdown block) so that each object is a tweet and the list is a thread of tweets. The image paths should come directly from paths extracted from the PDF processing results:
48
+
49
+ ```json
50
+ [
51
+ {
52
+ "content": "Content of the first tweet (includes "【\d+†source】" citations from the `retrieval` tool)",
53
+ "media": [
54
+ {
55
+ "explain": "Explanation for including Image 1",
56
+ "path": "Path to image 1"
57
+ },
58
+ ...
59
+ {
60
+ "explain": "Explanation for including Image n",
61
+ "path": "Path to image n"
62
+ }
63
+ // Note: A maximum of 4 images can be included in each tweet
64
+ ]
65
+ },
66
+ ...
67
+ {
68
+ "content": "Content of the last tweet in the thread (includes "【\d+†source】" citations from the `retrieval` tool)",
69
+ "media": [
70
+ {
71
+ "explain": "Explanation for including Image 1",
72
+ "path": "Path to image 1"
73
+ },
74
+ ...
75
+ {
76
+ "explain": "Explanation for including Image n",
77
+ "path": "Path to image n"
78
+ }
79
+ // Note: A maximum of 4 images can be included in each tweet
80
+ ]
81
+ }
82
+ ]
83
+ ```"""
84
+ TOOLS = [{"type": "retrieval"}]
85
+ MODEL = "gpt-4-1106-preview"
86
+
87
+
88
+ def create_assistant(
89
+ client: OpenAI,
90
+ name: str = NAME,
91
+ instructions: str = INSTRUCTIONS,
92
+ tools: dict = TOOLS,
93
+ model: str = MODEL,
94
+ ):
95
+ assistant = client.beta.assistants.create(
96
+ name=name,
97
+ instructions=instructions,
98
+ tools=tools,
99
+ model=model,
100
+ )
101
+ return assistant
images/examples.png ADDED

Git LFS Details

  • SHA256: fb7b4173224cd7f457f609cd3bc52c5358c8bbae7ec1e38207ffc4d86388d067
  • Pointer size: 132 Bytes
  • Size of remote file: 1.68 MB
images/gradio.png ADDED

Git LFS Details

  • SHA256: e25df26bcfc113e7c58bb04d0f0a2ce9cc9befc10953aacec9095abb35dda6f9
  • Pointer size: 132 Bytes
  • Size of remote file: 1.27 MB
images/logo.png ADDED
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ libgl1-mesa-glx
2
+ poppler-utils
3
+ tesseract-ocr
pre-requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch==2.0.0
2
+ torchvision==0.15.1
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.4.1
2
+ gradio-pdf==0.0.3
3
+ layoutparser==0.3.4
4
+ openai==1.2.4
5
+ pdf2image==1.16.3
6
+ pydantic==2.4.2
7
+ pytesseract==0.3.10
8
+ python_dotenv==1.0.0
9
+ Pillow==9.5.0
10
+ requests==2.31.0
11
+ tweepy==4.14.0
12
+ tweet-counter==0.1.0
13
+ git+https://github.com/facebookresearch/detectron2.git@v0.4
thread.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import re
6
+ import shutil
7
+ import time
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from io import BytesIO
10
+ from typing import Optional
11
+ from urllib.parse import urlparse
12
+
13
+ import layoutparser as lp
14
+ import openai
15
+ import pytesseract
16
+ import requests
17
+ from dotenv import load_dotenv
18
+ from pdf2image import convert_from_bytes
19
+ from pydantic import BaseModel, ConfigDict
20
+
21
+ from create_assistant import create_assistant
22
+
23
+ load_dotenv()
24
+
25
+
26
+ logging.basicConfig(handlers=[logging.StreamHandler()], level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class Block(BaseModel):
31
+ model_config = ConfigDict(arbitrary_types_allowed=True)
32
+ block: lp.elements.base.BaseLayoutElement
33
+ page_index: int
34
+
35
+
36
+ class CaptionedBlock(Block):
37
+ model_config = ConfigDict(arbitrary_types_allowed=True)
38
+ caption: lp.elements.base.BaseLayoutElement
39
+
40
+
41
+ def get_blocks_and_texts(layouts: list[lp.Layout]) -> tuple[list[Block], list[Block]]:
42
+ blocks = []
43
+ texts = []
44
+ for i, layout in enumerate(layouts):
45
+ for block in layout:
46
+ if block.type in ["Table", "Figure"]:
47
+ # Check if the current block overlaps with any existing block
48
+ for existing_block in blocks:
49
+ if existing_block.page_index != i:
50
+ # If the blocks are not on the same page, skip the overlap check
51
+ continue
52
+ overlap_area = existing_block.block.intersect(block).area
53
+ overlap_ratio = overlap_area / block.area
54
+ if overlap_ratio > 0.5:
55
+ # If the current block overlaps with an existing block by more than 50%
56
+ # Check which block is the "superset" block
57
+ if block.area > existing_block.block.area:
58
+ # If the current block is larger, replace the existing block with the current block
59
+ blocks.remove(existing_block)
60
+ blocks.append(Block(block=block, page_index=i))
61
+ # If the existing block is larger or equal, skip the current block
62
+ break
63
+ else:
64
+ # If the current block does not overlap significantly with any existing block, add it to the list
65
+ blocks.append(Block(block=block, page_index=i))
66
+ elif block.type == "Text":
67
+ texts.append(Block(block=block, page_index=i))
68
+ return blocks, texts
69
+
70
+
71
+ def caption_blocks(blocks: list[Block], texts: list[Block]) -> list[CaptionedBlock]:
72
+ captioned_blocks = []
73
+ # Find the closest text block to the top and bottom of the figure/table block
74
+ for block in blocks:
75
+ block_bottom_center = (
76
+ (block.block.block.x_1 + block.block.block.x_2) / 2,
77
+ block.block.block.y_2,
78
+ )
79
+ block_top_center = (
80
+ (block.block.block.x_1 + block.block.block.x_2) / 2,
81
+ block.block.block.y_1,
82
+ )
83
+ closest_text = None
84
+ closest_distance = float("inf")
85
+ for text in texts:
86
+ if text.page_index != block.page_index:
87
+ continue
88
+ text_top_center = (
89
+ (text.block.block.x_1 + text.block.block.x_2) / 2,
90
+ text.block.block.y_1,
91
+ )
92
+ text_bottom_center = (
93
+ (text.block.block.x_1 + text.block.block.x_2) / 2,
94
+ text.block.block.y_2,
95
+ )
96
+ distance_to_top = (
97
+ (block_bottom_center[0] - text_top_center[0]) ** 2
98
+ + (block_bottom_center[1] - text_top_center[1]) ** 2
99
+ ) ** 0.5
100
+ distance_to_bottom = (
101
+ (block_top_center[0] - text_bottom_center[0]) ** 2
102
+ + (block_top_center[1] - text_bottom_center[1]) ** 2
103
+ ) ** 0.5
104
+ # Reduce `distance_to_top` by 25% to bias towards picking bottom captions
105
+ distance = min(distance_to_top * 0.75, distance_to_bottom)
106
+ if distance < closest_distance:
107
+ closest_distance = distance
108
+ closest_text = text
109
+ if closest_text is not None:
110
+ captioned_blocks.append(
111
+ CaptionedBlock(
112
+ block=block.block,
113
+ caption=closest_text.block,
114
+ page_index=block.page_index,
115
+ )
116
+ )
117
+ return captioned_blocks
118
+
119
+
120
+ def combine_blocks(captioned_block, pages):
121
+ # Combine block and caption together
122
+ x_1 = min(captioned_block.block.block.x_1, captioned_block.caption.block.x_1)
123
+ y_1 = min(captioned_block.block.block.y_1, captioned_block.caption.block.y_1)
124
+ x_2 = max(captioned_block.block.block.x_2, captioned_block.caption.block.x_2)
125
+ y_2 = max(captioned_block.block.block.y_2, captioned_block.caption.block.y_2)
126
+ return pages[captioned_block.page_index].crop((x_1, y_1, x_2, y_2))
127
+
128
+
129
+ def process_captioned_block(captioned_block, pages, base_path):
130
+ combined_image = combine_blocks(captioned_block, pages)
131
+
132
+ # Convert the PIL Image object to base64
133
+ buffered = BytesIO()
134
+ combined_image.save(buffered, format="JPEG")
135
+
136
+ # Convert the PIL Image object to a string for caption
137
+ caption_image = pages[captioned_block.page_index].crop(
138
+ (
139
+ captioned_block.caption.block.x_1,
140
+ captioned_block.caption.block.y_1,
141
+ captioned_block.caption.block.x_2,
142
+ captioned_block.caption.block.y_2,
143
+ )
144
+ )
145
+ caption_text = pytesseract.image_to_string(caption_image)
146
+
147
+ figures_path = os.path.join(base_path, "figures")
148
+ os.makedirs(figures_path, exist_ok=True)
149
+
150
+ # Convert the caption text to snake case alpha numeric and truncate, then add .jpg to it
151
+ img_name = re.sub("[^0-9a-zA-Z]+", "_", caption_text)[:30] + ".jpg"
152
+ img_path = os.path.join(figures_path, img_name)
153
+
154
+ with open(img_path, "wb") as f:
155
+ f.write(buffered.getvalue())
156
+
157
+ return {"image": f"figures/{img_name}", "caption": caption_text}
158
+
159
+
160
+ def process_pdf(content: bytes, model: lp.models.Detectron2LayoutModel, base_path: str):
161
+ pages = convert_from_bytes(content)
162
+ logger.info("PDF converted to images")
163
+
164
+ with ThreadPoolExecutor(max_workers=16) as executor:
165
+ layouts = list(executor.map(model.detect, pages))
166
+ logger.info("Layout detection completed")
167
+
168
+ blocks, texts = get_blocks_and_texts(layouts)
169
+ logger.info("Blocks and texts extracted")
170
+
171
+ captioned_blocks = caption_blocks(blocks, texts)
172
+ logger.info("Captioning completed")
173
+
174
+ with ThreadPoolExecutor(max_workers=16) as executor:
175
+ results = list(
176
+ executor.map(
177
+ lambda captioned_block: process_captioned_block(
178
+ captioned_block, pages, base_path
179
+ ),
180
+ captioned_blocks,
181
+ )
182
+ )
183
+
184
+ return results
185
+
186
+
187
+ def wait_on_run(run, thread, client: openai.OpenAI):
188
+ while run.status == "queued" or run.status == "in_progress":
189
+ run = client.beta.threads.runs.retrieve(
190
+ thread_id=thread.id,
191
+ run_id=run.id,
192
+ )
193
+ time.sleep(0.5)
194
+ return run
195
+
196
+
197
+ def generate_thread_content(
198
+ pdf_path: str, results: dict, client: openai.OpenAI, assistant_id: str
199
+ ):
200
+ with open(pdf_path, "rb") as f:
201
+ pdf_file = client.files.create(file=f, purpose="assistants")
202
+
203
+ try:
204
+ thread = client.beta.threads.create()
205
+
206
+ message = client.beta.threads.messages.create(
207
+ thread_id=thread.id,
208
+ role="user",
209
+ content=f"{json.dumps(results)}\n\nCreate a thread for this. Your answer must be in JSON, media links should be from the local paths above.",
210
+ file_ids=[pdf_file.id],
211
+ )
212
+
213
+ run = client.beta.threads.runs.create(
214
+ thread_id=thread.id, assistant_id=assistant_id
215
+ )
216
+
217
+ run = wait_on_run(run, thread, client)
218
+
219
+ messages = client.beta.threads.messages.list(
220
+ thread_id=thread.id, order="asc", after=message.id
221
+ )
222
+
223
+ # TODO: OpenAI can return no new messages somehow (might be a bug, the run completes succesfully but no new messages are listed in the thread), catch this and throw error
224
+ if not messages.data or not messages.data[0].content:
225
+ raise ValueError("Unexpected empty response from OpenAI. Please try again.")
226
+
227
+ except Exception as e:
228
+ logger.error(f"Failed to generate thread content: {e}")
229
+ raise
230
+ finally:
231
+ # Delete uploaded PDF file
232
+ try:
233
+ client.files.delete(file_id=pdf_file.id)
234
+ except Exception as e:
235
+ logger.error(f"Failed to delete file: {e}")
236
+
237
+ # Extract JSON content from the message
238
+ message_content = messages.data[0].content[0].text.value
239
+ json_content = re.search(r"(```json\n)(.*?)(\n```)", message_content, re.DOTALL)
240
+ if json_content is None:
241
+ json_content = re.search(r"(```\n)(.*?)(\n```)", message_content, re.DOTALL)
242
+ if json_content is not None:
243
+ json_content = json_content.group(2)
244
+
245
+ try:
246
+ paper_thread = json.loads(json_content)
247
+ except (json.JSONDecodeError, TypeError):
248
+ raise ValueError(
249
+ "The thread generated by OpenAI was not in the expected JSON format."
250
+ )
251
+
252
+ return paper_thread
253
+
254
+
255
+ def process_thread(thread_data, base_path):
256
+ processed_data = []
257
+ media_set = set()
258
+ for data in thread_data:
259
+ cleaned_content = re.sub(
260
+ r"【\d+†source】", "", data["content"]
261
+ ) # Remove all source annotations
262
+ media_list = []
263
+ for media in data.get("media", []):
264
+ if media["path"] and media["path"] not in media_set:
265
+ media_file_path = os.path.join(base_path, media["path"])
266
+ if os.path.isfile(media_file_path):
267
+ media_list.append(media)
268
+ media_set.add(media["path"])
269
+ processed_data.append({"content": cleaned_content, "media": media_list})
270
+ return processed_data
271
+
272
+
273
+ def render_markdown(processed_thread):
274
+ markdown_content = ""
275
+ for data in processed_thread:
276
+ markdown_content += data["content"] + "\n"
277
+ for media in data["media"]:
278
+ markdown_content += f'\n<div align="center">\n'
279
+ markdown_content += f' <img src="{media["path"]}" alt="{media.get("explain", "")}" style="max-width: 75%;">\n'
280
+ markdown_content += "</div>\n"
281
+ markdown_content += "\n---\n\n"
282
+ return markdown_content
283
+
284
+
285
+ def uri_validator(x):
286
+ try:
287
+ result = urlparse(x)
288
+ return all([result.scheme, result.netloc])
289
+ except:
290
+ return False
291
+
292
+
293
+ def create_thread(
294
+ pdf_url_or_path: str, output_path: str, client: openai.OpenAI, assistant_id: str
295
+ ):
296
+ # Extract the PDF name from the URL and remove any file extension at the end
297
+ pdf_name = os.path.splitext(pdf_url_or_path.split("/")[-1])[0]
298
+ base_path = os.path.join(output_path, pdf_name)
299
+ results_path = os.path.join(base_path, "results.json")
300
+ pdf_path = os.path.join(base_path, f"{pdf_name}.pdf")
301
+ thread_path = os.path.join(base_path, "thread.json")
302
+ processed_thread_path = os.path.join(base_path, "processed_thread.json")
303
+ markdown_path = os.path.join(base_path, "processed_thread.md")
304
+
305
+ # Check if base path already exists and there is a results.json
306
+ # If so, assume we've run this before and just return results
307
+ if os.path.exists(base_path) and os.path.isfile(results_path):
308
+ with open(results_path, "r") as f:
309
+ results = json.load(f)
310
+ else:
311
+ os.makedirs(base_path, exist_ok=True)
312
+
313
+ if uri_validator(pdf_url_or_path):
314
+ pdf_content = requests.get(pdf_url_or_path).content
315
+ with open(pdf_path, "wb") as f:
316
+ f.write(pdf_content)
317
+ elif os.path.isfile(pdf_url_or_path):
318
+ shutil.copy(pdf_url_or_path, pdf_path)
319
+ with open(pdf_path, "rb") as f:
320
+ pdf_content = f.read()
321
+ else:
322
+ raise ValueError(
323
+ f"Invalid input: {pdf_url_or_path}. It should be a valid URL or a file path."
324
+ )
325
+
326
+ model = lp.models.Detectron2LayoutModel(
327
+ config_path="lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
328
+ extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
329
+ label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
330
+ )
331
+
332
+ results = process_pdf(pdf_content, model, base_path)
333
+ # Remove duplicates from results
334
+ results = [dict(t) for t in set(tuple(d.items()) for d in results)]
335
+ with open(results_path, "w") as f:
336
+ json.dump(results, f, indent=2)
337
+
338
+ paper_thread = generate_thread_content(pdf_path, results, client, assistant_id)
339
+ with open(thread_path, "w") as f:
340
+ json.dump(paper_thread, f, indent=2)
341
+
342
+ # Process the thread
343
+ processed_thread = process_thread(paper_thread, base_path)
344
+ with open(processed_thread_path, "w") as f:
345
+ json.dump(processed_thread, f, indent=2)
346
+
347
+ # Save processed thread as a markdown file
348
+ markdown_content = render_markdown(processed_thread)
349
+ with open(markdown_path, "w") as f:
350
+ f.write(markdown_content)
351
+
352
+ logger.info(f"Saved all outputs to: {os.path.abspath(base_path)}")
353
+
354
+ return base_path
355
+
356
+
357
+ def create_assistant_then_thread(
358
+ pdf_url_or_path: str,
359
+ output_path: str,
360
+ client: openai.OpenAI,
361
+ assistant_kwargs: Optional[dict] = None,
362
+ ):
363
+ if assistant_kwargs is None:
364
+ assistant_kwargs = {}
365
+ try:
366
+ assistant = create_assistant(client, **assistant_kwargs)
367
+ except Exception:
368
+ logger.error("Failed to create assistant", exc_info=True)
369
+ raise
370
+ try:
371
+ saved_path = create_thread(
372
+ pdf_url_or_path,
373
+ output_path,
374
+ client,
375
+ assistant.id,
376
+ )
377
+ except Exception:
378
+ logger.error("Failed to create thread", exc_info=True)
379
+ raise
380
+ finally:
381
+ try:
382
+ client.beta.assistants.delete(assistant.id)
383
+ except Exception:
384
+ logger.error("Failed to delete assistant", exc_info=True)
385
+ raise
386
+ return saved_path
387
+
388
+
389
+ if __name__ == "__main__":
390
+ parser = argparse.ArgumentParser(
391
+ description="Process a PDF from a URL or a local path."
392
+ )
393
+ parser.add_argument(
394
+ "url_or_path", type=str, help="The URL or local path of the PDF to process."
395
+ )
396
+ parser.add_argument(
397
+ "-o",
398
+ "--output",
399
+ default="data",
400
+ help="The output directory to store the results.",
401
+ )
402
+ args = parser.parse_args()
403
+
404
+ client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
405
+
406
+ create_assistant_then_thread(args.url_or_path, args.output, client)
tweet.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+
6
+ import tweepy
7
+ from dotenv import load_dotenv
8
+ from tweet_counter import count_tweet
9
+
10
+ load_dotenv()
11
+
12
+
13
+ CONSUMER_KEY = os.environ["CONSUMER_KEY"]
14
+ CONSUMER_SECRET = os.environ["CONSUMER_SECRET"]
15
+ ACCESS_KEY = os.environ["ACCESS_KEY"]
16
+ ACCESS_SECRET = os.environ["ACCESS_SECRET"]
17
+
18
+
19
+ # Authenticate to Twitter
20
+ client = tweepy.Client(
21
+ consumer_key=CONSUMER_KEY,
22
+ consumer_secret=CONSUMER_SECRET,
23
+ access_token=ACCESS_KEY,
24
+ access_token_secret=ACCESS_SECRET,
25
+ )
26
+ auth = tweepy.OAuth1UserHandler(
27
+ CONSUMER_KEY,
28
+ CONSUMER_SECRET,
29
+ ACCESS_KEY,
30
+ ACCESS_SECRET,
31
+ )
32
+
33
+ # Create API object
34
+ api = tweepy.API(auth, wait_on_rate_limit=True)
35
+
36
+
37
+ logging.basicConfig(handlers=[logging.StreamHandler()], level=logging.INFO)
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ def tweet_thread(thread_data, base_path):
42
+ for index, tweet in enumerate(thread_data, start=1):
43
+ tweet_length = count_tweet(tweet["content"])
44
+ if tweet_length > 280:
45
+ raise ValueError(
46
+ f"Tweet number {index} exceeds 280 characters by {tweet_length - 280}. Content: {tweet['content']}"
47
+ )
48
+
49
+ # Posting the thread
50
+ previous_tweet_id = None
51
+ for tweet_data in thread_data:
52
+ if "media" in tweet_data and tweet_data["media"]:
53
+ media_ids = [
54
+ api.media_upload(os.path.join(base_path, media["path"])).media_id
55
+ for media in tweet_data["media"]
56
+ ]
57
+ else:
58
+ media_ids = None
59
+
60
+ # Post tweet
61
+ if previous_tweet_id is None:
62
+ # First tweet of the thread
63
+ tweet = client.create_tweet(text=tweet_data["content"], media_ids=media_ids)
64
+ else:
65
+ # Subsequent tweets in the thread
66
+ tweet = client.create_tweet(
67
+ text=tweet_data["content"],
68
+ in_reply_to_tweet_id=previous_tweet_id,
69
+ media_ids=media_ids,
70
+ )
71
+
72
+ previous_tweet_id = tweet.data["id"]
73
+ logger.info(f"Tweeted: {tweet_data['content']}")
74
+
75
+ logger.info("Thread posted!")
76
+
77
+
78
+ if __name__ == "__main__":
79
+ parser = argparse.ArgumentParser(description="Tweet a thread from a json file.")
80
+ parser.add_argument(
81
+ "file", type=str, help="Path to the json file containing the thread data."
82
+ )
83
+ args = parser.parse_args()
84
+
85
+ with open(args.file, "r") as f:
86
+ thread_data = json.load(f)
87
+
88
+ base_path = os.path.dirname(os.path.abspath(args.file))
89
+ tweet_thread(thread_data, base_path)