Spaces:

wis-k
/

thread-gpt

Sleeping

App Files Files Community

wis-k commited on Nov 25, 2023

Commit

6370672

•

1 Parent(s): 95eaf35

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.env.template +7 -0
.gitattributes +2 -0
.gitignore +162 -0
.python-version +1 -0
README.md +113 -8
app.py +156 -0
create_assistant.py +101 -0
images/examples.png +3 -0
images/gradio.png +3 -0
images/logo.png +0 -0
packages.txt +3 -0
pre-requirements.txt +2 -0
requirements.txt +13 -0
thread.py +406 -0
tweet.py +89 -0

.env.template ADDED Viewed

	@@ -0,0 +1,7 @@

+OPENAI_API_KEY=
+# Only if you want to share the threads on X/Twitter
+CONSUMER_KEY=
+CONSUMER_SECRET=
+ACCESS_KEY=
+ACCESS_SECRET=

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/examples.png filter=lfs diff=lfs merge=lfs -text
+images/gradio.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+data/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10.0

README.md CHANGED Viewed

@@ -1,12 +1,117 @@
 ---
-title: Thread Gpt
-emoji: 🏃
-colorFrom: gray
-colorTo: yellow
-sdk: gradio
-sdk_version: 4.7.1
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: thread-gpt
 app_file: app.py
+sdk: gradio
+sdk_version: 4.4.1
 ---
+<h1 align="center">ThreadGPT</h1>
+<p align="center">
+  <img src="images/logo.png" alt="ThreadGPT Logo" style="height: 150px">
+</p>
+Struggling to keep up with the latest AI research papers? **ThreadGPT** is here to help. It seamlessly transforms complex academic papers into concise, easy-to-understand threads. Not only does it summarize the text, but it also includes relevant figures, tables, and visuals from the papers directly into the threads. 🧵✨📄
+<p align="center">
+  <img src="./images/gradio.png" alt="Gradio UI" width="800">
+  <br>
+  <i>Gradio App UI</i>
+</p>
+<p align="center">
+  <img src="./images/examples.png" alt="Example Threads" width="1200">
+  <br>
+  <i>Examples of threads generated by ThreadGPT (<a href="https://twitter.com/paper_threadoor">@paper_threadoor</a>)</i>
+</p>
+## 🛠️ Installation
+### Clone the repo
+```bash
+git clone https://github.com/wiskojo/thread-gpt
+```
+### Install dependencies
+```bash
+# Install PyTorch, torchvision, and torchaudio
+# Please refer to the official PyTorch website (https://pytorch.org) for the installation command that matches your system. Example:
+pip install torch==2.0.0 torchvision==0.15.1
+# Install all other dependencies
+pip install -r requirements.txt
+```
+### Configure environment variables
+Copy the `.env.template` file and fill in your `OPENAI_API_KEY`.
+```bash
+cp .env.template .env
+```
+## 🚀 Getting Started
+Before proceeding, please ensure that all the installation steps have been successfully completed.
+### 🚨 Cost Warning
+Please be aware that usage of GPT-4 with the assistant API can incur high costs. Make sure to monitor your usage and understand the pricing details provided by OpenAI before proceeding.
+### Gradio
+```bash
+python app.py
+```
+### CLI
+#### 🧵 Create Thread
+To create a thread, you can either provide a URL to a file or a local path to a file. Use the following commands:
+```bash
+# For a URL
+python thread.py <URL_TO_PDF>
+# For a local file
+python thread.py <LOCAL_PATH_TO_PDF>
+```
+By default, you will find all outputs under `./data/<PDF_NAME>`. It will have the following structure.
+```
+./data/<PDF_NAME>/
+├── figures/
+│   ├── <figure_1_name>.jpg
+│   ├── <figure_2_name>.png
+│   └── ...
+├── <PDF_NAME>.pdf
+├── results.json
+├── thread.json
+├── processed_thread.json
+└── processed_thread.md
+```
+The final output for user consumption is located at `./data/<PDF_NAME>/processed_thread.md`. This file is formatted in Markdown and can be conveniently viewed using any Markdown editor.
+#### All Contents
+1. `figures/`: This directory contains all the figures, tables, and visuals that have been extracted from the paper.
+2. `<PDF_NAME>.pdf`: This is the original PDF file.
+3. `results.json`: This file contains the results of the layout parsing. It includes an index of all figures, their paths, and captions that were passed to OpenAI.
+4. `thread.json`: This file contains the raw thread that was generated by OpenAI before any post-processing was done.
+5. `processed_thread.json`: This file is a post-processed version of `thread.json`. The post-processing includes steps such as removing source annotations and duplicate figures.
+6. `processed_thread.md`: This is a markdown version of `processed_thread.json`. It is the final output provided for user consumption.
+#### 📨 Share Thread
+To actually share the thread on X/Twitter, you need to set up the credentials in the `.env` file. This requires creating a [developer account](https://developer.twitter.com/) and filling in your `CONSUMER_KEY`, `CONSUMER_SECRET`, `ACCESS_KEY`, and `ACCESS_SECRET`. Then run this command on the created JSON file:
+```bash
+python tweet.py ./data/<PDF_NAME>/processed_thread.json
+```
+#### 🔧 Customize Assistant
+ThreadGPT utilizes OpenAI's assistant API. To customize the assistant's behavior, you need to modify the `create_assistant.py` file. This script has defaults for the prompt, name, tools, and model (`gpt-4-1106-preview`). You can customize these parameters to your liking.

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import copy
+import json
+import os
+import gradio as gr
+import openai
+from dotenv import load_dotenv
+from gradio_pdf import PDF
+from create_assistant import INSTRUCTIONS, MODEL
+from thread import create_assistant_then_thread, render_markdown
+load_dotenv()
+OUTPUT_PATH = "data"
+IMAGES_PATH = "images"
+def fix_image_paths_in_thread(thread, base_path):
+    for tweet in thread:
+        for media in tweet.get("media"):
+            media["path"] = os.path.join(
+                "file", OUTPUT_PATH, os.path.basename(base_path), media["path"]
+            )
+    return thread
+def run_create_thread(
+    url_or_path, openai_api_key, assistant_instructions, assistant_model
+):
+    if not openai_api_key:
+        raise gr.Error("No OpenAI API Key provided.")
+    client = openai.OpenAI(api_key=openai_api_key)
+    try:
+        saved_path = create_assistant_then_thread(
+            url_or_path,
+            OUTPUT_PATH,
+            client,
+            assistant_kwargs={
+                "instructions": assistant_instructions,
+                "model": assistant_model,
+            },
+        )
+    except Exception as e:
+        raise gr.Error(e)
+    with open(os.path.join(saved_path, "processed_thread.json"), "r") as f:
+        thread = json.load(f)
+    fixed_thread = fix_image_paths_in_thread(copy.deepcopy(thread), saved_path)
+    thread_md = render_markdown(fixed_thread)
+    return (
+        thread_md,
+        json.dumps(thread, indent=2),
+    )
+with gr.Blocks() as demo:
+    banner = gr.Markdown(
+        """<div style="display: flex; align-items: center; justify-content: center; margin-top: 20px;">
+      <img src="file/images/logo.png" alt="ThreadGPT Logo" style="height: 60px; margin-right: 12px; margin-top: -12px;">
+      <h1 style="font-size: 48px">ThreadGPT</h1>
+    </div>
+<p align="center" style="font-size: 12px;">🚨 Please be aware that usage of GPT-4 with the assistant API can incur high costs. Make sure to monitor your usage and understand the pricing details provided by OpenAI before proceeding. 🚨
+<br>
+❗ There currently seems to be a bug with the Assistant API where a completed run returns no new messages from the assistant. If you encounter this, please click "Retry 🔁". ❗</p>"""
+    )
+    with gr.Accordion("Configuration"):
+        with gr.Row():
+            api_key = gr.Textbox(
+                value=os.getenv("OPENAI_API_KEY"),
+                placeholder="sk-**************",
+                label="OpenAI API Key",
+                type="password",
+                interactive=True,
+            )
+            with gr.Column():
+                assistant_instr = gr.Textbox(
+                    value=INSTRUCTIONS,
+                    placeholder="Enter system instructions",
+                    label="System Instructions",
+                    interactive=True,
+                )
+                assistant_model = gr.Textbox(
+                    value=MODEL,
+                    placeholder="Enter model",
+                    label="Model",
+                    interactive=True,
+                )
+    with gr.Row():
+        url_or_path_state = gr.State("")
+        txt = gr.Textbox(
+            scale=6,
+            show_label=False,
+            placeholder="https://arxiv.org/pdf/1706.03762.pdf",
+            container=False,
+        )
+        upload_btn = gr.UploadButton("Upload PDF 📄", file_types=[".pdf"])
+        retry_btn = gr.Button("Retry 🔄")
+    with gr.Row(visible=False) as output_row:
+        with gr.Column():
+            pdf = PDF(height=900)
+        with gr.Column():
+            with gr.Tab("Markdown"):
+                md_viewer = gr.Markdown()
+            with gr.Tab("JSON"):
+                json_viewer = gr.Textbox(lines=44)
+    txt.submit(
+        lambda url_or_path: ("", url_or_path, gr.Row(visible=True), "", ""),
+        [txt],
+        [txt, url_or_path_state, output_row, md_viewer, json_viewer],
+    ).then(
+        lambda url_or_path: url_or_path,
+        [url_or_path_state],
+        [pdf],
+    ).then(
+        run_create_thread,
+        [url_or_path_state, api_key, assistant_instr, assistant_model],
+        [md_viewer, json_viewer],
+    )
+    upload_btn.upload(
+        lambda path: (path, gr.Row(visible=True), "", ""),
+        [upload_btn],
+        [url_or_path_state, output_row, md_viewer, json_viewer],
+    ).then(
+        lambda url_or_path: url_or_path,
+        [url_or_path_state],
+        [pdf],
+    ).then(
+        run_create_thread,
+        [url_or_path_state, api_key, assistant_instr, assistant_model],
+        [md_viewer, json_viewer],
+    )
+    retry_btn.click(
+        lambda url_or_path: url_or_path,
+        [url_or_path_state],
+        [pdf],
+    ).then(
+        run_create_thread,
+        [url_or_path_state, api_key, assistant_instr, assistant_model],
+        [md_viewer, json_viewer],
+    )
+if __name__ == "__main__":
+    demo.launch(allowed_paths=[OUTPUT_PATH, IMAGES_PATH])

create_assistant.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from openai import OpenAI
+NAME = "ThreadGPT"
+INSTRUCTIONS = """Paper Threadoor 📄🧳 specializes in transforming academic papers into engaging Twitter threads. The threads are formatted in a distinct style for clarity and engagement:
+1. Each tweet starts with a numbering in parentheses out of the total number of tweets in the thread and an emoji, e.g., "([number]/[total_number_of_tweets]) [emoji]".
+2. The tweet content follows, focusing on key insights or information.
+# Guidelines
+Your threads should begin with a captivating hook and sequentially explore the methodology, results, and implications of the research, highlighted by the included visual elements. The final tweet offers a conclusion and broader impact statement. Follow this general structure, in the order they are presented, when writing your threads:
+## 1. Hook
+* Include something eye catching from the main results (e.g. 2-3x faster, 60% better, 12% higher score on [dataset], etc.).
+* In 1 - 3 sentences, explain intuitively the methodology/approach or what is unique to this paper. From reading just this, the user should be able to fully understand what the approach is, and how it works, but where the details are abstracted and will follow.
+* You should include the main/overview figure of the paper when possible. Most of the time this is "Figure 1", however, pick whichever is most appropriate. Keep in mind, the image(s) you pick should be visually engaging so e.g. tables are generally not recommended.
+## 2. Methodology
+* Follow up on the hook's explanation by providing more context, details, and motivation around the methodology. Include relevant figures and tables that can be used to explain the approach.
+* Your explanation should be sufficient for readers who have never read the paper to understand how it works at a conceptual level.
+* Instead of describing surface concepts, actually explain the essential details of how things work so the readers will understand without having to read the full paper (what is special about their approach vs. prior art?).
+## 3. Main Results
+* Highlight the main results from the paper
+## 4. Supplemental Results and Other Details
+* Supplement the main results with other reported results that provide more insights.
+## 5. Conclusion, Discussion, Broader Impact
+* Conclude by explaining the application and broader implication of the work.
+* Generally this tweet should not have any figures/tables.
+## Note for all Sections
+* A PDF processing tool is used for extracting figures, tables, and their captions, but it may not be 100% accurate. This tool names the files using the closest text block to the figure or table, assuming it to be the caption. However, this method can lead to errors. For instance, not all captions may be labeled as "Figure N" or "Table N", which might result in misidentifying a non-figure element as a figure, or mismatching the captions. Therefore, when selecting figures for inclusion, it's crucial to refer back to the original document for verification, rather than relying solely on the file's caption or name.
+* Do not reuse the same figures/tables on multiple tweets in the same thread.
+* Provide citations to material referenced from the `retrieval` tool in the form of "【\d+†source】" in your tweet content.
+# Steps
+Follow the following steps when writing your threads:
+1. A PDF processor is used to extract all figures and tables from the PDF, which will be provided to you. The results from the processing will include paths and captions of each figure/table for you to reference in your thread.
+2. Use `retrieval` tool to actually read and understand the contents of the paper beyond just the figures and tables from step 1.
+3. Combine your results from step 1 and 2 and write your thread, adding figures/tables using markdown syntax when relevant.
+# Output Format
+Make sure that your output format is JSON (within a ```json\n``` markdown block) so that each object is a tweet and the list is a thread of tweets. The image paths should come directly from paths extracted from the PDF processing results:
+```json
+[
+    {
+        "content": "Content of the first tweet (includes "【\d+†source】" citations from the `retrieval` tool)",
+        "media": [
+            {
+                "explain": "Explanation for including Image 1",
+                "path": "Path to image 1"
+            },
+            ...
+            {
+                "explain": "Explanation for including Image n",
+                "path": "Path to image n"
+            }
+            // Note: A maximum of 4 images can be included in each tweet
+        ]
+    },
+    ...
+    {
+        "content": "Content of the last tweet in the thread (includes "【\d+†source】" citations from the `retrieval` tool)",
+        "media": [
+            {
+                "explain": "Explanation for including Image 1",
+                "path": "Path to image 1"
+            },
+            ...
+            {
+                "explain": "Explanation for including Image n",
+                "path": "Path to image n"
+            }
+            // Note: A maximum of 4 images can be included in each tweet
+        ]
+    }
+]
+```"""
+TOOLS = [{"type": "retrieval"}]
+MODEL = "gpt-4-1106-preview"
+def create_assistant(
+    client: OpenAI,
+    name: str = NAME,
+    instructions: str = INSTRUCTIONS,
+    tools: dict = TOOLS,
+    model: str = MODEL,
+):
+    assistant = client.beta.assistants.create(
+        name=name,
+        instructions=instructions,
+        tools=tools,
+        model=model,
+    )
+    return assistant

images/examples.png ADDED Viewed

Git LFS Details

SHA256: fb7b4173224cd7f457f609cd3bc52c5358c8bbae7ec1e38207ffc4d86388d067
Pointer size: 132 Bytes
Size of remote file: 1.68 MB

images/gradio.png ADDED Viewed

Git LFS Details

SHA256: e25df26bcfc113e7c58bb04d0f0a2ce9cc9befc10953aacec9095abb35dda6f9
Pointer size: 132 Bytes
Size of remote file: 1.27 MB

images/logo.png ADDED Viewed

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+libgl1-mesa-glx
+poppler-utils
+tesseract-ocr

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch==2.0.0
2	+ torchvision==0.15.1

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio==4.4.1
+gradio-pdf==0.0.3
+layoutparser==0.3.4
+openai==1.2.4
+pdf2image==1.16.3
+pydantic==2.4.2
+pytesseract==0.3.10
+python_dotenv==1.0.0
+Pillow==9.5.0
+requests==2.31.0
+tweepy==4.14.0
+tweet-counter==0.1.0
+git+https://github.com/facebookresearch/detectron2.git@v0.4

thread.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import argparse
+import json
+import logging
+import os
+import re
+import shutil
+import time
+from concurrent.futures import ThreadPoolExecutor
+from io import BytesIO
+from typing import Optional
+from urllib.parse import urlparse
+import layoutparser as lp
+import openai
+import pytesseract
+import requests
+from dotenv import load_dotenv
+from pdf2image import convert_from_bytes
+from pydantic import BaseModel, ConfigDict
+from create_assistant import create_assistant
+load_dotenv()
+logging.basicConfig(handlers=[logging.StreamHandler()], level=logging.INFO)
+logger = logging.getLogger(__name__)
+class Block(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    block: lp.elements.base.BaseLayoutElement
+    page_index: int
+class CaptionedBlock(Block):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    caption: lp.elements.base.BaseLayoutElement
+def get_blocks_and_texts(layouts: list[lp.Layout]) -> tuple[list[Block], list[Block]]:
+    blocks = []
+    texts = []
+    for i, layout in enumerate(layouts):
+        for block in layout:
+            if block.type in ["Table", "Figure"]:
+                # Check if the current block overlaps with any existing block
+                for existing_block in blocks:
+                    if existing_block.page_index != i:
+                        # If the blocks are not on the same page, skip the overlap check
+                        continue
+                    overlap_area = existing_block.block.intersect(block).area
+                    overlap_ratio = overlap_area / block.area
+                    if overlap_ratio > 0.5:
+                        # If the current block overlaps with an existing block by more than 50%
+                        # Check which block is the "superset" block
+                        if block.area > existing_block.block.area:
+                            # If the current block is larger, replace the existing block with the current block
+                            blocks.remove(existing_block)
+                            blocks.append(Block(block=block, page_index=i))
+                        # If the existing block is larger or equal, skip the current block
+                        break
+                else:
+                    # If the current block does not overlap significantly with any existing block, add it to the list
+                    blocks.append(Block(block=block, page_index=i))
+            elif block.type == "Text":
+                texts.append(Block(block=block, page_index=i))
+    return blocks, texts
+def caption_blocks(blocks: list[Block], texts: list[Block]) -> list[CaptionedBlock]:
+    captioned_blocks = []
+    # Find the closest text block to the top and bottom of the figure/table block
+    for block in blocks:
+        block_bottom_center = (
+            (block.block.block.x_1 + block.block.block.x_2) / 2,
+            block.block.block.y_2,
+        )
+        block_top_center = (
+            (block.block.block.x_1 + block.block.block.x_2) / 2,
+            block.block.block.y_1,
+        )
+        closest_text = None
+        closest_distance = float("inf")
+        for text in texts:
+            if text.page_index != block.page_index:
+                continue
+            text_top_center = (
+                (text.block.block.x_1 + text.block.block.x_2) / 2,
+                text.block.block.y_1,
+            )
+            text_bottom_center = (
+                (text.block.block.x_1 + text.block.block.x_2) / 2,
+                text.block.block.y_2,
+            )
+            distance_to_top = (
+                (block_bottom_center[0] - text_top_center[0]) ** 2
+                + (block_bottom_center[1] - text_top_center[1]) ** 2
+            ) ** 0.5
+            distance_to_bottom = (
+                (block_top_center[0] - text_bottom_center[0]) ** 2
+                + (block_top_center[1] - text_bottom_center[1]) ** 2
+            ) ** 0.5
+            # Reduce `distance_to_top` by 25% to bias towards picking bottom captions
+            distance = min(distance_to_top * 0.75, distance_to_bottom)
+            if distance < closest_distance:
+                closest_distance = distance
+                closest_text = text
+        if closest_text is not None:
+            captioned_blocks.append(
+                CaptionedBlock(
+                    block=block.block,
+                    caption=closest_text.block,
+                    page_index=block.page_index,
+                )
+            )
+    return captioned_blocks
+def combine_blocks(captioned_block, pages):
+    # Combine block and caption together
+    x_1 = min(captioned_block.block.block.x_1, captioned_block.caption.block.x_1)
+    y_1 = min(captioned_block.block.block.y_1, captioned_block.caption.block.y_1)
+    x_2 = max(captioned_block.block.block.x_2, captioned_block.caption.block.x_2)
+    y_2 = max(captioned_block.block.block.y_2, captioned_block.caption.block.y_2)
+    return pages[captioned_block.page_index].crop((x_1, y_1, x_2, y_2))
+def process_captioned_block(captioned_block, pages, base_path):
+    combined_image = combine_blocks(captioned_block, pages)
+    # Convert the PIL Image object to base64
+    buffered = BytesIO()
+    combined_image.save(buffered, format="JPEG")
+    # Convert the PIL Image object to a string for caption
+    caption_image = pages[captioned_block.page_index].crop(
+        (
+            captioned_block.caption.block.x_1,
+            captioned_block.caption.block.y_1,
+            captioned_block.caption.block.x_2,
+            captioned_block.caption.block.y_2,
+        )
+    )
+    caption_text = pytesseract.image_to_string(caption_image)
+    figures_path = os.path.join(base_path, "figures")
+    os.makedirs(figures_path, exist_ok=True)
+    # Convert the caption text to snake case alpha numeric and truncate, then add .jpg to it
+    img_name = re.sub("[^0-9a-zA-Z]+", "_", caption_text)[:30] + ".jpg"
+    img_path = os.path.join(figures_path, img_name)
+    with open(img_path, "wb") as f:
+        f.write(buffered.getvalue())
+    return {"image": f"figures/{img_name}", "caption": caption_text}
+def process_pdf(content: bytes, model: lp.models.Detectron2LayoutModel, base_path: str):
+    pages = convert_from_bytes(content)
+    logger.info("PDF converted to images")
+    with ThreadPoolExecutor(max_workers=16) as executor:
+        layouts = list(executor.map(model.detect, pages))
+        logger.info("Layout detection completed")
+    blocks, texts = get_blocks_and_texts(layouts)
+    logger.info("Blocks and texts extracted")
+    captioned_blocks = caption_blocks(blocks, texts)
+    logger.info("Captioning completed")
+    with ThreadPoolExecutor(max_workers=16) as executor:
+        results = list(
+            executor.map(
+                lambda captioned_block: process_captioned_block(
+                    captioned_block, pages, base_path
+                ),
+                captioned_blocks,
+            )
+        )
+    return results
+def wait_on_run(run, thread, client: openai.OpenAI):
+    while run.status == "queued" or run.status == "in_progress":
+        run = client.beta.threads.runs.retrieve(
+            thread_id=thread.id,
+            run_id=run.id,
+        )
+        time.sleep(0.5)
+    return run
+def generate_thread_content(
+    pdf_path: str, results: dict, client: openai.OpenAI, assistant_id: str
+):
+    with open(pdf_path, "rb") as f:
+        pdf_file = client.files.create(file=f, purpose="assistants")
+    try:
+        thread = client.beta.threads.create()
+        message = client.beta.threads.messages.create(
+            thread_id=thread.id,
+            role="user",
+            content=f"{json.dumps(results)}\n\nCreate a thread for this. Your answer must be in JSON, media links should be from the local paths above.",
+            file_ids=[pdf_file.id],
+        )
+        run = client.beta.threads.runs.create(
+            thread_id=thread.id, assistant_id=assistant_id
+        )
+        run = wait_on_run(run, thread, client)
+        messages = client.beta.threads.messages.list(
+            thread_id=thread.id, order="asc", after=message.id
+        )
+        # TODO: OpenAI can return no new messages somehow (might be a bug, the run completes succesfully but no new messages are listed in the thread), catch this and throw error
+        if not messages.data or not messages.data[0].content:
+            raise ValueError("Unexpected empty response from OpenAI. Please try again.")
+    except Exception as e:
+        logger.error(f"Failed to generate thread content: {e}")
+        raise
+    finally:
+        # Delete uploaded PDF file
+        try:
+            client.files.delete(file_id=pdf_file.id)
+        except Exception as e:
+            logger.error(f"Failed to delete file: {e}")
+    # Extract JSON content from the message
+    message_content = messages.data[0].content[0].text.value
+    json_content = re.search(r"(```json\n)(.*?)(\n```)", message_content, re.DOTALL)
+    if json_content is None:
+        json_content = re.search(r"(```\n)(.*?)(\n```)", message_content, re.DOTALL)
+    if json_content is not None:
+        json_content = json_content.group(2)
+    try:
+        paper_thread = json.loads(json_content)
+    except (json.JSONDecodeError, TypeError):
+        raise ValueError(
+            "The thread generated by OpenAI was not in the expected JSON format."
+        )
+    return paper_thread
+def process_thread(thread_data, base_path):
+    processed_data = []
+    media_set = set()
+    for data in thread_data:
+        cleaned_content = re.sub(
+            r"【\d+†source】", "", data["content"]
+        )  # Remove all source annotations
+        media_list = []
+        for media in data.get("media", []):
+            if media["path"] and media["path"] not in media_set:
+                media_file_path = os.path.join(base_path, media["path"])
+                if os.path.isfile(media_file_path):
+                    media_list.append(media)
+                    media_set.add(media["path"])
+        processed_data.append({"content": cleaned_content, "media": media_list})
+    return processed_data
+def render_markdown(processed_thread):
+    markdown_content = ""
+    for data in processed_thread:
+        markdown_content += data["content"] + "\n"
+        for media in data["media"]:
+            markdown_content += f'\n<div align="center">\n'
+            markdown_content += f'    <img src="{media["path"]}" alt="{media.get("explain", "")}" style="max-width: 75%;">\n'
+            markdown_content += "</div>\n"
+        markdown_content += "\n---\n\n"
+    return markdown_content
+def uri_validator(x):
+    try:
+        result = urlparse(x)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+def create_thread(
+    pdf_url_or_path: str, output_path: str, client: openai.OpenAI, assistant_id: str
+):
+    # Extract the PDF name from the URL and remove any file extension at the end
+    pdf_name = os.path.splitext(pdf_url_or_path.split("/")[-1])[0]
+    base_path = os.path.join(output_path, pdf_name)
+    results_path = os.path.join(base_path, "results.json")
+    pdf_path = os.path.join(base_path, f"{pdf_name}.pdf")
+    thread_path = os.path.join(base_path, "thread.json")
+    processed_thread_path = os.path.join(base_path, "processed_thread.json")
+    markdown_path = os.path.join(base_path, "processed_thread.md")
+    # Check if base path already exists and there is a results.json
+    # If so, assume we've run this before and just return results
+    if os.path.exists(base_path) and os.path.isfile(results_path):
+        with open(results_path, "r") as f:
+            results = json.load(f)
+    else:
+        os.makedirs(base_path, exist_ok=True)
+        if uri_validator(pdf_url_or_path):
+            pdf_content = requests.get(pdf_url_or_path).content
+            with open(pdf_path, "wb") as f:
+                f.write(pdf_content)
+        elif os.path.isfile(pdf_url_or_path):
+            shutil.copy(pdf_url_or_path, pdf_path)
+            with open(pdf_path, "rb") as f:
+                pdf_content = f.read()
+        else:
+            raise ValueError(
+                f"Invalid input: {pdf_url_or_path}. It should be a valid URL or a file path."
+            )
+        model = lp.models.Detectron2LayoutModel(
+            config_path="lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
+            extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
+            label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
+        )
+        results = process_pdf(pdf_content, model, base_path)
+        # Remove duplicates from results
+        results = [dict(t) for t in set(tuple(d.items()) for d in results)]
+        with open(results_path, "w") as f:
+            json.dump(results, f, indent=2)
+    paper_thread = generate_thread_content(pdf_path, results, client, assistant_id)
+    with open(thread_path, "w") as f:
+        json.dump(paper_thread, f, indent=2)
+    # Process the thread
+    processed_thread = process_thread(paper_thread, base_path)
+    with open(processed_thread_path, "w") as f:
+        json.dump(processed_thread, f, indent=2)
+    # Save processed thread as a markdown file
+    markdown_content = render_markdown(processed_thread)
+    with open(markdown_path, "w") as f:
+        f.write(markdown_content)
+    logger.info(f"Saved all outputs to: {os.path.abspath(base_path)}")
+    return base_path
+def create_assistant_then_thread(
+    pdf_url_or_path: str,
+    output_path: str,
+    client: openai.OpenAI,
+    assistant_kwargs: Optional[dict] = None,
+):
+    if assistant_kwargs is None:
+        assistant_kwargs = {}
+    try:
+        assistant = create_assistant(client, **assistant_kwargs)
+    except Exception:
+        logger.error("Failed to create assistant", exc_info=True)
+        raise
+    try:
+        saved_path = create_thread(
+            pdf_url_or_path,
+            output_path,
+            client,
+            assistant.id,
+        )
+    except Exception:
+        logger.error("Failed to create thread", exc_info=True)
+        raise
+    finally:
+        try:
+            client.beta.assistants.delete(assistant.id)
+        except Exception:
+            logger.error("Failed to delete assistant", exc_info=True)
+            raise
+    return saved_path
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Process a PDF from a URL or a local path."
+    )
+    parser.add_argument(
+        "url_or_path", type=str, help="The URL or local path of the PDF to process."
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="data",
+        help="The output directory to store the results.",
+    )
+    args = parser.parse_args()
+    client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
+    create_assistant_then_thread(args.url_or_path, args.output, client)

tweet.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import argparse
+import json
+import logging
+import os
+import tweepy
+from dotenv import load_dotenv
+from tweet_counter import count_tweet
+load_dotenv()
+CONSUMER_KEY = os.environ["CONSUMER_KEY"]
+CONSUMER_SECRET = os.environ["CONSUMER_SECRET"]
+ACCESS_KEY = os.environ["ACCESS_KEY"]
+ACCESS_SECRET = os.environ["ACCESS_SECRET"]
+# Authenticate to Twitter
+client = tweepy.Client(
+    consumer_key=CONSUMER_KEY,
+    consumer_secret=CONSUMER_SECRET,
+    access_token=ACCESS_KEY,
+    access_token_secret=ACCESS_SECRET,
+)
+auth = tweepy.OAuth1UserHandler(
+    CONSUMER_KEY,
+    CONSUMER_SECRET,
+    ACCESS_KEY,
+    ACCESS_SECRET,
+)
+# Create API object
+api = tweepy.API(auth, wait_on_rate_limit=True)
+logging.basicConfig(handlers=[logging.StreamHandler()], level=logging.INFO)
+logger = logging.getLogger(__name__)
+def tweet_thread(thread_data, base_path):
+    for index, tweet in enumerate(thread_data, start=1):
+        tweet_length = count_tweet(tweet["content"])
+        if tweet_length > 280:
+            raise ValueError(
+                f"Tweet number {index} exceeds 280 characters by {tweet_length - 280}. Content: {tweet['content']}"
+            )
+    # Posting the thread
+    previous_tweet_id = None
+    for tweet_data in thread_data:
+        if "media" in tweet_data and tweet_data["media"]:
+            media_ids = [
+                api.media_upload(os.path.join(base_path, media["path"])).media_id
+                for media in tweet_data["media"]
+            ]
+        else:
+            media_ids = None
+        # Post tweet
+        if previous_tweet_id is None:
+            # First tweet of the thread
+            tweet = client.create_tweet(text=tweet_data["content"], media_ids=media_ids)
+        else:
+            # Subsequent tweets in the thread
+            tweet = client.create_tweet(
+                text=tweet_data["content"],
+                in_reply_to_tweet_id=previous_tweet_id,
+                media_ids=media_ids,
+            )
+        previous_tweet_id = tweet.data["id"]
+        logger.info(f"Tweeted: {tweet_data['content']}")
+    logger.info("Thread posted!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Tweet a thread from a json file.")
+    parser.add_argument(
+        "file", type=str, help="Path to the json file containing the thread data."
+    )
+    args = parser.parse_args()
+    with open(args.file, "r") as f:
+        thread_data = json.load(f)
+    base_path = os.path.dirname(os.path.abspath(args.file))
+    tweet_thread(thread_data, base_path)