Spaces:

rbiswasfc
/

zotero-refresh-pipeline

Sleeping

File size: 17,414 Bytes

import os
import re
import time

import dotenv
import fitz  # PyMuPDF
import pandas as pd
import requests
import schedule
import srsly
from bs4 import BeautifulSoup
from datasets import Dataset, Image, load_dataset
from huggingface_hub import create_repo, login, whoami
from PIL import Image as PILImage
from retry import retry
from tqdm.auto import tqdm

dotenv.load_dotenv()
login(token=os.environ.get("HF_TOKEN"))

hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
HF_REPO_ID = f"{hf_user}/zotero-articles"


########################################################
### GET ZOTERO ITEMS
########################################################


@retry(tries=3, delay=8)
def _fetch_one_zotero_batch(url, headers, params):
    """
    Fetch articles from Zotero API
    """
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()


def get_zotero_items(debug=False):
    """
    fetch items from zotero library
    """

    GROUP_ID = os.getenv("GROUP_ID")
    API_KEY = os.getenv("API_KEY")
    BASE_URL = f"https://api.zotero.org/groups/{GROUP_ID}/items"
    LIMIT = 100

    headers = {"Zotero-API-Key": API_KEY, "Content-Type": "application/json"}

    items = []
    start = 0

    i = 1
    while True:
        i += 1
        params = {"limit": LIMIT, "start": start}
        page_items = _fetch_one_zotero_batch(BASE_URL, headers, params)

        if not page_items:
            break

        items.extend(page_items)
        start += LIMIT
        print(f"# items fetched {len(items)}")

        if debug:
            if len(items) > 200:
                break

    return items


########################################################
### EXTRACT ARXIV LINKS AND PDFs
########################################################


def get_arxiv_items(items):
    visited = set()

    arxiv_items = []
    arxiv_pattern = re.compile(r"arxiv.org/abs/(\d+\.\d+)")

    for item in items:
        data = item.get("data", {})
        attachments = item.get("links", {}).get("attachment", {})

        arxiv_url = None
        pdf_url = None

        if "url" in data and "arxiv.org" in data["url"]:
            arxiv_match = arxiv_pattern.search(data["url"])
            if arxiv_match:
                arxiv_url = data["url"]

        if attachments:
            pdf_url = attachments["href"]

        if arxiv_url:
            arxiv_id = arxiv_url.split("/")[-1]
            if arxiv_id in visited:
                continue

            arxiv_items.append(
                {
                    "arxiv_id": arxiv_id,
                    "arxiv_url": arxiv_url,
                    "pdf_url": pdf_url,
                    "added_by": item["meta"]["createdByUser"]["username"],
                    "date_added": data.get("dateAdded", ""),
                }
            )

            visited.add(arxiv_id)

    return arxiv_items


@retry(tries=3, delay=15, backoff=2)
def fetch_arxiv_html(arxiv_id):
    url = f"https://ar5iv.labs.arxiv.org/html/{arxiv_id.split('v')[0]}"
    response = requests.get(url)
    return response.text if response.status_code == 200 else None


def fetch_arxiv_htmls(arxiv_items):
    for item in tqdm(arxiv_items):
        html = fetch_arxiv_html(item["arxiv_id"])
        if html:
            item["raw_html"] = html
        else:
            print(f"failed to fetch html for {item['arxiv_id']}")
            item["raw_html"] = "Error"

    return arxiv_items


########################################################
### PARSE CONTENT FROM ARXIV HTML #
########################################################


def parse_html_content(html):
    """
    Parse content from arxiv html
    """
    arxiv_id_match = re.search(r"\[(\d+\.\d+(v\d+)?)\]", html)
    arxiv_id = arxiv_id_match.group(1) if arxiv_id_match else None
    soup = BeautifulSoup(html, "html.parser")
    result = []

    # Extract paper title
    try:
        paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(strip=True)
    except Exception:
        paper_title = soup.find("title").get_text(strip=True)
        paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)

    for math in soup.find_all("math"):
        math.decompose()
    for cite in soup.find_all("cite"):
        cite.decompose()

    # Extract abstract
    abstract = soup.find("div", class_="ltx_abstract")
    if abstract:
        result.append(
            {
                "content": " ".join(p.get_text(strip=True) for p in abstract.find_all("p")).replace(")", ") "),
                "title": "Abstract",
                "paper_title": paper_title,
                "content_type": "abstract",
            }
        )
    # Extract sections
    sections = soup.find_all("section", class_="ltx_section")
    for index, section in enumerate(sections):
        section_title = section.find("h2", class_="ltx_title ltx_title_section")
        section_title = section_title.get_text(strip=True) if section_title else f"Section {index + 1}"
        section_content = section.get_text(strip=True).replace(")", ") ")

        content_type = "body"
        if index == 0:
            content_type = "introduction"
        elif index == len(sections) - 1:
            content_type = "conclusion"

        result.append(
            {
                "content": section_content,
                "title": section_title,
                "paper_title": paper_title,
                "content_type": content_type,
            }
        )

    for c in result:
        c["arxiv_id"] = arxiv_id

    return result


########################################################
### GET TEXTS FROM PDF & PARSE
########################################################


def get_pdf_text(arxiv_id):
    url = "http://147.189.194.113:80/extract"  # fix: currently down

    try:
        response = requests.get(url, params={"arxiv_id": arxiv_id})
        response = response.json()
        if "text" in response:
            return response["text"]
        return None
    except Exception as e:
        print(e)
        return None


def get_content_type(section_type, section_count):
    """Determine the content type based on the section type and count"""
    if section_type == "abstract":
        return "abstract"
    elif section_type == "introduction" or section_count == 1:
        return "introduction"
    elif section_type == "conclusion" or section_type == "references":
        return section_type
    else:
        return "body"


def get_section_type(title):
    """Determine the section type based on the title"""
    title_lower = title.lower()
    if "abstract" in title_lower:
        return "abstract"
    elif "introduction" in title_lower:
        return "introduction"
    elif "conclusion" in title_lower:
        return "conclusion"
    elif "reference" in title_lower:
        return "references"
    else:
        return "body"


def parse_markdown_content(md_content, arxiv_id):
    """
    Parses markdown content to identify and extract sections based on headers.
    """

    lines = md_content.split("\n")
    parsed = []
    current_section = None
    content = []
    paper_title = None
    current_title = None

    # identify sections based on headers
    for line in lines:
        if line.startswith("#"):
            if paper_title is None:
                paper_title = line.lstrip("#").strip()
                continue
            if content:
                if current_title:
                    parsed.append(
                        {
                            "content": " ".join(content),
                            "title": current_title,
                            "paper_title": paper_title,
                            "content_type": get_content_type(current_section, len(parsed)),
                            "arxiv_id": arxiv_id,
                        }
                    )
                content = []

            current_title = line.lstrip("#").lstrip("#").lstrip()
            if "bit" not in current_title:
                current_title = (
                    current_title.lstrip("123456789")
                    .lstrip()
                    .lstrip(".")
                    .lstrip()
                    .lstrip("123456789")
                    .lstrip()
                    .lstrip(".")
                    .lstrip()
                )
            current_section = get_section_type(current_title)

        else:
            content.append(line)

    # Add the last section
    if content and current_title:
        parsed.append(
            {
                "content": " ".join(content).replace(")", ") "),
                "title": current_title,
                "paper_title": paper_title,
                "content_type": get_content_type(current_section, len(parsed)),
                "arxiv_id": arxiv_id,
            }
        )

    return parsed


########################################################
### Image Dataset
########################################################


def download_arxiv_pdf(arxiv_id):
    arxiv_id = arxiv_id.split("v")[0]
    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception(f"Failed to download PDF. Status code: {response.status_code}")


def pdf_to_jpegs(pdf_content, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open the PDF
    doc = fitz.open(stream=pdf_content, filetype="pdf")

    # Iterate through pages
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)

        # Convert page to image
        pix = page.get_pixmap()

        # Save image as JPEG
        image_path = os.path.join(output_folder, f"page_{page_num + 1}.jpg")
        pix.save(image_path)
        # print(f"Saved {image_path}")

    doc.close()


def save_arxiv_article_images(arxiv_id):
    output_folder = os.path.join("data", "arxiv_images", arxiv_id)
    try:
        pdf_content = download_arxiv_pdf(arxiv_id)
        pdf_to_jpegs(pdf_content, output_folder)
    except Exception as e:
        print(f"An error occurred: {str(e)}")


def create_hf_image_dataset(base_dir):
    data = []

    # Walk through the directory
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".jpg"):
                # Extract arxiv_id from the path
                arxiv_id = os.path.basename(root)

                # Extract page number from the filename
                match = re.search(r"page_(\d+)", file)
                if match:
                    page_number = int(match.group(1))
                else:
                    continue  # Skip if page number can't be extracted

                # Full path to the image
                image_path = os.path.join(root, file)

                # Open the image to get its size
                with PILImage.open(image_path) as img:
                    width, height = img.size

                # Add the data
                data.append(
                    {"image": image_path, "arxiv_id": arxiv_id, "page_number": page_number, "width": width, "height": height}
                )

    # Create the dataset
    dataset = Dataset.from_dict(
        {
            "image": [d["image"] for d in data],
            "arxiv_id": [d["arxiv_id"] for d in data],
            "page_number": [d["page_number"] for d in data],
            "width": [d["width"] for d in data],
            "height": [d["height"] for d in data],
        }
    )

    # Cast the image column to Image
    dataset = dataset.cast_column("image", Image())

    return dataset


########################################################
### HF UPLOAD
########################################################


def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
    repo_id = HF_REPO_ID
    create_repo(
        repo_id=repo_id,
        token=os.environ.get("HF_TOKEN"),
        private=True,
        repo_type="dataset",
        exist_ok=True,
    )

    # upload image dataset
    img_ds = create_hf_image_dataset("data/arxiv_images")
    img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))

    # push id_to_abstract
    abstract_ds = Dataset.from_pandas(abstract_df)
    abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))

    # push arxiv_items
    arxiv_ds = Dataset.from_pandas(contents_df)
    arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))

    # push processed_arxiv_ids
    processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
    processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
    processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))


########################################################
### MAIN
########################################################


def main():
    items = get_zotero_items(debug=True)
    print(f"# of items fetched from zotero: {len(items)}")
    arxiv_items = get_arxiv_items(items)
    print(f"# of arxiv papers: {len(arxiv_items)}")

    # get already processed arxiv ids from HF
    try:
        existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
    except Exception as e:
        print(e)
        try:
            existing_arxiv_ids = srsly.read_json("data/processed_arxiv_ids.json")
        except Exception as e:
            print(e)
            existing_arxiv_ids = []
    existing_arxiv_ids = set(existing_arxiv_ids)
    print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")

    # new arxiv items
    arxiv_items = [item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids]
    arxiv_items = fetch_arxiv_htmls(arxiv_items)
    print(f"# of new arxiv items: {len(arxiv_items)}")

    processed_arxiv_ids = set()
    for item in arxiv_items:
        # download images --
        save_arxiv_article_images(item["arxiv_id"])

        # parse html
        try:
            item["contents"] = parse_html_content(item["raw_html"])
            processed_arxiv_ids.add(item["arxiv_id"])
        except Exception as e:
            print(f"Failed to parse html for {item['arxiv_id']}: {e}")
            item["contents"] = []

        if len(item["contents"]) == 0:
            print("Extracting from pdf...")
            md_content = get_pdf_text(item["arxiv_id"])  # fix this
            if md_content:
                item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
                processed_arxiv_ids.add(item["arxiv_id"])
            else:
                item["contents"] = []

    # save contents ---
    processed_arxiv_ids = list(processed_arxiv_ids)
    print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")

    # save abstracts ---
    id_to_abstract = {}
    for item in arxiv_items:
        for entry in item["contents"]:
            if entry["content_type"] == "abstract":
                id_to_abstract[item["arxiv_id"]] = entry["content"]
                break
    print(f"# of abstracts: {len(id_to_abstract)}")
    abstract_df = pd.Series(id_to_abstract).reset_index().rename(columns={"index": "arxiv_id", 0: "abstract"})
    print(abstract_df.head())

    # add to existing dataset
    try:
        old_abstract_df = load_dataset(HF_REPO_ID, "abstracts")["train"].to_pandas()
    except Exception as e:
        print(e)
        old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
    print(old_abstract_df.head())

    abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
    abstract_df = abstract_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)

    # contents
    contents_df = pd.DataFrame(arxiv_items)
    print(contents_df.head())
    try:
        old_contents_df = load_dataset(HF_REPO_ID, "articles")["train"].to_pandas()
    except Exception as e:
        print(e)
        old_contents_df = pd.DataFrame(columns=contents_df.columns)
    if len(old_contents_df) > 0:
        print(old_contents_df.sample().T)

    contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
    contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)

    # upload to hf
    processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
    upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)

    # save as local copy
    os.makedirs("data", exist_ok=True)
    abstract_df.to_parquet("data/abstracts.parquet")
    contents_df.to_parquet("data/contents.parquet")
    srsly.write_json("data/processed_arxiv_ids.json", processed_arxiv_ids)


def schedule_periodic_task():
    """
    Schedule the main task to run at the user-defined frequency
    """
    main()  # run once initially

    frequency = "daily"  # TODO: env
    if frequency == "hourly":
        print("Scheduling tasks to run every hour at the top of the hour")
        schedule.every().hour.at(":00").do(main)
    elif frequency == "daily":
        start_time = "10:00"
        print("Scheduling tasks to run every day at: {start_time} UTC+00")
        schedule.every().day.at(start_time).do(main)

    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == "__main__":
    schedule_periodic_task()