docs2datasets

Sleeping

App Files Files Community

davidberenstein1957 HF staff commited on Nov 10

Commit

08fabf7

•

1 Parent(s): f79ba24

feat: add docling support

Browse files

Files changed (4) hide show

app.py +72 -19
demo.py +8 -0
requirements.in +4 -2
requirements.txt +6 -438

app.py CHANGED Viewed

@@ -2,12 +2,24 @@ import logging
 from pathlib import Path
 import gradio as gr
 from datasets import Dataset
 from gradio_log import Log
 from huggingface_hub import DatasetCard
-from llama_index.core import SimpleDirectoryReader
 from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.schema import MetadataMode
 from tqdm.auto import tqdm
 log_file = "logs.txt"
@@ -22,8 +34,40 @@ def load_corpus(
 ):
     if verbose:
         gr.Info("Loading files...")
-    reader = SimpleDirectoryReader(input_files=files)
-    docs = reader.load_data()
     if split_sentences is False:
         gr.Info(
             "Skipping sentence splitting. Each file will be a single row in the dataset."
@@ -61,7 +105,10 @@ def upload_and_preview(
     split_sentences: bool = True,
 ):
     print("loading files")
-    file_paths = [file.name for file in files]
     print("parsing into sentences")
     corpus = load_corpus(
@@ -159,17 +206,18 @@ def update_dataset_card(
 description = """Corpus Creator is a tool for transforming a collection of text files into a Hugging Face  dataset, perfect for various natural language processing (NLP) tasks. Whether you're preparing data for synthetic generation, building pipelines, or setting up annotation tasks, this app simplifies the process.
 Key features:
-- 📁 Easy text file upload
 - ✂️ Customizable text chunking
 - 👁️ Instant dataset preview
-- 🚀 One-click upload to Hugging Face Hubub
-#### Powered by Llama Index
-Corpus Creator leverages the power of Llama Index, a data framework for LLM-based applications. Specifically, we use Llama Index's `SentenceSplitter` class to intelligently chunk your text. This ensures that your dataset is split in a way that preserves semantic meaning, making it ideal for downstream NLP tasks. [Learn more about Llama Index](https://www.llamaindex.ai/)
-Get started by uploading your files and see your corpus take shape!
 [View an example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created with Corpus Creator.
 """
@@ -189,14 +237,19 @@ with gr.Blocks() as demo:
     gr.Markdown(
         "### 1. Upload Files\nClick 'Upload Files' to select text file(s). A preview will generate automatically"
     )
-    with gr.Row():
-        upload_button = gr.File(
-            file_types=["text"],
-            file_count="multiple",
-            height=50,
-            interactive=True,
-            label="Upload Files",
-        )
     gr.Markdown("""
     ### 2. Adjust Parameters for Chunking Text (Optional)
     Customize the chunk size, overlap, and sentence splitting option according to your requirements.
@@ -238,8 +291,8 @@ with gr.Blocks() as demo:
     with gr.Accordion("detailed logs", open=False):
         Log(log_file, dark=True, xterm_font_size=12)
-    upload_button.upload(
-        upload_and_preview,
         inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
         outputs=[state, corpus_preview_df, preview_summary],
     )

 from pathlib import Path
 import gradio as gr
+import pandas as pd
 from datasets import Dataset
 from gradio_log import Log
 from huggingface_hub import DatasetCard
 from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.readers import SimpleDirectoryReader
 from llama_index.core.schema import MetadataMode
+from llama_index.readers.docling import DoclingReader
+from llama_index.readers.file import (
+    EpubReader,
+    HWPReader,
+    ImageReader,
+    IPYNBReader,
+    MboxReader,
+    PandasCSVReader,
+    PandasExcelReader,
+    VideoAudioReader,
+)
 from tqdm.auto import tqdm
 log_file = "logs.txt"
 ):
     if verbose:
         gr.Info("Loading files...")
+    docling_reader = DoclingReader()
+    try:
+        docs = []
+        for file in files:
+            docs.extend(docling_reader.load_data(file))
+    except Exception:
+        reader = SimpleDirectoryReader(
+            input_files=files,
+            file_extractor={
+                ".hwp": HWPReader,
+                ".pdf": docling_reader,
+                ".docx": docling_reader,
+                ".pptx": docling_reader,
+                ".ppt": docling_reader,
+                ".pptm": docling_reader,
+                ".gif": ImageReader,
+                ".jpg": ImageReader,
+                ".png": ImageReader,
+                ".jpeg": ImageReader,
+                ".webp": ImageReader,
+                ".mp3": VideoAudioReader,
+                ".mp4": VideoAudioReader,
+                ".csv": PandasCSVReader,
+                ".epub": EpubReader,
+                ".md": docling_reader,
+                ".mbox": MboxReader,
+                ".ipynb": IPYNBReader,
+                ".xls": PandasExcelReader,
+                ".xlsx": PandasExcelReader,
+            },
+        )
+        docs = reader.load_data()
     if split_sentences is False:
         gr.Info(
             "Skipping sentence splitting. Each file will be a single row in the dataset."
     split_sentences: bool = True,
 ):
     print("loading files")
+    if isinstance(files, pd.DataFrame):
+        file_paths = files["urls"].tolist()
+    else:
+        file_paths = [file.name for file in files]
     print("parsing into sentences")
     corpus = load_corpus(
 description = """Corpus Creator is a tool for transforming a collection of text files into a Hugging Face  dataset, perfect for various natural language processing (NLP) tasks. Whether you're preparing data for synthetic generation, building pipelines, or setting up annotation tasks, this app simplifies the process.
 Key features:
+- 🗂️ Reads popular document formats (PDF, DOCX, PPTX, HTML, AsciiDoc, Markdown)
 - ✂️ Customizable text chunking
 - 👁️ Instant dataset preview
+- 🚀 One-click upload to Hugging Face Hub
+#### Powered by Llama Index and Docling
+Corpus Creator leverages the power of Llama Index, a data framework for LLM-based applications. Specifically, we use Llama Index's `SentenceSplitter` class to intelligently chunk your text. This ensures that your dataset is split in a way that preserves semantic meaning, making it ideal for downstream NLP tasks. [Learn more about Llama Index](https://www.llamaindex.ai/).
+Docling is a tool for converting documents to text. It supports a wide range of document formats, including PDF, DOCX, PPTX, Images, HTML, AsciiDoc, and Markdown. [Learn more about Docling](https://ds4sd.github.io/docling/).
+Get started by uploading your files and see your corpus take shape!
 [View an example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created with Corpus Creator.
 """
     gr.Markdown(
         "### 1. Upload Files\nClick 'Upload Files' to select text file(s). A preview will generate automatically"
     )
+    with gr.Tab():
+        with gr.Row():
+            upload_button = gr.File(
+                file_types=["text"],
+                file_count="multiple",
+                height=50,
+                interactive=True,
+                label="Upload Files",
+            )
+    with gr.Tab():
+        with gr.Row():
+            urls = gr.Dataframe(label="URL", headers=["urls"], interactive=True)
+            upload_button_files = gr.Button("Upload URLs")
     gr.Markdown("""
     ### 2. Adjust Parameters for Chunking Text (Optional)
     Customize the chunk size, overlap, and sentence splitting option according to your requirements.
     with gr.Accordion("detailed logs", open=False):
         Log(log_file, dark=True, xterm_font_size=12)
+    gr.on(
+        triggers=[upload_button.upload, upload_button_files.click],
         inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
         outputs=[state, corpus_preview_df, preview_summary],
     )

demo.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from docling.document_converter import DocumentConverter
+source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
+converter = DocumentConverter()
+result = converter.convert(source)
+print(
+    result.document.export_to_markdown()
+)  # output: "### Docling Technical Report[...]"

requirements.in CHANGED Viewed

@@ -1,4 +1,6 @@
 datasets
-gradio[oauth]==4.36.1
 gradio_log
-llama_index

 datasets
+gradio[oauth]<5
 gradio_log
+llama_index==0.11.22
+docling
+llama-index-readers-docling

requirements.txt CHANGED Viewed

@@ -1,438 +1,6 @@
-# This file was autogenerated by uv via the following command:
-#    uv pip compile requirements.in -o requirements.txt
-aiofiles==23.2.1
-    # via gradio
-aiohttp==3.9.5
-    # via
-    #   datasets
-    #   fsspec
-    #   llama-index-core
-    #   llama-index-legacy
-aiosignal==1.3.1
-    # via aiohttp
-altair==5.3.0
-    # via gradio
-annotated-types==0.7.0
-    # via pydantic
-anyio==4.4.0
-    # via
-    #   httpx
-    #   openai
-    #   starlette
-    #   watchfiles
-attrs==23.2.0
-    # via
-    #   aiohttp
-    #   jsonschema
-    #   referencing
-authlib==1.3.1
-    # via gradio
-beautifulsoup4==4.12.3
-    # via llama-index-readers-file
-certifi==2024.6.2
-    # via
-    #   httpcore
-    #   httpx
-    #   requests
-cffi==1.16.0
-    # via cryptography
-charset-normalizer==3.3.2
-    # via requests
-click==8.1.7
-    # via
-    #   nltk
-    #   typer
-    #   uvicorn
-contourpy==1.2.1
-    # via matplotlib
-cryptography==42.0.8
-    # via authlib
-cycler==0.12.1
-    # via matplotlib
-dataclasses-json==0.6.7
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-datasets==2.20.0
-    # via -r requirements.in
-deprecated==1.2.14
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-dill==0.3.8
-    # via
-    #   datasets
-    #   multiprocess
-dirtyjson==1.0.8
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-distro==1.9.0
-    # via openai
-dnspython==2.6.1
-    # via email-validator
-email-validator==2.1.2
-    # via fastapi
-fastapi==0.111.0
-    # via gradio
-fastapi-cli==0.0.4
-    # via fastapi
-ffmpy==0.3.2
-    # via gradio
-filelock==3.15.1
-    # via
-    #   datasets
-    #   huggingface-hub
-fonttools==4.53.0
-    # via matplotlib
-frozenlist==1.4.1
-    # via
-    #   aiohttp
-    #   aiosignal
-fsspec==2024.5.0
-    # via
-    #   datasets
-    #   gradio-client
-    #   huggingface-hub
-    #   llama-index-core
-    #   llama-index-legacy
-gradio==4.36.1
-    # via
-    #   -r requirements.in
-    #   gradio-log
-gradio-client==1.0.1
-    # via gradio
-gradio-log==0.0.4
-    # via -r requirements.in
-greenlet==3.0.3
-    # via sqlalchemy
-h11==0.14.0
-    # via
-    #   httpcore
-    #   uvicorn
-httpcore==1.0.5
-    # via httpx
-httptools==0.6.1
-    # via uvicorn
-httpx==0.27.0
-    # via
-    #   fastapi
-    #   gradio
-    #   gradio-client
-    #   llama-index-core
-    #   llama-index-legacy
-    #   llamaindex-py-client
-    #   openai
-huggingface-hub==0.23.4
-    # via
-    #   datasets
-    #   gradio
-    #   gradio-client
-idna==3.7
-    # via
-    #   anyio
-    #   email-validator
-    #   httpx
-    #   requests
-    #   yarl
-importlib-resources==6.4.0
-    # via gradio
-itsdangerous==2.2.0
-    # via gradio
-jinja2==3.1.4
-    # via
-    #   altair
-    #   fastapi
-    #   gradio
-joblib==1.4.2
-    # via nltk
-jsonschema==4.22.0
-    # via altair
-jsonschema-specifications==2023.12.1
-    # via jsonschema
-kiwisolver==1.4.5
-    # via matplotlib
-llama-index==0.10.45
-    # via -r requirements.in
-llama-index-agent-openai==0.2.7
-    # via
-    #   llama-index
-    #   llama-index-program-openai
-llama-index-cli==0.1.12
-    # via llama-index
-llama-index-core==0.10.44
-    # via
-    #   llama-index
-    #   llama-index-agent-openai
-    #   llama-index-cli
-    #   llama-index-embeddings-openai
-    #   llama-index-indices-managed-llama-cloud
-    #   llama-index-llms-openai
-    #   llama-index-multi-modal-llms-openai
-    #   llama-index-program-openai
-    #   llama-index-question-gen-openai
-    #   llama-index-readers-file
-    #   llama-index-readers-llama-parse
-    #   llama-parse
-llama-index-embeddings-openai==0.1.10
-    # via
-    #   llama-index
-    #   llama-index-cli
-llama-index-indices-managed-llama-cloud==0.1.6
-    # via llama-index
-llama-index-legacy==0.9.48
-    # via llama-index
-llama-index-llms-openai==0.1.22
-    # via
-    #   llama-index
-    #   llama-index-agent-openai
-    #   llama-index-cli
-    #   llama-index-multi-modal-llms-openai
-    #   llama-index-program-openai
-    #   llama-index-question-gen-openai
-llama-index-multi-modal-llms-openai==0.1.6
-    # via llama-index
-llama-index-program-openai==0.1.6
-    # via
-    #   llama-index
-    #   llama-index-question-gen-openai
-llama-index-question-gen-openai==0.1.3
-    # via llama-index
-llama-index-readers-file==0.1.25
-    # via llama-index
-llama-index-readers-llama-parse==0.1.4
-    # via llama-index
-llama-parse==0.4.4
-    # via llama-index-readers-llama-parse
-llamaindex-py-client==0.1.19
-    # via
-    #   llama-index-core
-    #   llama-index-indices-managed-llama-cloud
-markdown-it-py==3.0.0
-    # via rich
-markupsafe==2.1.5
-    # via
-    #   gradio
-    #   jinja2
-marshmallow==3.21.3
-    # via dataclasses-json
-matplotlib==3.9.0
-    # via gradio
-mdurl==0.1.2
-    # via markdown-it-py
-multidict==6.0.5
-    # via
-    #   aiohttp
-    #   yarl
-multiprocess==0.70.16
-    # via datasets
-mypy-extensions==1.0.0
-    # via typing-inspect
-nest-asyncio==1.6.0
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-networkx==3.3
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-nltk==3.8.1
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-numpy==2.0.0
-    # via
-    #   altair
-    #   contourpy
-    #   datasets
-    #   gradio
-    #   llama-index-core
-    #   llama-index-legacy
-    #   matplotlib
-    #   pandas
-    #   pyarrow
-openai==1.34.0
-    # via
-    #   llama-index-agent-openai
-    #   llama-index-core
-    #   llama-index-legacy
-orjson==3.10.5
-    # via
-    #   fastapi
-    #   gradio
-packaging==24.1
-    # via
-    #   altair
-    #   datasets
-    #   gradio
-    #   gradio-client
-    #   huggingface-hub
-    #   marshmallow
-    #   matplotlib
-pandas==2.2.2
-    # via
-    #   altair
-    #   datasets
-    #   gradio
-    #   llama-index-core
-    #   llama-index-legacy
-pillow==10.3.0
-    # via
-    #   gradio
-    #   llama-index-core
-    #   matplotlib
-pyarrow==16.1.0
-    # via datasets
-pyarrow-hotfix==0.6
-    # via datasets
-pycparser==2.22
-    # via cffi
-pydantic==2.7.4
-    # via
-    #   fastapi
-    #   gradio
-    #   llamaindex-py-client
-    #   openai
-pydantic-core==2.18.4
-    # via pydantic
-pydub==0.25.1
-    # via gradio
-pygments==2.18.0
-    # via rich
-pyparsing==3.1.2
-    # via matplotlib
-pypdf==4.2.0
-    # via llama-index-readers-file
-python-dateutil==2.9.0.post0
-    # via
-    #   matplotlib
-    #   pandas
-python-dotenv==1.0.1
-    # via uvicorn
-python-multipart==0.0.9
-    # via
-    #   fastapi
-    #   gradio
-pytz==2024.1
-    # via pandas
-pyyaml==6.0.1
-    # via
-    #   datasets
-    #   gradio
-    #   huggingface-hub
-    #   llama-index-core
-    #   uvicorn
-referencing==0.35.1
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
-regex==2024.5.15
-    # via
-    #   nltk
-    #   tiktoken
-requests==2.32.3
-    # via
-    #   datasets
-    #   huggingface-hub
-    #   llama-index-core
-    #   llama-index-legacy
-    #   tiktoken
-rich==13.7.1
-    # via typer
-rpds-py==0.18.1
-    # via
-    #   jsonschema
-    #   referencing
-ruff==0.4.9
-    # via gradio
-semantic-version==2.10.0
-    # via gradio
-shellingham==1.5.4
-    # via typer
-six==1.16.0
-    # via python-dateutil
-sniffio==1.3.1
-    # via
-    #   anyio
-    #   httpx
-    #   openai
-soupsieve==2.5
-    # via beautifulsoup4
-sqlalchemy==2.0.30
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-starlette==0.37.2
-    # via fastapi
-striprtf==0.0.26
-    # via llama-index-readers-file
-tenacity==8.4.1
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-tiktoken==0.7.0
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-tomlkit==0.12.0
-    # via gradio
-toolz==0.12.1
-    # via altair
-tqdm==4.66.4
-    # via
-    #   datasets
-    #   huggingface-hub
-    #   llama-index-core
-    #   nltk
-    #   openai
-typer==0.12.3
-    # via
-    #   fastapi-cli
-    #   gradio
-typing-extensions==4.12.2
-    # via
-    #   fastapi
-    #   gradio
-    #   gradio-client
-    #   huggingface-hub
-    #   llama-index-core
-    #   llama-index-legacy
-    #   openai
-    #   pydantic
-    #   pydantic-core
-    #   sqlalchemy
-    #   typer
-    #   typing-inspect
-typing-inspect==0.9.0
-    # via
-    #   dataclasses-json
-    #   llama-index-core
-    #   llama-index-legacy
-tzdata==2024.1
-    # via pandas
-ujson==5.10.0
-    # via fastapi
-urllib3==2.2.2
-    # via
-    #   gradio
-    #   requests
-uvicorn==0.30.1
-    # via
-    #   fastapi
-    #   gradio
-uvloop==0.19.0
-    # via uvicorn
-watchfiles==0.22.0
-    # via uvicorn
-websockets==11.0.3
-    # via
-    #   gradio-client
-    #   uvicorn
-wrapt==1.16.0
-    # via
-    #   deprecated
-    #   llama-index-core
-xxhash==3.4.1
-    # via datasets
-yarl==1.9.4
-    # via aiohttp

+datasets
+gradio[oauth]<5
+gradio_log
+llama_index==0.11.22
+docling
+llama-index-readers-docling