davidberenstein1957 HF staff commited on
Commit
08fabf7
1 Parent(s): f79ba24

feat: add docling support

Browse files
Files changed (4) hide show
  1. app.py +72 -19
  2. demo.py +8 -0
  3. requirements.in +4 -2
  4. requirements.txt +6 -438
app.py CHANGED
@@ -2,12 +2,24 @@ import logging
2
  from pathlib import Path
3
 
4
  import gradio as gr
 
5
  from datasets import Dataset
6
  from gradio_log import Log
7
  from huggingface_hub import DatasetCard
8
- from llama_index.core import SimpleDirectoryReader
9
  from llama_index.core.node_parser import SentenceSplitter
 
10
  from llama_index.core.schema import MetadataMode
 
 
 
 
 
 
 
 
 
 
 
11
  from tqdm.auto import tqdm
12
 
13
  log_file = "logs.txt"
@@ -22,8 +34,40 @@ def load_corpus(
22
  ):
23
  if verbose:
24
  gr.Info("Loading files...")
25
- reader = SimpleDirectoryReader(input_files=files)
26
- docs = reader.load_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  if split_sentences is False:
28
  gr.Info(
29
  "Skipping sentence splitting. Each file will be a single row in the dataset."
@@ -61,7 +105,10 @@ def upload_and_preview(
61
  split_sentences: bool = True,
62
  ):
63
  print("loading files")
64
- file_paths = [file.name for file in files]
 
 
 
65
 
66
  print("parsing into sentences")
67
  corpus = load_corpus(
@@ -159,17 +206,18 @@ def update_dataset_card(
159
  description = """Corpus Creator is a tool for transforming a collection of text files into a Hugging Face dataset, perfect for various natural language processing (NLP) tasks. Whether you're preparing data for synthetic generation, building pipelines, or setting up annotation tasks, this app simplifies the process.
160
 
161
  Key features:
162
- - 📁 Easy text file upload
163
  - ✂️ Customizable text chunking
164
  - 👁️ Instant dataset preview
165
- - 🚀 One-click upload to Hugging Face Hubub
166
 
167
- #### Powered by Llama Index
168
 
169
- Corpus Creator leverages the power of Llama Index, a data framework for LLM-based applications. Specifically, we use Llama Index's `SentenceSplitter` class to intelligently chunk your text. This ensures that your dataset is split in a way that preserves semantic meaning, making it ideal for downstream NLP tasks. [Learn more about Llama Index](https://www.llamaindex.ai/)
170
 
 
171
 
172
- Get started by uploading your files and see your corpus take shape!
173
 
174
  [View an example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created with Corpus Creator.
175
  """
@@ -189,14 +237,19 @@ with gr.Blocks() as demo:
189
  gr.Markdown(
190
  "### 1. Upload Files\nClick 'Upload Files' to select text file(s). A preview will generate automatically"
191
  )
192
- with gr.Row():
193
- upload_button = gr.File(
194
- file_types=["text"],
195
- file_count="multiple",
196
- height=50,
197
- interactive=True,
198
- label="Upload Files",
199
- )
 
 
 
 
 
200
  gr.Markdown("""
201
  ### 2. Adjust Parameters for Chunking Text (Optional)
202
  Customize the chunk size, overlap, and sentence splitting option according to your requirements.
@@ -238,8 +291,8 @@ with gr.Blocks() as demo:
238
  with gr.Accordion("detailed logs", open=False):
239
  Log(log_file, dark=True, xterm_font_size=12)
240
 
241
- upload_button.upload(
242
- upload_and_preview,
243
  inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
244
  outputs=[state, corpus_preview_df, preview_summary],
245
  )
 
2
  from pathlib import Path
3
 
4
  import gradio as gr
5
+ import pandas as pd
6
  from datasets import Dataset
7
  from gradio_log import Log
8
  from huggingface_hub import DatasetCard
 
9
  from llama_index.core.node_parser import SentenceSplitter
10
+ from llama_index.core.readers import SimpleDirectoryReader
11
  from llama_index.core.schema import MetadataMode
12
+ from llama_index.readers.docling import DoclingReader
13
+ from llama_index.readers.file import (
14
+ EpubReader,
15
+ HWPReader,
16
+ ImageReader,
17
+ IPYNBReader,
18
+ MboxReader,
19
+ PandasCSVReader,
20
+ PandasExcelReader,
21
+ VideoAudioReader,
22
+ )
23
  from tqdm.auto import tqdm
24
 
25
  log_file = "logs.txt"
 
34
  ):
35
  if verbose:
36
  gr.Info("Loading files...")
37
+
38
+ docling_reader = DoclingReader()
39
+ try:
40
+ docs = []
41
+ for file in files:
42
+ docs.extend(docling_reader.load_data(file))
43
+ except Exception:
44
+ reader = SimpleDirectoryReader(
45
+ input_files=files,
46
+ file_extractor={
47
+ ".hwp": HWPReader,
48
+ ".pdf": docling_reader,
49
+ ".docx": docling_reader,
50
+ ".pptx": docling_reader,
51
+ ".ppt": docling_reader,
52
+ ".pptm": docling_reader,
53
+ ".gif": ImageReader,
54
+ ".jpg": ImageReader,
55
+ ".png": ImageReader,
56
+ ".jpeg": ImageReader,
57
+ ".webp": ImageReader,
58
+ ".mp3": VideoAudioReader,
59
+ ".mp4": VideoAudioReader,
60
+ ".csv": PandasCSVReader,
61
+ ".epub": EpubReader,
62
+ ".md": docling_reader,
63
+ ".mbox": MboxReader,
64
+ ".ipynb": IPYNBReader,
65
+ ".xls": PandasExcelReader,
66
+ ".xlsx": PandasExcelReader,
67
+ },
68
+ )
69
+ docs = reader.load_data()
70
+
71
  if split_sentences is False:
72
  gr.Info(
73
  "Skipping sentence splitting. Each file will be a single row in the dataset."
 
105
  split_sentences: bool = True,
106
  ):
107
  print("loading files")
108
+ if isinstance(files, pd.DataFrame):
109
+ file_paths = files["urls"].tolist()
110
+ else:
111
+ file_paths = [file.name for file in files]
112
 
113
  print("parsing into sentences")
114
  corpus = load_corpus(
 
206
  description = """Corpus Creator is a tool for transforming a collection of text files into a Hugging Face dataset, perfect for various natural language processing (NLP) tasks. Whether you're preparing data for synthetic generation, building pipelines, or setting up annotation tasks, this app simplifies the process.
207
 
208
  Key features:
209
+ - 🗂️ Reads popular document formats (PDF, DOCX, PPTX, HTML, AsciiDoc, Markdown)
210
  - ✂️ Customizable text chunking
211
  - 👁️ Instant dataset preview
212
+ - 🚀 One-click upload to Hugging Face Hub
213
 
214
+ #### Powered by Llama Index and Docling
215
 
216
+ Corpus Creator leverages the power of Llama Index, a data framework for LLM-based applications. Specifically, we use Llama Index's `SentenceSplitter` class to intelligently chunk your text. This ensures that your dataset is split in a way that preserves semantic meaning, making it ideal for downstream NLP tasks. [Learn more about Llama Index](https://www.llamaindex.ai/).
217
 
218
+ Docling is a tool for converting documents to text. It supports a wide range of document formats, including PDF, DOCX, PPTX, Images, HTML, AsciiDoc, and Markdown. [Learn more about Docling](https://ds4sd.github.io/docling/).
219
 
220
+ Get started by uploading your files and see your corpus take shape!
221
 
222
  [View an example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created with Corpus Creator.
223
  """
 
237
  gr.Markdown(
238
  "### 1. Upload Files\nClick 'Upload Files' to select text file(s). A preview will generate automatically"
239
  )
240
+ with gr.Tab():
241
+ with gr.Row():
242
+ upload_button = gr.File(
243
+ file_types=["text"],
244
+ file_count="multiple",
245
+ height=50,
246
+ interactive=True,
247
+ label="Upload Files",
248
+ )
249
+ with gr.Tab():
250
+ with gr.Row():
251
+ urls = gr.Dataframe(label="URL", headers=["urls"], interactive=True)
252
+ upload_button_files = gr.Button("Upload URLs")
253
  gr.Markdown("""
254
  ### 2. Adjust Parameters for Chunking Text (Optional)
255
  Customize the chunk size, overlap, and sentence splitting option according to your requirements.
 
291
  with gr.Accordion("detailed logs", open=False):
292
  Log(log_file, dark=True, xterm_font_size=12)
293
 
294
+ gr.on(
295
+ triggers=[upload_button.upload, upload_button_files.click],
296
  inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
297
  outputs=[state, corpus_preview_df, preview_summary],
298
  )
demo.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from docling.document_converter import DocumentConverter
2
+
3
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
4
+ converter = DocumentConverter()
5
+ result = converter.convert(source)
6
+ print(
7
+ result.document.export_to_markdown()
8
+ ) # output: "### Docling Technical Report[...]"
requirements.in CHANGED
@@ -1,4 +1,6 @@
1
  datasets
2
- gradio[oauth]==4.36.1
3
  gradio_log
4
- llama_index
 
 
 
1
  datasets
2
+ gradio[oauth]<5
3
  gradio_log
4
+ llama_index==0.11.22
5
+ docling
6
+ llama-index-readers-docling
requirements.txt CHANGED
@@ -1,438 +1,6 @@
1
- # This file was autogenerated by uv via the following command:
2
- # uv pip compile requirements.in -o requirements.txt
3
- aiofiles==23.2.1
4
- # via gradio
5
- aiohttp==3.9.5
6
- # via
7
- # datasets
8
- # fsspec
9
- # llama-index-core
10
- # llama-index-legacy
11
- aiosignal==1.3.1
12
- # via aiohttp
13
- altair==5.3.0
14
- # via gradio
15
- annotated-types==0.7.0
16
- # via pydantic
17
- anyio==4.4.0
18
- # via
19
- # httpx
20
- # openai
21
- # starlette
22
- # watchfiles
23
- attrs==23.2.0
24
- # via
25
- # aiohttp
26
- # jsonschema
27
- # referencing
28
- authlib==1.3.1
29
- # via gradio
30
- beautifulsoup4==4.12.3
31
- # via llama-index-readers-file
32
- certifi==2024.6.2
33
- # via
34
- # httpcore
35
- # httpx
36
- # requests
37
- cffi==1.16.0
38
- # via cryptography
39
- charset-normalizer==3.3.2
40
- # via requests
41
- click==8.1.7
42
- # via
43
- # nltk
44
- # typer
45
- # uvicorn
46
- contourpy==1.2.1
47
- # via matplotlib
48
- cryptography==42.0.8
49
- # via authlib
50
- cycler==0.12.1
51
- # via matplotlib
52
- dataclasses-json==0.6.7
53
- # via
54
- # llama-index-core
55
- # llama-index-legacy
56
- datasets==2.20.0
57
- # via -r requirements.in
58
- deprecated==1.2.14
59
- # via
60
- # llama-index-core
61
- # llama-index-legacy
62
- dill==0.3.8
63
- # via
64
- # datasets
65
- # multiprocess
66
- dirtyjson==1.0.8
67
- # via
68
- # llama-index-core
69
- # llama-index-legacy
70
- distro==1.9.0
71
- # via openai
72
- dnspython==2.6.1
73
- # via email-validator
74
- email-validator==2.1.2
75
- # via fastapi
76
- fastapi==0.111.0
77
- # via gradio
78
- fastapi-cli==0.0.4
79
- # via fastapi
80
- ffmpy==0.3.2
81
- # via gradio
82
- filelock==3.15.1
83
- # via
84
- # datasets
85
- # huggingface-hub
86
- fonttools==4.53.0
87
- # via matplotlib
88
- frozenlist==1.4.1
89
- # via
90
- # aiohttp
91
- # aiosignal
92
- fsspec==2024.5.0
93
- # via
94
- # datasets
95
- # gradio-client
96
- # huggingface-hub
97
- # llama-index-core
98
- # llama-index-legacy
99
- gradio==4.36.1
100
- # via
101
- # -r requirements.in
102
- # gradio-log
103
- gradio-client==1.0.1
104
- # via gradio
105
- gradio-log==0.0.4
106
- # via -r requirements.in
107
- greenlet==3.0.3
108
- # via sqlalchemy
109
- h11==0.14.0
110
- # via
111
- # httpcore
112
- # uvicorn
113
- httpcore==1.0.5
114
- # via httpx
115
- httptools==0.6.1
116
- # via uvicorn
117
- httpx==0.27.0
118
- # via
119
- # fastapi
120
- # gradio
121
- # gradio-client
122
- # llama-index-core
123
- # llama-index-legacy
124
- # llamaindex-py-client
125
- # openai
126
- huggingface-hub==0.23.4
127
- # via
128
- # datasets
129
- # gradio
130
- # gradio-client
131
- idna==3.7
132
- # via
133
- # anyio
134
- # email-validator
135
- # httpx
136
- # requests
137
- # yarl
138
- importlib-resources==6.4.0
139
- # via gradio
140
- itsdangerous==2.2.0
141
- # via gradio
142
- jinja2==3.1.4
143
- # via
144
- # altair
145
- # fastapi
146
- # gradio
147
- joblib==1.4.2
148
- # via nltk
149
- jsonschema==4.22.0
150
- # via altair
151
- jsonschema-specifications==2023.12.1
152
- # via jsonschema
153
- kiwisolver==1.4.5
154
- # via matplotlib
155
- llama-index==0.10.45
156
- # via -r requirements.in
157
- llama-index-agent-openai==0.2.7
158
- # via
159
- # llama-index
160
- # llama-index-program-openai
161
- llama-index-cli==0.1.12
162
- # via llama-index
163
- llama-index-core==0.10.44
164
- # via
165
- # llama-index
166
- # llama-index-agent-openai
167
- # llama-index-cli
168
- # llama-index-embeddings-openai
169
- # llama-index-indices-managed-llama-cloud
170
- # llama-index-llms-openai
171
- # llama-index-multi-modal-llms-openai
172
- # llama-index-program-openai
173
- # llama-index-question-gen-openai
174
- # llama-index-readers-file
175
- # llama-index-readers-llama-parse
176
- # llama-parse
177
- llama-index-embeddings-openai==0.1.10
178
- # via
179
- # llama-index
180
- # llama-index-cli
181
- llama-index-indices-managed-llama-cloud==0.1.6
182
- # via llama-index
183
- llama-index-legacy==0.9.48
184
- # via llama-index
185
- llama-index-llms-openai==0.1.22
186
- # via
187
- # llama-index
188
- # llama-index-agent-openai
189
- # llama-index-cli
190
- # llama-index-multi-modal-llms-openai
191
- # llama-index-program-openai
192
- # llama-index-question-gen-openai
193
- llama-index-multi-modal-llms-openai==0.1.6
194
- # via llama-index
195
- llama-index-program-openai==0.1.6
196
- # via
197
- # llama-index
198
- # llama-index-question-gen-openai
199
- llama-index-question-gen-openai==0.1.3
200
- # via llama-index
201
- llama-index-readers-file==0.1.25
202
- # via llama-index
203
- llama-index-readers-llama-parse==0.1.4
204
- # via llama-index
205
- llama-parse==0.4.4
206
- # via llama-index-readers-llama-parse
207
- llamaindex-py-client==0.1.19
208
- # via
209
- # llama-index-core
210
- # llama-index-indices-managed-llama-cloud
211
- markdown-it-py==3.0.0
212
- # via rich
213
- markupsafe==2.1.5
214
- # via
215
- # gradio
216
- # jinja2
217
- marshmallow==3.21.3
218
- # via dataclasses-json
219
- matplotlib==3.9.0
220
- # via gradio
221
- mdurl==0.1.2
222
- # via markdown-it-py
223
- multidict==6.0.5
224
- # via
225
- # aiohttp
226
- # yarl
227
- multiprocess==0.70.16
228
- # via datasets
229
- mypy-extensions==1.0.0
230
- # via typing-inspect
231
- nest-asyncio==1.6.0
232
- # via
233
- # llama-index-core
234
- # llama-index-legacy
235
- networkx==3.3
236
- # via
237
- # llama-index-core
238
- # llama-index-legacy
239
- nltk==3.8.1
240
- # via
241
- # llama-index-core
242
- # llama-index-legacy
243
- numpy==2.0.0
244
- # via
245
- # altair
246
- # contourpy
247
- # datasets
248
- # gradio
249
- # llama-index-core
250
- # llama-index-legacy
251
- # matplotlib
252
- # pandas
253
- # pyarrow
254
- openai==1.34.0
255
- # via
256
- # llama-index-agent-openai
257
- # llama-index-core
258
- # llama-index-legacy
259
- orjson==3.10.5
260
- # via
261
- # fastapi
262
- # gradio
263
- packaging==24.1
264
- # via
265
- # altair
266
- # datasets
267
- # gradio
268
- # gradio-client
269
- # huggingface-hub
270
- # marshmallow
271
- # matplotlib
272
- pandas==2.2.2
273
- # via
274
- # altair
275
- # datasets
276
- # gradio
277
- # llama-index-core
278
- # llama-index-legacy
279
- pillow==10.3.0
280
- # via
281
- # gradio
282
- # llama-index-core
283
- # matplotlib
284
- pyarrow==16.1.0
285
- # via datasets
286
- pyarrow-hotfix==0.6
287
- # via datasets
288
- pycparser==2.22
289
- # via cffi
290
- pydantic==2.7.4
291
- # via
292
- # fastapi
293
- # gradio
294
- # llamaindex-py-client
295
- # openai
296
- pydantic-core==2.18.4
297
- # via pydantic
298
- pydub==0.25.1
299
- # via gradio
300
- pygments==2.18.0
301
- # via rich
302
- pyparsing==3.1.2
303
- # via matplotlib
304
- pypdf==4.2.0
305
- # via llama-index-readers-file
306
- python-dateutil==2.9.0.post0
307
- # via
308
- # matplotlib
309
- # pandas
310
- python-dotenv==1.0.1
311
- # via uvicorn
312
- python-multipart==0.0.9
313
- # via
314
- # fastapi
315
- # gradio
316
- pytz==2024.1
317
- # via pandas
318
- pyyaml==6.0.1
319
- # via
320
- # datasets
321
- # gradio
322
- # huggingface-hub
323
- # llama-index-core
324
- # uvicorn
325
- referencing==0.35.1
326
- # via
327
- # jsonschema
328
- # jsonschema-specifications
329
- regex==2024.5.15
330
- # via
331
- # nltk
332
- # tiktoken
333
- requests==2.32.3
334
- # via
335
- # datasets
336
- # huggingface-hub
337
- # llama-index-core
338
- # llama-index-legacy
339
- # tiktoken
340
- rich==13.7.1
341
- # via typer
342
- rpds-py==0.18.1
343
- # via
344
- # jsonschema
345
- # referencing
346
- ruff==0.4.9
347
- # via gradio
348
- semantic-version==2.10.0
349
- # via gradio
350
- shellingham==1.5.4
351
- # via typer
352
- six==1.16.0
353
- # via python-dateutil
354
- sniffio==1.3.1
355
- # via
356
- # anyio
357
- # httpx
358
- # openai
359
- soupsieve==2.5
360
- # via beautifulsoup4
361
- sqlalchemy==2.0.30
362
- # via
363
- # llama-index-core
364
- # llama-index-legacy
365
- starlette==0.37.2
366
- # via fastapi
367
- striprtf==0.0.26
368
- # via llama-index-readers-file
369
- tenacity==8.4.1
370
- # via
371
- # llama-index-core
372
- # llama-index-legacy
373
- tiktoken==0.7.0
374
- # via
375
- # llama-index-core
376
- # llama-index-legacy
377
- tomlkit==0.12.0
378
- # via gradio
379
- toolz==0.12.1
380
- # via altair
381
- tqdm==4.66.4
382
- # via
383
- # datasets
384
- # huggingface-hub
385
- # llama-index-core
386
- # nltk
387
- # openai
388
- typer==0.12.3
389
- # via
390
- # fastapi-cli
391
- # gradio
392
- typing-extensions==4.12.2
393
- # via
394
- # fastapi
395
- # gradio
396
- # gradio-client
397
- # huggingface-hub
398
- # llama-index-core
399
- # llama-index-legacy
400
- # openai
401
- # pydantic
402
- # pydantic-core
403
- # sqlalchemy
404
- # typer
405
- # typing-inspect
406
- typing-inspect==0.9.0
407
- # via
408
- # dataclasses-json
409
- # llama-index-core
410
- # llama-index-legacy
411
- tzdata==2024.1
412
- # via pandas
413
- ujson==5.10.0
414
- # via fastapi
415
- urllib3==2.2.2
416
- # via
417
- # gradio
418
- # requests
419
- uvicorn==0.30.1
420
- # via
421
- # fastapi
422
- # gradio
423
- uvloop==0.19.0
424
- # via uvicorn
425
- watchfiles==0.22.0
426
- # via uvicorn
427
- websockets==11.0.3
428
- # via
429
- # gradio-client
430
- # uvicorn
431
- wrapt==1.16.0
432
- # via
433
- # deprecated
434
- # llama-index-core
435
- xxhash==3.4.1
436
- # via datasets
437
- yarl==1.9.4
438
- # via aiohttp
 
1
+ datasets
2
+ gradio[oauth]<5
3
+ gradio_log
4
+ llama_index==0.11.22
5
+ docling
6
+ llama-index-readers-docling