|
from itertools import islice |
|
|
|
import gradio as gr |
|
from datatrove.executor.local import LocalPipelineExecutor |
|
from datatrove.pipeline.base import PipelineStep |
|
from datatrove.pipeline.extractors import Trafilatura |
|
from datatrove.pipeline.filters import ( |
|
C4QualityFilter, |
|
FineWebQualityFilter, |
|
GopherQualityFilter, |
|
GopherRepetitionFilter, |
|
LanguageFilter, |
|
URLFilter, |
|
) |
|
from datatrove.pipeline.readers import WarcReader |
|
from datatrove.pipeline.writers.jsonl import JsonlWriter |
|
|
|
|
|
def run(input): |
|
return "wip" |
|
|
|
|
|
demo = gr.Interface(run, inputs=[gr.Textbox()], outputs=[gr.Textbox()]) |
|
demo.launch() |
|
|