lhoestq's picture
lhoestq HF staff
add cc file
f721373
raw
history blame
620 Bytes
from itertools import islice
import gradio as gr
from datatrove.executor.local import LocalPipelineExecutor
from datatrove.pipeline.base import PipelineStep
from datatrove.pipeline.extractors import Trafilatura
from datatrove.pipeline.filters import (
C4QualityFilter,
FineWebQualityFilter,
GopherQualityFilter,
GopherRepetitionFilter,
LanguageFilter,
URLFilter,
)
from datatrove.pipeline.readers import WarcReader
from datatrove.pipeline.writers.jsonl import JsonlWriter
def run(input):
return "wip"
demo = gr.Interface(run, inputs=[gr.Textbox()], outputs=[gr.Textbox()])
demo.launch()