lhoestq HF staff commited on
Commit
f721373
1 Parent(s): c70e770

add cc file

Browse files
CC-MAIN-20231128083443-20231128113443-00000.warc.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a754b9a848ffaf00ba1621ad4577a31ce3176b058c3ed21cb3665954b7ae7cf
3
+ size 1208969113
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import islice
2
+
3
+ import gradio as gr
4
+ from datatrove.executor.local import LocalPipelineExecutor
5
+ from datatrove.pipeline.base import PipelineStep
6
+ from datatrove.pipeline.extractors import Trafilatura
7
+ from datatrove.pipeline.filters import (
8
+ C4QualityFilter,
9
+ FineWebQualityFilter,
10
+ GopherQualityFilter,
11
+ GopherRepetitionFilter,
12
+ LanguageFilter,
13
+ URLFilter,
14
+ )
15
+ from datatrove.pipeline.readers import WarcReader
16
+ from datatrove.pipeline.writers.jsonl import JsonlWriter
17
+
18
+
19
+ def run(input):
20
+ return "wip"
21
+
22
+
23
+ demo = gr.Interface(run, inputs=[gr.Textbox()], outputs=[gr.Textbox()])
24
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ datatrove[all]