polinaeterna HF staff commited on
Commit
12a4d67
β€’
1 Parent(s): 456fc29

add checker

Browse files
Files changed (2) hide show
  1. app.py +86 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import polars as pl
3
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
4
+ import torch
5
+ import spaces
6
+ from torch import nn
7
+ from transformers import AutoModel, AutoTokenizer, AutoConfig
8
+ from huggingface_hub import PyTorchModelHubMixin
9
+ import pandas as pd
10
+
11
+
12
+ class QualityModel(nn.Module, PyTorchModelHubMixin):
13
+ def __init__(self, config):
14
+ super(QualityModel, self).__init__()
15
+ self.model = AutoModel.from_pretrained(config["base_model"])
16
+ self.dropout = nn.Dropout(config["fc_dropout"])
17
+ self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))
18
+
19
+ def forward(self, input_ids, attention_mask):
20
+ features = self.model(
21
+ input_ids=input_ids, attention_mask=attention_mask
22
+ ).last_hidden_state
23
+ dropped = self.dropout(features)
24
+ outputs = self.fc(dropped)
25
+ return torch.softmax(outputs[:, 0, :], dim=1)
26
+
27
+ device = "cuda"
28
+ config = AutoConfig.from_pretrained("nvidia/quality-classifier-deberta")
29
+ tokenizer = AutoTokenizer.from_pretrained("nvidia/quality-classifier-deberta")
30
+ model = QualityModel.from_pretrained("nvidia/quality-classifier-deberta").to(device)
31
+ model.eval()
32
+
33
+
34
+ @spaces.GPU
35
+ def predict(texts: list[str]):
36
+ inputs = tokenizer(
37
+ texts, return_tensors="pt", padding="longest", truncation=True
38
+ ).to(device)
39
+ outputs = model(inputs["input_ids"], inputs["attention_mask"])
40
+ predicted_classes = torch.argmax(outputs, dim=1)
41
+ predicted_domains = [
42
+ config.id2label[class_idx.item()] for class_idx in predicted_classes.cpu().numpy()
43
+ ]
44
+ return predicted_domains
45
+
46
+
47
+ def run_quality_check(dataset, config, column, n_samples):
48
+ data = pl.read_parquet(f"hf://datasets/{dataset}@parquet~/{config}/train/0000.parquet", columns=[column])
49
+ texts = data[column].tolist()
50
+ predictions = predict(texts[:n_samples])
51
+ return pd.DataFrame({"quality": predictions}).value_counts()
52
+
53
+
54
+ with gr.Blocks() as demo:
55
+ gr.Markdown("# πŸ’« Dataset Quality Checker πŸ’«")
56
+ gr_dataset_name = HuggingfaceHubSearch(
57
+ label="Hub Dataset ID",
58
+ placeholder="Search for dataset id on Huggingface",
59
+ search_type="dataset",
60
+ value="fka/awesome-chatgpt-prompts",
61
+ )
62
+ dataset_name = HuggingfaceHubSearch(
63
+ label="Hub Dataset ID",
64
+ placeholder="Search for dataset id on Huggingface",
65
+ search_type="dataset",
66
+ value="HuggingFaceFW/fineweb",
67
+ )
68
+ config_name = "default"
69
+ @gr.render(inputs=dataset_name)
70
+ def embed(name):
71
+ html_code = f"""
72
+ <iframe
73
+ src="https://huggingface.co/datasets/{name}/embed/viewer/{config_name}/train"
74
+ frameborder="0"
75
+ width="100%"
76
+ height="700px"
77
+ ></iframe>
78
+ """
79
+ return gr.HTML(value=html_code)
80
+ text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
81
+ n_samples = gr.Number(placeholder=20, label="Num first samples to run check")
82
+ gr_check_btn = gr.Button("Check Dataset")
83
+ # plot = gr.BarPlot()
84
+ df = gr.DataFrame(visible=False)
85
+ gr_check_btn.click(run_quality_check, inputs=[dataset_name, config_name, text_column, n_samples], outputs=[df])
86
+ gr.BarPlot(df)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio_huggingfacehub_search==0.0.7
2
+ transformers
3
+ polars
4
+ torch
5
+ huggingface_hub