Spaces:
Runtime error
Runtime error
File size: 1,269 Bytes
bfc0ec6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
"""Compute near duplicates for a dataset."""
from typing import Iterable, Optional, cast
from pydantic import Field as PydanticField
from typing_extensions import override
from ..schema import Field, Item, RichData, SignalInputType, field
from ..signal import TextSignal
from .minhash_dup import find_clusters
CLUSTER_KEY = 'cluster_id'
class NearDuplicateSignal(TextSignal):
"""Find near duplicate documents in a dataset using n-grams.
<br/>
Documents are fingerprinted using n-grams with
[minhash LSH](https://en.wikipedia.org/wiki/MinHash). Documents are assigned the same cluster id
if their Jaccard similarity is above the provided threshold.
"""
name = 'near_dup'
display_name = 'Near duplicate documents'
input_type = SignalInputType.TEXT
threshold: float = PydanticField(
default=0.85,
description='The similarity threshold for detecting a near duplicate.',
)
@override
def fields(self) -> Field:
return field(fields={CLUSTER_KEY: field('uint32', categorical=True)})
@override
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
cluster_ids = find_clusters(cast(Iterable[str], data), threshold=self.threshold)
for cluster_id in cluster_ids:
yield {CLUSTER_KEY: cluster_id}
|