nsthorat-lilac's picture
Duplicate from lilacai/nikhil_staging
bfc0ec6
raw
history blame
1.27 kB
"""Compute near duplicates for a dataset."""
from typing import Iterable, Optional, cast
from pydantic import Field as PydanticField
from typing_extensions import override
from ..schema import Field, Item, RichData, SignalInputType, field
from ..signal import TextSignal
from .minhash_dup import find_clusters
CLUSTER_KEY = 'cluster_id'
class NearDuplicateSignal(TextSignal):
"""Find near duplicate documents in a dataset using n-grams.
<br/>
Documents are fingerprinted using n-grams with
[minhash LSH](https://en.wikipedia.org/wiki/MinHash). Documents are assigned the same cluster id
if their Jaccard similarity is above the provided threshold.
"""
name = 'near_dup'
display_name = 'Near duplicate documents'
input_type = SignalInputType.TEXT
threshold: float = PydanticField(
default=0.85,
description='The similarity threshold for detecting a near duplicate.',
)
@override
def fields(self) -> Field:
return field(fields={CLUSTER_KEY: field('uint32', categorical=True)})
@override
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
cluster_ids = find_clusters(cast(Iterable[str], data), threshold=self.threshold)
for cluster_id in cluster_ids:
yield {CLUSTER_KEY: cluster_id}