File size: 1,076 Bytes
c80917c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from random import uniform
import numpy as np
from collections import OrderedDict, defaultdict
from itertools import tee
import time

# -----------------------------------------------
def find_ngrams(input_list, n):
    return zip(*[input_list[i:] for i in range(n)])

def compute_div_n(caps,n=1):
  aggr_div = []
  for k in caps:
      all_ngrams = set()
      lenT = 0.
      for c in caps[k]:
         tkns = c.split()
         lenT += len(tkns)
         ng = find_ngrams(tkns, n)
         all_ngrams.update(ng)
      aggr_div.append(float(len(all_ngrams))/ (1e-6 + float(lenT)))
  return np.array(aggr_div).mean(), np.array(aggr_div)

def compute_global_div_n(caps,n=1):
  aggr_div = []
  all_ngrams = set()
  lenT = 0.
  for k in caps:
      for c in caps[k]:
         tkns = c.split()
         lenT += len(tkns)
         ng = find_ngrams(tkns, n)
         all_ngrams.update(ng)
  if n == 1:
    aggr_div.append(float(len(all_ngrams)))
  else:
    aggr_div.append(float(len(all_ngrams))/ (1e-6 + float(lenT)))
  return aggr_div[0], np.repeat(np.array(aggr_div),len(caps))