const clustering = (function() { var clusteringCount = 0; return { nClusters: 5, createClustering: function () { var clusterKeywords = $('#keywords_checkbox').is(':checked'); var clusterAuthor = $('#author_checkbox').is(':checked'); if (!clusterKeywords && !clusterAuthor) { page.notify('Please select at least one of the clustering criteria checkboxes.', 'error'); return; } this.nClusters = Math.min(Object.keys(bib.filteredEntries).length, this.nClusters); var clusteringName = String.fromCharCode(65 + clusteringCount); if (clusteringCount == 0) { bib.clusters = {}; bib.clusterAssignment = {}; bib.clusteringEntityTerms = {}; } var allTerms = {}; var docTerms = {}; $.each(bib.filteredEntries, function (id, entry) { docTerms[id] = []; var terms = []; if (clusterAuthor && bib.parsedEntries[id].author) { terms = terms.concat(bib.parsedEntries[id].author); } if (clusterKeywords && bib.parsedEntries[id].keywords) { terms = terms.concat(bib.parsedEntries[id].keywords) } //if (terms.length > 0) { //var splitTerms = entry.keywords.split(/(,\s)+/) $.each(terms, function (i, term) { //if (term.indexOf(',') < 0) { //term = term.replace(/\\/g, '').toLowerCase().trim(); var category = term.substr(0, term.indexOf(':')); if (!category || !bib.tagCategories[category] || !bib.tagCategories[category]['excludeForSimilarity']) { docTerms[id].push(term); allTerms[term] = true; } //} }); //} }); bib.clusteringEntityTerms[clusteringName] = docTerms; var labels = []; var vectors = []; var i = 0; $.each(docTerms, function (id, terms) { labels[i] = id; vectors[i] = []; var j = 0; for (var term in allTerms) { vectors[i][j] = terms.indexOf(term) >= 0 ? 1 : 0; j++; } i++; }); var clusters = window.figue.kmeans(this.nClusters, vectors); bib.clusters[clusteringName] = {}; $.each(clusters.centroids, function (i, centroid) { var cluster = {}; var maxTerms = []; while (maxTerms.length < centroid.length / 20) { var max = 0; var currentMaxTerms = []; $.each(centroid, function (j, value) { if (maxTerms.indexOf(j) < 0) { if (value > max) { max = value; currentMaxTerms = []; } if (value >= max) { currentMaxTerms.push(j); } } }); $.merge(maxTerms, currentMaxTerms); } var terms = []; $.each(maxTerms, function (i, j) { terms.push(Object.keys(allTerms)[j] + ' (' + centroid[j].toFixed(2) + ')'); }); clustering.terms = terms; bib.clusters[clusteringName][i + 1] = cluster; }); $.each(clusters.assignments, function (i, clusterID) { var id = labels[i]; if (!bib.clusterAssignment[id]) { bib.clusterAssignment[id] = []; } bib.clusterAssignment[id].push(clusteringName + '.' + (clusterID + 1)); }); clusteringCount++; }, updateClusters: function () { if (!bib.clusterAssignment) { return } var clusterSelectorSimilarities = {}; var clusterSizes = {}; var clusterTermFrequencies = {}; var termFrequencies = {}; var clusterTotalTermFrequency = {}; $.each(bib.filteredEntries, function (id, field) { if (bib.clusterAssignment[id]) { $.each(bib.clusterAssignment[id], function (j, clusterName) { var clusteringName = clusterName.substring(0, 1); if (!clusterSizes[clusterName]) { clusterSizes[clusterName] = 0; } clusterSizes[clusterName]++; if (!clusterSelectorSimilarities[clusterName]) { clusterSelectorSimilarities[clusterName] = []; } $.each(selectors.getSelectors(), function (i, selector) { if (selector && !selector['lock']) { if (!clusterSelectorSimilarities[clusterName][i]) { clusterSelectorSimilarities[clusterName][i] = 0.0; } clusterSelectorSimilarities[clusterName][i] += bib.entrySelectorSimilarities[id][i]; } }); $.each(bib.clusteringEntityTerms[clusteringName][id], function (i, term) { if (!clusterTermFrequencies[clusterName]) { clusterTermFrequencies[clusterName] = {}; } if (!clusterTermFrequencies[clusterName][term]) { clusterTermFrequencies[clusterName][term] = 0; } clusterTermFrequencies[clusterName][term]++; if (!termFrequencies[term]) { termFrequencies[term] = 0; } termFrequencies[term]++; if (!clusterTotalTermFrequency[clusterName]) { clusterTotalTermFrequency[clusterName] = 0; } clusterTotalTermFrequency[clusterName]++; }); }); } }); $.each(clusterSizes, function (clusterName, size) { $.each(selectors.getSelectors(), function (i, selector) { if (selector && !selector['lock']) { clusterSelectorSimilarities[clusterName][i] /= size; } }); }); var clusteringsDiv = $('#clusterings'); clusteringsDiv.empty(); if (bib.clusters) { $.each(bib.clusters, function (clusteringName, clustering) { var clusteringDiv = $('
', { class: 'clustering' }).appendTo(clusteringsDiv); var clusteringLabel = $('Clustering ' + clusteringName + ':').appendTo(clusteringDiv); var closeButton = $('
', { class: 'button tooltip small', title: 'discard clustering' }).appendTo(clusteringLabel); $('', { class: 'symbol', text: 'X' }).appendTo(closeButton); closeButton.tooltipster({ theme: 'tooltipster-survis' }); closeButton.click(function () { delete bib.clusters[clusteringName]; page.update(); }); var termDocumentFrequency = {}; $.each(clustering, function (clusterId, cluster) { var clusterName = clusteringName + '.' + clusterId; if (clusterTermFrequencies[clusterName]) { $.each(clusterTermFrequencies[clusterName], function (term, frequency) { if (!termDocumentFrequency[term]) { termDocumentFrequency[term] = 0; } termDocumentFrequency[term]++; }); } }); $.each(clustering, function (clusterId, cluster) { var clusterName = clusteringName + '.' + clusterId; if (!clusterSizes[clusterName]) { return; } var tfidf = []; var i = 0; $.each(clusterTermFrequencies[clusterName], function (term, frequency) { var tf = frequency / clusterTotalTermFrequency[clusterName]; var idf = Math.log(Object.keys(clustering).length / termDocumentFrequency[term]); tfidf[i] = { name: term, value: tf * idf }; i++; }); tfidf.sort(function (a, b) { return b.value - a.value; }); var clusterDiv = $('
', { class: 'tooltip tag authorized ' + tagUtil.getFrequencyClass(clusterSizes[clusterName]) }).appendTo(clusteringDiv); var sparklineDiv = $('
', { class: 'vis sparkline' }).appendTo(clusterDiv); selectors.vis(sparklineDiv, clusterSelectorSimilarities[clusterName]); $('', { class: 'text', text: clusterName }).appendTo(clusterDiv); $('', { class: 'tag_frequency', text: clusterSizes[clusterName] }).appendTo(clusterDiv); var termsDiv = $('
', { class: 'terms' }).appendTo(clusterDiv); for (i = 0; i < Math.min(3, tfidf.length); i++) { $('
', { class: 'term', html: latexUtil.latexToHtml(tfidf[i].name) }).appendTo(termsDiv); } clusterDiv.click(function (event) { selectors.toggleSelector('cluster', clusterName, event); }); var tooltipDiv = $('
'); $('

cluster: ' + clusterName + '

').appendTo(tooltipDiv); $('
# publications: ' + clusterSizes[clusterName] + '
').appendTo(tooltipDiv); var termDetailsDiv = $('
related terms:
', { class: 'terms' }).appendTo(tooltipDiv); for (i = 0; i < Math.min(10, tfidf.length); i++) { $('
', { class: 'term', html: latexUtil.latexToHtml(tfidf[i].name)//+'('+tfidf[i].value.toFixed(3)+')' }).appendTo(termDetailsDiv); } var totalSimilarity = selectors.computeTotalSimilarity(clusterSelectorSimilarities[clusterName]); if (selectors.getNActiveSelectors() > 0) { $('
selector agreement: ' + totalSimilarity.toFixed(2) + '
').appendTo(tooltipDiv); if (totalSimilarity > 0) { var visDiv = $('
', { class: 'vis' }).appendTo(tooltipDiv); selectors.vis(visDiv, clusterSelectorSimilarities[clusterName]); } } clusterDiv.tooltipster({ content: $(tooltipDiv), theme: 'tooltipster-survis' }); }); }); } } } })();