Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz> | |
# Licensed under the GNU LGPL v2.1 - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html | |
"""This module implements the concept of a Dictionary -- a mapping between words and their integer ids.""" | |
from collections import defaultdict | |
from collections.abc import Mapping | |
import logging | |
import itertools | |
from typing import Optional, List, Tuple | |
from gensim import utils | |
logger = logging.getLogger(__name__) | |
class Dictionary(utils.SaveLoad, Mapping): | |
"""Dictionary encapsulates the mapping between normalized words and their integer ids. | |
Notable instance attributes: | |
Attributes | |
---------- | |
token2id : dict of (str, int) | |
token -> token_id. I.e. the reverse mapping to `self[token_id]`. | |
cfs : dict of (int, int) | |
Collection frequencies: token_id -> how many instances of this token are contained in the documents. | |
dfs : dict of (int, int) | |
Document frequencies: token_id -> how many documents contain this token. | |
num_docs : int | |
Number of documents processed. | |
num_pos : int | |
Total number of corpus positions (number of processed words). | |
num_nnz : int | |
Total number of non-zeroes in the BOW matrix (sum of the number of unique | |
words per document over the entire corpus). | |
""" | |
def __init__(self, documents=None, prune_at=2000000): | |
""" | |
Parameters | |
---------- | |
documents : iterable of iterable of str, optional | |
Documents to be used to initialize the mapping and collect corpus statistics. | |
prune_at : int, optional | |
Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM | |
footprint, the correctness is not guaranteed. | |
Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> texts = [['human', 'interface', 'computer']] | |
>>> dct = Dictionary(texts) # initialize a Dictionary | |
>>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # add more document (extend the vocabulary) | |
>>> dct.doc2bow(["dog", "computer", "non_existent_word"]) | |
[(0, 1), (6, 1)] | |
""" | |
self.token2id = {} | |
self.id2token = {} | |
self.cfs = {} | |
self.dfs = {} | |
self.num_docs = 0 | |
self.num_pos = 0 | |
self.num_nnz = 0 | |
if documents is not None: | |
self.add_documents(documents, prune_at=prune_at) | |
self.add_lifecycle_event( | |
"created", | |
msg=f"built {self} from {self.num_docs} documents (total {self.num_pos} corpus positions)", | |
) | |
def __getitem__(self, tokenid): | |
"""Get the string token that corresponds to `tokenid`. | |
Parameters | |
---------- | |
tokenid : int | |
Id of token. | |
Returns | |
------- | |
str | |
Token corresponding to `tokenid`. | |
Raises | |
------ | |
KeyError | |
If this Dictionary doesn't contain such `tokenid`. | |
""" | |
if len(self.id2token) != len(self.token2id): | |
# the word->id mapping has changed (presumably via add_documents); | |
# recompute id->word accordingly | |
self.id2token = utils.revdict(self.token2id) | |
return self.id2token[tokenid] # will throw for non-existent ids | |
def __iter__(self): | |
"""Iterate over all tokens.""" | |
return iter(self.keys()) | |
# restore Py2-style dict API | |
iterkeys = __iter__ | |
def iteritems(self): | |
return self.items() | |
def itervalues(self): | |
return self.values() | |
def keys(self): | |
"""Get all stored ids. | |
Returns | |
------- | |
list of int | |
List of all token ids. | |
""" | |
return list(self.token2id.values()) | |
def __len__(self): | |
"""Get number of stored tokens. | |
Returns | |
------- | |
int | |
Number of stored tokens. | |
""" | |
return len(self.token2id) | |
def __str__(self): | |
some_keys = list(itertools.islice(self.token2id.keys(), 5)) | |
return "%s<%i unique tokens: %s%s>" % ( | |
self.__class__.__name__, len(self), some_keys, '...' if len(self) > 5 else '' | |
) | |
def from_documents(documents): | |
"""Create :class:`~gensim.corpora.dictionary.Dictionary` from `documents`. | |
Equivalent to `Dictionary(documents=documents)`. | |
Parameters | |
---------- | |
documents : iterable of iterable of str | |
Input corpus. | |
Returns | |
------- | |
:class:`~gensim.corpora.dictionary.Dictionary` | |
Dictionary initialized from `documents`. | |
""" | |
return Dictionary(documents=documents) | |
def add_documents(self, documents, prune_at=2000000): | |
"""Update dictionary from a collection of `documents`. | |
Parameters | |
---------- | |
documents : iterable of iterable of str | |
Input corpus. All tokens should be already **tokenized and normalized**. | |
prune_at : int, optional | |
Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM | |
footprint, the correctness is not guaranteed. | |
Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> corpus = ["máma mele maso".split(), "ema má máma".split()] | |
>>> dct = Dictionary(corpus) | |
>>> len(dct) | |
5 | |
>>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]]) | |
>>> len(dct) | |
10 | |
""" | |
for docno, document in enumerate(documents): | |
# log progress & run a regular check for pruning, once every 10k docs | |
if docno % 10000 == 0: | |
if prune_at is not None and len(self) > prune_at: | |
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at) | |
logger.info("adding document #%i to %s", docno, self) | |
# update Dictionary with the document | |
self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids | |
logger.info("built %s from %i documents (total %i corpus positions)", self, self.num_docs, self.num_pos) | |
def doc2bow(self, document, allow_update=False, return_missing=False): | |
"""Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples. | |
Parameters | |
---------- | |
document : list of str | |
Input document. | |
allow_update : bool, optional | |
Update self, by adding new tokens from `document` and updating internal corpus statistics. | |
return_missing : bool, optional | |
Return missing tokens (tokens present in `document` but not in self) with frequencies? | |
Return | |
------ | |
list of (int, int) | |
BoW representation of `document`. | |
list of (int, int), dict of (str, int) | |
If `return_missing` is True, return BoW representation of `document` + dictionary with missing | |
tokens and their frequencies. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()]) | |
>>> dct.doc2bow(["this", "is", "máma"]) | |
[(2, 1)] | |
>>> dct.doc2bow(["this", "is", "máma"], return_missing=True) | |
([(2, 1)], {u'this': 1, u'is': 1}) | |
""" | |
if isinstance(document, str): | |
raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") | |
# Construct (word, frequency) mapping. | |
counter = defaultdict(int) | |
for w in document: | |
counter[w if isinstance(w, str) else str(w, 'utf-8')] += 1 | |
token2id = self.token2id | |
if allow_update or return_missing: | |
missing = sorted(x for x in counter.items() if x[0] not in token2id) | |
if allow_update: | |
for w, _ in missing: | |
# new id = number of ids made so far; | |
# NOTE this assumes there are no gaps in the id sequence! | |
token2id[w] = len(token2id) | |
result = {token2id[w]: freq for w, freq in counter.items() if w in token2id} | |
if allow_update: | |
self.num_docs += 1 | |
self.num_pos += sum(counter.values()) | |
self.num_nnz += len(result) | |
# keep track of document and collection frequencies | |
for tokenid, freq in result.items(): | |
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq | |
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 | |
# return tokenids, in ascending id order | |
result = sorted(result.items()) | |
if return_missing: | |
return result, dict(missing) | |
else: | |
return result | |
def doc2idx(self, document, unknown_word_index=-1): | |
"""Convert `document` (a list of words) into a list of indexes = list of `token_id`. | |
Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`. | |
Parameters | |
---------- | |
document : list of str | |
Input document | |
unknown_word_index : int, optional | |
Index to use for words not in the dictionary. | |
Returns | |
------- | |
list of int | |
Token ids for tokens in `document`, in the same order. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> corpus = [["a", "a", "b"], ["a", "c"]] | |
>>> dct = Dictionary(corpus) | |
>>> dct.doc2idx(["a", "a", "c", "not_in_dictionary", "c"]) | |
[0, 0, 2, -1, 2] | |
""" | |
if isinstance(document, str): | |
raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string") | |
document = [word if isinstance(word, str) else str(word, 'utf-8') for word in document] | |
return [self.token2id.get(word, unknown_word_index) for word in document] | |
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): | |
"""Filter out tokens in the dictionary by their frequency. | |
Parameters | |
---------- | |
no_below : int, optional | |
Keep tokens which are contained in at least `no_below` documents. | |
no_above : float, optional | |
Keep tokens which are contained in no more than `no_above` documents | |
(fraction of total corpus size, not an absolute number). | |
keep_n : int, optional | |
Keep only the first `keep_n` most frequent tokens. | |
keep_tokens : iterable of str | |
Iterable of tokens that **must** stay in dictionary after filtering. | |
Notes | |
----- | |
This removes all tokens in the dictionary that are: | |
#. Less frequent than `no_below` documents (absolute number, e.g. `5`) or \n | |
#. More frequent than `no_above` documents (fraction of the total corpus size, e.g. `0.3`). | |
#. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `keep_n=None`). | |
After the pruning, resulting gaps in word ids are shrunk. | |
Due to this gap shrinking, **the same word may have a different word id before and after the call | |
to this function!** See :class:`gensim.models.VocabTransform` and the | |
`dedicated FAQ entry <https://github.com/RaRe-Technologies/gensim/wiki/Recipes-&-FAQ#q8-how-can-i-filter-a-saved-corpus-and-its-corresponding-dictionary>`_ on how # noqa | |
to transform a corpus built with a dictionary before pruning. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] | |
>>> dct = Dictionary(corpus) | |
>>> len(dct) | |
5 | |
>>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1) | |
>>> len(dct) | |
1 | |
""" | |
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold | |
# determine which tokens to keep | |
if keep_tokens: | |
keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id} | |
good_ids = [ | |
v for v in self.token2id.values() | |
if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids | |
] | |
good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True) | |
else: | |
good_ids = [ | |
v for v in self.token2id.values() | |
if no_below <= self.dfs.get(v, 0) <= no_above_abs | |
] | |
good_ids.sort(key=self.dfs.get, reverse=True) | |
if keep_n is not None: | |
good_ids = good_ids[:keep_n] | |
bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)] | |
logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10]) | |
logger.info( | |
"keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents", | |
len(good_ids), no_below, no_above_abs, 100.0 * no_above | |
) | |
# do the actual filtering, then rebuild dictionary to remove gaps in ids | |
self.filter_tokens(good_ids=good_ids) | |
logger.info("resulting dictionary: %s", self) | |
def filter_n_most_frequent(self, remove_n): | |
"""Filter out the 'remove_n' most frequent tokens that appear in the documents. | |
Parameters | |
---------- | |
remove_n : int | |
Number of the most frequent tokens that will be removed. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] | |
>>> dct = Dictionary(corpus) | |
>>> len(dct) | |
5 | |
>>> dct.filter_n_most_frequent(2) | |
>>> len(dct) | |
3 | |
""" | |
# determine which tokens to keep | |
most_frequent_ids = (v for v in self.token2id.values()) | |
most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True) | |
most_frequent_ids = most_frequent_ids[:remove_n] | |
# do the actual filtering, then rebuild dictionary to remove gaps in ids | |
most_frequent_words = [(self[idx], self.dfs.get(idx, 0)) for idx in most_frequent_ids] | |
logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10]) | |
self.filter_tokens(bad_ids=most_frequent_ids) | |
logger.info("resulting dictionary: %s", self) | |
def filter_tokens(self, bad_ids=None, good_ids=None): | |
"""Remove the selected `bad_ids` tokens from :class:`~gensim.corpora.dictionary.Dictionary`. | |
Alternatively, keep selected `good_ids` in :class:`~gensim.corpora.dictionary.Dictionary` and remove the rest. | |
Parameters | |
---------- | |
bad_ids : iterable of int, optional | |
Collection of word ids to be removed. | |
good_ids : collection of int, optional | |
Keep selected collection of word ids and remove the rest. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] | |
>>> dct = Dictionary(corpus) | |
>>> 'ema' in dct.token2id | |
True | |
>>> dct.filter_tokens(bad_ids=[dct.token2id['ema']]) | |
>>> 'ema' in dct.token2id | |
False | |
>>> len(dct) | |
4 | |
>>> dct.filter_tokens(good_ids=[dct.token2id['maso']]) | |
>>> len(dct) | |
1 | |
""" | |
if bad_ids is not None: | |
bad_ids = set(bad_ids) | |
self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid not in bad_ids} | |
self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid not in bad_ids} | |
self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid not in bad_ids} | |
if good_ids is not None: | |
good_ids = set(good_ids) | |
self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid in good_ids} | |
self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid in good_ids} | |
self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid in good_ids} | |
self.compactify() | |
def compactify(self): | |
"""Assign new word ids to all words, shrinking any gaps.""" | |
logger.debug("rebuilding dictionary, shrinking gaps") | |
# build mapping from old id -> new id | |
idmap = dict(zip(sorted(self.token2id.values()), range(len(self.token2id)))) | |
# reassign mappings to new ids | |
self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()} | |
self.id2token = {} | |
self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()} | |
self.cfs = {idmap[tokenid]: freq for tokenid, freq in self.cfs.items()} | |
def save_as_text(self, fname, sort_by_word=True): | |
"""Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file. | |
Parameters | |
---------- | |
fname : str | |
Path to output file. | |
sort_by_word : bool, optional | |
Sort words in lexicographical order before writing them out? | |
Notes | |
----- | |
Format:: | |
num_docs | |
id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE] | |
id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE] | |
.... | |
id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE] | |
This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable | |
to other tools and frameworks. For better performance and to store the entire object state, | |
including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and | |
:meth:`~gensim.corpora.dictionary.Dictionary.load` instead. | |
See Also | |
-------- | |
:meth:`~gensim.corpora.dictionary.Dictionary.load_from_text` | |
Load :class:`~gensim.corpora.dictionary.Dictionary` from text file. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> from gensim.test.utils import get_tmpfile | |
>>> | |
>>> tmp_fname = get_tmpfile("dictionary") | |
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] | |
>>> | |
>>> dct = Dictionary(corpus) | |
>>> dct.save_as_text(tmp_fname) | |
>>> | |
>>> loaded_dct = Dictionary.load_from_text(tmp_fname) | |
>>> assert dct.token2id == loaded_dct.token2id | |
""" | |
logger.info("saving dictionary mapping to %s", fname) | |
with utils.open(fname, 'wb') as fout: | |
numdocs_line = "%d\n" % self.num_docs | |
fout.write(utils.to_utf8(numdocs_line)) | |
if sort_by_word: | |
for token, tokenid in sorted(self.token2id.items()): | |
line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) | |
fout.write(utils.to_utf8(line)) | |
else: | |
for tokenid, freq in sorted(self.dfs.items(), key=lambda item: -item[1]): | |
line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) | |
fout.write(utils.to_utf8(line)) | |
def merge_with(self, other): | |
"""Merge another dictionary into this dictionary, mapping the same tokens to the same ids | |
and new tokens to new ids. | |
Notes | |
----- | |
The purpose is to merge two corpora created using two different dictionaries: `self` and `other`. | |
`other` can be any id=>word mapping (a dict, a Dictionary object, ...). | |
Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents | |
from a corpus built using the `other` dictionary into a document using the new, merged dictionary. | |
Parameters | |
---------- | |
other : {dict, :class:`~gensim.corpora.dictionary.Dictionary`} | |
Other dictionary. | |
Return | |
------ | |
:class:`gensim.models.VocabTransform` | |
Transformation object. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> corpus_1, corpus_2 = [["a", "b", "c"]], [["a", "f", "f"]] | |
>>> dct_1, dct_2 = Dictionary(corpus_1), Dictionary(corpus_2) | |
>>> dct_1.doc2bow(corpus_2[0]) | |
[(0, 1)] | |
>>> transformer = dct_1.merge_with(dct_2) | |
>>> dct_1.doc2bow(corpus_2[0]) | |
[(0, 1), (3, 2)] | |
""" | |
old2new = {} | |
for other_id, other_token in other.items(): | |
if other_token in self.token2id: | |
new_id = self.token2id[other_token] | |
else: | |
new_id = len(self.token2id) | |
self.token2id[other_token] = new_id | |
self.dfs[new_id] = 0 | |
old2new[other_id] = new_id | |
try: | |
self.dfs[new_id] += other.dfs[other_id] | |
except Exception: | |
# `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going | |
pass | |
try: | |
self.num_docs += other.num_docs | |
self.num_nnz += other.num_nnz | |
self.num_pos += other.num_pos | |
except Exception: | |
pass | |
import gensim.models | |
return gensim.models.VocabTransform(old2new) | |
def patch_with_special_tokens(self, special_token_dict): | |
"""Patch token2id and id2token using a dictionary of special tokens. | |
**Usecase:** when doing sequence modeling (e.g. named entity recognition), one may want to specify | |
special tokens that behave differently than others. | |
One example is the "unknown" token, and another is the padding token. | |
It is usual to set the padding token to have index `0`, and patching the dictionary with `{'<PAD>': 0}` | |
would be one way to specify this. | |
Parameters | |
---------- | |
special_token_dict : dict of (str, int) | |
dict containing the special tokens as keys and their wanted indices as values. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] | |
>>> dct = Dictionary(corpus) | |
>>> | |
>>> special_tokens = {'pad': 0, 'space': 1} | |
>>> print(dct.token2id) | |
{'maso': 0, 'mele': 1, 'máma': 2, 'ema': 3, 'má': 4} | |
>>> | |
>>> dct.patch_with_special_tokens(special_tokens) | |
>>> print(dct.token2id) | |
{'maso': 6, 'mele': 7, 'máma': 2, 'ema': 3, 'má': 4, 'pad': 0, 'space': 1} | |
""" | |
possible_ids = [] | |
for token, idx in special_token_dict.items(): | |
if token in self.token2id and self.token2id[token] == idx: | |
continue | |
if token in self.token2id and self.token2id[token] != idx: | |
possible_ids.append(self.token2id[token]) | |
del self.token2id[token] | |
old_token = self[idx] | |
self.token2id[token] = idx | |
self.token2id[old_token] = possible_ids.pop() if \ | |
len(possible_ids) > 0 else len(self.token2id) - 1 | |
self.id2token = {} # Make sure that id2token is updated according to special tokens. | |
def load_from_text(fname): | |
"""Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file. | |
Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. | |
Parameters | |
---------- | |
fname: str | |
Path to a file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. | |
See Also | |
-------- | |
:meth:`~gensim.corpora.dictionary.Dictionary.save_as_text` | |
Save :class:`~gensim.corpora.dictionary.Dictionary` to text file. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> from gensim.test.utils import get_tmpfile | |
>>> | |
>>> tmp_fname = get_tmpfile("dictionary") | |
>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] | |
>>> | |
>>> dct = Dictionary(corpus) | |
>>> dct.save_as_text(tmp_fname) | |
>>> | |
>>> loaded_dct = Dictionary.load_from_text(tmp_fname) | |
>>> assert dct.token2id == loaded_dct.token2id | |
""" | |
result = Dictionary() | |
with utils.open(fname, 'rb') as f: | |
for lineno, line in enumerate(f): | |
line = utils.to_unicode(line) | |
if lineno == 0: | |
if line.strip().isdigit(): | |
# Older versions of save_as_text may not write num_docs on first line. | |
result.num_docs = int(line.strip()) | |
continue | |
else: | |
logging.warning("Text does not contain num_docs on the first line.") | |
try: | |
wordid, word, docfreq = line[:-1].split('\t') | |
except Exception: | |
raise ValueError("invalid line in dictionary file %s: %s" | |
% (fname, line.strip())) | |
wordid = int(wordid) | |
if word in result.token2id: | |
raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) | |
result.token2id[word] = wordid | |
result.dfs[wordid] = int(docfreq) | |
return result | |
def most_common(self, n: Optional[int] = None) -> List[Tuple[str, int]]: | |
"""Return a list of the n most common words and their counts from the most common to the least. | |
Words with equal counts are ordered in the increasing order of their ids. | |
Parameters | |
---------- | |
n : int or None, optional | |
The number of most common words to be returned. If `None`, all words in the dictionary | |
will be returned. Default is `None`. | |
Returns | |
------- | |
most_common : list of (str, int) | |
The n most common words and their counts from the most common to the least. | |
""" | |
most_common = [ | |
(self[word], count) | |
for word, count | |
in sorted(self.cfs.items(), key=lambda x: (-x[1], x[0]))[:n] | |
] | |
return most_common | |
def from_corpus(corpus, id2word=None): | |
"""Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus. | |
Parameters | |
---------- | |
corpus : iterable of iterable of (int, number) | |
Corpus in BoW format. | |
id2word : dict of (int, object) | |
Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used. | |
Notes | |
----- | |
This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original | |
text corpus. This method will scan the term-document count matrix for all word ids that appear in it, | |
then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`. | |
`id2word` is an optional dictionary that maps the `word_id` to a token. | |
In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used. | |
Returns | |
------- | |
:class:`~gensim.corpora.dictionary.Dictionary` | |
Inferred dictionary from corpus. | |
Examples | |
-------- | |
.. sourcecode:: pycon | |
>>> from gensim.corpora import Dictionary | |
>>> | |
>>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] | |
>>> dct = Dictionary.from_corpus(corpus) | |
>>> len(dct) | |
3 | |
""" | |
result = Dictionary() | |
max_id = -1 | |
for docno, document in enumerate(corpus): | |
if docno % 10000 == 0: | |
logger.info("adding document #%i to %s", docno, result) | |
result.num_docs += 1 | |
result.num_nnz += len(document) | |
for wordid, word_freq in document: | |
max_id = max(wordid, max_id) | |
result.num_pos += word_freq | |
result.dfs[wordid] = result.dfs.get(wordid, 0) + 1 | |
if id2word is None: | |
# make sure length(result) == get_max_id(corpus) + 1 | |
result.token2id = {str(i): i for i in range(max_id + 1)} | |
else: | |
# id=>word mapping given: simply copy it | |
result.token2id = {utils.to_unicode(token): idx for idx, token in id2word.items()} | |
for idx in result.token2id.values(): | |
# make sure all token ids have a valid `dfs` entry | |
result.dfs[idx] = result.dfs.get(idx, 0) | |
logger.info( | |
"built %s from %i documents (total %i corpus positions)", | |
result, result.num_docs, result.num_pos | |
) | |
return result |