Spaces:

Booguy
/

linguask

Build error

linguask / src /feature_extractors /term_frequency_feature_extractor.py

GitHub Action

refs/heads/ci-cd/hugging-face

8b414b0 about 2 years ago

2.97 kB

	from collections import defaultdict
	from typing import Dict, List

	import nltk
	import numpy as np
	import pandas as pd

	from src.feature_extractors.base_extractor import BaseExtractor


	class TermFrequencyFeatureExtractor(BaseExtractor):
	"""Build a dataframe with a distribution of term frequencies

	Usage:
	>>> data = pd.read_csv("data/raw/train.csv").set_index("text_id")
	>>> featurizer = TermFrequencyFeaturizer()
	>>> X = featurizer.featurize(data.full_text)
	>>> y = data["vocabulary"]
	>>> model = catboost.CatBoostRegressor()
	>>> model.fit(x_train, y_train)

	Possible improvements:
	- Add word corrections: triying -> trying
	- Count not only word frequencies, but number of unique words in each hist bin
	"""

	MAX_TERM_FREQUENCY = 23135751162 # it's so in the term frequency dataset

	def __init__(self, n_bins: int = 40):
	self.term2freq: Dict[str, int] = self._load_term2freq_dict()
	self.bins = self._make_bins(n_bins)
	self.feature_names = [
	f"bin_{round(self.bins[i], 1)}_{round(self.bins[i+1], 1)}"
	for i in range(len(self.bins) - 1)
	]
	nltk.download("punkt")

	def _make_bins(self, n_bins: int) -> np.ndarray:
	min_bin = 0
	max_bin = np.log1p(self.MAX_TERM_FREQUENCY)
	bins = np.linspace(min_bin, max_bin, n_bins)
	return bins

	def _load_term2freq_dict(self) -> Dict[str, int]:
	term_frequencies = pd.read_csv("data/word_frequencies/unigram_freq.csv")
	term2freq: Dict[str, int] = defaultdict(lambda: 0)
	term2freq.update(term_frequencies.set_index("word").to_dict()["count"])
	return term2freq

	def generate_features(self, data: pd.Series) -> pd.DataFrame:
	"""Extracts features from the text in the form of histogram of word frequencies

	Logarithm operation is applied to the frequencies for the sake of distribution
	normality.
	"""
	feature_df = data.apply(self._compute_word_frequency_histogram)
	feature_df.columns = self.feature_names
	return feature_df

	def _compute_word_frequency_histogram(self, text: str) -> pd.Series:
	term_frequencies: List[int] = self._compute_term_frequencies_from_text(text)
	histogram_values: np.ndarray = self._build_histogram(term_frequencies)
	return pd.Series(histogram_values)

	def _compute_term_frequencies_from_text(self, text: str) -> List[int]:
	tokens = nltk.tokenize.word_tokenize(text)
	words = [token.lower() for token in tokens if token.isalpha()]
	word_frequencies = [self.term2freq[word] for word in words]
	return word_frequencies

	def _build_histogram(self, values: List[int]) -> np.ndarray:
	values_log = np.log1p(values)
	histogram, __ = np.histogram(values_log, bins=self.bins)
	normalized_histogram = histogram / len(values)
	return normalized_histogram