import argparse import json import logging import os import pathlib import random import shutil import sys import time from ast import literal_eval from shutil import rmtree from typing import Any, Dict, List, Union import numpy as np import pandas as pd import ray from sklearn.feature_extraction.text import (CountVectorizer, HashingVectorizer, TfidfVectorizer) from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split from tqdm import tqdm from lr.hyperparameters import (SEARCH_SPACE, HyperparameterSearch, RandomSearch) from lr.util import jackknife, replace_bool, stratified_sample # Create a custom logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) def train_lr(train, dev, test, search_space): master = pd.concat([train, dev], 0) space = HyperparameterSearch(**search_space) sample = space.sample() if sample.pop('stopwords') == 1: stop_words = 'english' else: stop_words = None weight = sample.pop('weight') if weight == 'binary': binary = True else: binary = False ngram_range = sample.pop('ngram_range') ngram_range = sorted([int(x) for x in ngram_range.split()]) if weight == 'tf-idf': vect = TfidfVectorizer(stop_words=stop_words, lowercase=True, ngram_range=ngram_range, ) elif weight == 'hash': vect = HashingVectorizer(stop_words=stop_words, lowercase=True, ngram_range=ngram_range, ) else: vect = CountVectorizer(binary=binary, stop_words=stop_words, lowercase=True, ngram_range=ngram_range, ) start = time.time() vect.fit(tqdm(master.text, desc="fitting data", leave=False)) X_train = vect.transform(tqdm(train.text, desc="transforming training data", leave=False)) X_dev = vect.transform(tqdm(dev.text, desc="transforming dev data", leave=False)) if test is not None: X_test = vect.transform(tqdm(test.text, desc="transforming test data", leave=False)) sample['C'] = float(sample['C']) sample['tol'] = float(sample['tol']) classifier = LogisticRegression(**sample, verbose=True) classifier.fit(X_train, train.label) end = time.time() for k, v in sample.items(): if not v: v = str(v) sample[k] = [v] res = pd.DataFrame(sample) preds = classifier.predict(X_dev) if test is not None: test_preds = classifier.predict(X_test) res['dev_f1'] = f1_score(dev.label, preds, average='macro') if test is not None: res['test_f1'] = f1_score(test.label, test_preds, average='macro') res['dev_accuracy'] = classifier.score(X_dev, dev.label) if test is not None: res['test_accuracy'] = classifier.score(X_test, test.label) res['training_duration'] = end - start res['ngram_range'] = str(ngram_range) res['weight'] = weight res['stopwords'] = stop_words return classifier, vect, res if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--train_file', type=str) parser.add_argument('--dev_file', type=str, required=False) parser.add_argument('--test_file', type=str, required=False) parser.add_argument('--search_trials', type=int, default=5) parser.add_argument('--train_subsample', type=int, required=False) parser.add_argument('--stratified', action='store_true') parser.add_argument('--jackknife_partitions', type=int, default=5, required=False) parser.add_argument('--save_jackknife_partitions', action='store_true') parser.add_argument('--serialization_dir', '-s', type=str) parser.add_argument('--override', '-o', action='store_true') parser.add_argument('--evaluate_on_test', '-t', action='store_true') args = parser.parse_args() if not os.path.isdir(args.serialization_dir): os.makedirs(args.serialization_dir) else: if args.override: rmtree(args.serialization_dir) os.makedirs(args.serialization_dir) else: print(f"serialization directory {args.serialization_dir} exists. Aborting! ") print(f"reading training data at {args.train_file}...") train = pd.read_json(args.train_file, lines=True) if args.train_subsample: if args.stratified: train = stratified_sample(train, "label", args.train_subsample) else: train = train.sample(n=args.train_subsample) if args.dev_file: print(f"reading dev data at {args.dev_file}...") dev = pd.read_json(args.dev_file, lines=True) else: print("Dev file not provided, will jackknife training data...") if args.evaluate_on_test: if args.test_file: print(f"reading test data at {args.test_file}...") test = pd.read_json(args.test_file, lines=True) else: print("Test file not provided.") sys.exit(1) else: test = None num_assignments = args.search_trials num_partitions = args.jackknife_partitions df = pd.DataFrame() current_f1 = 0.0 best_classifier = None best_vect = None if args.dev_file: pbar = tqdm(range(num_assignments), desc="search trials", leave=False) for i in pbar: try: classifier, vect, res = train_lr(train, dev, test, SEARCH_SPACE) df = pd.concat([df, res], 0, sort=True) best_f1 = df.dev_f1.max() if res.dev_f1[0] > current_f1: current_f1 = res.dev_f1[0] best_classifier = classifier best_vect = vect pbar.set_description(f"mean +- std dev F1: {df.dev_f1.mean()} +- {df.dev_f1.std()}, max F1: {df.dev_f1.max()}") except KeyboardInterrupt: break else: if args.save_jackknife_partitions: if not os.path.isdir(os.path.join(args.serialization_dir, "jackknife")): os.mkdir(os.path.join(args.serialization_dir, "jackknife")) for ix, (train, dev) in tqdm(enumerate(jackknife(train, num_partitions=num_partitions)), total=num_partitions, leave=False, desc="jackknife partitions"): for i in tqdm(range(num_assignments), desc="search trials", leave=False): classifier, vect, res = train_lr(train, dev, test, SEARCH_SPACE) df = pd.concat([df, res], 0, sort=True) best_f1 = df.dev_f1.max() if res.dev_f1[0] > current_f1: current_f1 = res.dev_f1[0] best_classifier = classifier best_vect = vect df['dataset_reader.sample'] = train.shape[0] df['model.encoder.architecture.type'] = 'logistic regression' if args.save_jackknife_partitions: train.to_json( os.path.join(args.serialization_dir, "jackknife", f"train.{ix}"), lines=True, orient="records") dev.to_json(os.path.join(args.serialization_dir, "jackknife", f"dev.{ix}"), lines=True, orient='records') print("DEV STATISTICS") print("================") print(f"mean +- std F1: {df.dev_f1.mean()} +- {df.dev_f1.std()}") print(f"max F1: {df.dev_f1.max()}") print(f"min F1: {df.dev_f1.min()}") print(f"mean +- std accuracy: {df.dev_accuracy.mean()} +- {df.dev_accuracy.std()}") print(f"max accuracy: {df.dev_accuracy.max()}") print(f"min accuracy: {df.dev_accuracy.min()}") print("") print("BEST HYPERPARAMETERS") print(f"=====================") best_hp = df.reset_index().iloc[df.reset_index().dev_f1.idxmax()].to_dict() print(df.reset_index().iloc[df.reset_index().dev_f1.idxmax()]) if test is not None: print("TEST STATISTICS") print("================") print(f"mean +- std F1: {df.test_f1.mean()} +- {df.test_f1.std()}") print(f"max F1: {df.test_f1.max()}") print(f"min F1: {df.test_f1.min()}") print(f"mean +- std accuracy: {df.test_accuracy.mean()} +- {df.test_accuracy.std()}") print(f"max accuracy: {df.test_accuracy.max()}") print(f"min accuracy: {df.test_accuracy.min()}") df.to_json(os.path.join(args.serialization_dir, "results.jsonl"), lines=True, orient='records') with open(os.path.join(args.serialization_dir, "best_hyperparameters.json"), "w+") as f: best_hp = df.reset_index().iloc[df.reset_index().dev_f1.idxmax()].to_dict() for k,v in best_hp.items(): if isinstance(v, np.int64): best_hp[k] = int(v) if isinstance(v, str) and "[" in v: v = literal_eval(v) best_hp[k] = f"{v[0]} {v[1]}" best_hp.pop("index") best_hp.pop("dev_accuracy") best_hp.pop("dev_f1") if test is not None: best_hp.pop("test_accuracy") best_hp.pop("test_f1") best_hp.pop("training_duration") json.dump(best_hp, f) with open(os.path.join(args.serialization_dir, "vocab.json"), 'w+') as f: for k,v in best_vect.__dict__['vocabulary_'].items(): best_vect.__dict__['vocabulary_'][k] = int(v) json.dump(best_vect.__dict__['vocabulary_'], f) os.mkdir(os.path.join(args.serialization_dir, "archive")) try: np.save(os.path.join(args.serialization_dir, "archive", "idf.npy"), best_vect.idf_) except: pass np.save(os.path.join(args.serialization_dir, "archive", "classes.npy"),best_classifier.classes_) np.save(os.path.join(args.serialization_dir, "archive", "coef.npy"),best_classifier.coef_) np.save(os.path.join(args.serialization_dir, "archive", "intercept.npy"), best_classifier.intercept_)