diff --git "a/data/hfid_to_pwcinfo.json" "b/data/hfid_to_pwcinfo.json" deleted file mode 100644--- "a/data/hfid_to_pwcinfo.json" +++ /dev/null @@ -1,3502 +0,0 @@ -{ - "glue": { - "pwc_id": "glue", - "dataset_name": "GLUE Dataset", - "dataset_abstract": "General Language Understanding Evaluation (GLUE) benchmark is a collection of nine natural language understanding tasks, including single-sentence tasks CoLA and SST-2, similarity and paraphrasing tasks MRPC, STS-B and QQP, and natural language inference tasks MNLI, QNLI, RTE and WNLI.", - "paper_name": "GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding", - "paper_abstract": "For natural language understanding (NLU) technology to be maximally useful,\nboth practically and as a scientific object of study, it must be general: it\nmust be able to process language in a way that is not exclusively tailored to\nany one specific task or dataset. In pursuit of this objective, we introduce\nthe General Language Understanding Evaluation benchmark (GLUE), a tool for\nevaluating and analyzing the performance of models across a diverse range of\nexisting NLU tasks. GLUE is model-agnostic, but it incentivizes sharing\nknowledge across tasks because certain tasks have very limited training data.\nWe further provide a hand-crafted diagnostic test suite that enables detailed\nlinguistic analysis of NLU models. We evaluate baselines based on current\nmethods for multi-task and transfer learning and find that they do not\nimmediately give substantial improvements over the aggregate performance of\ntraining a separate model per task, indicating room for improvement in\ndeveloping general and robust NLU systems." - }, - "super_glue": { - "pwc_id": "superglue", - "dataset_name": "SuperGLUE Dataset", - "dataset_abstract": "SuperGLUE is a benchmark dataset designed to pose a more rigorous test of language understanding than GLUE. SuperGLUE has the same high-level motivation as GLUE: to provide a simple, hard-to-game measure of progress toward general-purpose language understanding technologies for English. SuperGLUE follows the basic design of GLUE: It consists of a public leaderboard built around eight language understanding tasks, drawing on existing data, accompanied by a single-number\nperformance metric, and an analysis toolkit. However, it improves upon GLUE in several ways:\n\n\nMore challenging tasks: SuperGLUE retains the two hardest tasks in GLUE. The remaining tasks were identified from those submitted to an open call for task proposals and were selected based on difficulty for current NLP approaches.\nMore diverse task formats: The task formats in GLUE are limited to sentence- and sentence-pair classification. The authors expand the set of task formats in SuperGLUE to include\ncoreference resolution and question answering (QA).\nComprehensive human baselines: the authors include human performance estimates for all benchmark tasks, which verify that substantial headroom exists between a strong BERT-based baseline and human performance.\nImproved code support: SuperGLUE is distributed with a new, modular toolkit for work on pretraining, multi-task learning, and transfer learning in NLP, built around standard tools including PyTorch (Paszke et al., 2017) and AllenNLP (Gardner et al., 2017).\nRefined usage rules: The conditions for inclusion on the SuperGLUE leaderboard were revamped to ensure fair competition, an informative leaderboard, and full credit\nassignment to data and task creators.", - "paper_name": "SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems", - "paper_abstract": "In the last year, new models and methods for pretraining and transfer learning have driven striking performance improvements across a range of language understanding tasks. The GLUE benchmark, introduced a little over one year ago, offers a single-number metric that summarizes progress on a diverse set of such tasks, but performance on the benchmark has recently surpassed the level of non-expert humans, suggesting limited headroom for further research. In this paper we present SuperGLUE, a new benchmark styled after GLUE with a new set of more difficult language understanding tasks, a software toolkit, and a public leaderboard. SuperGLUE is available at super.gluebenchmark.com." - }, - "wikitext": { - "pwc_id": "wikitext-2", - "dataset_name": "WikiText-2 Dataset", - "dataset_abstract": "The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.\n\nCompared to the preprocessed version of Penn Treebank (PTB), WikiText-2 is over 2 times larger and WikiText-103 is over 110 times larger. The WikiText dataset also features a far larger vocabulary and retains the original case, punctuation and numbers - all of which are removed in PTB. As it is composed of full articles, the dataset is well suited for models that can take advantage of long term dependencies.", - "paper_name": "Pointer Sentinel Mixture Models", - "paper_abstract": "Recent neural network sequence models with softmax classifiers have achieved\ntheir best language modeling performance only with very large hidden states and\nlarge vocabularies. Even then they struggle to predict rare or unseen words\neven if the context makes the prediction unambiguous. We introduce the pointer\nsentinel mixture architecture for neural sequence models which has the ability\nto either reproduce a word from the recent context or produce a word from a\nstandard softmax classifier. Our pointer sentinel-LSTM model achieves state of\nthe art language modeling performance on the Penn Treebank (70.9 perplexity)\nwhile using far fewer parameters than a standard softmax LSTM. In order to\nevaluate how well language models can exploit longer contexts and deal with\nmore realistic vocabularies and larger corpora we also introduce the freely\navailable WikiText corpus." - }, - "squad": { - "pwc_id": "squad", - "dataset_name": "SQuAD Dataset", - "dataset_abstract": "The Stanford Question Answering Dataset (SQuAD) is a collection of question-answer pairs derived from Wikipedia articles. In SQuAD, the correct answers of questions can be any sequence of tokens in the given text. Because the questions and answers are produced by humans through crowdsourcing, it is more diverse than some other question-answering datasets. SQuAD 1.1 contains 107,785 question-answer pairs on 536 articles. SQuAD2.0 (open-domain SQuAD, SQuAD-Open), the latest version, combines the 100,000 questions in SQuAD1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers in forms that are similar to the answerable ones.", - "paper_name": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", - "paper_abstract": "We present the Stanford Question Answering Dataset (SQuAD), a new reading\ncomprehension dataset consisting of 100,000+ questions posed by crowdworkers on\na set of Wikipedia articles, where the answer to each question is a segment of\ntext from the corresponding reading passage. We analyze the dataset to\nunderstand the types of reasoning required to answer the questions, leaning\nheavily on dependency and constituency trees. We build a strong logistic\nregression model, which achieves an F1 score of 51.0%, a significant\nimprovement over a simple baseline (20%). However, human performance (86.8%) is\nmuch higher, indicating that the dataset presents a good challenge problem for\nfuture research.\n The dataset is freely available at https://stanford-qa.com" - }, - "red_caps": { - "pwc_id": "redcaps", - "dataset_name": "RedCaps Dataset", - "dataset_abstract": "RedCaps is a large-scale dataset of 12M image-text pairs collected from Reddit. Images and captions from Reddit depict and describe a wide variety of objects and scenes. The data is collected from a manually curated set of subreddits (350 total), which give coarse image labels and allow steering of the dataset composition without labeling individual instances.\n\nTerms of use: Uses of RedCaps are subject to Reddit API terms. Users must comply with Reddit User Agreeement, Content Policy, and Privacy Policy.\n\nUsage Restrictions: RedCaps should only be used for non-commercial research. RedCaps should not be used for any tasks that involve identifying features related to people (facial recognition, gender, age, ethnicity identification, etc.) or make decisions that impact people (mortgages, job applications, criminal sentences; or moderation decisions about user-uploaded data that could result in bans from a website). Any commercial and for-profit uses of RedCaps are restricted \u2013 it should not be used to train models that will be deployed in production systems as part of a product offered by businesses or government agencies\n\nRefer to the datasheet in the paper more details.", - "paper_name": "RedCaps: web-curated image-text data created by the people, for the people", - "paper_abstract": "Large datasets of paired images and text have become increasingly popular for learning generic representations for vision and vision-and-language tasks. Such datasets have been built by querying search engines or collecting HTML alt-text -- since web data is noisy, they require complex filtering pipelines to maintain quality. We explore alternate data sources to collect high quality data with minimal filtering. We introduce RedCaps -- a large-scale dataset of 12M image-text pairs collected from Reddit. Images and captions from Reddit depict and describe a wide variety of objects and scenes. We collect data from a manually curated set of subreddits, which give coarse image labels and allow us to steer the dataset composition without labeling individual instances. We show that captioning models trained on RedCaps produce rich and varied captions preferred by humans, and learn visual representations that transfer to many downstream tasks." - }, - "imdb": { - "pwc_id": "imdb-movie-reviews", - "dataset_name": "IMDb Movie Reviews Dataset", - "dataset_abstract": "The IMDb Movie Reviews dataset is a binary sentiment analysis dataset consisting of 50,000 reviews from the Internet Movie Database (IMDb) labeled as positive or negative. The dataset contains an even number of positive and negative reviews. Only highly polarizing reviews are considered. A negative review has a score \u2264 4 out of 10, and a positive review has a score \u2265 7 out of 10. No more than 30 reviews are included per movie. The dataset contains additional unlabeled data.", - "paper_name": "", - "paper_abstract": "" - }, - "tweet_eval": { - "pwc_id": "tweeteval", - "dataset_name": "TweetEval Dataset", - "dataset_abstract": "TweetEval introduces an evaluation framework consisting of seven heterogeneous Twitter-specific classification tasks.", - "paper_name": "TweetEval: Unified Benchmark and Comparative Evaluation for Tweet Classification", - "paper_abstract": "The experimental landscape in natural language processing for social media is too fragmented. Each year, new shared tasks and datasets are proposed, ranging from classics like sentiment analysis to irony detection or emoji prediction. Therefore, it is unclear what the current state of the art is, as there is no standardized evaluation protocol, neither a strong set of baselines trained on such domain-specific data. In this paper, we propose a new evaluation framework (TweetEval) consisting of seven heterogeneous Twitter-specific classification tasks. We also provide a strong set of baselines as starting point, and compare different language modeling pre-training strategies. Our initial experiments show the effectiveness of starting off with existing pre-trained generic language models, and continue training them on Twitter corpora." - }, - "wmt16": { - "pwc_id": "wmt-2016", - "dataset_name": "WMT 2016 Dataset", - "dataset_abstract": "WMT 2016 is a collection of datasets used in shared tasks of the First Conference on Machine Translation. The conference builds on ten previous Workshops on statistical Machine Translation.\n\nThe conference featured ten shared tasks:\n\n\na news translation task,\nan IT domain translation task,\na biomedical translation task,\nan automatic post-editing task,\na metrics task (assess MT quality given reference translation).\na quality estimation task (assess MT quality without access to any reference),\na tuning task (optimize a given MT system),\na pronoun translation task,\na bilingual document alignment task,\na multimodal translation task.", - "paper_name": "", - "paper_abstract": "" - }, - "emotion": { - "pwc_id": "emotion", - "dataset_name": "CARER Dataset", - "dataset_abstract": "CARER is an emotion dataset collected through noisy labels, annotated via distant supervision as in (Go et al., 2009). \n\nThe subset of data provided here corresponds to the six emotions variant described in the paper. The six emotions are anger, fear, joy, love, sadness, and surprise.", - "paper_name": "CARER: Contextualized Affect Representations for Emotion Recognition", - "paper_abstract": "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks." - }, - "wikiann": { - "pwc_id": "wikiann-1", - "dataset_name": "WikiAnn Dataset", - "dataset_abstract": "WikiAnn is a dataset for cross-lingual name tagging and linking based on Wikipedia articles in 295 languages.", - "paper_name": "Cross-lingual Name Tagging and Linking for 282 Languages", - "paper_abstract": "The ambitious goal of this work is to develop a cross-lingual name tagging and linking framework for 282 languages that exist in Wikipedia. Given a document in any of these languages, our framework is able to identify name mentions, assign a coarse-grained or fine-grained type to each mention, and link it to an English Knowledge Base (KB) if it is linkable. We achieve this goal by performing a series of new KB mining methods: generating {``}silver-standard{''} annotations by transferring annotations from English to other languages through cross-lingual links and KB properties, refining annotations through self-training and topic selection, deriving language-specific morphology features from anchor links, and mining word translation pairs from cross-lingual links. Both name tagging and linking results for 282 languages are promising on Wikipedia data and on-Wikipedia data." - }, - "blimp": { - "pwc_id": "blimp", - "dataset_name": "BLiMP Dataset", - "dataset_abstract": "BLiMP is a challenge set for evaluating what language models (LMs) know about major grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each containing 1000 minimal pairs isolating specific contrasts in syntax, morphology, or semantics. The data is automatically generated according to expert-crafted grammars. Aggregate human agreement with the labels is 96.4%.", - "paper_name": "BLiMP: The Benchmark of Linguistic Minimal Pairs for English", - "paper_abstract": "We introduce The Benchmark of Linguistic Minimal Pairs (shortened to BLiMP), a challenge set for evaluating what language models (LMs) know about major grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each containing 1000 minimal pairs isolating specific contrasts in syntax, morphology, or semantics. The data is automatically generated according to expert-crafted grammars, and aggregate human agreement with the labels is 96.4%. We use it to evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs. We find that state-of-the-art models identify morphological contrasts reliably, but they struggle with semantic restrictions on the distribution of quantifiers and negative polarity items and subtle syntactic phenomena such as extraction islands." - }, - "xnli": { - "pwc_id": "xnli", - "dataset_name": "XNLI Dataset", - "dataset_abstract": "The Cross-lingual Natural Language Inference (XNLI) corpus is the extension of the Multi-Genre NLI (MultiNLI) corpus to 15 languages. The dataset was created by manually translating the validation and test sets of MultiNLI into each of those 15 languages. The English training set was machine translated for all languages. The dataset is composed of 122k train, 2490 validation and 5010 test examples.", - "paper_name": "XNLI: Evaluating Cross-lingual Sentence Representations", - "paper_abstract": "State-of-the-art natural language processing systems rely on supervision in\nthe form of annotated data to learn competent models. These models are\ngenerally trained on data in a single language (usually English), and cannot be\ndirectly used beyond that language. Since collecting data in every language is\nnot realistic, there has been a growing interest in cross-lingual language\nunderstanding (XLU) and low-resource cross-language transfer. In this work, we\nconstruct an evaluation set for XLU by extending the development and test sets\nof the Multi-Genre Natural Language Inference Corpus (MultiNLI) to 15\nlanguages, including low-resource languages such as Swahili and Urdu. We hope\nthat our dataset, dubbed XNLI, will catalyze research in cross-lingual sentence\nunderstanding by providing an informative standard evaluation task. In\naddition, we provide several baselines for multilingual sentence understanding,\nincluding two based on machine translation systems, and two that use parallel\ndata to train aligned multilingual bag-of-words and LSTM encoders. We find that\nXNLI represents a practical and challenging evaluation suite, and that directly\ntranslating the test data yields the best performance among available\nbaselines." - }, - "ag_news": { - "pwc_id": "ag-news", - "dataset_name": "AG News Dataset", - "dataset_abstract": "AG News (AG\u2019s News Corpus) is a subdataset of AG's corpus of news articles constructed by assembling titles and description fields of articles from the 4 largest classes (\u201cWorld\u201d, \u201cSports\u201d, \u201cBusiness\u201d, \u201cSci/Tech\u201d) of AG\u2019s Corpus. The AG News contains 30,000 training and 1,900 test samples per class.", - "paper_name": "Character-level Convolutional Networks for Text Classification", - "paper_abstract": "This article offers an empirical exploration on the use of character-level\nconvolutional networks (ConvNets) for text classification. We constructed\nseveral large-scale datasets to show that character-level convolutional\nnetworks could achieve state-of-the-art or competitive results. Comparisons are\noffered against traditional models such as bag of words, n-grams and their\nTFIDF variants, and deep learning models such as word-based ConvNets and\nrecurrent neural networks." - }, - "squad_v2": { - "pwc_id": "squad", - "dataset_name": "SQuAD Dataset", - "dataset_abstract": "The Stanford Question Answering Dataset (SQuAD) is a collection of question-answer pairs derived from Wikipedia articles. In SQuAD, the correct answers of questions can be any sequence of tokens in the given text. Because the questions and answers are produced by humans through crowdsourcing, it is more diverse than some other question-answering datasets. SQuAD 1.1 contains 107,785 question-answer pairs on 536 articles. SQuAD2.0 (open-domain SQuAD, SQuAD-Open), the latest version, combines the 100,000 questions in SQuAD1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers in forms that are similar to the answerable ones.", - "paper_name": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", - "paper_abstract": "We present the Stanford Question Answering Dataset (SQuAD), a new reading\ncomprehension dataset consisting of 100,000+ questions posed by crowdworkers on\na set of Wikipedia articles, where the answer to each question is a segment of\ntext from the corresponding reading passage. We analyze the dataset to\nunderstand the types of reasoning required to answer the questions, leaning\nheavily on dependency and constituency trees. We build a strong logistic\nregression model, which achieves an F1 score of 51.0%, a significant\nimprovement over a simple baseline (20%). However, human performance (86.8%) is\nmuch higher, indicating that the dataset presents a good challenge problem for\nfuture research.\n The dataset is freely available at https://stanford-qa.com" - }, - "anli": { - "pwc_id": "anli", - "dataset_name": "ANLI Dataset", - "dataset_abstract": "The Adversarial Natural Language Inference (ANLI, Nie et al.) is a new large-scale NLI benchmark dataset, collected via an iterative, adversarial human-and-model-in-the-loop procedure. Particular, the data is selected to be difficult to the state-of-the-art models, including BERT and RoBERTa.", - "paper_name": "Adversarial NLI: A New Benchmark for Natural Language Understanding", - "paper_abstract": "We introduce a new large-scale NLI benchmark dataset, collected via an iterative, adversarial human-and-model-in-the-loop procedure. We show that training models on this new dataset leads to state-of-the-art performance on a variety of popular NLI benchmarks, while posing a more difficult challenge with its new test set. Our analysis sheds light on the shortcomings of current state-of-the-art models, and shows that non-expert annotators are successful at finding their weaknesses. The data collection method can be applied in a never-ending learning scenario, becoming a moving target for NLU, rather than a static benchmark that will quickly saturate." - }, - "xsum": { - "pwc_id": "xsum", - "dataset_name": "XSum Dataset", - "dataset_abstract": "The Extreme Summarization (XSum) dataset is a dataset for evaluation of abstractive single-document summarization systems. The goal is to create a short, one-sentence new summary answering the question \u201cWhat is the article about?\u201d. The dataset consists of 226,711 news articles accompanied with a one-sentence summary. The articles are collected from BBC articles (2010 to 2017) and cover a wide variety of domains (e.g., News, Politics, Sports, Weather, Business, Technology, Science, Health, Family, Education, Entertainment and Arts). The official random split contains 204,045 (90%), 11,332 (5%) and 11,334 (5) documents in training, validation and test sets, respectively.", - "paper_name": "Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization", - "paper_abstract": "We introduce extreme summarization, a new single-document summarization task\nwhich does not favor extractive strategies and calls for an abstractive\nmodeling approach. The idea is to create a short, one-sentence news summary\nanswering the question \"What is the article about?\". We collect a real-world,\nlarge-scale dataset for this task by harvesting online articles from the\nBritish Broadcasting Corporation (BBC). We propose a novel abstractive model\nwhich is conditioned on the article's topics and based entirely on\nconvolutional neural networks. We demonstrate experimentally that this\narchitecture captures long-range dependencies in a document and recognizes\npertinent content, outperforming an oracle extractive system and\nstate-of-the-art abstractive approaches when evaluated automatically and by\nhumans." - }, - "librispeech_asr": { - "pwc_id": "librispeech-1", - "dataset_name": "LibriSpeech Dataset", - "dataset_abstract": "The LibriSpeech corpus is a collection of approximately 1,000 hours of audiobooks that are a part of the LibriVox project. Most of the audiobooks come from the Project Gutenberg. The training data is split into 3 partitions of 100hr, 360hr, and 500hr sets while the dev and test data are split into the \u2019clean\u2019 and \u2019other\u2019 categories, respectively, depending upon how well or challening Automatic Speech Recognition systems would perform against. Each of the dev and test sets is around 5hr in audio length. This corpus also provides the n-gram language models and the corresponding texts excerpted from the Project Gutenberg books, which contain 803M tokens and 977K unique words.", - "paper_name": "", - "paper_abstract": "" - }, - "math_dataset": { - "pwc_id": "mathematics", - "dataset_name": "Mathematics Dataset Dataset", - "dataset_abstract": "This dataset code generates mathematical question and answer pairs, from a range of question types at roughly school-level difficulty. This is designed to test the mathematical learning and algebraic reasoning skills of learning models.", - "paper_name": "Analysing Mathematical Reasoning Abilities of Neural Models", - "paper_abstract": "Mathematical reasoning---a core ability within human intelligence---presents\nsome unique challenges as a domain: we do not come to understand and solve\nmathematical problems primarily on the back of experience and evidence, but on\nthe basis of inferring, learning, and exploiting laws, axioms, and symbol\nmanipulation rules. In this paper, we present a new challenge for the\nevaluation (and eventually the design) of neural architectures and similar\nsystem, developing a task suite of mathematics problems involving sequential\nquestions and answers in a free-form textual input/output format. The\nstructured nature of the mathematics domain, covering arithmetic, algebra,\nprobability and calculus, enables the construction of training and test splits\ndesigned to clearly illuminate the capabilities and failure-modes of different\narchitectures, as well as evaluate their ability to compose and relate\nknowledge and learned processes. Having described the data generation process\nand its potential future expansions, we conduct a comprehensive analysis of\nmodels from two broad classes of the most powerful sequence-to-sequence\narchitectures and find notable differences in their ability to resolve\nmathematical problems and generalize their knowledge." - }, - "xtreme": { - "pwc_id": "xtreme", - "dataset_name": "XTREME Dataset", - "dataset_abstract": "The Cross-lingual TRansfer Evaluation of Multilingual Encoders (XTREME) benchmark was introduced to encourage more research on multilingual transfer learning,. XTREME covers 40 typologically diverse languages spanning 12 language families and includes 9 tasks that require reasoning about different levels of syntax or semantics.\n\nThe languages in XTREME are selected to maximize language diversity, coverage in existing tasks, and availability of training data. The languages in XTREME are selected to maximize language diversity, coverage in existing tasks, and availability of training data. Among these are many under-studied languages, such as the Dravidian languages Tamil (spoken in southern India, Sri Lanka, and Singapore), Telugu and Malayalam (spoken mainly in southern India), and the Niger-Congo languages Swahili and Yoruba, spoken in Africa.", - "paper_name": "XTREME: A Massively Multilingual Multi-task Benchmark for Evaluating Cross-lingual Generalisation", - "paper_abstract": "Much recent progress in applications of machine learning models to NLP has been driven by benchmarks that evaluate models across a wide variety of tasks. However, these broad-coverage benchmarks have been mostly limited to English, and despite an increasing interest in multilingual models, a benchmark that enables the comprehensive evaluation of such methods on a diverse range of languages and tasks is still missing. To this end, we introduce the Cross-lingual TRansfer Evaluation of Multilingual Encoders (XTREME) benchmark, a multi-task benchmark for evaluating the cross-lingual generalization capabilities of multilingual representations across 40 languages and 9 tasks. We demonstrate that while models tested on English reach human performance on many tasks, there is still a sizable gap in the performance of cross-lingually transferred models, particularly on syntactic and sentence retrieval tasks. There is also a wide spread of results across languages. We will release the benchmark to encourage research on cross-lingual learning methods that transfer linguistic knowledge across a diverse and representative set of languages and tasks." - }, - "cnn_dailymail": { - "pwc_id": "cnn-daily-mail-1", - "dataset_name": "CNN/Daily Mail Dataset", - "dataset_abstract": "CNN/Daily Mail is a dataset for text summarization. Human generated abstractive summary bullets were generated from news stories in CNN and Daily Mail websites as questions (with one of the entities hidden), and stories as the corresponding passages from which the system is expected to answer the fill-in the-blank question. The authors released the scripts that crawl, extract and generate pairs of passages and questions from these websites.\n\nIn all, the corpus has 286,817 training pairs, 13,368 validation pairs and 11,487 test pairs, as defined by their scripts. The source documents in the training set have 766 words spanning 29.74 sentences on an average while the summaries consist of 53 words and 3.72 sentences.", - "paper_name": "Abstractive Text Summarization Using Sequence-to-Sequence RNNs and Beyond", - "paper_abstract": "In this work, we model abstractive text summarization using Attentional\nEncoder-Decoder Recurrent Neural Networks, and show that they achieve\nstate-of-the-art performance on two different corpora. We propose several novel\nmodels that address critical problems in summarization that are not adequately\nmodeled by the basic architecture, such as modeling key-words, capturing the\nhierarchy of sentence-to-word structure, and emitting words that are rare or\nunseen at training time. Our work shows that many of our proposed models\ncontribute to further improvement in performance. We also propose a new dataset\nconsisting of multi-sentence summaries, and establish performance benchmarks\nfor further research." - }, - "conll2003": { - "pwc_id": "conll-2003", - "dataset_name": "CoNLL-2003 Dataset", - "dataset_abstract": "CoNLL-2003 is a named entity recognition dataset released as a part of CoNLL-2003 shared task: language-independent named entity recognition.\nThe data consists of eight files covering two languages: English and German.\nFor each of the languages there is a training file, a development file, a test file and a large file with unannotated data.\n\nThe English data was taken from the Reuters Corpus. This corpus consists of Reuters news stories between August 1996 and August 1997.\nFor the training and development set, ten days worth of data were taken from the files representing the end of August 1996.\nFor the test set, the texts were from December 1996. The preprocessed raw data covers the month of September 1996.\n\nThe text for the German data was taken from the ECI Multilingual Text Corpus. This corpus consists of texts in many languages. The portion of data that\nwas used for this task, was extracted from the German newspaper Frankfurter Rundshau. All three of the training, development and test sets were taken\nfrom articles written in one week at the end of August 1992.\nThe raw data were taken from the months of September to December 1992.\n\n| English data | Articles | Sentences | Tokens | LOC | MISC | ORG | PER |\n|-------------------|----------|-----------|---------|------|------|------|------|\n| Training set | 946 | 14,987 | 203,621 | 7140 | 3438 | 6321 | 6600 |\n| Development set | 216 | 3,466 | 51,362 | 1837 | 922 | 1341 | 1842 |\n| Test set | 231 | 3,684 | 46,435 | 1668 | 702 | 1661 | 1617 |\n\nNumber of articles, sentences, tokens and entities (locations, miscellaneous, organizations, and persons) in English data files.\n\n| German data | Articles | Sentences | Tokens | LOC | MISC | ORG | PER |\n|-------------------|----------|-----------|---------|------|------|------|------|\n| Training set | 553 | 12,705 | 206,931 | 4363 | 2288 | 2427 | 2773 |\n| Development set | 201 | 3,068 | 51,444 | 1181 | 1010 | 1241 | 1401 |\n| Test set | 155 | 3,160 | 51,943 | 1035 | 670 | 773 | 1195 |\n\nNumber of articles, sentences, tokens and entities (locations, miscellaneous, organizations, and persons) in German data files.", - "paper_name": "Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition", - "paper_abstract": "We describe the CoNLL-2003 shared task: language-independent named entity recognition. We give background information on the data sets (English and German) and the evaluation method, present a general overview of the systems that have taken part in the task and discuss their performance." - }, - "kilt_tasks": { - "pwc_id": "kilt", - "dataset_name": "KILT Dataset", - "dataset_abstract": "KILT (Knowledge Intensive Language Tasks) is a benchmark consisting of 11 datasets representing 5 types of tasks:\n\n\nFact-checking (FEVER),\nEntity linking (AIDA CoNLL-YAGO, WNED-WIKI, WNED-CWEB),\nSlot filling (T-Rex, Zero Shot RE),\nOpen domain QA (Natural Questions, HotpotQA, TriviaQA, ELI5),\nDialog generation (Wizard of Wikipedia).\n\nAll these datasets have been grounded in a single pre-processed wikipedia snapshot, allowing for fairer and more consistent evaluation as well as enabling new task setups such as multitask and transfer learning.", - "paper_name": "KILT: a Benchmark for Knowledge Intensive Language Tasks", - "paper_abstract": "Challenging problems such as open-domain question answering, fact checking, slot filling and entity linking require access to large, external knowledge sources. While some models do well on individual tasks, developing general models is difficult as each task might require computationally expensive indexing of custom knowledge sources, in addition to dedicated infrastructure. To catalyze research on models that condition on specific information in large textual resources, we present a benchmark for knowledge-intensive language tasks (KILT). All tasks in KILT are grounded in the same snapshot of Wikipedia, reducing engineering turnaround through the re-use of components, as well as accelerating research into task-agnostic memory architectures. We test both task-specific and general baselines, evaluating downstream performance in addition to the ability of the models to provide provenance. We find that a shared dense vector index coupled with a seq2seq model is a strong baseline, outperforming more tailor-made approaches for fact checking, open-domain question answering and dialogue, and yielding competitive results on entity linking and slot filling, by generating disambiguated text. KILT data and code are available at https://github.com/facebookresearch/KILT." - }, - "adversarial_qa": { - "pwc_id": "adversarialqa", - "dataset_name": "AdversarialQA Dataset", - "dataset_abstract": "We have created three new Reading Comprehension datasets constructed using an adversarial model-in-the-loop.\n\nWe use three different models; BiDAF (Seo et al., 2016), BERTLarge (Devlin et al., 2018), and RoBERTaLarge (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\n\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging. The three AdversarialQA round 1 datasets provide a training and evaluation resource for such methods.", - "paper_name": "Beat the AI: Investigating Adversarial Human Annotation for Reading Comprehension", - "paper_abstract": "Innovations in annotation methodology have been a catalyst for Reading Comprehension (RC) datasets and models. One recent trend to challenge current RC models is to involve a model in the annotation process: humans create questions adversarially, such that the model fails to answer them correctly. In this work we investigate this annotation methodology and apply it in three different settings, collecting a total of 36,000 samples with progressively stronger models in the annotation loop. This allows us to explore questions such as the reproducibility of the adversarial effect, transfer from data collected with varying model-in-the-loop strengths, and generalisation to data collected without a model. We find that training on adversarially collected samples leads to strong generalisation to non-adversarially collected datasets, yet with progressive performance deterioration with increasingly stronger models-in-the-loop. Furthermore, we find that stronger models can still learn from datasets collected with substantially weaker models-in-the-loop. When trained on data collected with a BiDAF model in the loop, RoBERTa achieves 39.9F1 on questions that it cannot answer when trained on SQuAD - only marginally lower than when trained on data collected using RoBERTa itself (41.0F1)." - }, - "common_voice": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "oscar": { - "pwc_id": "oscar", - "dataset_name": "OSCAR Dataset", - "dataset_abstract": "OSCAR or Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture. The dataset used for training multilingual models such as BART incorporates 138 GB of text.", - "paper_name": "A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages", - "paper_abstract": "We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for five mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures." - }, - "trec": { - "pwc_id": "trecqa", - "dataset_name": "TrecQA Dataset", - "dataset_abstract": "Text Retrieval Conference Question Answering (TrecQA) is a dataset created from the TREC-8 (1999) to TREC-13 (2004) Question Answering tracks. There are two versions of TrecQA: raw and clean. Both versions have the same training set but their development and test sets differ. The commonly used clean version of the dataset excludes questions in development and test sets with no answers or only positive/negative answers. The clean version has 1,229/65/68 questions and 53,417/1,117/1,442 question-answer pairs for the train/dev/test split.", - "paper_name": "", - "paper_abstract": "" - }, - "rotten_tomatoes": { - "pwc_id": "mr", - "dataset_name": "MR Dataset", - "dataset_abstract": "MR Movie Reviews is a dataset for use in sentiment-analysis experiments. Available are collections of movie-review documents labeled with respect to their overall sentiment polarity (positive or negative) or subjective rating (e.g., \"two and a half stars\") and sentences labeled with respect to their subjectivity status (subjective or objective) or polarity.", - "paper_name": "", - "paper_abstract": "" - }, - "race": { - "pwc_id": "race", - "dataset_name": "RACE Dataset", - "dataset_abstract": "The ReAding Comprehension dataset from Examinations (RACE) dataset is a machine reading comprehension dataset consisting of 27,933 passages and 97,867 questions from English exams, targeting Chinese students aged 12-18. RACE consists of two subsets, RACE-M and RACE-H, from middle school and high school exams, respectively. RACE-M has 28,293 questions and RACE-H has 69,574. Each question is associated with 4 candidate answers, one of which is correct. The data generation process of RACE differs from most machine reading comprehension datasets - instead of generating questions and answers by heuristics or crowd-sourcing, questions in RACE are specifically designed for testing human reading skills, and are created by domain experts.", - "paper_name": "RACE: Large-scale ReAding Comprehension Dataset From Examinations", - "paper_abstract": "We present RACE, a new dataset for benchmark evaluation of methods in the\nreading comprehension task. Collected from the English exams for middle and\nhigh school Chinese students in the age range between 12 to 18, RACE consists\nof near 28,000 passages and near 100,000 questions generated by human experts\n(English instructors), and covers a variety of topics which are carefully\ndesigned for evaluating the students' ability in understanding and reasoning.\nIn particular, the proportion of questions that requires reasoning is much\nlarger in RACE than that in other benchmark datasets for reading comprehension,\nand there is a significant gap between the performance of the state-of-the-art\nmodels (43%) and the ceiling human performance (95%). We hope this new dataset\ncan serve as a valuable resource for research and evaluation in machine\ncomprehension. The dataset is freely available at\nhttp://www.cs.cmu.edu/~glai1/data/race/ and the code is available at\nhttps://github.com/qizhex/RACE_AR_baselines." - }, - "c4": { - "pwc_id": "c4", - "dataset_name": "C4 Dataset", - "dataset_abstract": "C4 is a colossal, cleaned version of Common Crawl's web crawl corpus. It was based on Common Crawl dataset: https://commoncrawl.org. It was used to train the T5 text-to-text Transformer models.\n\nThe dataset can be downloaded in a pre-processed form from allennlp.", - "paper_name": "Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer", - "paper_abstract": "Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new ``Colossal Clean Crawled Corpus'', we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our data set, pre-trained models, and code." - }, - "paws": { - "pwc_id": "paws", - "dataset_name": "PAWS Dataset", - "dataset_abstract": "Paraphrase Adversaries from Word Scrambling (PAWS) is a dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature the importance of modeling structure, context, and word order information for the problem of paraphrase identification. The dataset has two subsets, one based on Wikipedia and the other one based on the Quora Question Pairs (QQP) dataset.", - "paper_name": "PAWS: Paraphrase Adversaries from Word Scrambling", - "paper_abstract": "Existing paraphrase identification datasets lack sentence pairs that have\nhigh lexical overlap without being paraphrases. Models trained on such data\nfail to distinguish pairs like flights from New York to Florida and flights\nfrom Florida to New York. This paper introduces PAWS (Paraphrase Adversaries\nfrom Word Scrambling), a new dataset with 108,463 well-formed paraphrase and\nnon-paraphrase pairs with high lexical overlap. Challenging pairs are generated\nby controlled word swapping and back translation, followed by fluency and\nparaphrase judgments by human raters. State-of-the-art models trained on\nexisting datasets have dismal performance on PAWS (<40% accuracy); however,\nincluding PAWS training data for these models improves their accuracy to 85%\nwhile maintaining performance on existing tasks. In contrast, models that do\nnot capture non-local contextual information fail even with PAWS training\nexamples. As such, PAWS provides an effective instrument for driving further\nprogress on models that better exploit structure, context, and pairwise\ncomparisons." - }, - "klue": { - "pwc_id": "klue", - "dataset_name": "KLUE Dataset", - "dataset_abstract": "Korean Language Understanding Evaluation (KLUE) benchmark is a series of datasets to evaluate natural language understanding capability of Korean language models. KLUE consists of 8 diverse and representative tasks, which are accessible to anyone without any restrictions. With ethical considerations in mind, we deliberately design annotation guidelines to obtain unambiguous annotations for all datasets. Furthermore, we build an evaluation system and carefully choose evaluations metrics for every task, thus establishing fair comparison across Korean language models.\n\nKLUE benchmark is composed of 8 tasks:\n\n\nTopic Classification (TC)\nSentence Textual Similarity (STS)\nNatural Language Inference (NLI)\nNamed Entity Recognition (NER)\nRelation Extraction (RE)\n(Part-Of-Speech) + Dependency Parsing (DP)\nMachine Reading Comprehension (MRC)\nDialogue State Tracking (DST)", - "paper_name": "KLUE: Korean Language Understanding Evaluation", - "paper_abstract": "We introduce Korean Language Understanding Evaluation (KLUE) benchmark. KLUE is a collection of 8 Korean natural language understanding (NLU) tasks, including Topic Classification, SemanticTextual Similarity, Natural Language Inference, Named Entity Recognition, Relation Extraction, Dependency Parsing, Machine Reading Comprehension, and Dialogue State Tracking. We build all of the tasks from scratch from diverse source corpora while respecting copyrights, to ensure accessibility for anyone without any restrictions. With ethical considerations in mind, we carefully design annotation protocols. Along with the benchmark tasks and data, we provide suitable evaluation metrics and fine-tuning recipes for pretrained language models for each task. We furthermore release the pretrained language models (PLM), KLUE-BERT and KLUE-RoBERTa, to help reproducing baseline models on KLUE and thereby facilitate future research. We make a few interesting observations from the preliminary experiments using the proposed KLUE benchmark suite, already demonstrating the usefulness of this new benchmark suite. First, we find KLUE-RoBERTa-large outperforms other baselines, including multilingual PLMs and existing open-source Korean PLMs. Second, we see minimal degradation in performance even when we replace personally identifiable information from the pretraining corpus, suggesting that privacy and NLU capability are not at odds with each other. Lastly, we find that using BPE tokenization in combination with morpheme-level pre-tokenization is effective in tasks involving morpheme-level tagging, detection and generation. In addition to accelerating Korean NLP research, our comprehensive documentation on creating KLUE will facilitate creating similar resources for other languages in the future. KLUE is available at https://klue-benchmark.com." - }, - "snli": { - "pwc_id": "snli", - "dataset_name": "SNLI Dataset", - "dataset_abstract": "The SNLI dataset (Stanford Natural Language Inference) consists of 570k sentence-pairs manually labeled as entailment, contradiction, and neutral. Premises are image captions from Flickr30k, while hypotheses were generated by crowd-sourced annotators who were shown a premise and asked to generate entailing, contradicting, and neutral sentences. Annotators were instructed to judge the relation between sentences given that they describe the same event. Each pair is labeled as \u201centailment\u201d, \u201cneutral\u201d, \u201ccontradiction\u201d or \u201c-\u201d, where \u201c-\u201d indicates that an agreement could not be reached.", - "paper_name": "A large annotated corpus for learning natural language inference", - "paper_abstract": "Understanding entailment and contradiction is fundamental to understanding\nnatural language, and inference about entailment and contradiction is a\nvaluable testing ground for the development of semantic representations.\nHowever, machine learning research in this area has been dramatically limited\nby the lack of large-scale resources. To address this, we introduce the\nStanford Natural Language Inference corpus, a new, freely available collection\nof labeled sentence pairs, written by humans doing a novel grounded task based\non image captioning. At 570K pairs, it is two orders of magnitude larger than\nall other resources of its type. This increase in scale allows lexicalized\nclassifiers to outperform some sophisticated existing entailment models, and it\nallows a neural network-based model to perform competitively on natural\nlanguage inference benchmarks for the first time." - }, - "winogrande": { - "pwc_id": "winogrande", - "dataset_name": "WinoGrande Dataset", - "dataset_abstract": "WinoGrande is a large-scale dataset of 44k problems, inspired by the original WSC design, but adjusted to improve both the scale and the hardness of the dataset. The key steps of the dataset construction consist of (1) a carefully designed crowdsourcing procedure, followed by (2) systematic bias reduction using a novel AfLite algorithm that generalizes human-detectable word associations to machine-detectable embedding associations.", - "paper_name": "WinoGrande: An Adversarial Winograd Schema Challenge at Scale", - "paper_abstract": "The Winograd Schema Challenge (WSC) (Levesque, Davis, and Morgenstern 2011), a benchmark for commonsense reasoning, is a set of 273 expert-crafted pronoun resolution problems originally designed to be unsolvable for statistical models that rely on selectional preferences or word associations. However, recent advances in neural language models have already reached around 90% accuracy on variants of WSC. This raises an important question whether these models have truly acquired robust commonsense capabilities or whether they rely on spurious biases in the datasets that lead to an overestimation of the true capabilities of machine commonsense. To investigate this question, we introduce WinoGrande, a large-scale dataset of 44k problems, inspired by the original WSC design, but adjusted to improve both the scale and the hardness of the dataset. The key steps of the dataset construction consist of (1) a carefully designed crowdsourcing procedure, followed by (2) systematic bias reduction using a novel AfLite algorithm that generalizes human-detectable word associations to machine-detectable embedding associations. The best state-of-the-art methods on WinoGrande achieve 59.4-79.1%, which are 15-35% below human performance of 94.0%, depending on the amount of the training data allowed. Furthermore, we establish new state-of-the-art results on five related benchmarks - WSC (90.1%), DPR (93.1%), COPA (90.6%), KnowRef (85.6%), and Winogender (97.1%). These results have dual implications: on one hand, they demonstrate the effectiveness of WinoGrande when used as a resource for transfer learning. On the other hand, they raise a concern that we are likely to be overestimating the true capabilities of machine commonsense across all these benchmarks. We emphasize the importance of algorithmic bias reduction in existing and future benchmarks to mitigate such overestimation." - }, - "cosmos_qa": { - "pwc_id": "cosmosqa", - "dataset_name": "CosmosQA Dataset", - "dataset_abstract": "CosmosQA is a large-scale dataset of 35.6K problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. It focuses on reading between the lines over a diverse collection of people\u2019s everyday narratives, asking questions concerning on the likely causes or effects of events that require reasoning beyond the exact text spans in the context.", - "paper_name": "Cosmos QA: Machine Reading Comprehension with Contextual Commonsense Reasoning", - "paper_abstract": "Understanding narratives requires reading between the lines, which in turn, requires interpreting the likely causes and effects of events, even when they are not mentioned explicitly. In this paper, we introduce Cosmos QA, a large-scale dataset of 35,600 problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. In stark contrast to most existing reading comprehension datasets where the questions focus on factual and literal understanding of the context paragraph, our dataset focuses on reading between the lines over a diverse collection of people's everyday narratives, asking such questions as \"what might be the possible reason of ...?\", or \"what would have happened if ...\" that require reasoning beyond the exact text spans in the context. To establish baseline performances on Cosmos QA, we experiment with several state-of-the-art neural architectures for reading comprehension, and also propose a new architecture that improves over the competitive baselines. Experimental results demonstrate a significant gap between machine (68.4%) and human performance (94%), pointing to avenues for future research on commonsense machine comprehension. Dataset, code and leaderboard is publicly available at https://wilburone.github.io/cosmos." - }, - "wino_bias": { - "pwc_id": "winobias", - "dataset_name": "WinoBias Dataset", - "dataset_abstract": "WinoBias contains 3,160 sentences, split equally for development and test, created by researchers familiar with the project. Sentences were created to follow two prototypical templates but annotators were encouraged to come up with scenarios where entities could be interacting in plausible ways. Templates were selected to be challenging and designed to cover cases requiring semantics and syntax separately.", - "paper_name": "Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods", - "paper_abstract": "We introduce a new benchmark, WinoBias, for coreference resolution focused on\ngender bias. Our corpus contains Winograd-schema style sentences with entities\ncorresponding to people referred by their occupation (e.g. the nurse, the\ndoctor, the carpenter). We demonstrate that a rule-based, a feature-rich, and a\nneural coreference system all link gendered pronouns to pro-stereotypical\nentities with higher accuracy than anti-stereotypical entities, by an average\ndifference of 21.1 in F1 score. Finally, we demonstrate a data-augmentation\napproach that, in combination with existing word-embedding debiasing\ntechniques, removes the bias demonstrated by these systems in WinoBias without\nsignificantly affecting their performance on existing coreference benchmark\ndatasets. Our dataset and code are available at http://winobias.org." - }, - "sst": { - "pwc_id": "sst", - "dataset_name": "SST Dataset", - "dataset_abstract": "The Stanford Sentiment Treebank is a corpus with fully labeled parse trees that allows for a\ncomplete analysis of the compositional effects of\nsentiment in language. The corpus is based on\nthe dataset introduced by Pang and Lee (2005) and\nconsists of 11,855 single sentences extracted from\nmovie reviews. It was parsed with the Stanford\nparser and includes a total of 215,154 unique phrases\nfrom those parse trees, each annotated by 3 human judges.\n\nEach phrase is labelled as either negative, somewhat negative, neutral, somewhat positive or positive.\nThe corpus with all 5 labels is referred to as SST-5 or SST fine-grained. Binary classification experiments on full sentences (negative or somewhat negative vs somewhat positive or positive with neutral sentences discarded) refer to the dataset as SST-2 or SST binary.", - "paper_name": "", - "paper_abstract": "" - }, - "duorc": { - "pwc_id": "duorc", - "dataset_name": "DuoRC Dataset", - "dataset_abstract": "DuoRC contains 186,089 unique question-answer pairs created from a collection of 7680 pairs of movie plots where each pair in the collection reflects two versions of the same movie.\n\nWhy another RC dataset?\n\nDuoRC pushes the NLP community to address challenges on incorporating knowledge and reasoning in neural architectures for reading comprehension. It poses several interesting challenges such as:\n\n\nDuoRC using parallel plots is especially designed to contain a large number of questions with low lexical overlap between questions and their corresponding passages\nIt requires models to go beyond the content of the given passage itself and incorporate world-knowledge, background knowledge, and common-sense knowledge to arrive at the answer\nIt revolves around narrative passages from movie plots describing complex events and therefore naturally require complex reasoning (e.g. temporal reasoning, entailment, long-distance anaphoras, etc.) across multiple sentences to infer the answer to questions\nSeveral of the questions in DuoRC, while seeming relevant, cannot actually be answered from the given passage. This requires the model to detect the unanswerability of questions. This aspect is important for machines to achieve in industrial settings in particular", - "paper_name": "DuoRC: Towards Complex Language Understanding with Paraphrased Reading Comprehension", - "paper_abstract": "We propose DuoRC, a novel dataset for Reading Comprehension (RC) that\nmotivates several new challenges for neural approaches in language\nunderstanding beyond those offered by existing RC datasets. DuoRC contains\n186,089 unique question-answer pairs created from a collection of 7680 pairs of\nmovie plots where each pair in the collection reflects two versions of the same\nmovie - one from Wikipedia and the other from IMDb - written by two different\nauthors. We asked crowdsourced workers to create questions from one version of\nthe plot and a different set of workers to extract or synthesize answers from\nthe other version. This unique characteristic of DuoRC where questions and\nanswers are created from different versions of a document narrating the same\nunderlying story, ensures by design, that there is very little lexical overlap\nbetween the questions created from one version and the segments containing the\nanswer in the other version. Further, since the two versions have different\nlevels of plot detail, narration style, vocabulary, etc., answering questions\nfrom the second version requires deeper language understanding and\nincorporating external background knowledge. Additionally, the narrative style\nof passages arising from movie plots (as opposed to typical descriptive\npassages in existing datasets) exhibits the need to perform complex reasoning\nover events across multiple sentences. Indeed, we observe that state-of-the-art\nneural RC models which have achieved near human performance on the SQuAD\ndataset, even when coupled with traditional NLP techniques to address the\nchallenges presented in DuoRC exhibit very poor performance (F1 score of 37.42%\non DuoRC v/s 86% on SQuAD dataset). This opens up several interesting research\navenues wherein DuoRC could complement other RC datasets to explore novel\nneural approaches for studying language understanding." - }, - "quoref": { - "pwc_id": "quoref", - "dataset_name": "Quoref Dataset", - "dataset_abstract": "Quoref is a QA dataset which tests the coreferential reasoning capability of reading comprehension systems. In this span-selection benchmark containing 24K questions over 4.7K paragraphs from Wikipedia, a system must resolve hard coreferences before selecting the appropriate span(s) in the paragraphs for answering questions.", - "paper_name": "Quoref: A Reading Comprehension Dataset with Questions Requiring Coreferential Reasoning", - "paper_abstract": "Machine comprehension of texts longer than a single sentence often requires coreference resolution. However, most current reading comprehension benchmarks do not contain complex coreferential phenomena and hence fail to evaluate the ability of models to resolve coreference. We present a new crowdsourced dataset containing more than 24K span-selection questions that require resolving coreference among entities in over 4.7K English paragraphs from Wikipedia. Obtaining questions focused on such phenomena is challenging, because it is hard to avoid lexical cues that shortcut complex reasoning. We deal with this issue by using a strong baseline model as an adversary in the crowdsourcing loop, which helps crowdworkers avoid writing questions with exploitable surface cues. We show that state-of-the-art reading comprehension models perform significantly worse than humans on this benchmark---the best model performance is 70.5 F1, while the estimated human performance is 93.4 F1." - }, - "esnli": { - "pwc_id": "e-snli", - "dataset_name": "e-SNLI Dataset", - "dataset_abstract": "e-SNLI is used for various goals, such as obtaining full sentence justifications of a model's decisions, improving universal sentence representations and transferring to out-of-domain NLI datasets.", - "paper_name": "e-SNLI: Natural Language Inference with Natural Language Explanations", - "paper_abstract": "In order for machine learning to garner widespread public adoption, models\nmust be able to provide interpretable and robust explanations for their\ndecisions, as well as learn from human-provided explanations at train time. In\nthis work, we extend the Stanford Natural Language Inference dataset with an\nadditional layer of human-annotated natural language explanations of the\nentailment relations. We further implement models that incorporate these\nexplanations into their training process and output them at test time. We show\nhow our corpus of explanations, which we call e-SNLI, can be used for various\ngoals, such as obtaining full sentence justifications of a model's decisions,\nimproving universal sentence representations and transferring to out-of-domain\nNLI datasets. Our dataset thus opens up a range of research directions for\nusing natural language explanations, both for improving models and for\nasserting their trust." - }, - "hellaswag": { - "pwc_id": "hellaswag", - "dataset_name": "HellaSwag Dataset", - "dataset_abstract": "HellaSwag is a challenge dataset for evaluating commonsense NLI that is specially hard for state-of-the-art models, though its questions are trivial for humans (>95% accuracy).", - "paper_name": "HellaSwag: Can a Machine Really Finish Your Sentence?", - "paper_abstract": "Recent work by Zellers et al. (2018) introduced a new task of commonsense natural language inference: given an event description such as \"A woman sits at a piano,\" a machine must select the most likely followup: \"She sets her fingers on the keys.\" With the introduction of BERT, near human-level performance was reached. Does this mean that machines can perform human level commonsense inference? In this paper, we show that commonsense inference still proves difficult for even state-of-the-art models, by presenting HellaSwag, a new challenge dataset. Though its questions are trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). We achieve this via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. AF proves to be surprisingly robust. The key insight is to scale up the length and complexity of the dataset examples towards a critical 'Goldilocks' zone wherein generated text is ridiculous to humans, yet often misclassified by state-of-the-art models. Our construction of HellaSwag, and its resulting difficulty, sheds light on the inner workings of deep pretrained models. More broadly, it suggests a new path forward for NLP research, in which benchmarks co-evolve with the evolving state-of-the-art in an adversarial way, so as to present ever-harder challenges." - }, - "piqa": { - "pwc_id": "piqa", - "dataset_name": "PIQA Dataset", - "dataset_abstract": "PIQA is a dataset for commonsense reasoning, and was created to investigate the physical knowledge of existing models in NLP.", - "paper_name": "PIQA: Reasoning about Physical Commonsense in Natural Language", - "paper_abstract": "To apply eyeshadow without a brush, should I use a cotton swab or a toothpick? Questions requiring this kind of physical commonsense pose a challenge to today's natural language understanding systems. While recent pretrained models (such as BERT) have made progress on question answering over more abstract domains - such as news articles and encyclopedia entries, where text is plentiful - in more physical domains, text is inherently limited due to reporting bias. Can AI systems learn to reliably answer physical common-sense questions without experiencing the physical world? In this paper, we introduce the task of physical commonsense reasoning and a corresponding benchmark dataset Physical Interaction: Question Answering or PIQA. Though humans find the dataset easy (95% accuracy), large pretrained models struggle (77%). We provide analysis about the dimensions of knowledge that existing models lack, which offers significant opportunities for future research." - }, - "samsum": { - "pwc_id": "samsum-corpus", - "dataset_name": "SAMSum Corpus Dataset", - "dataset_abstract": "A new dataset with abstractive dialogue summaries.", - "paper_name": "SAMSum Corpus: A Human-annotated Dialogue Dataset for Abstractive Summarization", - "paper_abstract": "This paper introduces the SAMSum Corpus, a new dataset with abstractive dialogue summaries. We investigate the challenges it poses for automated summarization by testing several models and comparing their results with those obtained on a corpus of news articles. We show that model-generated summaries of dialogues achieve higher ROUGE scores than the model-generated summaries of news -- in contrast with human evaluators' judgement. This suggests that a challenging task of abstractive dialogue summarization requires dedicated models and non-standard quality measures. To our knowledge, our study is the first attempt to introduce a high-quality chat-dialogues corpus, manually annotated with abstractive summarizations, which can be used by the research community for further studies." - }, - "scan": { - "pwc_id": "scan", - "dataset_name": "SCAN Dataset", - "dataset_abstract": "SCAN is a dataset for grounded navigation which consists of a set of simple compositional navigation commands paired with the corresponding action sequences.", - "paper_name": "Generalization without systematicity: On the compositional skills of sequence-to-sequence recurrent networks", - "paper_abstract": "Humans can understand and produce new utterances effortlessly, thanks to\ntheir compositional skills. Once a person learns the meaning of a new verb\n\"dax,\" he or she can immediately understand the meaning of \"dax twice\" or \"sing\nand dax.\" In this paper, we introduce the SCAN domain, consisting of a set of\nsimple compositional navigation commands paired with the corresponding action\nsequences. We then test the zero-shot generalization capabilities of a variety\nof recurrent neural networks (RNNs) trained on SCAN with sequence-to-sequence\nmethods. We find that RNNs can make successful zero-shot generalizations when\nthe differences between training and test commands are small, so that they can\napply \"mix-and-match\" strategies to solve the task. However, when\ngeneralization requires systematic compositional skills (as in the \"dax\"\nexample above), RNNs fail spectacularly. We conclude with a proof-of-concept\nexperiment in neural machine translation, suggesting that lack of systematicity\nmight be partially responsible for neural networks' notorious training data\nthirst." - }, - "trivia_qa": { - "pwc_id": "triviaqa", - "dataset_name": "TriviaQA Dataset", - "dataset_abstract": "TriviaQA is a realistic text-based question answering dataset which includes 950K question-answer pairs from 662K documents collected from Wikipedia and the web. This dataset is more challenging than standard QA benchmark datasets such as Stanford Question Answering Dataset (SQuAD), as the answers for a question may not be directly obtained by span prediction and the context is very long. TriviaQA dataset consists of both human-verified and machine-generated QA subsets.", - "paper_name": "TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension", - "paper_abstract": "We present TriviaQA, a challenging reading comprehension dataset containing\nover 650K question-answer-evidence triples. TriviaQA includes 95K\nquestion-answer pairs authored by trivia enthusiasts and independently gathered\nevidence documents, six per question on average, that provide high quality\ndistant supervision for answering the questions. We show that, in comparison to\nother recently introduced large-scale datasets, TriviaQA (1) has relatively\ncomplex, compositional questions, (2) has considerable syntactic and lexical\nvariability between questions and corresponding answer-evidence sentences, and\n(3) requires more cross sentence reasoning to find answers. We also present two\nbaseline algorithms: a feature-based classifier and a state-of-the-art neural\nnetwork, that performs well on SQuAD reading comprehension. Neither approach\ncomes close to human performance (23% and 40% vs. 80%), suggesting that\nTriviaQA is a challenging testbed that is worth significant future study. Data\nand code available at -- http://nlp.cs.washington.edu/triviaqa/" - }, - "wiki_bio": { - "pwc_id": "wikibio", - "dataset_name": "WikiBio Dataset", - "dataset_abstract": "Kishor Salvi WikiBio is a Actor Wikipedia \nBorn - 6 September 1996\nAge - 2021 (25)\nNationality - indian\nHometown - Mumbai \n Maharashtra India\nOccupation - Actor , dancer\n\nKishor Salvi full filmography Biography dataset wikiBio \nKishor Salvi as a child actor in marathi tv series Asambhav 2007.\nHe was acted Marathi film Ranjan 2017 directed by prakash Janardhan pawar in national film Award winer film .\nKishor Salvi as same hindi Telivsion show acted in 2012.", - "paper_name": "Neural Text Generation from Structured Data with Application to the Biography Domain", - "paper_abstract": "This paper introduces a neural model for concept-to-text generation that\nscales to large, rich domains. We experiment with a new dataset of biographies\nfrom Wikipedia that is an order of magnitude larger than existing resources\nwith over 700k samples. The dataset is also vastly more diverse with a 400k\nvocabulary, compared to a few hundred words for Weathergov or Robocup. Our\nmodel builds upon recent work on conditional neural language model for text\ngeneration. To deal with the large vocabulary, we extend these models to mix a\nfixed vocabulary with copy actions that transfer sample-specific words from the\ninput database to the generated output sentence. Our neural model significantly\nout-performs a classical Kneser-Ney language model adapted to this task by\nnearly 15 BLEU." - }, - "cos_e": { - "pwc_id": "cos-e", - "dataset_name": "CoS-E Dataset", - "dataset_abstract": "CoS-E consists of human explanations for commonsense reasoning in the form of natural language sequences and highlighted annotations", - "paper_name": "Explain Yourself! Leveraging Language Models for Commonsense Reasoning", - "paper_abstract": "Deep learning models perform poorly on tasks that require commonsense reasoning, which often necessitates some form of world-knowledge or reasoning over information not immediately present in the input. We collect human explanations for commonsense reasoning in the form of natural language sequences and highlighted annotations in a new dataset called Common Sense Explanations (CoS-E). We use CoS-E to train language models to automatically generate explanations that can be used during training and inference in a novel Commonsense Auto-Generated Explanation (CAGE) framework. CAGE improves the state-of-the-art by 10% on the challenging CommonsenseQA task. We further study commonsense reasoning in DNNs using both human and auto-generated explanations including transfer to out-of-domain tasks. Empirical results indicate that we can effectively leverage language models for commonsense reasoning." - }, - "universal_dependencies": { - "pwc_id": "universal-dependencies", - "dataset_name": "Universal Dependencies Dataset", - "dataset_abstract": "The Universal Dependencies (UD) project seeks to develop cross-linguistically consistent treebank annotation of morphology and syntax for multiple languages. The first version of the dataset was released in 2015 and consisted of 10 treebanks over 10 languages. Version 2.7 released in 2020 consists of 183 treebanks over 104 languages. The annotation consists of UPOS (universal part-of-speech tags), XPOS (language-specific part-of-speech tags), Feats (universal morphological features), Lemmas, dependency heads and universal dependency labels.", - "paper_name": "Universal Dependencies v1: A Multilingual Treebank Collection", - "paper_abstract": "Cross-linguistically consistent annotation is necessary for sound comparative evaluation and cross-lingual learning experiments. It is also useful for multilingual system development and comparative linguistic studies. Universal Dependencies is an open community effort to create cross-linguistically consistent treebank annotation for many languages within a dependency-based lexicalist framework. In this paper, we describe v1 of the universal guidelines, the underlying design principles, and the currently available treebanks for 33 languages." - }, - "quail": { - "pwc_id": "quail", - "dataset_name": "QuAIL Dataset", - "dataset_abstract": "A new kind of question-answering dataset that combines commonsense, text-based, and unanswerable questions, balanced for different genres and reasoning types. Reasoning type annotation for 9 types of reasoning: temporal, causality, factoid, coreference, character properties, their belief states, subsequent entity states, event durations, and unanswerable. Genres: CC license fiction, Voice of America news, blogs, user stories from Quora 800 texts, 18 questions for each (~14K questions).", - "paper_name": "", - "paper_abstract": "" - }, - "swag": { - "pwc_id": "swag", - "dataset_name": "SWAG Dataset", - "dataset_abstract": "Given a partial description like \"she opened the hood of the car,\" humans can reason about the situation and anticipate what might come next (\"then, she examined the engine\"). SWAG (Situations With Adversarial Generations) is a large-scale dataset for this task of grounded commonsense inference, unifying natural language inference and physically grounded reasoning.\n\nThe dataset consists of 113k multiple choice questions about grounded situations. Each question is a video caption from LSMDC or ActivityNet Captions, with four answer choices about what might happen next in the scene. The correct answer is the (real) video caption for the next event in the video; the three incorrect answers are adversarially generated and human verified, so as to fool machines but not humans. The authors aim for SWAG to be a benchmark for evaluating grounded commonsense NLI and for learning representations.", - "paper_name": "SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference", - "paper_abstract": "Given a partial description like \"she opened the hood of the car,\" humans can\nreason about the situation and anticipate what might come next (\"then, she\nexamined the engine\"). In this paper, we introduce the task of grounded\ncommonsense inference, unifying natural language inference and commonsense\nreasoning.\n We present SWAG, a new dataset with 113k multiple choice questions about a\nrich spectrum of grounded situations. To address the recurring challenges of\nthe annotation artifacts and human biases found in many existing datasets, we\npropose Adversarial Filtering (AF), a novel procedure that constructs a\nde-biased dataset by iteratively training an ensemble of stylistic classifiers,\nand using them to filter the data. To account for the aggressive adversarial\nfiltering, we use state-of-the-art language models to massively oversample a\ndiverse set of potential counterfactuals. Empirical results demonstrate that\nwhile humans can solve the resulting inference problems with high accuracy\n(88%), various competitive models struggle on our task. We provide\ncomprehensive analysis that indicates significant opportunities for future\nresearch." - }, - "common_gen": { - "pwc_id": "commongen", - "dataset_name": "CommonGen Dataset", - "dataset_abstract": "CommonGen is constructed through a combination of crowdsourced and existing caption corpora, consists of 79k commonsense descriptions over 35k unique concept-sets.", - "paper_name": "CommonGen: A Constrained Text Generation Challenge for Generative Commonsense Reasoning", - "paper_abstract": "Recently, large-scale pre-trained language models have demonstrated impressive performance on several commonsense-reasoning benchmark datasets. However, building machines with commonsense to compose realistically plausible sentences remains challenging. In this paper, we present a constrained text generation task, CommonGen associated with a benchmark dataset, to explicitly test machines for the ability of generative commonsense reasoning. Given a set of common concepts (e.g., {dog, frisbee, catch, throw}); the task is to generate a coherent sentence describing an everyday scenario using these concepts (e.g., \"a man throws a frisbee and his dog catches it\"). The CommonGen task is challenging because it inherently requires 1) relational reasoning with background commonsense knowledge, and 2) compositional generalization ability to work on unseen concept combinations. Our dataset, constructed through a combination of crowdsourced and existing caption corpora, consists of 79k commonsense descriptions over 35k unique concept-sets. Experiments show that there is a large gap between state-of-the-art text generation models (e.g., T5) and human performance. Furthermore, we demonstrate that the learned generative commonsense reasoning capability can be transferred to improve downstream tasks such as CommonsenseQA by generating additional context." - }, - "hate_speech18": { - "pwc_id": "hate-speech", - "dataset_name": "Hate Speech Dataset", - "dataset_abstract": "Dataset of hate speech annotated on Internet forum posts in English at sentence-level. The source forum in Stormfront, a large online community of white nacionalists. A total of 10,568 sentence have been been extracted from Stormfront and classified as conveying hate speech or not.", - "paper_name": "Hate Speech Dataset from a White Supremacy Forum", - "paper_abstract": "Hate speech is commonly defined as any communication that disparages a target\ngroup of people based on some characteristic such as race, colour, ethnicity,\ngender, sexual orientation, nationality, religion, or other characteristic. Due\nto the massive rise of user-generated web content on social media, the amount\nof hate speech is also steadily increasing. Over the past years, interest in\nonline hate speech detection and, particularly, the automation of this task has\ncontinuously grown, along with the societal impact of the phenomenon. This\npaper describes a hate speech dataset composed of thousands of sentences\nmanually labelled as containing hate speech or not. The sentences have been\nextracted from Stormfront, a white supremacist forum. A custom annotation tool\nhas been developed to carry out the manual labelling task which, among other\nthings, allows the annotators to choose whether to read the context of a\nsentence before labelling it. The paper also provides a thoughtful qualitative\nand quantitative study of the resulting dataset and several baseline\nexperiments with different classification models. The dataset is publicly\navailable." - }, - "paws-x": { - "pwc_id": "paws-x", - "dataset_name": "PAWS-X Dataset", - "dataset_abstract": "PAWS-X contains 23,659 human translated PAWS evaluation pairs and 296,406 machine translated training pairs in six typologically distinct languages: French, Spanish, German, Chinese, Japanese, and Korean. All translated pairs are sourced from examples in PAWS-Wiki.", - "paper_name": "PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification", - "paper_abstract": "Most existing work on adversarial data generation focuses on English. For example, PAWS (Paraphrase Adversaries from Word Scrambling) consists of challenging English paraphrase identification pairs from Wikipedia and Quora. We remedy this gap with PAWS-X, a new dataset of 23,659 human translated PAWS evaluation pairs in six typologically distinct languages: French, Spanish, German, Chinese, Japanese, and Korean. We provide baseline numbers for three models with different capacity to capture non-local context and sentence structure, and using different multilingual training and evaluation regimes. Multilingual BERT fine-tuned on PAWS English plus machine-translated data performs the best, with a range of 83.1-90.8 accuracy across the non-English languages and an average accuracy gain of 23% over the next best model. PAWS-X shows the effectiveness of deep, multilingual pre-training while also leaving considerable headroom as a new challenge to drive multilingual research that better captures structure and contextual information." - }, - "wiki_qa": { - "pwc_id": "wikiqa", - "dataset_name": "WikiQA Dataset", - "dataset_abstract": "The WikiQA corpus is a publicly available set of question and sentence pairs, collected and annotated for research on open-domain question answering. In order to reflect the true information need of general users, Bing query logs were used as the question source. Each question is linked to a Wikipedia page that potentially has the answer. Because the summary section of a Wikipedia page provides the basic and usually most important information about the topic, sentences in this section were used as the candidate answers. The corpus includes 3,047 questions and 29,258 sentences, where 1,473 sentences were labeled as answer sentences to their corresponding questions.", - "paper_name": "", - "paper_abstract": "" - }, - "xquad": { - "pwc_id": "xquad", - "dataset_name": "XQuAD Dataset", - "dataset_abstract": "XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering performance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations into ten languages: Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi. Consequently, the dataset is entirely parallel across 11 languages.", - "paper_name": "On the Cross-lingual Transferability of Monolingual Representations", - "paper_abstract": "State-of-the-art unsupervised multilingual models (e.g., multilingual BERT) have been shown to generalize in a zero-shot cross-lingual setting. This generalization ability has been attributed to the use of a shared subword vocabulary and joint training across multiple languages giving rise to deep multilingual abstractions. We evaluate this hypothesis by designing an alternative approach that transfers a monolingual model to new languages at the lexical level. More concretely, we first train a transformer-based masked language model on one language, and transfer it to a new language by learning a new embedding matrix with the same masked language modeling objective, freezing parameters of all other layers. This approach does not rely on a shared vocabulary or joint training. However, we show that it is competitive with multilingual BERT on standard cross-lingual classification benchmarks and on a new Cross-lingual Question Answering Dataset (XQuAD). Our results contradict common beliefs of the basis of the generalization ability of multilingual models and suggest that deep monolingual models learn some abstractions that generalize across languages. We also release XQuAD as a more comprehensive cross-lingual benchmark, which comprises 240 paragraphs and 1190 question-answer pairs from SQuAD v1.1 translated into ten languages by professional translators." - }, - "hans": { - "pwc_id": "hans", - "dataset_name": "HANS Dataset", - "dataset_abstract": "The HANS (Heuristic Analysis for NLI Systems) dataset which contains many examples where the heuristics fail.", - "paper_name": "", - "paper_abstract": "" - }, - "dbpedia_14": { - "pwc_id": "dbpedia", - "dataset_name": "DBpedia Dataset", - "dataset_abstract": "DBpedia (from \"DB\" for \"database\") is a project aiming to extract structured content from the information created in the Wikipedia project. DBpedia allows users to semantically query relationships and properties of Wikipedia resources, including links to other related datasets.", - "paper_name": "", - "paper_abstract": "" - }, - "ropes": { - "pwc_id": "ropes", - "dataset_name": "ROPES Dataset", - "dataset_abstract": "ROPES is a QA dataset which tests a system's ability to apply knowledge from a passage of text to a new situation. A system is presented a background passage containing a causal or qualitative relation(s), a novel situation that uses this background, and questions that require reasoning about effects of the relationships in the back-ground passage in the context of the situation.", - "paper_name": "Reasoning Over Paragraph Effects in Situations", - "paper_abstract": "A key component of successfully reading a passage of text is the ability to apply knowledge gained from the passage to a new situation. In order to facilitate progress on this kind of reading, we present ROPES, a challenging benchmark for reading comprehension targeting Reasoning Over Paragraph Effects in Situations. We target expository language describing causes and effects (e.g., \"animal pollinators increase efficiency of fertilization in flowers\"), as they have clear implications for new situations. A system is presented a background passage containing at least one of these relations, a novel situation that uses this background, and questions that require reasoning about effects of the relationships in the background passage in the context of the situation. We collect background passages from science textbooks and Wikipedia that contain such phenomena, and ask crowd workers to author situations, questions, and answers, resulting in a 14,322 question dataset. We analyze the challenges of this task and evaluate the performance of state-of-the-art reading comprehension models. The best model performs only slightly better than randomly guessing an answer of the correct type, at 61.6% F1, well below the human performance of 89.0%." - }, - "go_emotions": { - "pwc_id": "goemotions", - "dataset_name": "GoEmotions Dataset", - "dataset_abstract": "GoEmotions is a corpus of 58k carefully curated comments extracted from Reddit, with human annotations to 27 emotion categories or Neutral.\n\n\nNumber of examples: 58,009.\nNumber of labels: 27 + Neutral.\nMaximum sequence length in training and evaluation datasets: 30.\n\nOn top of the raw data, the dataset also includes a version filtered based on reter-agreement, which contains a train/test/validation split:\n\n\nSize of training dataset: 43,410.\nSize of test dataset: 5,427.\nSize of validation dataset: 5,426.\n\nThe emotion categories are: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, optimism, pride, realization, relief, remorse, sadness, surprise.", - "paper_name": "GoEmotions: A Dataset of Fine-Grained Emotions", - "paper_abstract": "Understanding emotion expressed in language has a wide range of applications, from building empathetic chatbots to detecting harmful online behavior. Advancement in this area can be improved using large-scale datasets with a fine-grained typology, adaptable to multiple downstream tasks. We introduce GoEmotions, the largest manually annotated dataset of 58k English Reddit comments, labeled for 27 emotion categories or Neutral. We demonstrate the high quality of the annotations via Principal Preserved Component Analysis. We conduct transfer learning experiments with existing emotion benchmarks to show that our dataset generalizes well to other domains and different emotion taxonomies. Our BERT-based model achieves an average F1-score of .46 across our proposed taxonomy, leaving much room for improvement." - }, - "commonsense_qa": { - "pwc_id": "commonsenseqa", - "dataset_name": "CommonsenseQA Dataset", - "dataset_abstract": "The CommonsenseQA is a dataset for commonsense question answering task. The dataset consists of 12,247 questions with 5 choices each.\nThe dataset was generated by Amazon Mechanical Turk workers in the following process (an example is provided in parentheses):\n\n\na crowd worker observes a source concept from ConceptNet (\u201cRiver\u201d) and three target concepts (\u201cWaterfall\u201d, \u201cBridge\u201d, \u201cValley\u201d) that are all related by the same ConceptNet relation (\u201cAtLocation\u201d),\nthe worker authors three questions, one per target concept, such that only that particular target concept is the answer, while the other two distractor concepts are not, (\u201cWhere on a river can you hold a cup upright to catch water on a sunny day?\u201d, \u201cWhere can I stand on a river to see water falling without getting wet?\u201d, \u201cI\u2019m crossing the river, my feet are wet but my body is dry, where am I?\u201d)\nfor each question, another worker chooses one additional distractor from Concept Net (\u201cpebble\u201d, \u201cstream\u201d, \u201cbank\u201d), and the author another distractor (\u201cmountain\u201d, \u201cbottom\u201d, \u201cisland\u201d) manually.", - "paper_name": "CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge", - "paper_abstract": "When answering a question, people often draw upon their rich world knowledge\nin addition to the particular context. Recent work has focused primarily on\nanswering questions given some relevant document or context, and required very\nlittle general background. To investigate question answering with prior\nknowledge, we present CommonsenseQA: a challenging new dataset for commonsense\nquestion answering. To capture common sense beyond associations, we extract\nfrom ConceptNet (Speer et al., 2017) multiple target concepts that have the\nsame semantic relation to a single source concept. Crowd-workers are asked to\nauthor multiple-choice questions that mention the source concept and\ndiscriminate in turn between each of the target concepts. This encourages\nworkers to create questions with complex semantics that often require prior\nknowledge. We create 12,247 questions through this procedure and demonstrate\nthe difficulty of our task with a large number of strong baselines. Our best\nbaseline is based on BERT-large (Devlin et al., 2018) and obtains 56% accuracy,\nwell below human performance, which is 89%." - }, - "xcopa": { - "pwc_id": "xcopa", - "dataset_name": "XCOPA Dataset", - "dataset_abstract": "The Cross-lingual Choice of Plausible Alternatives (XCOPA) dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages. The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe. The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages.", - "paper_name": "XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning", - "paper_abstract": "In order to simulate human language capacity, natural language processing systems must be able to reason about the dynamics of everyday situations, including their possible causes and effects. Moreover, they should be able to generalise the acquired world knowledge to new languages, modulo cultural differences. Advances in machine reasoning and cross-lingual transfer depend on the availability of challenging evaluation benchmarks. Motivated by both demands, we introduce Cross-lingual Choice of Plausible Alternatives (XCOPA), a typologically diverse multilingual dataset for causal commonsense reasoning in 11 languages, which includes resource-poor languages like Eastern Apur\\'imac Quechua and Haitian Creole. We evaluate a range of state-of-the-art models on this novel dataset, revealing that the performance of current methods based on multilingual pretraining and zero-shot fine-tuning falls short compared to translation-based transfer. Finally, we propose strategies to adapt multilingual models to out-of-sample resource-lean languages where only a small corpus or a bilingual dictionary is available, and report substantial improvements over the random baseline. The XCOPA dataset is freely available at github.com/cambridgeltl/xcopa." - }, - "dream": { - "pwc_id": "dream", - "dataset_name": "DREAM Dataset", - "dataset_abstract": "DREAM is a multiple-choice Dialogue-based REAding comprehension exaMination dataset. In contrast to existing reading comprehension datasets, DREAM is the first to focus on in-depth multi-turn multi-party dialogue understanding.\n\nDREAM contains 10,197 multiple choice questions for 6,444 dialogues, collected from English-as-a-foreign-language examinations designed by human experts. DREAM is likely to present significant challenges for existing reading comprehension systems: 84% of answers are non-extractive, 85% of questions require reasoning beyond a single sentence, and 34% of questions also involve commonsense knowledge.", - "paper_name": "DREAM: A Challenge Data Set and Models for Dialogue-Based Reading Comprehension", - "paper_abstract": "We present DREAM, the first dialogue-based multiple-choice reading comprehension data set. Collected from English as a Foreign Language examinations designed by human experts to evaluate the comprehension level of Chinese learners of English, our data set contains 10,197 multiple-choice questions for 6,444 dialogues. In contrast to existing reading comprehension data sets, DREAM is the first to focus on in-depth multi-turn multi-party dialogue understanding. DREAM is likely to present significant challenges for existing reading comprehension systems: 84{\\%} of answers are non-extractive, 85{\\%} of questions require reasoning beyond a single sentence, and 34{\\%} of questions also involve commonsense knowledge. We apply several popular neural reading comprehension models that primarily exploit surface information within the text and find them to, at best, just barely outperform a rule-based approach. We next investigate the effects of incorporating dialogue structure and different kinds of general world knowledge into both rule-based and (neural and non-neural) machine learning-based reading comprehension models. Experimental results on the DREAM data set show the effectiveness of dialogue structure and general world knowledge. DREAM is available at https://dataset.org/dream/." - }, - "quartz": { - "pwc_id": "quartz", - "dataset_name": "QuaRTz Dataset", - "dataset_abstract": "QuaRTz is a crowdsourced dataset of 3864 multiple-choice questions about open domain qualitative relationships. Each question is paired with one of 405 different background sentences (sometimes short paragraphs).\n\nThe QuaRTz dataset V1 contains 3864 questions about open domain qualitative relationships. Each question is paired with one of 405 different background sentences (sometimes short paragraphs).\n\nThe dataset is split into train (2696), dev (384) and test (784). A background sentence will only appear in a single split.\n\nEach line in a dataset file is a question specified as a json object, e.g., (with extra whitespace for readability).", - "paper_name": "QuaRTz: An Open-Domain Dataset of Qualitative Relationship Questions", - "paper_abstract": "We introduce the first open-domain dataset, called QuaRTz, for reasoning about textual qualitative relationships. QuaRTz contains general qualitative statements, e.g., \"A sunscreen with a higher SPF protects the skin longer.\", twinned with 3864 crowdsourced situated questions, e.g., \"Billy is wearing sunscreen with a lower SPF than Lucy. Who will be best protected from the sun?\", plus annotations of the properties being compared. Unlike previous datasets, the general knowledge is textual and not tied to a fixed set of relationships, and tests a system's ability to comprehend and apply textual qualitative knowledge in a novel setting. We find state-of-the-art results are substantially (20%) below human performance, presenting an open challenge to the NLP community." - }, - "bookcorpus": { - "pwc_id": "bookcorpus", - "dataset_name": "BookCorpus Dataset", - "dataset_abstract": "BookCorpus is a large collection of free novel books written by unpublished authors, which contains 11,038 books (around 74M sentences and 1G words) of 16 different sub-genres (e.g., Romance, Historical, Adventure, etc.).", - "paper_name": "Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books", - "paper_abstract": "Books are a rich source of both fine-grained information, how a character, an\nobject or a scene looks like, as well as high-level semantics, what someone is\nthinking, feeling and how these states evolve through a story. This paper aims\nto align books to their movie releases in order to provide rich descriptive\nexplanations for visual content that go semantically far beyond the captions\navailable in current datasets. To align movies and books we exploit a neural\nsentence embedding that is trained in an unsupervised way from a large corpus\nof books, as well as a video-text neural embedding for computing similarities\nbetween movie clips and sentences in the book. We propose a context-aware CNN\nto combine information from multiple sources. We demonstrate good quantitative\nperformance for movie/book alignment and show several qualitative examples that\nshowcase the diversity of tasks our model can be used for." - }, - "openbookqa": { - "pwc_id": "openbookqa", - "dataset_name": "OpenBookQA Dataset", - "dataset_abstract": "OpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding of a subject. It consists of 5,957 multiple-choice elementary-level science questions (4,957 train, 500 dev, 500 test), which probe the understanding of a small \u201cbook\u201d of 1,326 core science facts and the application of these facts to novel situations. For training, the dataset includes a mapping from each question to the core science fact it was designed to probe. Answering OpenBookQA questions requires additional broad common knowledge, not contained in the book. The questions, by design, are answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm.\nAdditionally, the dataset includes a collection of 5,167 crowd-sourced common knowledge facts, and an expanded version of the train/dev/test questions where each question is associated with its originating core fact, a human accuracy score, a clarity score, and an anonymized crowd-worker ID.", - "paper_name": "Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering", - "paper_abstract": "We present a new kind of question answering dataset, OpenBookQA, modeled\nafter open book exams for assessing human understanding of a subject. The open\nbook that comes with our questions is a set of 1329 elementary level science\nfacts. Roughly 6000 questions probe an understanding of these facts and their\napplication to novel situations. This requires combining an open book fact\n(e.g., metals conduct electricity) with broad common knowledge (e.g., a suit of\narmor is made of metal) obtained from other sources. While existing QA datasets\nover documents or knowledge bases, being generally self-contained, focus on\nlinguistic understanding, OpenBookQA probes a deeper understanding of both the\ntopic---in the context of common knowledge---and the language it is expressed\nin. Human performance on OpenBookQA is close to 92%, but many state-of-the-art\npre-trained QA methods perform surprisingly poorly, worse than several simple\nneural baselines we develop. Our oracle experiments designed to circumvent the\nknowledge retrieval bottleneck demonstrate the value of both the open book and\nadditional facts. We leave it as a challenge to solve the retrieval problem in\nthis multi-hop setting and to close the large gap to human performance." - }, - "qasc": { - "pwc_id": "qasc", - "dataset_name": "QASC Dataset", - "dataset_abstract": "QASC is a question-answering dataset with a focus on sentence composition. It consists of 9,980 8-way multiple-choice questions about grade school science (8,134 train, 926 dev, 920 test), and comes with a corpus of 17M sentences.", - "paper_name": "QASC: A Dataset for Question Answering via Sentence Composition", - "paper_abstract": "Composing knowledge from multiple pieces of texts is a key challenge in multi-hop question answering. We present a multi-hop reasoning dataset, Question Answering via Sentence Composition(QASC), that requires retrieving facts from a large corpus and composing them to answer a multiple-choice question. QASC is the first dataset to offer two desirable properties: (a) the facts to be composed are annotated in a large corpus, and (b) the decomposition into these facts is not evident from the question itself. The latter makes retrieval challenging as the system must introduce new concepts or relations in order to discover potential decompositions. Further, the reasoning model must then learn to identify valid compositions of these retrieved facts using common-sense reasoning. To help address these challenges, we provide annotation for supporting facts as well as their composition. Guided by these annotations, we present a two-step approach to mitigate the retrieval challenges. We use other multiple-choice datasets as additional training data to strengthen the reasoning model. Our proposed approach improves over current state-of-the-art language models by 11% (absolute). The reasoning and retrieval problems, however, remain unsolved as this model still lags by 20% behind human performance." - }, - "social_i_qa": { - "pwc_id": "social-iqa", - "dataset_name": "Social IQA Dataset", - "dataset_abstract": "Social Interaction QA, a new question-answering benchmark for testing social commonsense intelligence. Contrary to many prior benchmarks that focus on physical or taxonomic knowledge, Social IQa focuses on reasoning about people\u2019s actions and their social implications. For example, given an action like \"Jesse saw a concert\" and a question like \"Why did Jesse do this?\", humans can easily infer that Jesse wanted \"to see their favorite performer\" or \"to enjoy the music\", and not \"to see what's happening inside\" or \"to see if it works\". The actions in Social IQa span a wide variety of social situations, and answer candidates contain both human-curated answers and adversarially-filtered machine-generated candidates. Social IQa contains over 37,000 QA pairs for evaluating models\u2019 abilities to reason about the social implications of everyday events and situations.", - "paper_name": "Social IQa: Commonsense Reasoning about Social Interactions", - "paper_abstract": "We introduce Social IQa, the first large-scale benchmark for commonsense reasoning about social situations. Social IQa contains 38,000 multiple choice questions for probing emotional and social intelligence in a variety of everyday situations (e.g., Q: {``}Jordan wanted to tell Tracy a secret, so Jordan leaned towards Tracy. Why did Jordan do this?{''} A: {``}Make sure no one else could hear{''}). Through crowdsourcing, we collect commonsense questions along with correct and incorrect answers about social interactions, using a new framework that mitigates stylistic artifacts in incorrect answers by asking workers to provide the right answer to a different but related question. Empirical results show that our benchmark is challenging for existing question-answering models based on pretrained language models, compared to human performance ({\\textgreater}20{\\%} gap). Notably, we further establish Social IQa as a resource for transfer learning of commonsense knowledge, achieving state-of-the-art performance on multiple commonsense reasoning tasks (Winograd Schemas, COPA)." - }, - "multi_news": { - "pwc_id": "multi-news", - "dataset_name": "Multi-News Dataset", - "dataset_abstract": "Multi-News, consists of news articles and human-written summaries of these articles from the site newser.com. Each summary is professionally written by editors and includes links to the original articles cited.", - "paper_name": "Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model", - "paper_abstract": "Automatic generation of summaries from multiple news articles is a valuable tool as the number of online publications grows rapidly. Single document summarization (SDS) systems have benefited from advances in neural encoder-decoder model thanks to the availability of large datasets. However, multi-document summarization (MDS) of news articles has been limited to datasets of a couple of hundred examples. In this paper, we introduce Multi-News, the first large-scale MDS news dataset. Additionally, we propose an end-to-end model which incorporates a traditional extractive summarization model with a standard SDS model and achieves competitive results on MDS datasets. We benchmark several methods on Multi-News and release our data and code in hope that this work will promote advances in summarization in the multi-document setting." - }, - "wiki_hop": { - "pwc_id": "wikihop", - "dataset_name": "WikiHop Dataset", - "dataset_abstract": "WikiHop is a multi-hop question-answering dataset. The query of WikiHop is constructed with entities and relations from WikiData, while supporting documents are from WikiReading. A bipartite graph connecting entities and documents is first built and the answer for each query is located by traversal on this graph. Candidates that are type-consistent with the answer and share the same relation in query with the answer are included, resulting in a set of candidates. Thus, WikiHop is a multi-choice style reading comprehension data set. There are totally about 43K samples in training set, 5K samples in development set and 2.5K samples in test set. The test set is not provided. The task is to predict the correct answer given a query and multiple supporting documents.\n\nThe dataset includes a masked variant, where all candidates and their mentions in the supporting documents are replaced by random but consistent placeholder tokens.", - "paper_name": "Constructing Datasets for Multi-hop Reading Comprehension Across Documents", - "paper_abstract": "Most Reading Comprehension methods limit themselves to queries which can be\nanswered using a single sentence, paragraph, or document. Enabling models to\ncombine disjoint pieces of textual evidence would extend the scope of machine\ncomprehension methods, but currently there exist no resources to train and test\nthis capability. We propose a novel task to encourage the development of models\nfor text understanding across multiple documents and to investigate the limits\nof existing methods. In our task, a model learns to seek and combine evidence -\neffectively performing multi-hop (alias multi-step) inference. We devise a\nmethodology to produce datasets for this task, given a collection of\nquery-answer pairs and thematically linked documents. Two datasets from\ndifferent domains are induced, and we identify potential pitfalls and devise\ncircumvention strategies. We evaluate two previously proposed competitive\nmodels and find that one can integrate information across documents. However,\nboth models struggle to select relevant information, as providing documents\nguaranteed to be relevant greatly improves their performance. While the models\noutperform several strong baselines, their best accuracy reaches 42.9% compared\nto human performance at 74.0% - leaving ample room for improvement." - }, - "wiqa": { - "pwc_id": "wiqa", - "dataset_name": "WIQA Dataset", - "dataset_abstract": "The WIQA dataset V1 has 39705 questions containing a perturbation and a possible effect in the context of a paragraph. The dataset is split into 29808 train questions, 6894 dev questions and 3003 test questions.", - "paper_name": "WIQA: A dataset for \"What if...\" reasoning over procedural text", - "paper_abstract": "We introduce WIQA, the first large-scale dataset of \"What if...\" questions over procedural text. WIQA contains three parts: a collection of paragraphs each describing a process, e.g., beach erosion; a set of crowdsourced influence graphs for each paragraph, describing how one change affects another; and a large (40k) collection of \"What if...?\" multiple-choice questions derived from the graphs. For example, given a paragraph about beach erosion, would stormy weather result in more or less erosion (or have no effect)? The task is to answer the questions, given their associated paragraph. WIQA contains three kinds of questions: perturbations to steps mentioned in the paragraph; external (out-of-paragraph) perturbations requiring commonsense knowledge; and irrelevant (no effect) perturbations. We find that state-of-the-art models achieve 73.8% accuracy, well below the human performance of 96.3%. We analyze the challenges, in particular tracking chains of influences, and present the dataset as an open challenge to the community." - }, - "xquad_r": { - "pwc_id": "xquad-r", - "dataset_name": "LAReQA Dataset", - "dataset_abstract": "A challenging new benchmark for language-agnostic answer retrieval from a multilingual candidate pool.", - "paper_name": "LAReQA: Language-agnostic answer retrieval from a multilingual pool", - "paper_abstract": "We present LAReQA, a challenging new benchmark for language-agnostic answer retrieval from a multilingual candidate pool. Unlike previous cross-lingual tasks, LAReQA tests for \"strong\" cross-lingual alignment, requiring semantically related cross-language pairs to be closer in representation space than unrelated same-language pairs. Building on multilingual BERT (mBERT), we study different strategies for achieving strong alignment. We find that augmenting training data via machine translation is effective, and improves significantly over using mBERT out-of-the-box. Interestingly, the embedding baseline that performs the best on LAReQA falls short of competing baselines on zero-shot variants of our task that only target \"weak\" alignment. This finding underscores our claim that languageagnostic retrieval is a substantively new kind of cross-lingual evaluation." - }, - "opus100": { - "pwc_id": "opus-100", - "dataset_name": "OPUS-100 Dataset", - "dataset_abstract": "A novel multilingual dataset with 100 languages.", - "paper_name": "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation", - "paper_abstract": "Massively multilingual models for neural machine translation (NMT) are theoretically attractive, but often underperform bilingual models and deliver poor zero-shot translations. In this paper, we explore ways to improve them. We argue that multilingual NMT requires stronger modeling capacity to support language pairs with varying typological characteristics, and overcome this bottleneck via language-specific components and deepening NMT architectures. We identify the off-target translation issue (i.e. translating into a wrong target language) as the major source of the inferior zero-shot performance, and propose random online backtranslation to enforce the translation of unseen training language pairs. Experiments on OPUS-100 (a novel multilingual dataset with 100 languages) show that our approach substantially narrows the performance gap with bilingual models in both one-to-many and many-to-many settings, and improves zero-shot performance by ~10 BLEU, approaching conventional pivot-based methods." - }, - "tydiqa": { - "pwc_id": "tydi-qa", - "dataset_name": "TyDi QA Dataset", - "dataset_abstract": "TyDi QA is a question answering dataset covering 11 typologically diverse languages with 200K question-answer pairs. The languages of TyDi QA are diverse with regard to their typology \u2014 the set of linguistic features that each language expresses \u2014 such that the authors expect models performing well on this set to generalize across a large number of the languages in the world.", - "paper_name": "TyDi QA: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages", - "paper_abstract": "Confidently making progress on multilingual modeling requires challenging, trustworthy evaluations. We present TyDi QA---a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs. The languages of TyDi QA are diverse with regard to their typology---the set of linguistic features each language expresses---such that we expect models performing well on this set to generalize across a large number of the world's languages. We present a quantitative analysis of the data quality and example-level qualitative linguistic analyses of observed language phenomena that would not be found in English-only corpora. To provide a realistic information-seeking task and avoid priming effects, questions are written by people who want to know the answer, but don't know the answer yet, and the data is collected directly in each language without the use of translation." - }, - "codah": { - "pwc_id": "codah", - "dataset_name": "CODAH Dataset", - "dataset_abstract": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. It contains 2801 questions in total, and uses 5-fold cross validation for evaluation.", - "paper_name": "CODAH: An Adversarially-Authored Question Answering Dataset for Common Sense", - "paper_abstract": "Commonsense reasoning is a critical AI capability, but it is difficult to construct challenging datasets that test common sense. Recent neural question answering systems, based on large pre-trained models of language, have already achieved near-human-level performance on commonsense knowledge benchmarks. These systems do not possess human-level common sense, but are able to exploit limitations of the datasets to achieve human-level scores. We introduce the CODAH dataset, an adversarially-constructed evaluation dataset for testing common sense. CODAH forms a challenging extension to the recently-proposed SWAG dataset, which tests commonsense knowledge using sentence-completion questions that describe situations observed in video. To produce a more difficult dataset, we introduce a novel procedure for question acquisition in which workers author questions designed to target weaknesses of state-of-the-art neural question answering systems. Workers are rewarded for submissions that models fail to answer correctly both before and after fine-tuning (in cross-validation). We create 2.8k questions via this procedure and evaluate the performance of multiple state-of-the-art question answering systems on our dataset. We observe a significant gap between human performance, which is 95.3{\\%}, and the performance of the best baseline accuracy of 65.3{\\%} by the OpenAI GPT model." - }, - "head_qa": { - "pwc_id": "headqa", - "dataset_name": "HeadQA Dataset", - "dataset_abstract": "HeadQA is a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans.", - "paper_name": "HEAD-QA: A Healthcare Dataset for Complex Reasoning", - "paper_abstract": "We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work." - }, - "subjqa": { - "pwc_id": "subjqa", - "dataset_name": "SubjQA Dataset", - "dataset_abstract": "SubjQA is a question answering dataset that focuses on subjective (as opposed to factual) questions and answers. The dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery, electronics, TripAdvisor (i.e. hotels), and restaurants. Each question is paired with a review and a span is highlighted as the answer to the question (with some questions having no answer). Moreover, both questions and answer spans are assigned a subjectivity label by annotators. Questions such as \"How much does this product weigh?\" is a factual question (i.e., low subjectivity), while \"Is this easy to use?\" is a subjective question (i.e., high subjectivity).", - "paper_name": "SubjQA: A Dataset for Subjectivity and Review Comprehension", - "paper_abstract": "Subjectivity is the expression of internal opinions or beliefs which cannot be objectively observed or verified, and has been shown to be important for sentiment analysis and word-sense disambiguation. Furthermore, subjectivity is an important aspect of user-generated data. In spite of this, subjectivity has not been investigated in contexts where such data is widespread, such as in question answering (QA). We therefore investigate the relationship between subjectivity and QA, while developing a new dataset. We compare and contrast with analyses from previous work, and verify that findings regarding subjectivity still hold when using recently developed NLP architectures. We find that subjectivity is also an important feature in the case of QA, albeit with more intricate interactions between subjectivity and QA performance. For instance, a subjective question may or may not be associated with a subjective answer. We release an English QA dataset (SubjQA) based on customer reviews, containing subjectivity annotations for questions and answer spans across 6 distinct domains." - }, - "mc4": { - "pwc_id": "mc4", - "dataset_name": "mC4 Dataset", - "dataset_abstract": "mC4 is a multilingual variant of the C4 dataset called mC4. mC4 comprises natural text in 101 languages drawn from the public Common Crawl web scrape.", - "paper_name": "mT5: A massively multilingual pre-trained text-to-text transformer", - "paper_abstract": "The recent \"Text-to-Text Transfer Transformer\" (T5) leveraged a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual benchmarks. We also describe a simple technique to prevent \"accidental translation\" in the zero-shot setting, where a generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model checkpoints used in this work are publicly available." - }, - "web_questions": { - "pwc_id": "webquestions", - "dataset_name": "WebQuestions Dataset", - "dataset_abstract": "The WebQuestions dataset is a question answering dataset using Freebase as the knowledge base and contains 6,642 question-answer pairs. It was created by crawling questions through the Google Suggest API, and then obtaining answers using Amazon Mechanical Turk. The original split uses 3,778 examples for training and 2,032 for testing. All answers are defined as Freebase entities.\n\nExample questions (answers) in the dataset include \u201cWhere did Edgar Allan Poe died?\u201d (baltimore) or \u201cWhat degrees did Barack Obama get?\u201d (bachelor_of_arts, juris_doctor).", - "paper_name": "", - "paper_abstract": "" - }, - "pubmed_qa": { - "pwc_id": "pubmedqa", - "dataset_name": "PubMedQA Dataset", - "dataset_abstract": "The task of PubMedQA is to answer research questions with yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after coronary artery bypass grafting?) using the corresponding abstracts.\n\nPubMedQA has 1k expert labeled, 61.2k unlabeled and 211.3k artificially generated QA instances.", - "paper_name": "PubMedQA: A Dataset for Biomedical Research Question Answering", - "paper_abstract": "We introduce PubMedQA, a novel biomedical question answering (QA) dataset collected from PubMed abstracts. The task of PubMedQA is to answer research questions with yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA instances. Each PubMedQA instance is composed of (1) a question which is either an existing research article title or derived from one, (2) a context which is the corresponding abstract without its conclusion, (3) a long answer, which is the conclusion of the abstract and, presumably, answers the research question, and (4) a yes/no/maybe answer which summarizes the conclusion. PubMedQA is the first QA dataset where reasoning over biomedical research texts, especially their quantitative contents, is required to answer the questions. Our best performing model, multi-phase fine-tuning of BioBERT with long answer bag-of-word statistics as additional supervision, achieves 68.1% accuracy, compared to single human performance of 78.0% accuracy and majority-baseline of 55.2% accuracy, leaving much room for improvement. PubMedQA is publicly available at https://pubmedqa.github.io." - }, - "sciq": { - "pwc_id": "sciq", - "dataset_name": "SciQ Dataset", - "dataset_abstract": "The SciQ dataset contains 13,679 crowdsourced science exam questions about Physics, Chemistry and Biology, among others. The questions are in multiple-choice format with 4 answer options each. For the majority of the questions, an additional paragraph with supporting evidence for the correct answer is provided.", - "paper_name": "Crowdsourcing Multiple Choice Science Questions", - "paper_abstract": "We present a novel method for obtaining high-quality, domain-targeted\nmultiple choice questions from crowd workers. Generating these questions can be\ndifficult without trading away originality, relevance or diversity in the\nanswer options. Our method addresses these problems by leveraging a large\ncorpus of domain-specific text and a small set of existing questions. It\nproduces model suggestions for document selection and answer distractor choice\nwhich aid the human question generation process. With this method we have\nassembled SciQ, a dataset of 13.7K multiple choice science exam questions\n(Dataset available at http://allenai.org/data.html). We demonstrate that the\nmethod produces in-domain questions by providing an analysis of this new\ndataset and by showing that humans cannot distinguish the crowdsourced\nquestions from original questions. When using SciQ as additional training data\nto existing questions, we observe accuracy improvements on real science exams." - }, - "multi_nli": { - "pwc_id": "multinli", - "dataset_name": "MultiNLI Dataset", - "dataset_abstract": "The Multi-Genre Natural Language Inference (MultiNLI) dataset has 433K sentence pairs. Its size and mode of collection are modeled closely like SNLI. MultiNLI offers ten distinct genres (Face-to-face, Telephone, 9/11, Travel, Letters, Oxford University Press, Slate, Verbatim, Goverment and Fiction) of written and spoken English data. There are matched dev/test sets which are derived from the same sources as those in the training set, and mismatched sets which do not closely resemble any seen at training time.", - "paper_name": "A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference", - "paper_abstract": "This paper introduces the Multi-Genre Natural Language Inference (MultiNLI)\ncorpus, a dataset designed for use in the development and evaluation of machine\nlearning models for sentence understanding. In addition to being one of the\nlargest corpora available for the task of NLI, at 433k examples, this corpus\nimproves upon available resources in its coverage: it offers data from ten\ndistinct genres of written and spoken English--making it possible to evaluate\nsystems on nearly the full complexity of the language--and it offers an\nexplicit setting for the evaluation of cross-genre domain adaptation." - }, - "quarel": { - "pwc_id": "quarel", - "dataset_name": "QuaRel Dataset", - "dataset_abstract": "QuaRel is a crowdsourced dataset of 2771 multiple-choice story questions, including their logical forms.", - "paper_name": "QuaRel: A Dataset and Models for Answering Questions about Qualitative Relationships", - "paper_abstract": "Many natural language questions require recognizing and reasoning with\nqualitative relationships (e.g., in science, economics, and medicine), but are\nchallenging to answer with corpus-based methods. Qualitative modeling provides\ntools that support such reasoning, but the semantic parsing task of mapping\nquestions into those models has formidable challenges. We present QuaRel, a\ndataset of diverse story questions involving qualitative relationships that\ncharacterize these challenges, and techniques that begin to address them. The\ndataset has 2771 questions relating 19 different types of quantities. For\nexample, \"Jenny observes that the robot vacuum cleaner moves slower on the\nliving room carpet than on the bedroom carpet. Which carpet has more friction?\"\nWe contribute (1) a simple and flexible conceptual framework for representing\nthese kinds of questions; (2) the QuaRel dataset, including logical forms,\nexemplifying the parsing challenges; and (3) two novel models for this task,\nbuilt as extensions of type-constrained semantic parsing. The first of these\nmodels (called QuaSP+) significantly outperforms off-the-shelf tools on QuaRel.\nThe second (QuaSP+Zero) demonstrates zero-shot capability, i.e., the ability to\nhandle new qualitative relationships without requiring additional training\ndata, something not possible with previous models. This work thus makes inroads\ninto answering complex, qualitative questions that require reasoning, and\nscaling to new relationships at low cost. The dataset and models are available\nat http://data.allenai.org/quarel." - }, - "lama": { - "pwc_id": "lama", - "dataset_name": "LAMA Dataset", - "dataset_abstract": "LAnguage Model Analysis (LAMA) consists of a set of knowledge sources, each comprised of a set of facts. LAMA is a probe for analyzing the factual and commonsense knowledge contained in pretrained language models.", - "paper_name": "Language Models as Knowledge Bases?", - "paper_abstract": "Recent progress in pretraining language models on large textual corpora led to a surge of improvements for downstream NLP tasks. Whilst learning linguistic knowledge, these models may also be storing relational knowledge present in the training data, and may be able to answer queries structured as \"fill-in-the-blank\" cloze statements. Language models have many advantages over structured knowledge bases: they require no schema engineering, allow practitioners to query about an open class of relations, are easy to extend to more data, and require no human supervision to train. We present an in-depth analysis of the relational knowledge already present (without fine-tuning) in a wide range of state-of-the-art pretrained language models. We find that (i) without fine-tuning, BERT contains relational knowledge competitive with traditional NLP methods that have some access to oracle knowledge, (ii) BERT also does remarkably well on open-domain question answering against a supervised baseline, and (iii) certain types of factual knowledge are learned much more readily than others by standard language model pretraining approaches. The surprisingly strong ability of these models to recall factual knowledge without any fine-tuning demonstrates their potential as unsupervised open-domain QA systems. The code to reproduce our analysis is available at https://github.com/facebookresearch/LAMA." - }, - "babi_qa": { - "pwc_id": "babi-1", - "dataset_name": "bAbI Dataset", - "dataset_abstract": "The bAbI dataset is a textual QA benchmark composed of 20 different tasks. Each task is designed to test a different reasoning skill, such as deduction, induction, and coreference resolution. Some of the tasks need relational reasoning, for instance, to compare the size of different entities. Each sample is composed of a question, an answer, and a set of facts. There are two versions of the dataset, referring to different dataset sizes: bAbI-1k and bAbI-10k. The bAbI-10k version of the dataset consists of 10,000 training samples per task.", - "paper_name": "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks", - "paper_abstract": "One long-term goal of machine learning research is to produce methods that\nare applicable to reasoning and natural language, in particular building an\nintelligent dialogue agent. To measure progress towards that goal, we argue for\nthe usefulness of a set of proxy tasks that evaluate reading comprehension via\nquestion answering. Our tasks measure understanding in several ways: whether a\nsystem is able to answer questions via chaining facts, simple induction,\ndeduction and many more. The tasks are designed to be prerequisites for any\nsystem that aims to be capable of conversing with a human. We believe many\nexisting learning systems can currently not solve them, and hence our aim is to\nclassify these tasks into skill sets, so that researchers can identify (and\nthen rectify) the failings of their systems. We also extend and improve the\nrecently introduced Memory Networks model, and show it is able to solve some,\nbut not all, of the tasks." - }, - "scitail": { - "pwc_id": "scitail", - "dataset_name": "SciTail Dataset", - "dataset_abstract": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and web sentences. Each question and the correct answer choice are converted into an assertive statement to form the hypothesis. We use information retrieval to obtain relevant text from a large text corpus of web sentences, and use these sentences as a premise P. We crowdsource the annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order to create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with entails label and 16,925 examples with neutral label.", - "paper_name": "", - "paper_abstract": "" - }, - "math_qa": { - "pwc_id": "mathqa", - "dataset_name": "MathQA Dataset", - "dataset_abstract": "MathQA significantly enhances the AQuA dataset with fully-specified operational programs.", - "paper_name": "MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms", - "paper_abstract": "We introduce a large-scale dataset of math word problems and an interpretable neural math problem solver that learns to map problems to operation programs. Due to annotation challenges, current datasets in this domain have been either relatively small in scale or did not offer precise operational annotations over diverse problem types. We introduce a new representation language to model precise operation programs corresponding to each math problem that aim to improve both the performance and the interpretability of the learned models. Using this representation language, our new dataset, MathQA, significantly enhances the AQuA dataset with fully-specified operational programs. We additionally introduce a neural sequence-to-program model enhanced with automatic problem categorization. Our experiments show improvements over competitive baselines in our MathQA as well as the AQuA dataset. The results are still significantly lower than human performance indicating that the dataset poses new challenges for future research. Our dataset is available at: https://math-qa.github.io/math-QA/" - }, - "mc_taco": { - "pwc_id": "mc-taco", - "dataset_name": "MC-TACO Dataset", - "dataset_abstract": "MC-TACO is a dataset of 13k question-answer pairs that require temporal commonsense comprehension. The dataset contains five temporal properties, (1) duration (how long an event takes), (2) temporal ordering (typical order of events), (3) typical time (when an event occurs), (4) frequency (how often an event occurs), and (5) stationarity (whether a state is maintained for a very long time or indefinitely).", - "paper_name": "\"Going on a vacation\" takes longer than \"Going for a walk\": A Study of Temporal Commonsense Understanding", - "paper_abstract": "Understanding time is crucial for understanding events expressed in natural language. Because people rarely say the obvious, it is often necessary to have commonsense knowledge about various temporal aspects of events, such as duration, frequency, and temporal order. However, this important problem has so far received limited attention. This paper systematically studies this temporal commonsense problem. Specifically, we define five classes of temporal commonsense, and use crowdsourcing to develop a new dataset, MCTACO, that serves as a test set for this task. We find that the best current methods used on MCTACO are still far behind human performance, by about 20%, and discuss several directions for improvement. We hope that the new dataset and our study here can foster more future research on this topic." - }, - "squadshifts": { - "pwc_id": "squad-shifts", - "dataset_name": "SQuAD-shifts Dataset", - "dataset_abstract": "Provides four new test sets for the Stanford Question Answering Dataset (SQuAD) and evaluate the ability of question-answering systems to generalize to new data.", - "paper_name": "The Effect of Natural Distribution Shift on Question Answering Models", - "paper_abstract": "We build four new test sets for the Stanford Question Answering Dataset (SQuAD) and evaluate the ability of question-answering systems to generalize to new data. Our first test set is from the original Wikipedia domain and measures the extent to which existing systems overfit the original test set. Despite several years of heavy test set re-use, we find no evidence of adaptive overfitting. The remaining three test sets are constructed from New York Times articles, Reddit posts, and Amazon product reviews and measure robustness to natural distribution shifts. Across a broad range of models, we observe average performance drops of 3.8, 14.0, and 17.4 F1 points, respectively. In contrast, a strong human baseline matches or exceeds the performance of SQuAD models on the original domain and exhibits little to no drop in new domains. Taken together, our results confirm the surprising resilience of the holdout method and emphasize the need to move towards evaluation metrics that incorporate robustness to natural distribution shifts." - }, - "cbt": { - "pwc_id": "cbt", - "dataset_name": "CBT Dataset", - "dataset_abstract": "Children\u2019s Book Test (CBT) is designed to measure directly how well language models can exploit wider linguistic context. The CBT is built from books that are freely available thanks to Project Gutenberg.", - "paper_name": "The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations", - "paper_abstract": "We introduce a new test of how well language models capture meaning in\nchildren's books. Unlike standard language modelling benchmarks, it\ndistinguishes the task of predicting syntactic function words from that of\npredicting lower-frequency words, which carry greater semantic content. We\ncompare a range of state-of-the-art models, each with a different way of\nencoding what has been previously read. We show that models which store\nexplicit representations of long-term contexts outperform state-of-the-art\nneural language models at predicting semantic content words, although this\nadvantage is not observed for syntactic function words. Interestingly, we find\nthat the amount of text encoded in a single memory representation is highly\ninfluential to the performance: there is a sweet-spot, not too big and not too\nsmall, between single words and full sentences that allows the most meaningful\ninformation in a text to be effectively retained and recalled. Further, the\nattention over such window-based memories can be trained effectively through\nself-supervision. We then assess the generality of this principle by applying\nit to the CNN QA benchmark, which involves identifying named entities in\nparaphrased summaries of news articles, and achieve state-of-the-art\nperformance." - }, - "sms_spam": { - "pwc_id": "sms-spam-collection-data-set", - "dataset_name": "SMS Spam Collection Data Set Dataset", - "dataset_abstract": "This corpus has been collected from free or free for research sources at the Internet:\n\n\nA collection of 425 SMS spam messages was manually extracted from the Grumbletext Web site. This is a UK forum in which cell phone users make public claims about SMS spam messages, most of them without reporting the very spam message received. The identification of the text of spam messages in the claims is a very hard and time-consuming task, and it involved carefully scanning hundreds of web pages.\nA subset of 3,375 SMS randomly chosen ham messages of the NUS SMS Corpus (NSC), which is a dataset of about 10,000 legitimate messages collected for research at the Department of Computer Science at the National University of Singapore. The messages largely originate from Singaporeans and mostly from students attending the University. These messages were collected from volunteers who were made aware that their contributions were going to be made publicly available.\nA list of 450 SMS ham messages collected from Caroline Tag's PhD Thesis.\nthe SMS Spam Corpus v.0.1 Big. It has 1,002 SMS ham messages and 322 spam messages.", - "paper_name": "", - "paper_abstract": "" - }, - "winograd_wsc": { - "pwc_id": "wsc", - "dataset_name": "WSC Dataset", - "dataset_abstract": "The Winograd Schema Challenge was introduced both as an alternative to the Turing Test and as a test of a system\u2019s ability to do commonsense reasoning. A Winograd schema is a pair of sentences differing in one or two words with a highly ambiguous pronoun, resolved differently in the two sentences, that appears to require commonsense knowledge to be resolved correctly. The examples were designed to be easily solvable by humans but difficult for machines, in principle requiring a deep understanding of the content of the text and the situation it describes.\n\nThe original Winograd Schema Challenge dataset consisted of 100 Winograd schemas constructed manually by AI experts. As of 2020 there are 285 examples available; however, the last 12 examples were only added recently. To ensure consistency with earlier models, several authors often prefer to report the performance on the first 273 examples only. These datasets are usually referred to as WSC285 and WSC273, respectively.", - "paper_name": "", - "paper_abstract": "" - }, - "acronym_identification": { - "pwc_id": "acronym-identification", - "dataset_name": "Acronym Identification Dataset", - "dataset_abstract": "Is an acronym disambiguation (AD) dataset for scientific domain with 62,441 samples which is significantly larger than the previous scientific AD dataset.", - "paper_name": "What Does This Acronym Mean? Introducing a New Dataset for Acronym Identification and Disambiguation", - "paper_abstract": "Acronyms are the short forms of phrases that facilitate conveying lengthy sentences in documents and serve as one of the mainstays of writing. Due to their importance, identifying acronyms and corresponding phrases (i.e., acronym identification (AI)) and finding the correct meaning of each acronym (i.e., acronym disambiguation (AD)) are crucial for text understanding. Despite the recent progress on this task, there are some limitations in the existing datasets which hinder further improvement. More specifically, limited size of manually annotated AI datasets or noises in the automatically created acronym identification datasets obstruct designing advanced high-performing acronym identification models. Moreover, the existing datasets are mostly limited to the medical domain and ignore other domains. In order to address these two limitations, we first create a manually annotated large AI dataset for scientific domain. This dataset contains 17,506 sentences which is substantially larger than previous scientific AI datasets. Next, we prepare an AD dataset for scientific domain with 62,441 samples which is significantly larger than the previous scientific AD dataset. Our experiments show that the existing state-of-the-art models fall far behind human-level performance on both datasets proposed by this work. In addition, we propose a new deep learning model that utilizes the syntactical structure of the sentence to expand an ambiguous acronym in a sentence. The proposed model outperforms the state-of-the-art models on the new AD dataset, providing a strong baseline for future research on this dataset." - }, - "wmt14": { - "pwc_id": "wmt-2014", - "dataset_name": "WMT 2014 Dataset", - "dataset_abstract": "WMT 2014 is a collection of datasets used in shared tasks of the Ninth Workshop on Statistical Machine Translation. The workshop featured four tasks:\n\n\na news translation task,\na quality estimation task,\na metrics task,\na medical text translation task.", - "paper_name": "", - "paper_abstract": "" - }, - "ncbi_disease": { - "pwc_id": "ncbi-disease-1", - "dataset_name": "NCBI Disease Dataset", - "dataset_abstract": "The NCBI Disease corpus consists of 793 PubMed abstracts, which are separated into training (593), development (100) and test (100) subsets. The NCBI Disease corpus is annotated with disease mentions, using concept identifiers from either MeSH or OMIM.", - "paper_name": "", - "paper_abstract": "" - }, - "hate_speech_offensive": { - "pwc_id": "hate-speech-and-offensive-language", - "dataset_name": "Hate Speech and Offensive Language Dataset", - "dataset_abstract": "HSOL is a dataset for hate speech detection. The authors begun with a hate speech lexicon containing words and\nphrases identified by internet users as hate speech, compiled by Hatebase.org. Using the Twitter API they searched\nfor tweets containing terms from the lexicon, resulting in a sample of tweets from 33,458 Twitter users. They extracted\nthe time-line for each user, resulting in a set of 85.4 million tweets. From this corpus they took a random sample of 25k tweets containing terms from the lexicon and had them manually coded by CrowdFlower (CF) workers. Workers were asked to label each tweet as one of three categories: hate speech, offensive but not hate speech, or neither offensive nor hate speech.", - "paper_name": "Automated Hate Speech Detection and the Problem of Offensive Language", - "paper_abstract": "A key challenge for automatic hate-speech detection on social media is the\nseparation of hate speech from other instances of offensive language. Lexical\ndetection methods tend to have low precision because they classify all messages\ncontaining particular terms as hate speech and previous work using supervised\nlearning has failed to distinguish between the two categories. We used a\ncrowd-sourced hate speech lexicon to collect tweets containing hate speech\nkeywords. We use crowd-sourcing to label a sample of these tweets into three\ncategories: those containing hate speech, only offensive language, and those\nwith neither. We train a multi-class classifier to distinguish between these\ndifferent categories. Close analysis of the predictions and the errors shows\nwhen we can reliably separate hate speech from other offensive language and\nwhen this differentiation is more difficult. We find that racist and homophobic\ntweets are more likely to be classified as hate speech but that sexist tweets\nare generally classified as offensive. Tweets without explicit hate keywords\nare also more difficult to classify." - }, - "boolq": { - "pwc_id": "boolq", - "dataset_name": "BoolQ Dataset", - "dataset_abstract": "BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally occurring \u2013 they are generated in unprompted and unconstrained settings.\nEach example is a triplet of (question, passage, answer), with the title of the page as optional additional context.\n\nQuestions are gathered from anonymized, aggregated queries to the Google search engine. Queries that are likely to be yes/no questions are heuristically identified and questions are only kept if a Wikipedia page is returned as one of the first five results, in which case the question and Wikipedia page are given to a human annotator for further processing. Annotators label question/article pairs in a three-step process. First, they decide if the question is good, meaning it is comprehensible, unambiguous, and requesting factual information. This judgment is made before the annotator sees the Wikipedia page. Next, for good questions, annotators find a passage within the document that contains enough information to answer the question. Annotators can mark questions as \u201cnot answerable\u201d if the Wikipedia article does not contain the requested information. Finally, annotators mark whether the question\u2019s answer is \u201cyes\u201d or \u201cno\u201d. Only questions that were marked as having a yes/no answer are used, and each question is paired with the selected passage instead of the entire document.", - "paper_name": "BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions", - "paper_abstract": "In this paper we study yes/no questions that are naturally occurring --- meaning that they are generated in unprompted and unconstrained settings. We build a reading comprehension dataset, BoolQ, of such questions, and show that they are unexpectedly challenging. They often query for complex, non-factoid information, and require difficult entailment-like inference to solve. We also explore the effectiveness of a range of transfer learning baselines. We find that transferring from entailment data is more effective than transferring from paraphrase or extractive QA data, and that it, surprisingly, continues to be very beneficial even when starting from massive pre-trained language models such as BERT. Our best method trains BERT on MultiNLI and then re-trains it on our train set. It achieves 80.4% accuracy compared to 90% accuracy of human annotators (and 62% majority-baseline), leaving a significant gap for future work." - }, - "billsum": { - "pwc_id": "billsum", - "dataset_name": "BillSum Dataset", - "dataset_abstract": "BillSum is the first dataset for summarization of US Congressional and California state bills.\n\nThe BillSum dataset consists of three parts: US training bills, US test bills and California test bills. The US bills were collected from the Govinfo service provided by the United States Government Publishing Office (GPO). The corpus consists of bills from the 103rd-115th (1993-2018) sessions of Congress. The data was split into 18,949 train bills and 3,269 test bills. For California, bills from the 2015-2016 session were scraped directly from the legislature\u2019s website; the summaries were written by their Legislative Counsel.\n\nThe BillSum corpus focuses on mid-length legislation from 5,000 to 20,000 character in length. The authors chose to measure the text length in characters, instead of words or sentences, because the texts have complex structure that makes it difficult to consistently measure words. The range was chosen because on one side, short bills introduce minor changes and do not require summaries. While the CRS produces summaries for them, they often contain most of the text of the bill. On the\nother side, very long legislation is often composed of several large sections.", - "paper_name": "BillSum: A Corpus for Automatic Summarization of US Legislation", - "paper_abstract": "Automatic summarization methods have been studied on a variety of domains, including news and scientific articles. Yet, legislation has not previously been considered for this task, despite US Congress and state governments releasing tens of thousands of bills every year. In this paper, we introduce BillSum, the first dataset for summarization of US Congressional and California state bills (https://github.com/FiscalNote/BillSum). We explain the properties of the dataset that make it more challenging to process than other domains. Then, we benchmark extractive methods that consider neural sentence representations and traditional contextual features. Finally, we demonstrate that models built on Congressional bills can be used to summarize California bills, thus, showing that methods developed on this dataset can transfer to states without human-written summaries." - }, - "mdd": { - "pwc_id": "mdd", - "dataset_name": "MDD Dataset", - "dataset_abstract": "Movie Dialog dataset (MDD) is designed to measure how well models can perform at goal and non-goal orientated dialog centered around the topic of movies (question answering, recommendation and discussion).", - "paper_name": "Evaluating Prerequisite Qualities for Learning End-to-End Dialog Systems", - "paper_abstract": "A long-term goal of machine learning is to build intelligent conversational\nagents. One recent popular approach is to train end-to-end models on a large\namount of real dialog transcripts between humans (Sordoni et al., 2015; Vinyals\n& Le, 2015; Shang et al., 2015). However, this approach leaves many questions\nunanswered as an understanding of the precise successes and shortcomings of\neach model is hard to assess. A contrasting recent proposal are the bAbI tasks\n(Weston et al., 2015b) which are synthetic data that measure the ability of\nlearning machines at various reasoning tasks over toy language. Unfortunately,\nthose tests are very small and hence may encourage methods that do not scale.\nIn this work, we propose a suite of new tasks of a much larger scale that\nattempt to bridge the gap between the two regimes. Choosing the domain of\nmovies, we provide tasks that test the ability of models to answer factual\nquestions (utilizing OMDB), provide personalization (utilizing MovieLens),\ncarry short conversations about the two, and finally to perform on natural\ndialogs from Reddit. We provide a dataset covering 75k movie entities and with\n3.5M training examples. We present results of various models on these tasks,\nand evaluate their performance." - }, - "coqa": { - "pwc_id": "coqa", - "dataset_name": "CoQA Dataset", - "dataset_abstract": "CoQA is a large-scale dataset for building Conversational Question Answering systems. The goal of the CoQA challenge is to measure the ability of machines to understand a text passage and answer a series of interconnected questions that appear in a conversation.\n\nCoQA contains 127,000+ questions with answers collected from 8000+ conversations. Each conversation is collected by pairing two crowdworkers to chat about a passage in the form of questions and answers. The unique features of CoQA include 1) the questions are conversational; 2) the answers can be free-form text; 3) each answer also comes with an evidence subsequence highlighted in the passage; and 4) the passages are collected from seven diverse domains. CoQA has a lot of challenging phenomena not present in existing reading comprehension datasets, e.g., coreference and pragmatic reasoning.", - "paper_name": "CoQA: A Conversational Question Answering Challenge", - "paper_abstract": "Humans gather information by engaging in conversations involving a series of\ninterconnected questions and answers. For machines to assist in information\ngathering, it is therefore essential to enable them to answer conversational\nquestions. We introduce CoQA, a novel dataset for building Conversational\nQuestion Answering systems. Our dataset contains 127k questions with answers,\nobtained from 8k conversations about text passages from seven diverse domains.\nThe questions are conversational, and the answers are free-form text with their\ncorresponding evidence highlighted in the passage. We analyze CoQA in depth and\nshow that conversational questions have challenging phenomena not present in\nexisting reading comprehension datasets, e.g., coreference and pragmatic\nreasoning. We evaluate strong conversational and reading comprehension models\non CoQA. The best system obtains an F1 score of 65.4%, which is 23.4 points\nbehind human performance (88.8%), indicating there is ample room for\nimprovement. We launch CoQA as a challenge to the community at\nhttp://stanfordnlp.github.io/coqa/" - }, - "hotpot_qa": { - "pwc_id": "hotpotqa", - "dataset_name": "HotpotQA Dataset", - "dataset_abstract": "HotpotQA is a question answering dataset collected on the English Wikipedia, containing about 113K crowd-sourced questions that are constructed to require the introduction paragraphs of two Wikipedia articles to answer. Each question in the dataset comes with the two gold paragraphs, as well as a list of sentences in these paragraphs that crowdworkers identify as supporting facts necessary to answer the question. \n\nA diverse range of reasoning strategies are featured in HotpotQA, including questions involving missing entities in the question, intersection questions (What satisfies property A and property B?), and comparison questions, where two entities are compared by a common attribute, among others. In the few-document distractor setting, the QA models are given ten paragraphs in which the gold paragraphs are guaranteed to be found; in the open-domain fullwiki setting, the models are only given the question and the entire Wikipedia. Models are evaluated on their answer accuracy and explainability, where the former is measured as overlap between the predicted and gold answers with exact match (EM) and unigram F1, and the latter concerns how well the predicted supporting fact sentences match human annotation (Supporting Fact EM/F1). A joint metric is also reported on this dataset, which encourages systems to perform well on both tasks simultaneously.", - "paper_name": "HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering", - "paper_abstract": "Existing question answering (QA) datasets fail to train QA systems to perform\ncomplex reasoning and provide explanations for answers. We introduce HotpotQA,\na new dataset with 113k Wikipedia-based question-answer pairs with four key\nfeatures: (1) the questions require finding and reasoning over multiple\nsupporting documents to answer; (2) the questions are diverse and not\nconstrained to any pre-existing knowledge bases or knowledge schemas; (3) we\nprovide sentence-level supporting facts required for reasoning, allowing QA\nsystems to reason with strong supervision and explain the predictions; (4) we\noffer a new type of factoid comparison questions to test QA systems' ability to\nextract relevant facts and perform necessary comparison. We show that HotpotQA\nis challenging for the latest QA systems, and the supporting facts enable\nmodels to improve performance and make explainable predictions." - }, - "cc_news": { - "pwc_id": "cc-news", - "dataset_name": "CC-News Dataset", - "dataset_abstract": "CommonCrawl News is a dataset containing news articles from news sites all over the world. The dataset is available in form of Web ARChive (WARC) files that are released on a daily basis.", - "paper_name": "", - "paper_abstract": "" - }, - "biosses": { - "pwc_id": "biosses", - "dataset_name": "BIOSSES Dataset", - "dataset_abstract": "The BIOSSES data set comprises total 100 sentence pairs all of which were selected from the \"TAC2 Biomedical Summarization Track Training Data Set\" .\n\nThe sentence pairs were evaluated by five different human experts that judged their similarity and gave scores in a range [0-4]. Our guideline was prepared based on SemEval 2012 Task 6 Guideline.", - "paper_name": "BIOSSES: A Semantic Sentence Similarity Estimation System for the Biomedical Domain", - "paper_abstract": "Motivation: The amount of information available in textual format is rapidly increasing in the biomedical domain. Therefore, natural language processing (NLP) applications are becoming increasingly important to facilitate the retrieval and analysis of these data. Computing the semantic similarity between sentences is an important component in many NLP tasks including text retrieval and summarization. A number of approaches have been proposed for semantic sentence similarity estimation for generic English. However, our experiments showed that such approaches do not effectively cover biomedical knowledge and produce poor results for biomedical text.\r\n\r\nMethods: We propose several approaches for sentence-level semantic similarity computation in the biomedical domain, including string similarity measures and measures based on the distributed vector representations of sentences learned in an unsupervised manner from a large biomedical corpus. In addition, ontology-based approaches are presented that utilize general and domain-specific ontologies. Finally, a supervised regression based model is developed that effectively combines the different similarity computation metrics. A benchmark data set consisting of 100 sentence pairs from the biomedical literature is manually annotated by five human experts and used for evaluating the proposed methods.\r\n\r\nResults: The experiments showed that the supervised semantic sentence similarity computation approach obtained the best performance (0.836 correlation with gold standard human annotations) and improved over the state-of-the-art domain-independent systems up to 42.6% in terms of the Pearson correlation metric." - }, - "crows_pairs": { - "pwc_id": "crows-pairs", - "dataset_name": "CrowS-Pairs Dataset", - "dataset_abstract": "CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups.", - "paper_name": "CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models", - "paper_abstract": "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress." - }, - "sem_eval_2010_task_8": { - "pwc_id": "semeval-2010-task-8", - "dataset_name": "SemEval-2010 Task 8 Dataset", - "dataset_abstract": "The dataset for the SemEval-2010 Task 8 is a dataset for multi-way classification of mutually exclusive semantic relations between pairs of nominals.", - "paper_name": "SemEval-2010 Task 8: Multi-Way Classification of Semantic Relations Between Pairs of Nominals", - "paper_abstract": "In response to the continuing research interest in computational semantic analysis, we have proposed a new task for SemEval-2010: multi-way classification of mutually exclusive semantic relations between pairs of nominals. The task is designed to compare different approaches to the problem and to provide a standard testbed for future research. In this paper, we define the task, describe the creation of the datasets, and discuss the results of the participating 28 systems submitted by 10 teams." - }, - "wnut_17": { - "pwc_id": "wnut-2017-emerging-and-rare-entity", - "dataset_name": "WNUT 2017 Dataset", - "dataset_abstract": "This shared task focuses on identifying unusual, previously-unseen entities in the context of emerging discussions. Named entities form the basis of many modern approaches to other tasks (like event clustering and summarisation), but recall on them is a real problem in noisy text - even among annotators. This drop tends to be due to novel entities and surface forms. Take for example the tweet \u201cso.. kktny in 30 mins?\u201d - even human experts find entity kktny hard to detect and resolve. This task will evaluate the ability to detect and classify novel, emerging, singleton named entities in noisy text.\n\nThe goal of this task is to provide a definition of emerging and of rare entities, and based on that, also datasets for detecting these entities.", - "paper_name": "Results of the WNUT2017 Shared Task on Novel and Emerging Entity Recognition", - "paper_abstract": "This shared task focuses on identifying unusual, previously-unseen entities in the context of emerging discussions. Named entities form the basis of many modern approaches to other tasks (like event clustering and summarization), but recall on them is a real problem in noisy text - even among annotators. This drop tends to be due to novel entities and surface forms. Take for example the tweet {``}so.. kktny in 30 mins?!{''} {--} even human experts find the entity {`}kktny{'} hard to detect and resolve. The goal of this task is to provide a definition of emerging and of rare entities, and based on that, also datasets for detecting these entities. The task as described in this paper evaluated the ability of participating entries to detect and classify novel and emerging named entities in noisy text." - }, - "narrativeqa": { - "pwc_id": "narrativeqa", - "dataset_name": "NarrativeQA Dataset", - "dataset_abstract": "The NarrativeQA dataset includes a list of documents with Wikipedia summaries, links to full stories, and questions and answers.", - "paper_name": "The NarrativeQA Reading Comprehension Challenge", - "paper_abstract": "Reading comprehension (RC)---in contrast to information retrieval---requires\nintegrating information and reasoning about events, entities, and their\nrelations across a full document. Question answering is conventionally used to\nassess RC ability, in both artificial agents and children learning to read.\nHowever, existing RC datasets and tasks are dominated by questions that can be\nsolved by selecting answers using superficial information (e.g., local context\nsimilarity or global term frequency); they thus fail to test for the essential\nintegrative aspect of RC. To encourage progress on deeper comprehension of\nlanguage, we present a new dataset and set of tasks in which the reader must\nanswer questions about stories by reading entire books or movie scripts. These\ntasks are designed so that successfully answering their questions requires\nunderstanding the underlying narrative rather than relying on shallow pattern\nmatching or salience. We show that although humans solve the tasks easily,\nstandard RC models struggle on the tasks presented here. We provide an analysis\nof the dataset and the challenges it presents." - }, - "discovery": { - "pwc_id": "discovery", - "dataset_name": "Discovery Dataset Dataset", - "dataset_abstract": "The Discovery datasets consists of adjacent sentence pairs (s1,s2) with a discourse marker (y) that occurred at the beginning of s2. They were extracted from the depcc web corpus.\n\nMarkers prediction can be used in order to train a sentence encoders. Discourse markers can be considered as noisy labels for various semantic tasks, such as entailment (y=therefore), subjectivity analysis (y=personally) or sentiment analysis (y=sadly), similarity (y=similarly), typicality, (y=curiously) ...\n\nThe specificity of this dataset is the diversity of the markers, since previously used data used only ~10 imbalanced classes. The author of the dataset provide:\n\n\na list of the 174 discourse markers\na Base version of the dataset with 1.74 million pairs (10k examples per marker)\na Big version with 3.4 million pairs\na Hard version with 1.74 million pairs where the connective couldn't be predicted with a fastText linear model", - "paper_name": "Mining Discourse Markers for Unsupervised Sentence Representation Learning", - "paper_abstract": "Current state of the art systems in NLP heavily rely on manually annotated\ndatasets, which are expensive to construct. Very little work adequately\nexploits unannotated data -- such as discourse markers between sentences --\nmainly because of data sparseness and ineffective extraction methods. In the\npresent work, we propose a method to automatically discover sentence pairs with\nrelevant discourse markers, and apply it to massive amounts of data. Our\nresulting dataset contains 174 discourse markers with at least 10k examples\neach, even for rare markers such as coincidentally or amazingly We use the\nresulting data as supervision for learning transferable sentence embeddings. In\naddition, we show that even though sentence representation learning through\nprediction of discourse markers yields state of the art results across\ndifferent transfer tasks, it is not clear that our models made use of the\nsemantic relation between sentences, thus leaving room for further\nimprovements. Our datasets are publicly available\n(https://github.com/synapse-developpement/Discovery)" - }, - "lambada": { - "pwc_id": "lambada", - "dataset_name": "LAMBADA Dataset", - "dataset_abstract": "The LAMBADA (LAnguage Modeling Broadened to Account for Discourse Aspects) benchmark is an open-ended cloze task which consists of about 10,000 passages from BooksCorpus where a missing target word is predicted in the last sentence of each passage. The missing word is constrained to always be the last word of the last sentence and there are no candidate words to choose from. Examples were filtered by humans to ensure they were possible to guess given the context, i.e., the sentences in the passage leading up to the last sentence. Examples were further filtered to ensure that missing words could not be guessed without the context, ensuring that models attempting the dataset would need to reason over the entire paragraph to answer questions.", - "paper_name": "The LAMBADA dataset: Word prediction requiring a broad discourse context", - "paper_abstract": "We introduce LAMBADA, a dataset to evaluate the capabilities of computational\nmodels for text understanding by means of a word prediction task. LAMBADA is a\ncollection of narrative passages sharing the characteristic that human subjects\nare able to guess their last word if they are exposed to the whole passage, but\nnot if they only see the last sentence preceding the target word. To succeed on\nLAMBADA, computational models cannot simply rely on local context, but must be\nable to keep track of information in the broader discourse. We show that\nLAMBADA exemplifies a wide range of linguistic phenomena, and that none of\nseveral state-of-the-art language models reaches accuracy above 1% on this\nnovel benchmark. We thus propose LAMBADA as a challenging test set, meant to\nencourage the development of new models capable of genuine understanding of\nbroad context in natural language text." - }, - "selqa": { - "pwc_id": "selqa", - "dataset_name": "SelQA Dataset", - "dataset_abstract": "SelQA is a dataset that consists of questions generated through crowdsourcing and sentence length answers that are drawn from the ten most prevalent topics in the English Wikipedia.", - "paper_name": "SelQA: A New Benchmark for Selection-based Question Answering", - "paper_abstract": "This paper presents a new selection-based question answering dataset, SelQA.\nThe dataset consists of questions generated through crowdsourcing and sentence\nlength answers that are drawn from the ten most prevalent topics in the English\nWikipedia. We introduce a corpus annotation scheme that enhances the generation\nof large, diverse, and challenging datasets by explicitly aiming to reduce word\nco-occurrences between the question and answers. Our annotation scheme is\ncomposed of a series of crowdsourcing tasks with a view to more effectively\nutilize crowdsourcing in the creation of question answering datasets in various\ndomains. Several systems are compared on the tasks of answer sentence selection\nand answer triggering, providing strong baseline results for future work to\nimprove upon." - }, - "sick": { - "pwc_id": "sick", - "dataset_name": "SICK Dataset", - "dataset_abstract": "The Sentences Involving Compositional Knowledge (SICK) dataset is a dataset for compositional distributional semantics. It includes a large number of sentence pairs that are rich in the lexical, syntactic and semantic phenomena. Each pair of sentences is annotated in two dimensions: relatedness and entailment. The relatedness score ranges from 1 to 5, and Pearson\u2019s r is used for evaluation; the entailment relation is categorical, consisting of entailment, contradiction, and neutral. There are 4439 pairs in the train split, 495 in the trial split used for development and 4906 in the test split. The sentence pairs are generated from image and video caption datasets before being paired up using some algorithm.", - "paper_name": "A SICK cure for the evaluation of compositional distributional semantic models", - "paper_abstract": "Shared and internationally recognized benchmarks are fundamental for the development of any computational system. We aim to help the research community working on compositional distributional semantic models (CDSMs) by providing SICK (Sentences Involving Compositional Knowldedge), a large size English benchmark tailored for them. SICK consists of about 10,000 English sentence pairs that include many examples of the lexical, syntactic and semantic phenomena that CDSMs are expected to account for, but do not require dealing with other aspects of existing sentential data sets (idiomatic multiword expressions, named entities, telegraphic language) that are not within the scope of CDSMs. By means of crowdsourcing techniques, each pair was annotated for two crucial semantic tasks: relatedness in meaning (with a 5-point rating scale as gold score) and entailment relation between the two elements (with three possible gold labels: entailment, contradiction, and neutral). The SICK data set was used in SemEval-2014 Task 1, and it freely available for research purposes." - }, - "fever": { - "pwc_id": "fever", - "dataset_name": "FEVER Dataset", - "dataset_abstract": "FEVER is a publicly available dataset for fact extraction and verification against textual sources.\n\nIt consists of 185,445 claims manually verified against the introductory sections of Wikipedia pages and classified as SUPPORTED, REFUTED or NOTENOUGHINFO. For the first two classes, systems and annotators need to also return the combination of sentences forming the necessary evidence supporting or refuting the claim.\n\nThe claims were generated by human annotators extracting claims from Wikipedia and mutating them in a variety of ways, some of which were meaning-altering. The verification of each claim was conducted in a separate annotation process by annotators who were aware of the page but not the sentence from which original claim was\nextracted and thus in 31.75% of the claims more than one sentence was considered appropriate evidence. Claims require composition of evidence from multiple sentences in 16.82% of cases. Furthermore, in 12.15% of the claims, this evidence was taken from multiple pages.", - "paper_name": "FEVER: a large-scale dataset for Fact Extraction and VERification", - "paper_abstract": "In this paper we introduce a new publicly available dataset for verification\nagainst textual sources, FEVER: Fact Extraction and VERification. It consists\nof 185,445 claims generated by altering sentences extracted from Wikipedia and\nsubsequently verified without knowledge of the sentence they were derived from.\nThe claims are classified as Supported, Refuted or NotEnoughInfo by annotators\nachieving 0.6841 in Fleiss $\\kappa$. For the first two classes, the annotators\nalso recorded the sentence(s) forming the necessary evidence for their\njudgment. To characterize the challenge of the dataset presented, we develop a\npipeline approach and compare it to suitably designed oracles. The best\naccuracy we achieve on labeling a claim accompanied by the correct evidence is\n31.87%, while if we ignore the evidence we achieve 50.91%. Thus we believe that\nFEVER is a challenging testbed that will help stimulate progress on claim\nverification against textual sources." - }, - "scicite": { - "pwc_id": "scicite", - "dataset_name": "SciCite Dataset", - "dataset_abstract": "SciCite is a dataset of citation intents that addresses multiple scientific domains and is more than five times larger than ACL-ARC.", - "paper_name": "Structural Scaffolds for Citation Intent Classification in Scientific Publications", - "paper_abstract": "Identifying the intent of a citation in scientific papers (e.g., background information, use of methods, comparing results) is critical for machine reading of individual publications and automated analysis of the scientific literature. We propose structural scaffolds, a multitask model to incorporate structural information of scientific papers into citations for effective classification of citation intents. Our model achieves a new state-of-the-art on an existing ACL anthology dataset (ACL-ARC) with a 13.3% absolute increase in F1 score, without relying on external linguistic resources or hand-engineered features as done in existing methods. In addition, we introduce a new dataset of citation intents (SciCite) which is more than five times larger and covers multiple scientific domains compared with existing datasets. Our code and data are available at: https://github.com/allenai/scicite." - }, - "mlqa": { - "pwc_id": "mlqa", - "dataset_name": "MLQA Dataset", - "dataset_abstract": "MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating cross-lingual question answering performance. MLQA consists of over 5K extractive QA instances (12K in English) in SQuAD format in seven languages - English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA is highly parallel, with QA instances parallel between 4 different languages on average.", - "paper_name": "MLQA: Evaluating Cross-lingual Extractive Question Answering", - "paper_abstract": "Question answering (QA) models have shown rapid progress enabled by the availability of large, high-quality benchmark datasets. Such annotated datasets are difficult and costly to collect, and rarely exist in languages other than English, making training QA systems in other languages challenging. An alternative to building large monolingual training datasets is to develop cross-lingual systems which can transfer to a target language without requiring training data in that language. In order to develop such systems, it is crucial to invest in high quality multilingual evaluation benchmarks to measure progress. We present MLQA, a multi-way aligned extractive QA evaluation benchmark intended to spur research in this area. MLQA contains QA instances in 7 languages, namely English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese. It consists of over 12K QA instances in English and 5K in each other language, with each QA instance being parallel between 4 languages on average. MLQA is built using a novel alignment context strategy on Wikipedia articles, and serves as a cross-lingual extension to existing extractive QA datasets. We evaluate current state-of-the-art cross-lingual representations on MLQA, and also provide machine-translation-based baselines. In all cases, transfer results are shown to be significantly behind training-language performance." - }, - "clinc_oos": { - "pwc_id": "clinc150", - "dataset_name": "CLINC150 Dataset", - "dataset_abstract": "This dataset is for evaluating the performance of intent classification systems in the presence of \"out-of-scope\" queries, i.e., queries that do not fall into any of the system-supported intent classes. The dataset includes both in-scope and out-of-scope data.", - "paper_name": "An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction", - "paper_abstract": "Task-oriented dialog systems need to know when a query falls outside their range of supported intents, but current text classification corpora only define label sets that cover every example. We introduce a new dataset that includes queries that are out-of-scope---i.e., queries that do not fall into any of the system's supported intents. This poses a new challenge because models cannot assume that every query at inference time belongs to a system-supported intent class. Our dataset also covers 150 intent classes over 10 domains, capturing the breadth that a production task-oriented agent must handle. We evaluate a range of benchmark classifiers on our dataset along with several different out-of-scope identification schemes. We find that while the classifiers perform well on in-scope intent classification, they struggle to identify out-of-scope queries. Our dataset and evaluation fill an important gap in the field, offering a way of more rigorously and realistically benchmarking text classification in task-driven dialog systems." - }, - "tab_fact": { - "pwc_id": "tabfact", - "dataset_name": "TabFact Dataset", - "dataset_abstract": "TabFact is a large-scale dataset which consists of 117,854 manually annotated statements with regard to 16,573 Wikipedia tables, their relations are classified as ENTAILED and REFUTED. TabFact is the first dataset to evaluate language inference on structured data, which involves mixed reasoning skills in both symbolic and linguistic aspects.", - "paper_name": "TabFact: A Large-scale Dataset for Table-based Fact Verification", - "paper_abstract": "The problem of verifying whether a textual hypothesis holds based on the given evidence, also known as fact verification, plays an important role in the study of natural language understanding and semantic representation. However, existing studies are mainly restricted to dealing with unstructured evidence (e.g., natural language sentences and documents, news, etc), while verification under structured evidence, such as tables, graphs, and databases, remains under-explored. This paper specifically aims to study the fact verification given semi-structured data as evidence. To this end, we construct a large-scale dataset called TabFact with 16k Wikipedia tables as the evidence for 118k human-annotated natural language statements, which are labeled as either ENTAILED or REFUTED. TabFact is challenging since it involves both soft linguistic reasoning and hard symbolic reasoning. To address these reasoning challenges, we design two different models: Table-BERT and Latent Program Algorithm (LPA). Table-BERT leverages the state-of-the-art pre-trained language model to encode the linearized tables and statements into continuous vectors for verification. LPA parses statements into programs and executes them against the tables to obtain the returned binary value for verification. Both methods achieve similar accuracy but still lag far behind human performance. We also perform a comprehensive analysis to demonstrate great future opportunities. The data and code of the dataset are provided in \\url{https://github.com/wenhuchen/Table-Fact-Checking}." - }, - "poem_sentiment": { - "pwc_id": "gutenberg-poem-dataset", - "dataset_name": "Gutenberg Poem Dataset Dataset", - "dataset_abstract": "Gutenberg Poem Dataset is used for the next verse prediction component.", - "paper_name": "Investigating Societal Biases in a Poetry Composition System", - "paper_abstract": "There is a growing collection of work analyzing and mitigating societal biases in language understanding, generation, and retrieval tasks, though examining biases in creative tasks remains underexplored. Creative language applications are meant for direct interaction with users, so it is important to quantify and mitigate societal biases in these applications. We introduce a novel study on a pipeline to mitigate societal biases when retrieving next verse suggestions in a poetry composition system. Our results suggest that data augmentation through sentiment style transfer has potential for mitigating societal biases." - }, - "health_fact": { - "pwc_id": "pubhealth", - "dataset_name": "PUBHEALTH Dataset", - "dataset_abstract": "PUBHEALTH is a comprehensive dataset for explainable automated fact-checking of public health claims. Each instance in the PUBHEALTH dataset has an associated veracity label (true, false, unproven, mixture). Furthermore each instance in the dataset has an explanation text field. The explanation is a justification for which the claim has been assigned a particular veracity label.", - "paper_name": "Explainable Automated Fact-Checking for Public Health Claims", - "paper_abstract": "Fact-checking is the task of verifying the veracity of claims by assessing their assertions against credible evidence. The vast majority of fact-checking studies focus exclusively on political claims. Very little research explores fact-checking for other topics, specifically subject matters for which expertise is required. We present the first study of explainable fact-checking for claims which require specific expertise. For our case study we choose the setting of public health. To support this case study we construct a new dataset PUBHEALTH of 11.8K claims accompanied by journalist crafted, gold standard explanations (i.e., judgments) to support the fact-check labels for claims. We explore two tasks: veracity prediction and explanation generation. We also define and evaluate, with humans and computationally, three coherence properties of explanation quality. Our results indicate that, by training on in-domain data, gains can be made in explainable, automated fact-checking for claims which require specific expertise." - }, - "scitldr": { - "pwc_id": "scitldr", - "dataset_name": "SciTLDR Dataset", - "dataset_abstract": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers. SciTLDR contains both author-written and expert-derived TLDRs, where the latter are collected using a novel annotation protocol that produces high-quality summaries while minimizing annotation burden.", - "paper_name": "TLDR: Extreme Summarization of Scientific Documents", - "paper_abstract": "We introduce TLDR generation, a new form of extreme summarization, for scientific papers. TLDR generation involves high source compression and requires expert background knowledge and understanding of complex domain-specific language. To facilitate study on this task, we introduce SciTLDR, a new multi-target dataset of 5.4K TLDRs over 3.2K papers. SciTLDR contains both author-written and expert-derived TLDRs, where the latter are collected using a novel annotation protocol that produces high-quality summaries while minimizing annotation burden. We propose CATTS, a simple yet effective learning strategy for generating TLDRs that exploits titles as an auxiliary training signal. CATTS improves upon strong baselines under both automated metrics and human evaluations. Data and code are publicly available at https://github.com/allenai/scitldr." - }, - "emo": { - "pwc_id": "emocontext", - "dataset_name": "EmoContext Dataset", - "dataset_abstract": "EmoContext consists of three-turn English Tweets. The emotion labels include happiness, sadness, anger and other.", - "paper_name": "SemEval-2019 Task 3: EmoContext Contextual Emotion Detection in Text", - "paper_abstract": "In this paper, we present the SemEval-2019 Task 3 - EmoContext: Contextual Emotion Detection in Text. Lack of facial expressions and voice modulations make detecting emotions in text a challenging problem. For instance, as humans, on reading {``}Why don{'}t you ever text me!{''} we can either interpret it as a sad or angry emotion and the same ambiguity exists for machines. However, the context of dialogue can prove helpful in detection of the emotion. In this task, given a textual dialogue i.e. an utterance along with two previous turns of context, the goal was to infer the underlying emotion of the utterance by choosing from four emotion classes - Happy, Sad, Angry and Others. To facilitate the participation in this task, textual dialogues from user interaction with a conversational agent were taken and annotated for emotion classes after several data processing steps. A training data set of 30160 dialogues, and two evaluation data sets, Test1 and Test2, containing 2755 and 5509 dialogues respectively were released to the participants. A total of 311 teams made submissions to this task. The final leader-board was evaluated on Test2 data set, and the highest ranked submission achieved 79.59 micro-averaged F1 score. Our analysis of systems submitted to the task indicate that Bi-directional LSTM was the most common choice of neural architecture used, and most of the systems had the best performance for the Sad emotion class, and the worst for the Happy emotion class." - }, - "eli5": { - "pwc_id": "eli5", - "dataset_name": "ELI5 Dataset", - "dataset_abstract": "ELI5 is a dataset for long-form question answering. It contains 270K complex, diverse questions that require explanatory multi-sentence answers. Web search results are used as evidence documents to answer each question.\n\nELI5 is also a task in Dodecadialogue.", - "paper_name": "ELI5: Long Form Question Answering", - "paper_abstract": "We introduce the first large-scale corpus for long-form question answering, a task requiring elaborate and in-depth answers to open-ended questions. The dataset comprises 270K threads from the Reddit forum ``Explain Like I'm Five'' (ELI5) where an online community provides answers to questions which are comprehensible by five year olds. Compared to existing datasets, ELI5 comprises diverse questions requiring multi-sentence answers. We provide a large set of web documents to help answer the question. Automatic and human evaluations show that an abstractive model trained with a multi-task objective outperforms conventional Seq2Seq, language modeling, as well as a strong extractive baseline. However, our best model is still far from human performance since raters prefer gold responses in over 86% of cases, leaving ample opportunity for future improvement." - }, - "cord19": { - "pwc_id": "cord-19", - "dataset_name": "CORD-19 Dataset", - "dataset_abstract": "CORD-19 is a free resource of tens of thousands of scholarly articles about COVID-19, SARS-CoV-2, and related coronaviruses for use by the global research community.", - "paper_name": "CORD-19: The COVID-19 Open Research Dataset", - "paper_abstract": "The COVID-19 Open Research Dataset (CORD-19) is a growing resource of scientific papers on COVID-19 and related historical coronavirus research. CORD-19 is designed to facilitate the development of text mining and information retrieval systems over its rich collection of metadata and structured full text papers. Since its release, CORD-19 has been downloaded over 200K times and has served as the basis of many COVID-19 text mining and discovery systems. In this article, we describe the mechanics of dataset construction, highlighting challenges and key design decisions, provide an overview of how CORD-19 has been used, and describe several shared tasks built around the dataset. We hope this resource will continue to bring together the computing community, biomedical experts, and policy makers in the search for effective treatments and management policies for COVID-19." - }, - "timit_asr": { - "pwc_id": "timit", - "dataset_name": "TIMIT Dataset", - "dataset_abstract": "The TIMIT Acoustic-Phonetic Continuous Speech Corpus is a standard dataset used for evaluation of automatic speech recognition systems. It consists of recordings of 630 speakers of 8 dialects of American English each reading 10 phonetically-rich sentences. It also comes with the word and phone-level transcriptions of the speech.", - "paper_name": "", - "paper_abstract": "" - }, - "aeslc": { - "pwc_id": "aeslc", - "dataset_name": "AESLC Dataset", - "dataset_abstract": "To study the task of email subject line generation: automatically generating an email subject line from the email body.", - "paper_name": "This Email Could Save Your Life: Introducing the Task of Email Subject Line Generation", - "paper_abstract": "Given the overwhelming number of emails, an effective subject line becomes essential to better inform the recipient of the email's content. In this paper, we propose and study the task of email subject line generation: automatically generating an email subject line from the email body. We create the first dataset for this task and find that email subject line generation favor extremely abstractive summary which differentiates it from news headline generation or news single document summarization. We then develop a novel deep learning method and compare it to several baselines as well as recent state-of-the-art text summarization systems. We also investigate the efficacy of several automatic metrics based on correlations with human judgments and propose a new automatic evaluation metric. Our system outperforms competitive baselines given both automatic and human evaluations. To our knowledge, this is the first work to tackle the problem of effective email subject line generation." - }, - "ecthr_cases": { - "pwc_id": "ecthr", - "dataset_name": "ECtHR Dataset", - "dataset_abstract": "ECtHR is a dataset comprising European Court of Human Rights cases, including annotations for paragraph-level rationales. This dataset comprises 11k ECtHR cases and can be viewed as an enriched version of the ECtHR dataset of Chalkidis et al. (2019), which did not provide ground truth for alleged article violations (articles discussed) and rationales. It is released with silver rationales obtained from references in court decisions, and gold rationales provided by ECHR-experienced lawyers", - "paper_name": "Paragraph-level Rationale Extraction through Regularization: A case study on European Court of Human Rights Cases", - "paper_abstract": "Interpretability or explainability is an emerging research field in NLP. From a user-centric point of view, the goal is to build models that provide proper justification for their decisions, similar to those of humans, by requiring the models to satisfy additional constraints. To this end, we introduce a new application on legal text where, contrary to mainstream literature targeting word-level rationales, we conceive rationales as selected paragraphs in multi-paragraph structured court cases. We also release a new dataset comprising European Court of Human Rights cases, including annotations for paragraph-level rationales. We use this dataset to study the effect of already proposed rationale constraints, i.e., sparsity, continuity, and comprehensiveness, formulated as regularizers. Our findings indicate that some of these constraints are not beneficial in paragraph-level rationale extraction, while others need re-formulation to better handle the multi-label nature of the task we consider. We also introduce a new constraint, singularity, which further improves the quality of rationales, even compared with noisy rationale supervision. Experimental results indicate that the newly introduced task is very challenging and there is a large scope for further research." - }, - "art": { - "pwc_id": "art-dataset", - "dataset_name": "ART Dataset Dataset", - "dataset_abstract": "ART consists of over 20k commonsense narrative contexts and 200k explanations.", - "paper_name": "Abductive Commonsense Reasoning", - "paper_abstract": "Abductive reasoning is inference to the most plausible explanation. For example, if Jenny finds her house in a mess when she returns from work, and remembers that she left a window open, she can hypothesize that a thief broke into her house and caused the mess, as the most plausible explanation. While abduction has long been considered to be at the core of how people interpret and read between the lines in natural language (Hobbs et al., 1988), there has been relatively little research in support of abductive natural language inference and generation. We present the first study that investigates the viability of language-based abductive reasoning. We introduce a challenge dataset, ART, that consists of over 20k commonsense narrative contexts and 200k explanations. Based on this dataset, we conceptualize two new tasks -- (i) Abductive NLI: a multiple-choice question answering task for choosing the more likely explanation, and (ii) Abductive NLG: a conditional generation task for explaining given observations in natural language. On Abductive NLI, the best model achieves 68.9% accuracy, well below human performance of 91.4%. On Abductive NLG, the current best language generators struggle even more, as they lack reasoning capabilities that are trivial for humans. Our analysis leads to new insights into the types of reasoning that deep pre-trained language models fail to perform--despite their strong performance on the related but more narrowly defined task of entailment NLI--pointing to interesting avenues for future research." - }, - "liar": { - "pwc_id": "liar", - "dataset_name": "LIAR Dataset", - "dataset_abstract": "LIAR is a publicly available dataset for fake news detection. A decade-long of 12.8K manually labeled short statements were collected in various contexts from POLITIFACT.COM, which provides detailed analysis report and links to source documents for each case. This dataset can be used for fact-checking research as well. Notably, this new dataset is an order of magnitude larger than previously largest public fake news datasets of similar type. The LIAR dataset4 includes 12.8K human labeled short statements from POLITIFACT.COM\u2019s API, and each statement is evaluated by a POLITIFACT.COM editor for its truthfulness.", - "paper_name": "\"Liar, Liar Pants on Fire\": A New Benchmark Dataset for Fake News Detection", - "paper_abstract": "Automatic fake news detection is a challenging problem in deception\ndetection, and it has tremendous real-world political and social impacts.\nHowever, statistical approaches to combating fake news has been dramatically\nlimited by the lack of labeled benchmark datasets. In this paper, we present\nliar: a new, publicly available dataset for fake news detection. We collected a\ndecade-long, 12.8K manually labeled short statements in various contexts from\nPolitiFact.com, which provides detailed analysis report and links to source\ndocuments for each case. This dataset can be used for fact-checking research as\nwell. Notably, this new dataset is an order of magnitude larger than previously\nlargest public fake news datasets of similar type. Empirically, we investigate\nautomatic fake news detection based on surface-level linguistic patterns. We\nhave designed a novel, hybrid convolutional neural network to integrate\nmeta-data with text. We show that this hybrid approach can improve a text-only\ndeep learning model." - }, - "gem": { - "pwc_id": "gem", - "dataset_name": "GEM Dataset", - "dataset_abstract": "Generation, Evaluation, and Metrics (GEM) is a benchmark environment for Natural Language Generation with a focus on its Evaluation, both through human annotations and automated Metrics.\n\nGEM aims to:\n\n\nmeasure NLG progress across 13 datasets spanning many NLG tasks and languages.\nprovide an in-depth analysis of data and models presented via data statements and challenge sets.\ndevelop standards for evaluation of generated text using both automated and human metrics.\n\nIt is our goal to regularly update GEM and to encourage toward more inclusive practices in dataset development by extending existing data or developing datasets for additional languages.", - "paper_name": "The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics", - "paper_abstract": "We introduce GEM, a living benchmark for natural language Generation (NLG), its Evaluation, and Metrics. Measuring progress in NLG relies on a constantly evolving ecosystem of automated metrics, datasets, and human evaluation standards. Due to this moving target, new models often still evaluate on divergent anglo-centric corpora with well-established, but flawed, metrics. This disconnect makes it challenging to identify the limitations of current models and opportunities for progress. Addressing this limitation, GEM provides an environment in which models can easily be applied to a wide set of tasks and in which evaluation strategies can be tested. Regular updates to the benchmark will help NLG research become more multilingual and evolve the challenge alongside models. This paper serves as the description of the data for which we are organizing a shared task at our ACL 2021 Workshop and to which we invite the entire NLG community to participate." - }, - "quac": { - "pwc_id": "quac", - "dataset_name": "QuAC Dataset", - "dataset_abstract": "Question Answering in Context is a large-scale dataset that consists of around 14K crowdsourced Question Answering dialogs with 98K question-answer pairs in total. Data instances consist of an interactive dialog between two crowd workers: (1) a student who poses a sequence of freeform questions to learn as much as possible about a hidden Wikipedia text, and (2) a teacher who answers the questions by providing short excerpts (spans) from the text.", - "paper_name": "QuAC: Question Answering in Context", - "paper_abstract": "We present QuAC, a dataset for Question Answering in Context that contains 14K information-seeking QA dialogs (100K questions in total). The dialogs involve two crowd workers: (1) a student who poses a sequence of freeform questions to learn as much as possible about a hidden Wikipedia text, and (2) a teacher who answers the questions by providing short excerpts from the text. QuAC introduces challenges not found in existing machine comprehension datasets: its questions are often more open-ended, unanswerable, or only meaningful within the dialog context, as we show in a detailed qualitative evaluation. We also report results for a number of reference models, including a recently state-of-the-art reading comprehension architecture extended to model dialog context. Our best model underperforms humans by 20 F1, suggesting that there is significant room for future work on this data. Dataset, baseline, and leaderboard available at \\url{http://quac.ai}." - }, - "asset": { - "pwc_id": "asset", - "dataset_name": "ASSET Dataset", - "dataset_abstract": "ASSET is a new dataset for assessing sentence simplification in English. ASSET is a crowdsourced multi-reference corpus where each simplification was produced by executing several rewriting transformations.", - "paper_name": "ASSET: A Dataset for Tuning and Evaluation of Sentence Simplification Models with Multiple Rewriting Transformations", - "paper_abstract": "In order to simplify a sentence, human editors perform multiple rewriting transformations: they split it into several shorter sentences, paraphrase words (i.e. replacing complex words or phrases by simpler synonyms), reorder components, and/or delete information deemed unnecessary. Despite these varied range of possible text alterations, current models for automatic sentence simplification are evaluated using datasets that are focused on a single transformation, such as lexical paraphrasing or splitting. This makes it impossible to understand the ability of simplification models in more realistic settings. To alleviate this limitation, this paper introduces ASSET, a new dataset for assessing sentence simplification in English. ASSET is a crowdsourced multi-reference corpus where each simplification was produced by executing several rewriting transformations. Through quantitative and qualitative experiments, we show that simplifications in ASSET are better at capturing characteristics of simplicity when compared to other standard evaluation datasets for the task. Furthermore, we motivate the need for developing better methods for automatic evaluation using ASSET, since we show that current popular metrics may not be suitable when multiple simplification transformations are performed." - }, - "circa": { - "pwc_id": "circa", - "dataset_name": "Circa Dataset", - "dataset_abstract": "The Circa (meaning \u2018approximately\u2019) dataset aims to help machine learning systems to solve the problem of interpreting indirect answers to polar questions.\n\nThe dataset contains pairs of yes/no questions and indirect answers, together with annotations for the interpretation of the answer. The data is collected in 10 different social conversational situations (eg. food preferences of a friend). Examples:\n\n```\nQ: Are you vegan?\nA: I love burgers too much. [No]\n\nQ: Do you like spicy food?\nA: I put hot sauce on everything. [Yes] \n\nQ: Would you like to go see live music?\nA: If it\u2019s not too crowded. [Yes, upon a condition]\n```\n\nCurrently, the Circa annotations focus on a few classes such as \u2018yes\u2019, \u2018no\u2019 and \u2018yes, upon condition\u2019. The data can be used to build machine learning models which can replicate these classes on new question-answer pairs, and allow evaluation of methods for doing so.", - "paper_name": "\"I'd rather just go to bed\": Understanding Indirect Answers", - "paper_abstract": "We revisit a pragmatic inference problem in dialog: understanding indirect responses to questions. Humans can interpret 'I'm starving.' in response to 'Hungry?', even without direct cue words such as 'yes' and 'no'. In dialog systems, allowing natural responses rather than closed vocabularies would be similarly beneficial. However, today's systems are only as sensitive to these pragmatic moves as their language model allows. We create and release the first large-scale English language corpus 'Circa' with 34,268 (polar question, indirect answer) pairs to enable progress on this task. The data was collected via elaborate crowdsourcing, and contains utterances with yes/no meaning, as well as uncertain, middle-ground, and conditional responses. We also present BERT-based neural models to predict such categories for a question-answer pair. We find that while transfer learning from entailment works reasonably, performance is not yet sufficient for robust dialog. Our models reach 82-88% accuracy for a 4-class distinction, and 74-85% for 6 classes." - }, - "aqua_rat": { - "pwc_id": "aqua-rat", - "dataset_name": "AQUA-RAT Dataset", - "dataset_abstract": "Algebra Question Answering with Rationales (AQUA-RAT) is a dataset that contains algebraic word problems with rationales. The dataset consists of about 100,000 algebraic word problems with natural language rationales. Each problem is a json object consisting of four parts:\n* question - A natural language definition of the problem to solve\n* options - 5 possible options (A, B, C, D and E), among which one is correct\n* rationale - A natural language description of the solution to the problem\n* correct - The correct option", - "paper_name": "Program Induction by Rationale Generation : Learning to Solve and Explain Algebraic Word Problems", - "paper_abstract": "Solving algebraic word problems requires executing a series of arithmetic\noperations---a program---to obtain a final answer. However, since programs can\nbe arbitrarily complicated, inducing them directly from question-answer pairs\nis a formidable challenge. To make this task more feasible, we solve these\nproblems by generating answer rationales, sequences of natural language and\nhuman-readable mathematical expressions that derive the final answer through a\nseries of small steps. Although rationales do not explicitly specify programs,\nthey provide a scaffolding for their structure via intermediate milestones. To\nevaluate our approach, we have created a new 100,000-sample dataset of\nquestions, answers and rationales. Experimental results show that indirect\nsupervision of program learning via answer rationales is a promising strategy\nfor inducing arithmetic programs." - }, - "blended_skill_talk": { - "pwc_id": "blended-skill-talk", - "dataset_name": "Blended Skill Talk Dataset", - "dataset_abstract": "To analyze how these capabilities would mesh together in a natural conversation, and compare the performance of different architectures and training schemes.", - "paper_name": "Can You Put it All Together: Evaluating Conversational Agents' Ability to Blend Skills", - "paper_abstract": "Being engaging, knowledgeable, and empathetic are all desirable general qualities in a conversational agent. Previous work has introduced tasks and datasets that aim to help agents to learn those qualities in isolation and gauge how well they can express them. But rather than being specialized in one single quality, a good open-domain conversational agent should be able to seamlessly blend them all into one cohesive conversational flow. In this work, we investigate several ways to combine models trained towards isolated capabilities, ranging from simple model aggregation schemes that require minimal additional training, to various forms of multi-task training that encompass several skills at all training stages. We further propose a new dataset, BlendedSkillTalk, to analyze how these capabilities would mesh together in a natural conversation, and compare the performance of different architectures and training schemes. Our experiments show that multi-tasking over several tasks that focus on particular capabilities results in better blended conversation performance compared to models trained on a single skill, and that both unified or two-stage approaches perform well if they are constructed to avoid unwanted bias in skill selection or are fine-tuned on our new task." - }, - "qa_srl": { - "pwc_id": "qa-srl", - "dataset_name": "QA-SRL Dataset", - "dataset_abstract": "QA-SRL was proposed as an open schema for semantic roles, in which the relation between an argument and a predicate is expressed as a natural-language question containing the predicate (\u201cWhere was someone educated?\u201d) whose answer is the argument (\u201cPrinceton\u201d). The authors collected about 19,000 question-answer pairs from 3,200 sentences.", - "paper_name": "", - "paper_abstract": "" - }, - "climate_fever": { - "pwc_id": "climate-fever", - "dataset_name": "CLIMATE-FEVER Dataset", - "dataset_abstract": "A new publicly available dataset for verification of climate change-related claims.", - "paper_name": "CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims", - "paper_abstract": "We introduce CLIMATE-FEVER, a new publicly available dataset for verification of climate change-related claims. By providing a dataset for the research community, we aim to facilitate and encourage work on improving algorithms for retrieving evidential support for climate-specific claims, addressing the underlying language understanding challenges, and ultimately help alleviate the impact of misinformation on climate change. We adapt the methodology of FEVER [1], the largest dataset of artificially designed claims, to real-life claims collected from the Internet. While during this process, we could rely on the expertise of renowned climate scientists, it turned out to be no easy task. We discuss the surprising, subtle complexity of modeling real-world climate-related claims within the \\textsc{fever} framework, which we believe provides a valuable challenge for general natural language understanding. We hope that our work will mark the beginning of a new exciting long-term joint effort by the climate science and AI community." - }, - "humicroedit": { - "pwc_id": "humicroedit", - "dataset_name": "Humicroedit Dataset", - "dataset_abstract": "Humicroedit is a humorous headline dataset. The data consists of regular English news headlines paired with versions of the same headlines that contain simple replacement edits designed to make them funny. The authors carefully curated crowdsourced editors to create funny headlines and judges to score a to a total of 15,095 edited headlines, with five judges per headline.", - "paper_name": "\"President Vows to Cut Hair\": Dataset and Analysis of Creative Text Editing for Humorous Headlines", - "paper_abstract": "We introduce, release, and analyze a new dataset, called Humicroedit, for research in computational humor. Our publicly available data consists of regular English news headlines paired with versions of the same headlines that contain simple replacement edits designed to make them funny. We carefully curated crowdsourced editors to create funny headlines and judges to score a to a total of 15,095 edited headlines, with five judges per headline. The simple edits, usually just a single word replacement, mean we can apply straightforward analysis techniques to determine what makes our edited headlines humorous. We show how the data support classic theories of humor, such as incongruity, superiority, and setup/punchline. Finally, we develop baseline classifiers that can predict whether or not an edited headline is funny, which is a first step toward automatically generating humorous headlines as an approach to creating topical humor." - }, - "discofuse": { - "pwc_id": "discofuse", - "dataset_name": "DiscoFuse Dataset", - "dataset_abstract": "DiscoFuse was created by applying a rule-based splitting method on two corpora -\nsports articles crawled from the Web, and Wikipedia. See the paper for a detailed\ndescription of the dataset generation process and evaluation.\n\nDiscoFuse has two parts with 44,177,443 and 16,642,323 examples sourced from Sports articles and Wikipedia, respectively.\n\nFor each part, a random split is provided to train (98% of the examples), development (1%) and test (1%) sets. In addition, as the original data distribution is highly skewed (see details in the paper), a balanced version for each part is also provided.", - "paper_name": "DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion", - "paper_abstract": "Sentence fusion is the task of joining several independent sentences into a\nsingle coherent text. Current datasets for sentence fusion are small and\ninsufficient for training modern neural models. In this paper, we propose a\nmethod for automatically-generating fusion examples from raw text and present\nDiscoFuse, a large scale dataset for discourse-based sentence fusion. We author\na set of rules for identifying a diverse set of discourse phenomena in raw\ntext, and decomposing the text into two independent sentences. We apply our\napproach on two document collections: Wikipedia and Sports articles, yielding\n60 million fusion examples annotated with discourse information required to\nreconstruct the fused text. We develop a sequence-to-sequence model on\nDiscoFuse and thoroughly analyze its strengths and weaknesses with respect to\nthe various discourse phenomena, using both automatic as well as human\nevaluation. Finally, we conduct transfer learning experiments with WebSplit, a\nrecent dataset for text simplification. We show that pretraining on DiscoFuse\nsubstantially improves performance on WebSplit when viewed as a sentence fusion\ntask." - }, - "ambig_qa": { - "pwc_id": "ambigqa", - "dataset_name": "AmbigQA Dataset", - "dataset_abstract": "Is a new open-domain question answering task which involves predicting a set of question-answer pairs, where every plausible answer is paired with a disambiguated rewrite of the original question. A dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark.", - "paper_name": "", - "paper_abstract": "" - }, - "ethos": { - "pwc_id": "ethos", - "dataset_name": "ETHOS Dataset", - "dataset_abstract": "ETHOS is a hate speech detection dataset. It is built from YouTube and Reddit comments validated through a crowdsourcing platform. It has two subsets, one for binary classification and the other for multi-label classification. The former contains 998 comments, while the latter contains fine-grained hate-speech annotations for 433 comments.", - "paper_name": "ETHOS: an Online Hate Speech Detection Dataset", - "paper_abstract": "Online hate speech is a recent problem in our society that is rising at a steady pace by leveraging the vulnerabilities of the corresponding regimes that characterise most social media platforms. This phenomenon is primarily fostered by offensive comments, either during user interaction or in the form of a posted multimedia context. Nowadays, giant corporations own platforms where millions of users log in every day, and protection from exposure to similar phenomena appears to be necessary in order to comply with the corresponding legislation and maintain a high level of service quality. A robust and reliable system for detecting and preventing the uploading of relevant content will have a significant impact on our digitally interconnected society. Several aspects of our daily lives are undeniably linked to our social profiles, making us vulnerable to abusive behaviours. As a result, the lack of accurate hate speech detection mechanisms would severely degrade the overall user experience, although its erroneous operation would pose many ethical concerns. In this paper, we present 'ETHOS', a textual dataset with two variants: binary and multi-label, based on YouTube and Reddit comments validated using the Figure-Eight crowdsourcing platform. Furthermore, we present the annotation protocol used to create this dataset: an active sampling procedure for balancing our data in relation to the various aspects defined. Our key assumption is that, even gaining a small amount of labelled data from such a time-consuming process, we can guarantee hate speech occurrences in the examined material." - }, - "multi_x_science_sum": { - "pwc_id": "multi-xscience", - "dataset_name": "Multi-XScience Dataset", - "dataset_abstract": "Multi-XScience is a large-scale dataset for multi-document summarization of scientific articles. It has 30,369, 5,066 and 5,093 samples for the train, validation and test split respectively. The average document length is 778.08 words and the average summary length is 116.44 words.", - "paper_name": "Multi-XScience: A Large-scale Dataset for Extreme Multi-document Summarization of Scientific Articles", - "paper_abstract": "Multi-document summarization is a challenging task for which there exists little large-scale datasets. We propose Multi-XScience, a large-scale multi-document summarization dataset created from scientific articles. Multi-XScience introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references. Our work is inspired by extreme summarization, a dataset construction protocol that favours abstractive modeling approaches. Descriptive statistics and empirical results---using several state-of-the-art models trained on the Multi-XScience dataset---reveal that Multi-XScience is well suited for abstractive models." - }, - "freebase_qa": { - "pwc_id": "freebaseqa", - "dataset_name": "FreebaseQA Dataset", - "dataset_abstract": "FreebaseQA is a data set for open-domain QA over the Freebase knowledge graph. The question-answer pairs in this data set are collected from various sources, including the TriviaQA data set and other trivia websites (QuizBalls, QuizZone, KnowQuiz), and are matched against Freebase to generate relevant subject-predicate-object triples that were further verified by human annotators. As all questions in FreebaseQA are composed independently for human contestants in various trivia-like competitions, this data set shows richer linguistic variation and complexity than existing QA data sets, making it a good test-bed for emerging KB-QA systems.", - "paper_name": "FreebaseQA: A New Factoid QA Data Set Matching Trivia-Style Question-Answer Pairs with Freebase", - "paper_abstract": "In this paper, we present a new data set, named FreebaseQA, for open-domain factoid question answering (QA) tasks over structured knowledge bases, like Freebase. The data set is generated by matching trivia-type question-answer pairs with subject-predicate-object triples in Freebase. For each collected question-answer pair, we first tag all entities in each question and search for relevant predicates that bridge a tagged entity with the answer in Freebase. Finally, human annotation is used to remove any false positive in these matched triples. Using this method, we are able to efficiently generate over 54K matches from about 28K unique questions with minimal cost. Our analysis shows that this data set is suitable for model training in factoid QA tasks beyond simpler questions since FreebaseQA provides more linguistically sophisticated questions than other existing data sets." - }, - "onestop_english": { - "pwc_id": "onestopenglish", - "dataset_name": "OneStopEnglish Dataset", - "dataset_abstract": "Useful for through two applications - automatic readability assessment and automatic text simplification. The corpus consists of 189 texts, each in three versions (567 in total).", - "paper_name": "OneStopEnglish corpus: A new corpus for automatic readability assessment and text simplification", - "paper_abstract": "This paper describes the collection and compilation of the OneStopEnglish corpus of texts written at three reading levels, and demonstrates its usefulness for through two applications - automatic readability assessment and automatic text simplification. The corpus consists of 189 texts, each in three versions (567 in total). The corpus is now freely available under a CC by-SA 4.0 license and we hope that it would foster further research on the topics of readability assessment and text simplification." - }, - "meta_woz": { - "pwc_id": "metalwoz", - "dataset_name": "MetaLWOz Dataset", - "dataset_abstract": "Collected by leveraging background knowledge from a larger, more highly represented dialogue source.", - "paper_name": "Few-Shot Dialogue Generation Without Annotated Data: A Transfer Learning Approach", - "paper_abstract": "Learning with minimal data is one of the key challenges in the development of practical, production-ready goal-oriented dialogue systems. In a real-world enterprise setting where dialogue systems are developed rapidly and are expected to work robustly for an ever-growing variety of domains, products, and scenarios, efficient learning from a limited number of examples becomes indispensable. In this paper, we introduce a technique to achieve state-of-the-art dialogue generation performance in a few-shot setup, without using any annotated data. We do this by leveraging background knowledge from a larger, more highly represented dialogue source --- namely, the MetaLWOz dataset. We evaluate our model on the Stanford Multi-Domain Dialogue Dataset, consisting of human-human goal-oriented dialogues in in-car navigation, appointment scheduling, and weather information domains. We show that our few-shot approach achieves state-of-the art results on that dataset by consistently outperforming the previous best model in terms of BLEU and Entity F1 scores, while being more data-efficient by not requiring any data annotation." - }, - "jfleg": { - "pwc_id": "jfleg", - "dataset_name": "JFLEG Dataset", - "dataset_abstract": "JFLEG is for developing and evaluating grammatical error correction (GEC). Unlike other corpora, it represents a broad range of language proficiency levels and uses holistic fluency edits to not only correct grammatical errors but also make the original text more native sounding.", - "paper_name": "JFLEG: A Fluency Corpus and Benchmark for Grammatical Error Correction", - "paper_abstract": "We present a new parallel corpus, JHU FLuency-Extended GUG corpus (JFLEG) for\ndeveloping and evaluating grammatical error correction (GEC). Unlike other\ncorpora, it represents a broad range of language proficiency levels and uses\nholistic fluency edits to not only correct grammatical errors but also make the\noriginal text more native sounding. We describe the types of corrections made\nand benchmark four leading GEC systems on this corpus, identifying specific\nareas in which they do well and how they can improve. JFLEG fulfills the need\nfor a new gold standard to properly assess the current state of GEC." - }, - "numer_sense": { - "pwc_id": "numersense", - "dataset_name": "NumerSense Dataset", - "dataset_abstract": "Contains 13.6k masked-word-prediction probes, 10.5k for fine-tuning and 3.1k for testing.", - "paper_name": "Birds have four legs?! NumerSense: Probing Numerical Commonsense Knowledge of Pre-trained Language Models", - "paper_abstract": "Recent works show that pre-trained language models (PTLMs), such as BERT, possess certain commonsense and factual knowledge. They suggest that it is promising to use PTLMs as \"neural knowledge bases\" via predicting masked words. Surprisingly, we find that this may not work for numerical commonsense knowledge (e.g., a bird usually has two legs). In this paper, we investigate whether and to what extent we can induce numerical commonsense knowledge from PTLMs as well as the robustness of this process. To study this, we introduce a novel probing task with a diagnostic dataset, NumerSense, containing 13.6k masked-word-prediction probes (10.5k for fine-tuning and 3.1k for testing). Our analysis reveals that: (1) BERT and its stronger variant RoBERTa perform poorly on the diagnostic dataset prior to any fine-tuning; (2) fine-tuning with distant supervision brings some improvement; (3) the best supervised model still performs poorly as compared to human performance (54.06% vs 96.3% in accuracy)." - }, - "neural_code_search": { - "pwc_id": "neural-code-search-evaluation-dataset", - "dataset_name": "Neural Code Search Evaluation Dataset Dataset", - "dataset_abstract": "Neural-Code-Search-Evaluation-Dataset presents an evaluation dataset consisting of natural language query and code snippet pairs, with the hope that future work in this area can use this dataset as a common benchmark.", - "paper_name": "", - "paper_abstract": "" - }, - "mrqa": { - "pwc_id": "mrqa-2019", - "dataset_name": "MRQA Dataset", - "dataset_abstract": "The MRQA (Machine Reading for Question Answering) dataset is a dataset for evaluating the generalization capabilities of reading comprehension systems.", - "paper_name": "MRQA 2019 Shared Task: Evaluating Generalization in Reading Comprehension", - "paper_abstract": "We present the results of the Machine Reading for Question Answering (MRQA) 2019 shared task on evaluating the generalization capabilities of reading comprehension systems. In this task, we adapted and unified 18 distinct question answering datasets into the same format. Among them, six datasets were made available for training, six datasets were made available for development, and the final six were hidden for final evaluation. Ten teams submitted systems, which explored various ideas including data sampling, multi-task learning, adversarial training and ensembling. The best system achieved an average F1 score of 72.5 on the 12 held-out datasets, 10.7 absolute points higher than our initial baseline based on BERT." - }, - "drop": { - "pwc_id": "drop", - "dataset_name": "DROP Dataset", - "dataset_abstract": "Discrete Reasoning Over Paragraphs DROP is a crowdsourced, adversarially-created, 96k-question benchmark, in which a system must resolve references in a question, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or sorting). These operations require a much more comprehensive understanding of the content of paragraphs than what was necessary for prior datasets. The questions consist of passages extracted from Wikipedia articles. The dataset is split into a training set of about 77,000 questions, a development set of around 9,500 questions and a hidden test set similar in size to the development set.", - "paper_name": "DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs", - "paper_abstract": "Reading comprehension has recently seen rapid progress, with systems matching\nhumans on the most popular datasets for the task. However, a large body of work\nhas highlighted the brittleness of these systems, showing that there is much\nwork left to be done. We introduce a new English reading comprehension\nbenchmark, DROP, which requires Discrete Reasoning Over the content of\nParagraphs. In this crowdsourced, adversarially-created, 96k-question\nbenchmark, a system must resolve references in a question, perhaps to multiple\ninput positions, and perform discrete operations over them (such as addition,\ncounting, or sorting). These operations require a much more comprehensive\nunderstanding of the content of paragraphs than what was necessary for prior\ndatasets. We apply state-of-the-art methods from both the reading comprehension\nand semantic parsing literature on this dataset and show that the best systems\nonly achieve 32.7% F1 on our generalized accuracy metric, while expert human\nperformance is 96.0%. We additionally present a new model that combines reading\ncomprehension methods with simple numerical reasoning to achieve 47.0% F1." - }, - "openwebtext": { - "pwc_id": "openwebtext", - "dataset_name": "OpenWebText Dataset", - "dataset_abstract": "OpenWebText is an open-source recreation of the WebText corpus. The text is web content extracted from URLs shared on Reddit with at least three upvotes. (38GB).", - "paper_name": "", - "paper_abstract": "" - }, - "snips_built_in_intents": { - "pwc_id": "snips", - "dataset_name": "SNIPS Dataset", - "dataset_abstract": "The SNIPS Natural Language Understanding benchmark is a dataset of over 16,000 crowdsourced queries distributed among 7 user intents of various complexity:\n\n\nSearchCreativeWork (e.g. Find me the I, Robot television show),\nGetWeather (e.g. Is it windy in Boston, MA right now?),\nBookRestaurant (e.g. I want to book a highly rated restaurant in Paris tomorrow night),\nPlayMusic (e.g. Play the last track from Beyonc\u00e9 off Spotify),\nAddToPlaylist (e.g. Add Diamonds to my roadtrip playlist),\nRateBook (e.g. Give 6 stars to Of Mice and Men),\nSearchScreeningEvent (e.g. Check the showtimes for Wonder Woman in Paris).\nThe training set contains of 13,084 utterances, the validation set and the test set contain 700 utterances each, with 100 queries per intent.", - "paper_name": "Snips Voice Platform: an embedded Spoken Language Understanding system for private-by-design voice interfaces", - "paper_abstract": "This paper presents the machine learning architecture of the Snips Voice\nPlatform, a software solution to perform Spoken Language Understanding on\nmicroprocessors typical of IoT devices. The embedded inference is fast and\naccurate while enforcing privacy by design, as no personal user data is ever\ncollected. Focusing on Automatic Speech Recognition and Natural Language\nUnderstanding, we detail our approach to training high-performance Machine\nLearning models that are small enough to run in real-time on small devices.\nAdditionally, we describe a data generation procedure that provides sufficient,\nhigh-quality training data without compromising user privacy." - }, - "conv_ai_2": { - "pwc_id": "convai2", - "dataset_name": "ConvAI2 Dataset", - "dataset_abstract": "The ConvAI2 NeurIPS competition aimed at finding approaches to creating high-quality dialogue agents capable of meaningful open domain conversation. The ConvAI2 dataset for training models is based on the PERSONA-CHAT dataset. The speaker pairs each have assigned profiles coming from a set of 1155 possible personas (at training time), each consisting of at least 5 profile sentences, setting aside 100 never seen before personas for validation. As the original PERSONA-CHAT test set was released, a new hidden test set consisted of 100 new personas and over 1,015 dialogs was created by crowdsourced workers.\n\nTo avoid modeling that takes advantage of trivial word overlap, additional rewritten sets of the same train and test personas were crowdsourced, with related sentences that are rephrases, generalizations or specializations, rendering the task much more challenging. For example \u201cI just got my nails done\u201d is revised as \u201cI love to pamper myself on a regular basis\u201d and \u201cI am on a diet now\u201d is revised as \u201cI need to lose weight.\u201d\n\nThe training, validation and hidden test sets consists of 17,878, 1,000 and 1,015 dialogues, respectively.", - "paper_name": "The Second Conversational Intelligence Challenge (ConvAI2)", - "paper_abstract": "We describe the setting and results of the ConvAI2 NeurIPS competition that\naims to further the state-of-the-art in open-domain chatbots. Some key\ntakeaways from the competition are: (i) pretrained Transformer variants are\ncurrently the best performing models on this task, (ii) but to improve\nperformance on multi-turn conversations with humans, future systems must go\nbeyond single word metrics like perplexity to measure the performance across\nsequences of utterances (conversations) -- in terms of repetition, consistency\nand balance of dialogue acts (e.g. how many questions asked vs. answered)." - }, - "mocha": { - "pwc_id": "mocha", - "dataset_name": "MOCHA Dataset", - "dataset_abstract": "Contains 40K human judgement scores on model outputs from 6 diverse question answering datasets and an additional set of minimal pairs for evaluation.", - "paper_name": "MOCHA: A Dataset for Training and Evaluating Generative Reading Comprehension Metrics", - "paper_abstract": "Posing reading comprehension as a generation problem provides a great deal of flexibility, allowing for open-ended questions with few restrictions on possible answers. However, progress is impeded by existing generation metrics, which rely on token overlap and are agnostic to the nuances of reading comprehension. To address this, we introduce a benchmark for training and evaluating generative reading comprehension metrics: MOdeling Correctness with Human Annotations. MOCHA contains 40K human judgement scores on model outputs from 6 diverse question answering datasets and an additional set of minimal pairs for evaluation. Using MOCHA, we train a Learned Evaluation metric for Reading Comprehension, LERC, to mimic human judgement scores. LERC outperforms baseline metrics by 10 to 36 absolute Pearson points on held-out annotations. When we evaluate robustness on minimal pairs, LERC achieves 80% accuracy, outperforming baselines by 14 to 26 absolute percentage points while leaving significant room for improvement. MOCHA presents a challenging problem for developing accurate and robust generative reading comprehension metrics." - }, - "covid_qa_castorini": { - "pwc_id": "covidqa", - "dataset_name": "CovidQA Dataset", - "dataset_abstract": "The beginnings of a question answering dataset specifically designed for COVID-19, built by hand from knowledge gathered from Kaggle's COVID-19 Open Research Dataset Challenge.", - "paper_name": "Rapidly Bootstrapping a Question Answering Dataset for COVID-19", - "paper_abstract": "We present CovidQA, the beginnings of a question answering dataset specifically designed for COVID-19, built by hand from knowledge gathered from Kaggle's COVID-19 Open Research Dataset Challenge. To our knowledge, this is the first publicly available resource of its type, and intended as a stopgap measure for guiding research until more substantial evaluation resources become available. While this dataset, comprising 124 question-article pairs as of the present version 0.1 release, does not have sufficient examples for supervised machine learning, we believe that it can be helpful for evaluating the zero-shot or transfer capabilities of existing models on topics specifically related to COVID-19. This paper describes our methodology for constructing the dataset and presents the effectiveness of a number of baselines, including term-based techniques and various transformer-based models. The dataset is available at http://covidqa.ai/" - }, - "wiki40b": { - "pwc_id": "wiki-40b", - "dataset_name": "Wiki-40B Dataset", - "dataset_abstract": "A new multilingual language model benchmark that is composed of 40+ languages spanning several scripts and linguistic families containing round 40 billion characters and aimed to accelerate the research of multilingual modeling.", - "paper_name": "Wiki-40B: Multilingual Language Model Dataset", - "paper_abstract": "We propose a new multilingual language model benchmark that is composed of 40+ languages spanning several scripts and linguistic families. With around 40 billion characters, we hope this new resource will accelerate the research of multilingual modeling. We train monolingual causal language models using a state-of-the-art model (Transformer-XL) establishing baselines for many languages. We also introduce the task of multilingual causal language modeling where we train our model on the combined text of 40+ languages from Wikipedia with different vocabulary sizes and evaluate on the languages individually. We released the cleaned-up text of 40+ Wikipedia language editions, the corresponding trained monolingual language models, and several multilingual language models with different fixed vocabulary sizes." - }, - "docred": { - "pwc_id": "docred", - "dataset_name": "DocRED Dataset", - "dataset_abstract": "DocRED (Document-Level Relation Extraction Dataset) is a relation extraction dataset constructed from Wikipedia and Wikidata. Each document in the dataset is human-annotated with named entity mentions, coreference information, intra- and inter-sentence relations, and supporting evidence. DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document. Along with the human-annotated data, the dataset provides large-scale distantly supervised data.\n\nDocRED contains 132,375 entities and 56,354 relational facts annotated on 5,053 Wikipedia documents. In addition to the human-annotated data, the dataset provides large-scale distantly supervised data over 101,873 documents.", - "paper_name": "DocRED: A Large-Scale Document-Level Relation Extraction Dataset", - "paper_abstract": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features: (1) DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text; (2) DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document; (3) along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios. In order to verify the challenges of document-level RE, we implement recent state-of-the-art methods for RE and conduct a thorough evaluation of these methods on DocRED. Empirical results show that DocRED is challenging for existing RE methods, which indicates that document-level RE remains an open problem and requires further efforts. Based on the detailed analysis on the experiments, we discuss multiple promising directions for future research." - }, - "wiki_split": { - "pwc_id": "wikisplit", - "dataset_name": "WikiSplit Dataset", - "dataset_abstract": "Contains one million naturally occurring sentence rewrites, providing sixty times more distinct split examples and a ninety times larger vocabulary than the WebSplit corpus introduced by Narayan et al. (2017) as a benchmark for this task.", - "paper_name": "Learning To Split and Rephrase From Wikipedia Edit History", - "paper_abstract": "Split and rephrase is the task of breaking down a sentence into shorter ones\nthat together convey the same meaning. We extract a rich new dataset for this\ntask by mining Wikipedia's edit history: WikiSplit contains one million\nnaturally occurring sentence rewrites, providing sixty times more distinct\nsplit examples and a ninety times larger vocabulary than the WebSplit corpus\nintroduced by Narayan et al. (2017) as a benchmark for this task. Incorporating\nWikiSplit as training data produces a model with qualitatively better\npredictions that score 32 BLEU points above the prior best result on the\nWebSplit benchmark." - }, - "craigslist_bargains": { - "pwc_id": "craigslistbargains", - "dataset_name": "CraigslistBargains Dataset", - "dataset_abstract": "A richer dataset based on real items on Craigslist.", - "paper_name": "Decoupling Strategy and Generation in Negotiation Dialogues", - "paper_abstract": "We consider negotiation settings in which two agents use natural language to\nbargain on goods. Agents need to decide on both high-level strategy (e.g.,\nproposing \\$50) and the execution of that strategy (e.g., generating \"The bike\nis brand new. Selling for just \\$50.\"). Recent work on negotiation trains\nneural models, but their end-to-end nature makes it hard to control their\nstrategy, and reinforcement learning tends to lead to degenerate solutions. In\nthis paper, we propose a modular approach based on coarse di- alogue acts\n(e.g., propose(price=50)) that decouples strategy and generation. We show that\nwe can flexibly set the strategy using supervised learning, reinforcement\nlearning, or domain-specific knowledge without degeneracy, while our\nretrieval-based generation can maintain context-awareness and produce diverse\nutterances. We test our approach on the recently proposed DEALORNODEAL game,\nand we also collect a richer dataset based on real items on Craigslist. Human\nevaluation shows that our systems achieve higher task success rate and more\nhuman-like negotiation behavior than previous approaches." - }, - "asnq": { - "pwc_id": "asnq", - "dataset_name": "ASNQ Dataset", - "dataset_abstract": "A large scale dataset to enable the transfer step, exploiting the Natural Questions dataset.", - "paper_name": "TANDA: Transfer and Adapt Pre-Trained Transformer Models for Answer Sentence Selection", - "paper_abstract": "We propose TANDA, an effective technique for fine-tuning pre-trained Transformer models for natural language tasks. Specifically, we first transfer a pre-trained model into a model for a general task by fine-tuning it with a large and high-quality dataset. We then perform a second fine-tuning step to adapt the transferred model to the target domain. We demonstrate the benefits of our approach for answer sentence selection, which is a well-known inference task in Question Answering. We built a large scale dataset to enable the transfer step, exploiting the Natural Questions dataset. Our approach establishes the state of the art on two well-known benchmarks, WikiQA and TREC-QA, achieving MAP scores of 92% and 94.3%, respectively, which largely outperform the previous highest scores of 83.4% and 87.5%, obtained in very recent work. We empirically show that TANDA generates more stable and robust models reducing the effort required for selecting optimal hyper-parameters. Additionally, we show that the transfer step of TANDA makes the adaptation step more robust to noise. This enables a more effective use of noisy datasets for fine-tuning. Finally, we also confirm the positive impact of TANDA in an industrial setting, using domain specific datasets subject to different types of noise." - }, - "limit": { - "pwc_id": "limit", - "dataset_name": "LiMiT Dataset", - "dataset_abstract": "The limit dataset of ~24K sentences that describe literal motion (~14K sentences), and sentences not describing motion or other type of motion (e.g. fictive motion). Senteces were extracted from electronic books categorized as fiction or novels, and a portion from the NetActivity Captions Dataset.", - "paper_name": "", - "paper_abstract": "" - }, - "kelm": { - "pwc_id": "kelm", - "dataset_name": "KELM Dataset", - "dataset_abstract": "KELM is a large-scale synthetic corpus of Wikidata KG as natural text.", - "paper_name": "Knowledge Graph Based Synthetic Corpus Generation for Knowledge-Enhanced Language Model Pre-training", - "paper_abstract": "Prior work on Data-To-Text Generation, the task of converting knowledge graph (KG) triples into natural text, focused on domain-specific benchmark datasets. In this paper, however, we verbalize the entire English Wikidata KG, and discuss the unique challenges associated with a broad, open-domain, large-scale verbalization. We further show that verbalizing a comprehensive, encyclopedic KG like Wikidata can be used to integrate structured KGs and natural language corpora. In contrast to the many architectures that have been developed to integrate these two sources, our approach converts the KG into natural text, allowing it to be seamlessly integrated into existing language models. It carries the further advantages of improved factual accuracy and reduced toxicity in the resulting language model. We evaluate this approach by augmenting the retrieval corpus in a retrieval language model and showing significant improvements on the knowledge intensive tasks of open domain QA and the LAMA knowledge probe." - }, - "zest": { - "pwc_id": "zest", - "dataset_name": "ZEST Dataset", - "dataset_abstract": "A new English language dataset structured for task-oriented evaluation on unseen tasks.", - "paper_name": "Learning from Task Descriptions", - "paper_abstract": "Typically, machine learning systems solve new tasks by training on thousands of examples. In contrast, humans can solve new tasks by reading some instructions, with perhaps an example or two. To take a step toward closing this gap, we introduce a framework for developing NLP systems that solve new tasks after reading their descriptions, synthesizing prior work in this area. We instantiate this framework with a new English language dataset, ZEST, structured for task-oriented evaluation on unseen tasks. Formulating task descriptions as questions, we ensure each is general enough to apply to many possible inputs, thus comprehensively evaluating a model's ability to solve each task. Moreover, the dataset's structure tests specific types of systematic generalization. We find that the state-of-the-art T5 model achieves a score of 12% on ZEST, leaving a significant challenge for NLP researchers." - }, - "gutenberg_time": { - "pwc_id": "gutenberg-time-dataset", - "dataset_name": "Gutenberg Time Dataset Dataset", - "dataset_abstract": "A data set of hourly time phrases from 52,183 fictional books.", - "paper_name": "What time is it? Temporal Analysis of Novels", - "paper_abstract": "Recognizing the flow of time in a story is a crucial aspect of understanding it. Prior work related to time has primarily focused on identifying temporal expressions or relative sequencing of events, but here we propose computationally annotating each line of a book with wall clock times, even in the absence of explicit time-descriptive phrases. To do so, we construct a data set of hourly time phrases from 52,183 fictional books. We then construct a time-of-day classification model that achieves an average error of 2.27 hours. Furthermore, we show that by analyzing a book in whole using dynamic programming of breakpoints, we can roughly partition a book into segments that each correspond to a particular time-of-day. This approach improves upon baselines by over two hours. Finally, we apply our model to a corpus of literature categorized by different periods in history, to show interesting trends of hourly activity throughout the past. Among several observations we find that the fraction of events taking place past 10 P.M jumps past 1880 - coincident with the advent of the electric light bulb and city lights." - }, - "sent_comp": { - "pwc_id": "sentence-compression", - "dataset_name": "Sentence Compression Dataset", - "dataset_abstract": "Sentence Compression is a dataset where the syntactic trees of the compressions are subtrees of their uncompressed counterparts, and hence where supervised systems which require a structural alignment between the input and output can be successfully trained.", - "paper_name": "", - "paper_abstract": "" - }, - "qed": { - "pwc_id": "qed", - "dataset_name": "QED Dataset", - "dataset_abstract": "QED is a linguistically principled framework for explanations in question answering. Given a question and a passage, QED represents an explanation of the answer as a combination of discrete, human-interpretable steps:\nsentence selection := identification of a sentence implying an answer to the question\nreferential equality := identification of noun phrases in the question and the answer sentence that refer to the same thing\npredicate entailment := confirmation that the predicate in the sentence entails the predicate in the question once referential equalities are abstracted away.\nThe QED dataset is an expert-annotated dataset of QED explanations build upon a subset of the Google Natural Questions dataset.", - "paper_name": "QED: A Framework and Dataset for Explanations in Question Answering", - "paper_abstract": "A question answering system that in addition to providing an answer provides an explanation of the reasoning that leads to that answer has potential advantages in terms of debuggability, extensibility and trust. To this end, we propose QED, a linguistically informed, extensible framework for explanations in question answering. A QED explanation specifies the relationship between a question and answer according to formal semantic notions such as referential equality, sentencehood, and entailment. We describe and publicly release an expert-annotated dataset of QED explanations built upon a subset of the Google Natural Questions dataset, and report baseline models on two tasks -- post-hoc explanation generation given an answer, and joint question answering and explanation generation. In the joint setting, a promising result suggests that training on a relatively small amount of QED data can improve question answering. In addition to describing the formal, language-theoretic motivations for the QED approach, we describe a large user study showing that the presence of QED explanations significantly improves the ability of untrained raters to spot errors made by a strong neural QA baseline." - }, - "code_search_net": { - "pwc_id": "codesearchnet", - "dataset_name": "CodeSearchNet Dataset", - "dataset_abstract": "The CodeSearchNet Corpus is a large dataset of functions with associated documentation written in Go, Java, JavaScript, PHP, Python, and Ruby from open source projects on GitHub. The CodeSearchNet Corpus includes:\n* Six million methods overall\n* Two million of which have associated documentation (docstrings, JavaDoc, and more)\n* Metadata that indicates the original location (repository or line number, for example) where the data was found", - "paper_name": "CodeSearchNet Challenge: Evaluating the State of Semantic Code Search", - "paper_abstract": "Semantic code search is the task of retrieving relevant code given a natural language query. While related to other information retrieval tasks, it requires bridging the gap between the language used in code (often abbreviated and highly technical) and natural language more suitable to describe vague concepts and ideas. To enable evaluation of progress on code search, we are releasing the CodeSearchNet Corpus and are presenting the CodeSearchNet Challenge, which consists of 99 natural language queries with about 4k expert relevance annotations of likely results from CodeSearchNet Corpus. The corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation. In this article, we describe the methodology used to obtain the corpus and expert labels, as well as a number of simple baseline solutions for the task. We hope that CodeSearchNet Challenge encourages researchers and practitioners to study this interesting task further and will host a competition and leaderboard to track the progress on the challenge. We are also keen on extending CodeSearchNet Challenge to more queries and programming languages in the future." - }, - "wikihow": { - "pwc_id": "wikihow", - "dataset_name": "WikiHow Dataset", - "dataset_abstract": "WikiHow is a dataset of more than 230,000 article and summary pairs extracted and constructed from an online knowledge base written by different human authors. The articles span a wide range of topics and represent high diversity styles.", - "paper_name": "WikiHow: A Large Scale Text Summarization Dataset", - "paper_abstract": "Sequence-to-sequence models have recently gained the state of the art\nperformance in summarization. However, not too many large-scale high-quality\ndatasets are available and almost all the available ones are mainly news\narticles with specific writing style. Moreover, abstractive human-style systems\ninvolving description of the content at a deeper level require data with higher\nlevels of abstraction. In this paper, we present WikiHow, a dataset of more\nthan 230,000 article and summary pairs extracted and constructed from an online\nknowledge base written by different human authors. The articles span a wide\nrange of topics and therefore represent high diversity styles. We evaluate the\nperformance of the existing methods on WikiHow to present its challenges and\nset some baselines to further improve it." - }, - "tapaco": { - "pwc_id": "tapaco", - "dataset_name": "TaPaCo Dataset", - "dataset_abstract": "TaPaCo is a freely available paraphrase corpus for 73 languages extracted from the Tatoeba database.", - "paper_name": "TaPaCo: A Corpus of Sentential Paraphrases for 73 Languages", - "paper_abstract": "This paper presents TaPaCo, a freely available paraphrase corpus for 73 languages extracted from the Tatoeba database. Tatoeba is a crowdsourcing project mainly geared towards language learners. Its aim is to provide example sentences and translations for particular linguistic constructions and words. The paraphrase corpus is created by populating a graph with Tatoeba sentences and equivalence links between sentences {``}meaning the same thing{''}. This graph is then traversed to extract sets of paraphrases. Several language-independent filters and pruning steps are applied to remove uninteresting sentences. A manual evaluation performed on three languages shows that between half and three quarters of inferred paraphrases are correct and that most remaining ones are either correct but trivial, or near-paraphrases that neutralize a morphological distinction. The corpus contains a total of 1.9 million sentences, with 200 - 250 000 sentences per language. It covers a range of languages for which, to our knowledge, no other paraphrase dataset exists. The dataset is available at https://doi.org/10.5281/zenodo.3707949." - }, - "exams": { - "pwc_id": "exams", - "dataset_name": "EXAMS Dataset", - "dataset_abstract": "A new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. Collects more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others. EXAMS offers a fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of various models.", - "paper_name": "EXAMS: A Multi-Subject High School Examinations Dataset for Cross-Lingual and Multilingual Question Answering", - "paper_abstract": "We propose EXAMS -- a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others. EXAMS offers a fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of various models. We perform various experiments with existing top-performing multilingual pre-trained models and we show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible before. The data, code, pre-trained models, and evaluation are available at https://github.com/mhardalov/exams-qa." - }, - "mnist": { - "pwc_id": "mnist", - "dataset_name": "MNIST Dataset", - "dataset_abstract": "The MNIST database (Modified National Institute of Standards and Technology database) is a large collection of handwritten digits. It has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger NIST Special Database 3 (digits written by employees of the United States Census Bureau) and Special Database 1 (digits written by high school students) which contain monochrome images of handwritten digits. The digits have been size-normalized and centered in a fixed-size image. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image by computing the center of mass of the pixels, and translating the image so as to position this point at the center of the 28x28 field.", - "paper_name": "", - "paper_abstract": "" - }, - "mlsum": { - "pwc_id": "mlsum", - "dataset_name": "MLSUM Dataset", - "dataset_abstract": "A large-scale MultiLingual SUMmarization dataset. Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish. Together with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.", - "paper_name": "MLSUM: The Multilingual Summarization Corpus", - "paper_abstract": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset. Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish. Together with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community. We report cross-lingual comparative analyses based on state-of-the-art systems. These highlight existing biases which motivate the use of a multi-lingual dataset." - }, - "ccdv/cnn_dailymail": { - "pwc_id": "cnn-daily-mail-1", - "dataset_name": "CNN/Daily Mail Dataset", - "dataset_abstract": "CNN/Daily Mail is a dataset for text summarization. Human generated abstractive summary bullets were generated from news stories in CNN and Daily Mail websites as questions (with one of the entities hidden), and stories as the corresponding passages from which the system is expected to answer the fill-in the-blank question. The authors released the scripts that crawl, extract and generate pairs of passages and questions from these websites.\n\nIn all, the corpus has 286,817 training pairs, 13,368 validation pairs and 11,487 test pairs, as defined by their scripts. The source documents in the training set have 766 words spanning 29.74 sentences on an average while the summaries consist of 53 words and 3.72 sentences.", - "paper_name": "Abstractive Text Summarization Using Sequence-to-Sequence RNNs and Beyond", - "paper_abstract": "In this work, we model abstractive text summarization using Attentional\nEncoder-Decoder Recurrent Neural Networks, and show that they achieve\nstate-of-the-art performance on two different corpora. We propose several novel\nmodels that address critical problems in summarization that are not adequately\nmodeled by the basic architecture, such as modeling key-words, capturing the\nhierarchy of sentence-to-word structure, and emitting words that are rare or\nunseen at training time. Our work shows that many of our proposed models\ncontribute to further improvement in performance. We also propose a new dataset\nconsisting of multi-sentence summaries, and establish performance benchmarks\nfor further research." - }, - "e2e_nlg": { - "pwc_id": "e2e", - "dataset_name": "E2E Dataset", - "dataset_abstract": "End-to-End NLG Challenge (E2E) aims to assess whether recent end-to-end NLG systems can generate more complex output by learning from datasets containing higher lexical richness, syntactic complexity and diverse discourse phenomena.", - "paper_name": "The E2E Dataset: New Challenges For End-to-End Generation", - "paper_abstract": "This paper describes the E2E data, a new dataset for training end-to-end,\ndata-driven natural language generation systems in the restaurant domain, which\nis ten times bigger than existing, frequently used datasets in this area. The\nE2E dataset poses new challenges: (1) its human reference texts show more\nlexical richness and syntactic variation, including discourse phenomena; (2)\ngenerating from this set requires content selection. As such, learning from\nthis dataset promises more natural, varied and less template-like system\nutterances. We also establish a baseline on this dataset, which illustrates\nsome of the difficulties associated with this data." - }, - "medal": { - "pwc_id": "medal", - "dataset_name": "MeDAL Dataset", - "dataset_abstract": "The Medical Dataset for Abbreviation Disambiguation for Natural Language Understanding (MeDAL) is a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding pre-training in the medical domain. It was published at the ClinicalNLP workshop at EMNLP.", - "paper_name": "MeDAL: Medical Abbreviation Disambiguation Dataset for Natural Language Understanding Pretraining", - "paper_abstract": "One of the biggest challenges that prohibit the use of many current NLP methods in clinical settings is the availability of public datasets. In this work, we present MeDAL, a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding pre-training in the medical domain. We pre-trained several models of common architectures on this dataset and empirically showed that such pre-training leads to improved performance and convergence speed when fine-tuning on downstream medical tasks." - }, - "tatoeba": { - "pwc_id": "tatoeba", - "dataset_name": "Tatoeba Dataset", - "dataset_abstract": "The Tatoeba dataset consists of up to 1,000 English-aligned sentence pairs covering 122 languages.", - "paper_name": "Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond", - "paper_abstract": "We introduce an architecture to learn joint multilingual sentence representations for 93 languages, belonging to more than 30 different families and written in 28 different scripts. Our system uses a single BiLSTM encoder with a shared BPE vocabulary for all languages, which is coupled with an auxiliary decoder and trained on publicly available parallel corpora. This enables us to learn a classifier on top of the resulting embeddings using English annotated data only, and transfer it to any of the 93 languages without any modification. Our experiments in cross-lingual natural language inference (XNLI dataset), cross-lingual document classification (MLDoc dataset) and parallel corpus mining (BUCC dataset) show the effectiveness of our approach. We also introduce a new test set of aligned sentences in 112 languages, and show that our sentence embeddings obtain strong results in multilingual similarity search even for low-resource languages. Our implementation, the pre-trained encoder and the multilingual test set are available at https://github.com/facebookresearch/LASER" - }, - "clue": { - "pwc_id": "clue", - "dataset_name": "CLUE Dataset", - "dataset_abstract": "CLUE is a Chinese Language Understanding Evaluation benchmark. It consists of different NLU datasets. It is a community-driven project that brings together 9 tasks spanning several well-established single-sentence/sentence-pair classification tasks, as well as machine reading comprehension, all on original Chinese text.", - "paper_name": "CLUE: A Chinese Language Understanding Evaluation Benchmark", - "paper_abstract": "The advent of natural language understanding (NLU) benchmarks for English, such as GLUE and SuperGLUE allows new NLU models to be evaluated across a diverse set of tasks. These comprehensive benchmarks have facilitated a broad range of research and applications in natural language processing (NLP). The problem, however, is that most such benchmarks are limited to English, which has made it difficult to replicate many of the successes in English NLU for other languages. To help remedy this issue, we introduce the first large-scale Chinese Language Understanding Evaluation (CLUE) benchmark. CLUE is an open-ended, community-driven project that brings together 9 tasks spanning several well-established single-sentence/sentence-pair classification tasks, as well as machine reading comprehension, all on original Chinese text. To establish results on these tasks, we report scores using an exhaustive set of current state-of-the-art pre-trained Chinese models (9 in total). We also introduce a number of supplementary datasets and additional tools to help facilitate further progress on Chinese NLU. Our benchmark is released at https://www.CLUEbenchmarks.com" - }, - "gsm8k": { - "pwc_id": "gsm8k", - "dataset_name": "GSM8K Dataset", - "dataset_abstract": "GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. The dataset is segmented into 7.5K training problems and 1K test problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations (+ \u2212 \u00d7\u00f7) to reach the final answer. A bright middle school student should be able to solve every problem. It can be used for multi-step mathematical reasoning.", - "paper_name": "Training Verifiers to Solve Math Word Problems", - "paper_abstract": "State-of-the-art language models can match human performance on many tasks, but they still struggle to robustly perform multi-step mathematical reasoning. To diagnose the failures of current models and support research, we introduce GSM8K, a dataset of 8.5K high quality linguistically diverse grade school math word problems. We find that even the largest transformer models fail to achieve high test performance, despite the conceptual simplicity of this problem distribution. To increase performance, we propose training verifiers to judge the correctness of model completions. At test time, we generate many candidate solutions and select the one ranked highest by the verifier. We demonstrate that verification significantly improves performance on GSM8K, and we provide strong empirical evidence that verification scales more effectively with increased data than a finetuning baseline." - }, - "squad_kor_v1": { - "pwc_id": "korquad", - "dataset_name": "KorQuAD Dataset", - "dataset_abstract": "KorQuAD is a large-scale question-and-answer dataset constructed for Korean machine reading comprehension, and investigate the dataset to understand the distribution of answers and the types of reasoning required to answer the question. This dataset benchmarks the data generating process of SQuAD to meet the standard.", - "paper_name": "KorQuAD1.0: Korean QA Dataset for Machine Reading Comprehension", - "paper_abstract": "Machine Reading Comprehension (MRC) is a task that requires machine to understand natural language and answer questions by reading a document. It is the core of automatic response technology such as chatbots and automatized customer supporting systems. We present Korean Question Answering Dataset(KorQuAD), a large-scale Korean dataset for extractive machine reading comprehension task. It consists of 70,000+ human generated question-answer pairs on Korean Wikipedia articles. We release KorQuAD1.0 and launch a challenge at https://KorQuAD.github.io to encourage the development of multilingual natural language processing research." - }, - "cifar10": { - "pwc_id": "cifar-10", - "dataset_name": "CIFAR-10 Dataset", - "dataset_abstract": "The CIFAR-10 dataset (Canadian Institute for Advanced Research, 10 classes) is a subset of the Tiny Images dataset and consists of 60000 32x32 color images. The images are labelled with one of 10 mutually exclusive classes: airplane, automobile (but not truck or pickup truck), bird, cat, deer, dog, frog, horse, ship, and truck (but not pickup truck). There are 6000 images per class with 5000 training and 1000 testing images per class.\n\nThe criteria for deciding whether an image belongs to a class were as follows:\n\n\nThe class name should be high on the list of likely answers to the question \u201cWhat is in this picture?\u201d\nThe image should be photo-realistic. Labelers were instructed to reject line drawings.\nThe image should contain only one prominent instance of the object to which the class refers.\nThe object may be partially occluded or seen from an unusual viewpoint as long as its identity is still clear to the labeler.", - "paper_name": "", - "paper_abstract": "" - }, - "multi_woz_v22": { - "pwc_id": "multiwoz", - "dataset_name": "MultiWOZ Dataset", - "dataset_abstract": "The Multi-domain Wizard-of-Oz (MultiWOZ) dataset is a large-scale human-human conversational corpus spanning over seven domains, containing 8438 multi-turn dialogues, with each dialogue averaging 14 turns. Different from existing standard datasets like WOZ and DSTC2, which contain less than 10 slots and only a few hundred values, MultiWOZ has 30 (domain, slot) pairs and over 4,500 possible values. The dialogues span seven domains: restaurant, hotel, attraction, taxi, train, hospital and police.", - "paper_name": "MultiWOZ -- A Large-Scale Multi-Domain Wizard-of-Oz Dataset for Task-Oriented Dialogue Modelling", - "paper_abstract": "Even though machine learning has become the major scene in dialogue research community, the real breakthrough has been blocked by the scale of data available. To address this fundamental obstacle, we introduce the Multi-Domain Wizard-of-Oz dataset (MultiWOZ), a fully-labeled collection of human-human written conversations spanning over multiple domains and topics. At a size of $10$k dialogues, it is at least one order of magnitude larger than all previous annotated task-oriented corpora. The contribution of this work apart from the open-sourced dataset labelled with dialogue belief states and dialogue actions is two-fold: firstly, a detailed description of the data collection procedure along with a summary of data structure and analysis is provided. The proposed data-collection pipeline is entirely based on crowd-sourcing without the need of hiring professional annotators; secondly, a set of benchmark results of belief tracking, dialogue act and response generation is reported, which shows the usability of the data and sets a baseline for future studies." - }, - "nsmc": { - "pwc_id": "nsmc", - "dataset_name": "NSMC Dataset", - "dataset_abstract": "This is a movie review dataset in the Korean language. Reviews were scraped from Naver Movies.", - "paper_name": "", - "paper_abstract": "" - }, - "conllpp": { - "pwc_id": "conll", - "dataset_name": "CoNLL++ Dataset", - "dataset_abstract": "CoNLL++ is a corrected version of the CoNLL03 NER dataset where 5.38% of the test sentences have been fixed.", - "paper_name": "CrossWeigh: Training Named Entity Tagger from Imperfect Annotations", - "paper_abstract": "Everyone makes mistakes. So do human annotators when curating labels for named entity recognition (NER). Such label mistakes might hurt model training and interfere model comparison. In this study, we dive deep into one of the widely-adopted NER benchmark datasets, CoNLL03 NER. We are able to identify label mistakes in about 5.38% test sentences, which is a significant ratio considering that the state-of-the-art test F1 score is already around 93%. Therefore, we manually correct these label mistakes and form a cleaner test set. Our re-evaluation of popular models on this corrected test set leads to more accurate assessments, compared to those on the original test set. More importantly, we propose a simple yet effective framework, CrossWeigh, to handle label mistakes during NER model training. Specifically, it partitions the training data into several folds and train independent NER models to identify potential mistakes in each fold. Then it adjusts the weights of training data accordingly to train the final NER model. Extensive experiments demonstrate significant improvements of plugging various NER models into our proposed framework on three datasets. All implementations and corrected test set are available at our Github repo: https://github.com/ZihanWangKi/CrossWeigh." - }, - "wikisql": { - "pwc_id": "wikisql", - "dataset_name": "WikiSQL Dataset", - "dataset_abstract": "WikiSQL consists of a corpus of 87,726 hand-annotated SQL query and natural language question pairs. These SQL queries are further split into training (61,297 examples), development (9,145 examples) and test sets (17,284 examples). It can be used for natural language inference tasks related to relational databases.", - "paper_name": "Seq2SQL: Generating Structured Queries from Natural Language using Reinforcement Learning", - "paper_abstract": "A significant amount of the world's knowledge is stored in relational\ndatabases. However, the ability for users to retrieve facts from a database is\nlimited due to a lack of understanding of query languages such as SQL. We\npropose Seq2SQL, a deep neural network for translating natural language\nquestions to corresponding SQL queries. Our model leverages the structure of\nSQL queries to significantly reduce the output space of generated queries.\nMoreover, we use rewards from in-the-loop query execution over the database to\nlearn a policy to generate unordered parts of the query, which we show are less\nsuitable for optimization via cross entropy loss. In addition, we will publish\nWikiSQL, a dataset of 80654 hand-annotated examples of questions and SQL\nqueries distributed across 24241 tables from Wikipedia. This dataset is\nrequired to train our model and is an order of magnitude larger than comparable\ndatasets. By applying policy-based reinforcement learning with a query\nexecution environment to WikiSQL, our model Seq2SQL outperforms attentional\nsequence to sequence models, improving execution accuracy from 35.9% to 59.4%\nand logical form accuracy from 23.4% to 48.3%." - }, - "big_patent": { - "pwc_id": "bigpatent", - "dataset_name": "BigPatent Dataset", - "dataset_abstract": "Consists of 1.3 million records of U.S. patent documents along with human written abstractive summaries.", - "paper_name": "BIGPATENT: A Large-Scale Dataset for Abstractive and Coherent Summarization", - "paper_abstract": "Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article's global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research." - }, - "md_gender_bias": { - "pwc_id": "md-gender", - "dataset_name": "MD Gender Dataset", - "dataset_abstract": "Provides eight automatically annotated large scale datasets with gender information.", - "paper_name": "Multi-Dimensional Gender Bias Classification", - "paper_abstract": "Machine learning models are trained to find patterns in data. NLP models can inadvertently learn socially undesirable patterns when training on gender biased text. In this work, we propose a general framework that decomposes gender bias in text along several pragmatic and semantic dimensions: bias from the gender of the person being spoken about, bias from the gender of the person being spoken to, and bias from the gender of the speaker. Using this fine-grained framework, we automatically annotate eight large scale datasets with gender information. In addition, we collect a novel, crowdsourced evaluation benchmark of utterance-level gender rewrites. Distinguishing between gender bias along multiple dimensions is important, as it enables us to train finer-grained gender bias classifiers. We show our classifiers prove valuable for a variety of important applications, such as controlling for gender bias in generative models, detecting gender bias in arbitrary text, and shed light on offensive language in terms of genderedness." - }, - "polyglot_ner": { - "pwc_id": "polyglot-ner", - "dataset_name": "Polyglot-NER Dataset", - "dataset_abstract": "Polyglot-NER builds massive multilingual annotators with minimal human expertise and intervention.", - "paper_name": "POLYGLOT-NER: Massive Multilingual Named Entity Recognition", - "paper_abstract": "The increasing diversity of languages used on the web introduces a new level\nof complexity to Information Retrieval (IR) systems. We can no longer assume\nthat textual content is written in one language or even the same language\nfamily. In this paper, we demonstrate how to build massive multilingual\nannotators with minimal human expertise and intervention. We describe a system\nthat builds Named Entity Recognition (NER) annotators for 40 major languages\nusing Wikipedia and Freebase. Our approach does not require NER human annotated\ndatasets or language specific resources like treebanks, parallel corpora, and\northographic rules. The novelty of approach lies therein - using only language\nagnostic techniques, while achieving competitive performance.\n Our method learns distributed word representations (word embeddings) which\nencode semantic and syntactic features of words in each language. Then, we\nautomatically generate datasets from Wikipedia link structure and Freebase\nattributes. Finally, we apply two preprocessing stages (oversampling and exact\nsurface form matching) which do not require any linguistic expertise.\n Our evaluation is two fold: First, we demonstrate the system performance on\nhuman annotated datasets. Second, for languages where no gold-standard\nbenchmarks are available, we propose a new method, distant evaluation, based on\nstatistical machine translation." - }, - "imppres": { - "pwc_id": "imppres", - "dataset_name": "IMPPRES Dataset", - "dataset_abstract": "An IMPlicature and PRESupposition diagnostic dataset (IMPPRES), consisting of >25k semiautomatically generated sentence pairs illustrating well-studied pragmatic inference types.", - "paper_name": "Are Natural Language Inference Models IMPPRESsive? Learning IMPlicature and PRESupposition", - "paper_abstract": "Natural language inference (NLI) is an increasingly important task for natural language understanding, which requires one to infer whether a sentence entails another. However, the ability of NLI models to make pragmatic inferences remains understudied. We create an IMPlicature and PRESupposition diagnostic dataset (IMPPRES), consisting of >25k semiautomatically generated sentence pairs illustrating well-studied pragmatic inference types. We use IMPPRES to evaluate whether BERT, InferSent, and BOW NLI models trained on MultiNLI (Williams et al., 2018) learn to make pragmatic inferences. Although MultiNLI appears to contain very few pairs illustrating these inference types, we find that BERT learns to draw pragmatic inferences. It reliably treats scalar implicatures triggered by \"some\" as entailments. For some presupposition triggers like \"only\", BERT reliably recognizes the presupposition as an entailment, even when the trigger is embedded under an entailment canceling operator like negation. BOW and InferSent show weaker evidence of pragmatic reasoning. We conclude that NLI training encourages models to learn some, but not all, pragmatic inferences." - }, - "indonlu": { - "pwc_id": "indonlu-benchmark", - "dataset_name": "IndoNLU Benchmark Dataset", - "dataset_abstract": "The IndoNLU benchmark is a collection of resources for training, evaluating, and analyzing natural language understanding systems for Bahasa Indonesia. It is a joint venture from many Indonesia NLP enthusiasts from different institutions such as Gojek, Institut Teknologi Bandung, HKUST, Universitas Multimedia Nusantara, Prosa.ai, and Universitas Indonesia.", - "paper_name": "IndoNLU: Benchmark and Resources for Evaluating Indonesian Natural Language Understanding", - "paper_abstract": "Although Indonesian is known to be the fourth most frequently used language over the internet, the research progress on this language in the natural language processing (NLP) is slow-moving due to a lack of available resources. In response, we introduce the first-ever vast resource for the training, evaluating, and benchmarking on Indonesian natural language understanding (IndoNLU) tasks. IndoNLU includes twelve tasks, ranging from single sentence classification to pair-sentences sequence labeling with different levels of complexity. The datasets for the tasks lie in different domains and styles to ensure task diversity. We also provide a set of Indonesian pre-trained models (IndoBERT) trained from a large and clean Indonesian dataset Indo4B collected from publicly available sources such as social media texts, blogs, news, and websites. We release baseline models for all twelve tasks, as well as the framework for benchmark evaluation, and thus it enables everyone to benchmark their system performances." - }, - "wmt18": { - "pwc_id": "wmt-2018", - "dataset_name": "WMT 2018 Dataset", - "dataset_abstract": "WMT 2018 is a collection of datasets used in shared tasks of the Third Conference on Machine Translation. The conference builds on a series of twelve previous annual workshops and conferences on Statistical Machine Translation.\n\nThe conference featured ten shared tasks:\n\n\na news translation task,\na biomedical translation task,\na multimodal machine translation task,\na metrics task,\na quality estimation task,\nan automatic post-editing task,\na parallel corpus filtering task.", - "paper_name": "Findings of the 2018 Conference on Machine Translation (WMT18)", - "paper_abstract": "This paper presents the results of the premier shared task organized alongside the Conference on Machine Translation (WMT) 2018. Participants were asked to build machine translation systems for any of 7 language pairs in both directions, to be evaluated on a test set of news stories. The main metric for this task is human judgment of translation quality. This year, we also opened up the task to additional test sets to probe specific aspects of translation." - }, - "wiki_lingua": { - "pwc_id": "wikilingua", - "dataset_name": "WikiLingua Dataset", - "dataset_abstract": "WikiLingua includes ~770k article and summary pairs in 18 languages from WikiHow. Gold-standard article-summary alignments across languages are extracted by aligning the images that are used to describe each how-to step in an article.", - "paper_name": "WikiLingua: A New Benchmark Dataset for Cross-Lingual Abstractive Summarization", - "paper_abstract": "We introduce WikiLingua, a large-scale, multilingual dataset for the evaluation of crosslingual abstractive summarization systems. We extract article and summary pairs in 18 languages from WikiHow, a high quality, collaborative resource of how-to guides on a diverse set of topics written by human authors. We create gold-standard article-summary alignments across languages by aligning the images that are used to describe each how-to step in an article. As a set of baselines for further studies, we evaluate the performance of existing cross-lingual abstractive summarization methods on our dataset. We further propose a method for direct crosslingual summarization (i.e., without requiring translation at inference time) by leveraging synthetic data and Neural Machine Translation as a pre-training step. Our method significantly outperforms the baseline approaches, while being more cost efficient during inference." - }, - "lince": { - "pwc_id": "lince", - "dataset_name": "LinCE Dataset", - "dataset_abstract": "A centralized benchmark for Linguistic Code-switching Evaluation (LinCE) that combines ten corpora covering four different code-switched language pairs (i.e., Spanish-English, Nepali-English, Hindi-English, and Modern Standard Arabic-Egyptian Arabic) and four tasks (i.e., language identification, named entity recognition, part-of-speech tagging, and sentiment analysis).", - "paper_name": "LinCE: A Centralized Benchmark for Linguistic Code-switching Evaluation", - "paper_abstract": "Recent trends in NLP research have raised an interest in linguistic code-switching (CS); modern approaches have been proposed to solve a wide range of NLP tasks on multiple language pairs. Unfortunately, these proposed methods are hardly generalizable to different code-switched languages. In addition, it is unclear whether a model architecture is applicable for a different task while still being compatible with the code-switching setting. This is mainly because of the lack of a centralized benchmark and the sparse corpora that researchers employ based on their specific needs and interests. To facilitate research in this direction, we propose a centralized benchmark for Linguistic Code-switching Evaluation (LinCE) that combines ten corpora covering four different code-switched language pairs (i.e., Spanish-English, Nepali-English, Hindi-English, and Modern Standard Arabic-Egyptian Arabic) and four tasks (i.e., language identification, named entity recognition, part-of-speech tagging, and sentiment analysis). As part of the benchmark centralization effort, we provide an online platform at ritual.uh.edu/lince, where researchers can submit their results while comparing with others in real-time. In addition, we provide the scores of different popular models, including LSTM, ELMo, and multilingual BERT so that the NLP community can compare against state-of-the-art systems. LinCE is a continuous effort, and we will expand it with more low-resource languages and tasks." - }, - "spider": { - "pwc_id": "spider-1", - "dataset_name": "SPIDER Dataset", - "dataset_abstract": "Spider is a large-scale complex and cross-domain semantic parsing and text-to-SQL dataset annotated by 11 Yale students. The goal of the Spider challenge is to develop natural language interfaces to cross-domain databases. It consists of 10,181 questions and 5,693 unique complex SQL queries on 200 databases with multiple tables covering 138 different domains. In Spider 1.0, different complex SQL queries and databases appear in train and test sets. To do well on it, systems must generalize well to not only new SQL queries but also new database schemas.", - "paper_name": "Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task", - "paper_abstract": "We present Spider, a large-scale, complex and cross-domain semantic parsing\nand text-to-SQL dataset annotated by 11 college students. It consists of 10,181\nquestions and 5,693 unique complex SQL queries on 200 databases with multiple\ntables, covering 138 different domains. We define a new complex and\ncross-domain semantic parsing and text-to-SQL task where different complex SQL\nqueries and databases appear in train and test sets. In this way, the task\nrequires the model to generalize well to both new SQL queries and new database\nschemas. Spider is distinct from most of the previous semantic parsing tasks\nbecause they all use a single database and the exact same programs in the train\nset and the test set. We experiment with various state-of-the-art models and\nthe best model achieves only 12.4% exact matching accuracy on a database split\nsetting. This shows that Spider presents a strong challenge for future\nresearch. Our dataset and task are publicly available at\nhttps://yale-lily.github.io/spider" - }, - "bookcorpusopen": { - "pwc_id": "bookcorpus", - "dataset_name": "BookCorpus Dataset", - "dataset_abstract": "BookCorpus is a large collection of free novel books written by unpublished authors, which contains 11,038 books (around 74M sentences and 1G words) of 16 different sub-genres (e.g., Romance, Historical, Adventure, etc.).", - "paper_name": "Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books", - "paper_abstract": "Books are a rich source of both fine-grained information, how a character, an\nobject or a scene looks like, as well as high-level semantics, what someone is\nthinking, feeling and how these states evolve through a story. This paper aims\nto align books to their movie releases in order to provide rich descriptive\nexplanations for visual content that go semantically far beyond the captions\navailable in current datasets. To align movies and books we exploit a neural\nsentence embedding that is trained in an unsupervised way from a large corpus\nof books, as well as a video-text neural embedding for computing similarities\nbetween movie clips and sentences in the book. We propose a context-aware CNN\nto combine information from multiple sources. We demonstrate good quantitative\nperformance for movie/book alignment and show several qualitative examples that\nshowcase the diversity of tasks our model can be used for." - }, - "alt": { - "pwc_id": "alt", - "dataset_name": "ALT Dataset", - "dataset_abstract": "The ALT project aims to advance the state-of-the-art Asian natural language processing (NLP) techniques through the open collaboration for developing and using ALT. It was first conducted by NICT and UCSY as described in Ye Kyaw Thu, Win Pa Pa, Masao Utiyama, Andrew Finch and Eiichiro Sumita (2016). Then, it was developed under ASEAN IVO as described in this Web page. The process of building ALT began with sampling about 20,000 sentences from English Wikinews, and then these sentences were translated into the other languages. ALT now has 13 languages: Bengali, English, Filipino, Hindi, Bahasa Indonesia, Japanese, Khmer, Lao, Malay, Myanmar (Burmese), Thai, Vietnamese, Chinese (Simplified Chinese).", - "paper_name": "", - "paper_abstract": "" - }, - "lener_br": { - "pwc_id": "lener-br", - "dataset_name": "LeNER-Br Dataset", - "dataset_abstract": "LeNER-Br is a dataset for named entity recognition (NER) in Brazilian Legal Text.", - "paper_name": "LeNER-Br: a Dataset for Named Entity Recognition in Brazilian Legal Text", - "paper_abstract": "Named entity recognition systems have the untapped potential to extract information from legal documents, which can improve\r\ninformation retrieval and decision-making processes. In this paper, a dataset for named entity recognition in Brazilian legal documents is presented. Unlike other Portuguese language datasets, this dataset is composed entirely of legal documents. In addition to tags for persons, locations, time entities and organizations, the dataset contains specific tags for law and legal cases entities. To establish a set of baseline results, we first performed experiments on another Portuguese dataset: Paramopama. This evaluation demonstrate that LSTM-CRF gives results that are significantly better than those previously reported. We then retrained LSTM-CRF, on our dataset and obtained F 1 scores of 97.04% and 88.82% for Legislation and Legal case entities, respectively.\r\nThese results show the viability of the proposed dataset for legal applications." - }, - "german_legal_entity_recognition": { - "pwc_id": "legal-documents-entity-recognition", - "dataset_name": "Legal Documents Entity Recognition Dataset", - "dataset_abstract": "Court decisions from 2017 and 2018 were selected for the dataset, published online by the Federal Ministry of Justice and Consumer Protection. The documents originate from seven federal courts: Federal Labour Court (BAG), Federal Fiscal Court (BFH), Federal Court of Justice (BGH), Federal Patent Court (BPatG), Federal Social Court (BSG), Federal Constitutional Court (BVerfG) and Federal Administrative Court (BVerwG).", - "paper_name": "", - "paper_abstract": "" - }, - "reclor": { - "pwc_id": "reclor", - "dataset_name": "ReClor Dataset", - "dataset_abstract": "Logical reasoning is an important ability to examine, analyze, and critically evaluate arguments as they occur in ordinary language as the definition from Law School Admission Council. ReClor is a dataset extracted from logical reasoning questions of standardized graduate admission examinations.", - "paper_name": "ReClor: A Reading Comprehension Dataset Requiring Logical Reasoning", - "paper_abstract": "Recent powerful pre-trained language models have achieved remarkable performance on most of the popular datasets for reading comprehension. It is time to introduce more challenging datasets to push the development of this field towards more comprehensive reasoning of text. In this paper, we introduce a new Reading Comprehension dataset requiring logical reasoning (ReClor) extracted from standardized graduate admission examinations. As earlier studies suggest, human-annotated datasets usually contain biases, which are often exploited by models to achieve high accuracy without truly understanding the text. In order to comprehensively evaluate the logical reasoning ability of models on ReClor, we propose to identify biased data points and separate them into EASY set while the rest as HARD set. Empirical results show that state-of-the-art models have an outstanding ability to capture biases contained in the dataset with high accuracy on EASY set. However, they struggle on HARD set with poor performance near that of random guess, indicating more research is needed to essentially enhance the logical reasoning ability of current models." - }, - "qasper": { - "pwc_id": "qasper", - "dataset_name": "QASPER Dataset", - "dataset_abstract": "QASPER is a dataset for question answering on scientific research papers. It consists of 5,049 questions over 1,585 Natural Language Processing papers. Each question is written by an NLP practitioner who read only the title and abstract of the corresponding paper, and the question seeks information present in the full text. The questions are then answered by a separate set of NLP practitioners who also provide supporting evidence to answers.", - "paper_name": "A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers", - "paper_abstract": "Readers of academic research papers often read with the goal of answering specific questions. Question Answering systems that can answer those questions can make consumption of the content much more efficient. However, building such tools requires data that reflect the difficulty of the task arising from complex reasoning about claims made in multiple parts of a paper. In contrast, existing information-seeking question answering datasets usually contain questions about generic factoid-type information. We therefore present QASPER, a dataset of 5,049 questions over 1,585 Natural Language Processing papers. Each question is written by an NLP practitioner who read only the title and abstract of the corresponding paper, and the question seeks information present in the full text. The questions are then answered by a separate set of NLP practitioners who also provide supporting evidence to answers. We find that existing models that do well on other QA tasks do not perform well on answering these questions, underperforming humans by at least 27 F1 points when answering them from entire papers, motivating further research in document-grounded, information-seeking QA, which our dataset is designed to facilitate." - }, - "svhn": { - "pwc_id": "svhn", - "dataset_name": "SVHN Dataset", - "dataset_abstract": "Street View House Numbers (SVHN) is a digit classification benchmark dataset that contains 600,000 32\u00d732 RGB images of printed digits (from 0 to 9) cropped from pictures of house number plates. The cropped images are centered in the digit of interest, but nearby digits and other distractors are kept in the image. SVHN has three sets: training, testing sets and an extra set with 530,000 images that are less difficult and can be used for helping with the training process.", - "paper_name": "", - "paper_abstract": "" - }, - "wiki_asp": { - "pwc_id": "wikiasp", - "dataset_name": "WikiAsp Dataset", - "dataset_abstract": "A large-scale dataset for multi-domain aspect-based summarization that attempts to spur research in the direction of open-domain aspect-based summarization.", - "paper_name": "WikiAsp: A Dataset for Multi-domain Aspect-based Summarization", - "paper_abstract": "Aspect-based summarization is the task of generating focused summaries based on specific points of interest. Such summaries aid efficient analysis of text, such as quickly understanding reviews or opinions from different angles. However, due to large differences in the type of aspects for different domains (e.g., sentiment, product features), the development of previous models has tended to be domain-specific. In this paper, we propose WikiAsp, a large-scale dataset for multi-domain aspect-based summarization that attempts to spur research in the direction of open-domain aspect-based summarization. Specifically, we build the dataset using Wikipedia articles from 20 different domains, using the section titles and boundaries of each article as a proxy for aspect annotation. We propose several straightforward baseline models for this task and conduct experiments on the dataset. Results highlight key challenges that existing summarization models face in this setting, such as proper pronoun handling of quoted sources and consistent explanation of time-sensitive events." - }, - "cfq": { - "pwc_id": "cfq", - "dataset_name": "CFQ Dataset", - "dataset_abstract": "A large and realistic natural language question answering dataset.", - "paper_name": "Measuring Compositional Generalization: A Comprehensive Method on Realistic Data", - "paper_abstract": "State-of-the-art machine learning methods exhibit limited compositional generalization. At the same time, there is a lack of realistic benchmarks that comprehensively measure this ability, which makes it challenging to find and evaluate improvements. We introduce a novel method to systematically construct such benchmarks by maximizing compound divergence while guaranteeing a small atom divergence between train and test sets, and we quantitatively compare this method to other approaches for creating compositional generalization benchmarks. We present a large and realistic natural language question answering dataset that is constructed according to this method, and we use it to analyze the compositional generalization ability of three machine learning architectures. We find that they fail to generalize compositionally and that there is a surprisingly strong negative correlation between compound divergence and accuracy. We also demonstrate how our method can be used to create new compositionality benchmarks on top of the existing SCAN dataset, which confirms these findings." - }, - "wmt15": { - "pwc_id": "wmt-2015", - "dataset_name": "WMT 2015 Dataset", - "dataset_abstract": "WMT 2015 is a collection of datasets used in shared tasks of the Tenth Workshop on Statistical Machine Translation. The workshop featured five tasks:\n\n\na news translation task,\na metrics task,\na tuning task,\na quality estimation task,\nan automatic post-editing task.", - "paper_name": "", - "paper_abstract": "" - }, - "conll2012_ontonotesv5": { - "pwc_id": "ontonotes-5-0", - "dataset_name": "OntoNotes 5.0 Dataset", - "dataset_abstract": "OntoNotes 5.0 is a large corpus comprising various genres of text (news, conversational telephone speech, weblogs, usenet newsgroups, broadcast, talk shows) in three languages (English, Chinese, and Arabic) with structural information (syntax and predicate argument structure) and shallow semantics (word sense linked to an ontology and coreference).\n\nOntoNotes Release 5.0 contains the content of earlier releases - and adds source data from and/or additional annotations for, newswire, broadcast news, broadcast conversation, telephone conversation and web data in English and Chinese and newswire data in Arabic.", - "paper_name": "", - "paper_abstract": "" - }, - "para_crawl": { - "pwc_id": "paracrawl", - "dataset_name": "ParaCrawl Dataset", - "dataset_abstract": "ParaCrawl v.7.1 is a parallel dataset with 41 language pairs primarily aligned with English (39 out of 41) and mined using the parallel-data-crawling tool Bitextor which includes downloading documents, preprocessing and normalization, aligning documents and segments, and filtering noisy data via Bicleaner. ParaCrawl focuses on European languages, but also includes 9 lower-resource, non-European language pairs in v7.1.", - "paper_name": "ParaCrawl: Web-Scale Acquisition of Parallel Corpora", - "paper_abstract": "We report on methods to create the largest publicly available parallel corpora by crawling the web, using open source software. We empirically compare alternative methods and publish benchmark data sets for sentence alignment and sentence pair filtering. We also describe the parallel corpora released and evaluate their quality and their usefulness to create machine translation systems." - }, - "web_nlg": { - "pwc_id": "webnlg", - "dataset_name": "WebNLG Dataset", - "dataset_abstract": "The WebNLG corpus comprises of sets of triplets describing facts (entities and relations between them) and the corresponding facts in form of natural language text. The corpus contains sets with up to 7 triplets each along with one or more reference texts for each set. The test set is split into two parts: seen, containing inputs created for entities and relations belonging to DBpedia categories that were seen in the training data, and unseen, containing inputs extracted for entities and relations belonging to 5 unseen categories.\n\nInitially, the dataset was used for the WebNLG natural language generation challenge which consists of mapping the sets of triplets to text, including referring expression generation, aggregation, lexicalization, surface realization, and sentence segmentation.\nThe corpus is also used for a reverse task of triplets extraction.\n\nVersioning history of the dataset can be found here.", - "paper_name": "Creating Training Corpora for NLG Micro-Planners", - "paper_abstract": "In this paper, we present a novel framework for semi-automatically creating linguistically challenging micro-planning data-to-text corpora from existing Knowledge Bases. Because our method pairs data of varying size and shape with texts ranging from simple clauses to short texts, a dataset created using this framework provides a challenging benchmark for microplanning. Another feature of this framework is that it can be applied to any large scale knowledge base and can therefore be used to train and learn KB verbalisers. We apply our framework to DBpedia data and compare the resulting dataset with Wen et al. 2016{'}s. We show that while Wen et al.{'}s dataset is more than twice larger than ours, it is less diverse both in terms of input and in terms of text. We thus propose our corpus generation framework as a novel method for creating challenging data sets from which NLG models can be learned which are capable of handling the complex interactions occurring during in micro-planning between lexicalisation, aggregation, surface realisation, referring expression generation and sentence segmentation. To encourage researchers to take up this challenge, we made available a dataset of 21,855 data/text pairs created using this framework in the context of the WebNLG shared task." - }, - "cifar100": { - "pwc_id": "cifar-100", - "dataset_name": "CIFAR-100 Dataset", - "dataset_abstract": "The CIFAR-100 dataset (Canadian Institute for Advanced Research, 100 classes) is a subset of the Tiny Images dataset and consists of 60000 32x32 color images. The 100 classes in the CIFAR-100 are grouped into 20 superclasses. There are 600 images per class. Each image comes with a \"fine\" label (the class to which it belongs) and a \"coarse\" label (the superclass to which it belongs). There are 500 training images and 100 testing images per class.\n\nThe criteria for deciding whether an image belongs to a class were as follows:\n\n\nThe class name should be high on the list of likely answers to the question \u201cWhat is in this picture?\u201d\nThe image should be photo-realistic. Labelers were instructed to reject line drawings.\nThe image should contain only one prominent instance of the object to which the class refers.\nThe object may be partially occluded or seen from an unusual viewpoint as long as its identity is still clear to the labeler.", - "paper_name": "", - "paper_abstract": "" - }, - "hatexplain": { - "pwc_id": "hatexplain", - "dataset_name": "HateXplain Dataset", - "dataset_abstract": "Covers multiple aspects of the issue. Each post in the dataset is annotated from three different perspectives: the basic, commonly used 3-class classification (i.e., hate, offensive or normal), the target community (i.e., the community that has been the victim of hate speech/offensive speech in the post), and the rationales, i.e., the portions of the post on which their labelling decision (as hate, offensive or normal) is based.", - "paper_name": "HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection", - "paper_abstract": "Hate speech is a challenging issue plaguing the online social media. While better models for hate speech detection are continuously being developed, there is little research on the bias and interpretability aspects of hate speech. In this paper, we introduce HateXplain, the first benchmark hate speech dataset covering multiple aspects of the issue. Each post in our dataset is annotated from three different perspectives: the basic, commonly used 3-class classification (i.e., hate, offensive or normal), the target community (i.e., the community that has been the victim of hate speech/offensive speech in the post), and the rationales, i.e., the portions of the post on which their labelling decision (as hate, offensive or normal) is based. We utilize existing state-of-the-art models and observe that even models that perform very well in classification do not score high on explainability metrics like model plausibility and faithfulness. We also observe that models, which utilize the human rationales for training, perform better in reducing unintended bias towards target communities. We have made our code and dataset public at https://github.com/punyajoy/HateXplain" - }, - "biomrc": { - "pwc_id": "biomrc", - "dataset_name": "BIOMRC Dataset", - "dataset_abstract": "A large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the previous BIOREAD dataset of Pappas et al. (2018).", - "paper_name": "BIOMRC: A Dataset for Biomedical Machine Reading Comprehension", - "paper_abstract": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the previous BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the new dataset, and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating that the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is also higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new BERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or surpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different sizes, also releasing our code, and providing a leaderboard." - }, - "break_data": { - "pwc_id": "break", - "dataset_name": "BREAK Dataset", - "dataset_abstract": "Break is a question understanding dataset, aimed at training models to reason over complex questions. It features 83,978 natural language questions, annotated with a new meaning representation, Question Decomposition Meaning Representation (QDMR). Each example has the natural question along with its QDMR representation. Break contains human composed questions, sampled from 10 leading question-answering benchmarks over text, images and databases. This dataset was created by a team of NLP researchers at Tel Aviv University and Allen Institute for AI.", - "paper_name": "Break It Down: A Question Understanding Benchmark", - "paper_abstract": "Understanding natural language questions entails the ability to break down a question into the requisite steps for computing its answer. In this work, we introduce a Question Decomposition Meaning Representation (QDMR) for questions. QDMR constitutes the ordered list of steps, expressed through natural language, that are necessary for answering a question. We develop a crowdsourcing pipeline, showing that quality QDMRs can be annotated at scale, and release the Break dataset, containing over 83K pairs of questions and their QDMRs. We demonstrate the utility of QDMR by showing that (a) it can be used to improve open-domain question answering on the HotpotQA dataset, (b) it can be deterministically converted to a pseudo-SQL formal language, which can alleviate annotation in semantic parsing applications. Last, we use Break to train a sequence-to-sequence model with copying that parses questions into QDMR structures, and show that it substantially outperforms several natural baselines." - }, - "conll2002": { - "pwc_id": "conll-2002", - "dataset_name": "CoNLL 2002 Dataset", - "dataset_abstract": "The shared task of CoNLL-2002 concerns language-independent named entity recognition. The types of named entities include: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups. The participants of the shared task were offered training and test data for at least two languages. Information sources other than the training data might have been used in this shared task.", - "paper_name": "", - "paper_abstract": "" - }, - "multilingual_librispeech": { - "pwc_id": "librispeech-1", - "dataset_name": "LibriSpeech Dataset", - "dataset_abstract": "The LibriSpeech corpus is a collection of approximately 1,000 hours of audiobooks that are a part of the LibriVox project. Most of the audiobooks come from the Project Gutenberg. The training data is split into 3 partitions of 100hr, 360hr, and 500hr sets while the dev and test data are split into the \u2019clean\u2019 and \u2019other\u2019 categories, respectively, depending upon how well or challening Automatic Speech Recognition systems would perform against. Each of the dev and test sets is around 5hr in audio length. This corpus also provides the n-gram language models and the corresponding texts excerpted from the Project Gutenberg books, which contain 803M tokens and 977K unique words.", - "paper_name": "", - "paper_abstract": "" - }, - "daily_dialog": { - "pwc_id": "dailydialog", - "dataset_name": "DailyDialog Dataset", - "dataset_abstract": "DailyDialog is a high-quality multi-turn open-domain English dialog dataset. It contains 13,118 dialogues split into a training set with 11,118 dialogues and validation and test sets with 1000 dialogues each. On average there are around 8 speaker turns per dialogue with around 15 tokens per turn.", - "paper_name": "DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset", - "paper_abstract": "We develop a high-quality multi-turn dialog dataset, DailyDialog, which is\nintriguing in several aspects. The language is human-written and less noisy.\nThe dialogues in the dataset reflect our daily communication way and cover\nvarious topics about our daily life. We also manually label the developed\ndataset with communication intention and emotion information. Then, we evaluate\nexisting approaches on DailyDialog dataset and hope it benefit the research\nfield of dialog systems." - }, - "un_multi": { - "pwc_id": "multiun", - "dataset_name": "MultiUN Dataset", - "dataset_abstract": "The MultiUN parallel corpus is extracted from the United Nations Website , and then cleaned and converted to XML at Language Technology Lab in DFKI GmbH (LT-DFKI), Germany. The documents were published by UN from 2000 to 2009.", - "paper_name": "", - "paper_abstract": "" - }, - "para_pat": { - "pwc_id": "parapat", - "dataset_name": "ParaPat Dataset", - "dataset_abstract": "A parallel corpus from the open access Google Patents dataset in 74 language pairs, comprising more than 68 million sentences and 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm for the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.", - "paper_name": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts", - "paper_abstract": "The Google Patents is one of the main important sources of patents information. A striking characteristic is that many of its abstracts are presented in more than one language, thus making it a potential source of parallel corpora. This article presents the development of a parallel corpus from the open access Google Patents dataset in 74 language pairs, comprising more than 68 million sentences and 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm for the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned. We demonstrate the capabilities of our corpus by training Neural Machine Translation (NMT) models for the main 9 language pairs, with a total of 18 models. Our parallel corpus is freely available in TSV format and with a SQLite database, with complementary information regarding patent metadata." - }, - "tweet_qa": { - "pwc_id": "tweetqa", - "dataset_name": "TweetQA Dataset", - "dataset_abstract": "With social media becoming increasingly popular on which lots of news and real-time events are reported, developing automated question answering systems is critical to the effectiveness of many applications that rely on real-time knowledge. While previous question answering (QA) datasets have concentrated on formal text like news and Wikipedia, the first large-scale dataset for QA over social media data is presented. To make sure the tweets are meaningful and contain interesting information, tweets used by journalists to write news articles are gathered. Then human annotators are asked to write questions and answers upon these tweets. Unlike other QA datasets like SQuAD in which the answers are extractive, the answer are allowed to be abstractive. The task requires model to read a short tweet and a question and outputs a text phrase (does not need to be in the tweet) as the answer.", - "paper_name": "TWEETQA: A Social Media Focused Question Answering Dataset", - "paper_abstract": "With social media becoming increasingly pop-ular on which lots of news and real-time eventsare reported, developing automated questionanswering systems is critical to the effective-ness of many applications that rely on real-time knowledge. While previous datasets haveconcentrated on question answering (QA) forformal text like news and Wikipedia, wepresent the first large-scale dataset for QA oversocial media data. To ensure that the tweetswe collected are useful, we only gather tweetsused by journalists to write news articles. Wethen ask human annotators to write questionsand answers upon these tweets. Unlike otherQA datasets like SQuAD in which the answersare extractive, we allow the answers to be ab-stractive. We show that two recently proposedneural models that perform well on formaltexts are limited in their performance when ap-plied to our dataset. In addition, even the fine-tuned BERT model is still lagging behind hu-man performance with a large margin. Our re-sults thus point to the need of improved QAsystems targeting social media text." - }, - "ccaligned_multilingual": { - "pwc_id": "ccaligned", - "dataset_name": "CCAligned Dataset", - "dataset_abstract": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to multiple documents in different target language, it is possible to join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).", - "paper_name": "CCAligned: A Massive Collection of Cross-Lingual Web-Document Pairs", - "paper_abstract": "Cross-lingual document alignment aims to identify pairs of documents in two distinct languages that are of comparable content or translations of each other. In this paper, we exploit the signals embedded in URLs to label web documents at scale with an average precision of 94.5% across different language pairs. We mine sixty-eight snapshots of the Common Crawl corpus and identify web document pairs that are translations of each other. We release a new web dataset consisting of over 392 million URL pairs from Common Crawl covering documents in 8144 language pairs of which 137 pairs include English. In addition to curating this massive dataset, we introduce baseline methods that leverage cross-lingual representations to identify aligned documents based on their textual content. Finally, we demonstrate the value of this parallel documents dataset through a downstream task of mining parallel sentences and measuring the quality of machine translations from models trained on this mined data. Our objective in releasing this dataset is to foster new research in cross-lingual NLP across a variety of low, medium, and high-resource languages." - }, - "cmrc2018": { - "pwc_id": "cmrc-2018", - "dataset_name": "CMRC 2018 Dataset", - "dataset_abstract": "CMRC 2018 is a dataset for Chinese Machine Reading Comprehension. Specifically, it is a span-extraction reading comprehension dataset that is similar to SQuAD.", - "paper_name": "A Span-Extraction Dataset for Chinese Machine Reading Comprehension", - "paper_abstract": "Machine Reading Comprehension (MRC) has become enormously popular recently and has attracted a lot of attention. However, the existing reading comprehension datasets are mostly in English. In this paper, we introduce a Span-Extraction dataset for Chinese machine reading comprehension to add language diversities in this area. The dataset is composed by near 20,000 real questions annotated on Wikipedia paragraphs by human experts. We also annotated a challenge set which contains the questions that need comprehensive understanding and multi-sentence inference throughout the context. We present several baseline systems as well as anonymous submissions for demonstrating the difficulties in this dataset. With the release of the dataset, we hosted the Second Evaluation Workshop on Chinese Machine Reading Comprehension (CMRC 2018). We hope the release of the dataset could further accelerate the Chinese machine reading comprehension research. Resources are available: https://github.com/ymcui/cmrc2018" - }, - "schema_guided_dstc8": { - "pwc_id": "sgd", - "dataset_name": "SGD Dataset", - "dataset_abstract": "The Schema-Guided Dialogue (SGD) dataset consists of over 20k annotated multi-domain, task-oriented conversations between a human and a virtual assistant. These conversations involve interactions with services and APIs spanning 20 domains, ranging from banks and events to media, calendar, travel, and weather. For most of these domains, the dataset contains multiple different APIs, many of which have overlapping functionalities but different interfaces, which reflects common real-world scenarios. The wide range of available annotations can be used for intent prediction, slot filling, dialogue state tracking, policy imitation learning, language generation, user simulation learning, among other tasks in large-scale virtual assistants. Besides these, the dataset has unseen domains and services in the evaluation set to quantify the performance in zero-shot or few shot settings.", - "paper_name": "Towards Scalable Multi-domain Conversational Agents: The Schema-Guided Dialogue Dataset", - "paper_abstract": "Virtual assistants such as Google Assistant, Alexa and Siri provide a conversational interface to a large number of services and APIs spanning multiple domains. Such systems need to support an ever-increasing number of services with possibly overlapping functionality. Furthermore, some of these services have little to no training data available. Existing public datasets for task-oriented dialogue do not sufficiently capture these challenges since they cover few domains and assume a single static ontology per domain. In this work, we introduce the the Schema-Guided Dialogue (SGD) dataset, containing over 16k multi-domain conversations spanning 16 domains. Our dataset exceeds the existing task-oriented dialogue corpora in scale, while also highlighting the challenges associated with building large-scale virtual assistants. It provides a challenging testbed for a number of tasks including language understanding, slot filling, dialogue state tracking and response generation. Along the same lines, we present a schema-guided paradigm for task-oriented dialogue, in which predictions are made over a dynamic set of intents and slots, provided as input, using their natural language descriptions. This allows a single dialogue system to easily support a large number of services and facilitates simple integration of new services without requiring additional training data. Building upon the proposed paradigm, we release a model for dialogue state tracking capable of zero-shot generalization to new APIs, while remaining competitive in the regular setting." - }, - "empathetic_dialogues": { - "pwc_id": "empatheticdialogues", - "dataset_name": "EmpatheticDialogues Dataset", - "dataset_abstract": "The EmpatheticDialogues dataset is a large-scale multi-turn empathetic dialogue dataset collected on the Amazon Mechanical Turk, containing 24,850 one-to-one open-domain conversations. Each conversation was obtained by pairing two crowd-workers: a speaker and a listener. The speaker is asked to talk about the personal emotional feelings. The listener infers the underlying emotion through what the speaker says and responds empathetically. The dataset provides 32 evenly distributed emotion labels.", - "paper_name": "Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset", - "paper_abstract": "One challenge for dialogue agents is recognizing feelings in the conversation partner and replying accordingly, a key communicative skill. While it is straightforward for humans to recognize and acknowledge others' feelings in a conversation, this is a significant challenge for AI systems due to the paucity of suitable publicly-available datasets for training and evaluation. This work proposes a new benchmark for empathetic dialogue generation and EmpatheticDialogues, a novel dataset of 25k conversations grounded in emotional situations. Our experiments indicate that dialogue models that use our dataset are perceived to be more empathetic by human evaluators, compared to models merely trained on large-scale Internet conversation data. We also present empirical comparisons of dialogue model adaptations for empathetic responding, leveraging existing models or datasets without requiring lengthy re-training of the full model." - }, - "kd_conv": { - "pwc_id": "kdconv", - "dataset_name": "KdConv Dataset", - "dataset_abstract": "KdConv is a Chinese multi-domain Knowledge-driven Conversation dataset, grounding the topics in multi-turn conversations to knowledge graphs. KdConv contains 4.5K conversations from three domains (film, music, and travel), and 86K utterances with an average turn number of 19.0. These conversations contain in-depth discussions on related topics and natural transition between multiple topics, while the corpus can also used for exploration of transfer learning and domain adaptation.", - "paper_name": "KdConv: A Chinese Multi-domain Dialogue Dataset Towards Multi-turn Knowledge-driven Conversation", - "paper_abstract": "The research of knowledge-driven conversational systems is largely limited due to the lack of dialog data which consist of multi-turn conversations on multiple topics and with knowledge annotations. In this paper, we propose a Chinese multi-domain knowledge-driven conversation dataset, KdConv, which grounds the topics in multi-turn conversations to knowledge graphs. Our corpus contains 4.5K conversations from three domains (film, music, and travel), and 86K utterances with an average turn number of 19.0. These conversations contain in-depth discussions on related topics and natural transition between multiple topics. To facilitate the following research on this corpus, we provide several benchmark models. Comparative results show that the models can be enhanced by introducing background knowledge, yet there is still a large space for leveraging knowledge to model multi-turn conversations for further research. Results also show that there are obvious performance differences between different domains, indicating that it is worth to further explore transfer learning and domain adaptation. The corpus and benchmark models are publicly available." - }, - "food101": { - "pwc_id": "food-101", - "dataset_name": "Food-101 Dataset", - "dataset_abstract": "The Food-101 dataset consists of 101 food categories with 750 training and 250 test images per category, making a total of 101k images. The labels for the test images have been manually cleaned, while the training set contains some noise.", - "paper_name": "", - "paper_abstract": "" - }, - "eurlex": { - "pwc_id": "eurlex57k", - "dataset_name": "EURLEX57K Dataset", - "dataset_abstract": "EURLEX57K is a new publicly available legal LMTC dataset, dubbed EURLEX57K, containing 57k English EU legislative documents from the EUR-LEX portal, tagged with \u223c4.3k labels (concepts) from the European Vocabulary (EUROVOC).", - "paper_name": "Large-Scale Multi-Label Text Classification on EU Legislation", - "paper_abstract": "We consider Large-Scale Multi-Label Text Classification (LMTC) in the legal domain. We release a new dataset of 57k legislative documents from EURLEX, annotated with ~4.3k EUROVOC labels, which is suitable for LMTC, few- and zero-shot learning. Experimenting with several neural classifiers, we show that BIGRUs with label-wise attention perform better than other current state of the art methods. Domain-specific WORD2VEC and context-sensitive ELMO embeddings further improve performance. We also find that considering only particular zones of the documents is sufficient. This allows us to bypass BERT's maximum text length limit and fine-tune BERT, obtaining the best results in all but zero-shot learning cases." - }, - "multi_re_qa": { - "pwc_id": "multireqa", - "dataset_name": "MultiReQA Dataset", - "dataset_abstract": "MultiReQA is a cross-domain evaluation for retrieval question answering models. Retrieval question answering (ReQA) is the task of retrieving a sentence-level answer to a question from an open corpus. MultiReQA is a new multi-domain ReQA evaluation suite composed of eight retrieval QA tasks drawn from publicly available QA datasets from the MRQA shared task.\nMultiReQA contains the sentence boundary annotation from eight publicly available QA datasets including SearchQA, TriviaQA, HotpotQA, NaturalQuestions, SQuAD, BioASQ, RelationExtraction, and TextbookQA. Five of these datasets, including SearchQA, TriviaQA, HotpotQA, NaturalQuestions, SQuAD, contain both training and test data, and three, in cluding BioASQ, RelationExtraction, TextbookQA, contain only the test data.", - "paper_name": "MultiReQA: A Cross-Domain Evaluation for Retrieval Question Answering Models", - "paper_abstract": "Retrieval question answering (ReQA) is the task of retrieving a sentence-level answer to a question from an open corpus (Ahmad et al.,2019).This paper presents MultiReQA, anew multi-domain ReQA evaluation suite com-posed of eight retrieval QA tasks drawn from publicly available QA datasets. We provide the first systematic retrieval based evaluation over these datasets using two supervised neural models, based on fine-tuning BERT andUSE-QA models respectively, as well as a surprisingly strong information retrieval baseline,BM25. Five of these tasks contain both train-ing and test data, while three contain test data only. Performance on the five tasks with train-ing data shows that while a general model covering all domains is achievable, the best performance is often obtained by training exclusively on in-domain data." - }, - "conceptual_captions": { - "pwc_id": "conceptual-captions", - "dataset_name": "Conceptual Captions Dataset", - "dataset_abstract": "Automatic image captioning is the task of producing a natural-language utterance (usually a sentence) that correctly reflects the visual content of an image. Up to this point, the resource most used for this task was the MS-COCO dataset, containing around 120,000 images and 5-way image-caption annotations (produced by paid annotators).\n\nGoogle's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions. In contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web, and therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images. The authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness, informativeness, fluency, and learnability of the resulting captions.", - "paper_name": "Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning", - "paper_abstract": "We present a new dataset of image caption annotations, Conceptual Captions, which contains an order of magnitude more images than the MS-COCO dataset (Lin et al., 2014) and represents a wider variety of both images and image caption styles. We achieve this by extracting and filtering image caption annotations from billions of webpages. We also present quantitative evaluations of a number of image captioning models and show that a model architecture based on Inception-ResNetv2 (Szegedy et al., 2016) for image-feature extraction and Transformer (Vaswani et al., 2017) for sequence modeling achieves the best performance when trained on the Conceptual Captions dataset." - }, - "cuad": { - "pwc_id": "cuad", - "dataset_name": "CUAD Dataset", - "dataset_abstract": "Contract Understanding Atticus Dataset (CUAD) is a dataset for legal contract review. CUAD was created with dozens of legal experts from The Atticus Project\nand consists of over 13,000 annotations. The task is to highlight salient portions of a contract that are important for a human to review.", - "paper_name": "CUAD: An Expert-Annotated NLP Dataset for Legal Contract Review", - "paper_abstract": "Many specialized domains remain untouched by deep learning, as large labeled datasets require expensive expert annotators. We address this bottleneck within the legal domain by introducing the Contract Understanding Atticus Dataset (CUAD), a new dataset for legal contract review. CUAD was created with dozens of legal experts from The Atticus Project and consists of over 13,000 annotations. The task is to highlight salient portions of a contract that are important for a human to review. We find that Transformer models have nascent performance, but that this performance is strongly influenced by model design and training dataset size. Despite these promising results, there is still substantial room for improvement. As one of the only large, specialized NLP benchmarks annotated by experts, CUAD can serve as a challenging research benchmark for the broader NLP community." - }, - "ms_marco": { - "pwc_id": "ms-marco", - "dataset_name": "MS MARCO Dataset", - "dataset_abstract": "The MS MARCO (Microsoft MAchine Reading Comprehension) is a collection of datasets focused on deep learning in search.\nThe first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer. Over time the collection was extended with a 1,000,000 question dataset, a natural language generation dataset, a passage ranking dataset, keyphrase extraction dataset, crawling dataset, and a conversational search.", - "paper_name": "MS MARCO: A Human Generated MAchine Reading COmprehension Dataset", - "paper_abstract": "We introduce a large scale MAchine Reading COmprehension dataset, which we\nname MS MARCO. The dataset comprises of 1,010,916 anonymized\nquestions---sampled from Bing's search query logs---each with a human generated\nanswer and 182,669 completely human rewritten generated answers. In addition,\nthe dataset contains 8,841,823 passages---extracted from 3,563,535 web\ndocuments retrieved by Bing---that provide the information necessary for\ncurating the natural language answers. A question in the MS MARCO dataset may\nhave multiple answers or no answers at all. Using this dataset, we propose\nthree different tasks with varying levels of difficulty: (i) predict if a\nquestion is answerable given a set of context passages, and extract and\nsynthesize the answer as a human would (ii) generate a well-formed answer (if\npossible) based on the context passages that can be understood with the\nquestion and passage context, and finally (iii) rank a set of retrieved\npassages given a question. The size of the dataset and the fact that the\nquestions are derived from real user search queries distinguishes MS MARCO from\nother well-known publicly available datasets for machine reading comprehension\nand question-answering. We believe that the scale and the real-world nature of\nthis dataset makes it attractive for benchmarking machine reading comprehension\nand question-answering models." - }, - "natural_questions": { - "pwc_id": "natural-questions", - "dataset_name": "Natural Questions Dataset", - "dataset_abstract": "The Natural Questions corpus is a question answering dataset containing 307,373 training examples, 7,830 development examples, and 7,842 test examples. Each example is comprised of a google.com query and a corresponding Wikipedia page. Each Wikipedia page has a passage (or long answer) annotated on the page that answers the question and one or more short spans from the annotated passage containing the actual answer. The long and the short answer annotations can however be empty. If they are both empty, then there is no answer on the page at all. If the long answer annotation is non-empty, but the short answer annotation is empty, then the annotated passage answers the question but no explicit short answer could be found. Finally 1% of the documents have a passage annotated with a short answer that is \u201cyes\u201d or \u201cno\u201d, instead of a list of short spans.", - "paper_name": "Natural Questions: a Benchmark for Question Answering Research", - "paper_abstract": "We present the Natural Questions corpus, a question answering dataset. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations, 7,830 examples with 5-way annotations for development data, and a further 7,842 examples 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature." - }, - "reddit_tifu": { - "pwc_id": "reddit-tifu", - "dataset_name": "Reddit TIFU Dataset", - "dataset_abstract": "Reddit TIFU dataset is a newly collected Reddit dataset, where TIFU denotes the name of /r/tifu subbreddit.\nThere are 122,933 text-summary pairs in total.", - "paper_name": "Abstractive Summarization of Reddit Posts with Multi-level Memory Networks", - "paper_abstract": "We address the problem of abstractive summarization in two directions:\nproposing a novel dataset and a new model. First, we collect Reddit TIFU\ndataset, consisting of 120K posts from the online discussion forum Reddit. We\nuse such informal crowd-generated posts as text source, in contrast with\nexisting datasets that mostly use formal documents as source such as news\narticles. Thus, our dataset could less suffer from some biases that key\nsentences usually locate at the beginning of the text and favorable summary\ncandidates are already inside the text in similar forms. Second, we propose a\nnovel abstractive summarization model named multi-level memory networks (MMN),\nequipped with multi-level memory to store the information of text from\ndifferent levels of abstraction. With quantitative evaluation and user studies\nvia Amazon Mechanical Turk, we show the Reddit TIFU dataset is highly\nabstractive and the MMN outperforms the state-of-the-art summarization models." - }, - "un_pc": { - "pwc_id": "united-nations-parallel-corpus", - "dataset_name": "United Nations Parallel Corpus Dataset", - "dataset_abstract": "The first parallel corpus composed from United Nations documents published by the original data creator. The parallel corpus presented consists of manually translated UN documents from the last 25 years (1990 to 2014) for the six official UN languages, Arabic, Chinese, English, French, Russian, and Spanish.", - "paper_name": "The United Nations Parallel Corpus v1.0", - "paper_abstract": "This paper describes the creation process and statistics of the official United Nations Parallel Corpus, the first parallel corpus composed from United Nations documents published by the original data creator. The parallel corpus presented consists of manually translated UN documents from the last 25 years (1990 to 2014) for the six official UN languages, Arabic, Chinese, English, French, Russian, and Spanish. The corpus is freely available for download under a liberal license. Apart from the pairwise aligned documents, a fully aligned subcorpus for the six official UN languages is distributed. We provide baseline BLEU scores of our Moses-based SMT systems trained with the full data of language pairs involving English and for all possible translation directions of the six-way subcorpus." - }, - "allocine": { - "pwc_id": "allocine", - "dataset_name": "AlloCine Dataset", - "dataset_abstract": "A new dataset for sentiment analysis, scraped from Allocin\u00e9.fr user reviews. It contains 100k positive and 100k negative reviews divided into 3 balanced splits: train (160k reviews), val (20k) and test (20k).", - "paper_name": "", - "paper_abstract": "" - }, - "wiki_atomic_edits": { - "pwc_id": "wikiatomicedits", - "dataset_name": "WikiAtomicEdits Dataset", - "dataset_abstract": "WikiAtomicEdits is a corpus of 43 million atomic edits across 8 languages. These edits are mined from Wikipedia edit history and consist of instances in which a human editor has inserted a single contiguous phrase into, or deleted a single contiguous phrase from, an existing sentence.", - "paper_name": "WikiAtomicEdits: A Multilingual Corpus of Wikipedia Edits for Modeling Language and Discourse", - "paper_abstract": "We release a corpus of 43 million atomic edits across 8 languages. These\nedits are mined from Wikipedia edit history and consist of instances in which a\nhuman editor has inserted a single contiguous phrase into, or deleted a single\ncontiguous phrase from, an existing sentence. We use the collected data to show\nthat the language generated during editing differs from the language that we\nobserve in standard corpora, and that models trained on edits encode different\naspects of semantics and discourse than models trained on raw, unstructured\ntext. We release the full corpus as a resource to aid ongoing research in\nsemantics, discourse, and representation learning." - }, - "sentiment140": { - "pwc_id": "sentiment140", - "dataset_name": "Sentiment140 Dataset", - "dataset_abstract": "Sentiment140 is a dataset that allows you to discover the sentiment of a brand, product, or topic on Twitter.", - "paper_name": "", - "paper_abstract": "" - }, - "doqa": { - "pwc_id": "doqa", - "dataset_name": "DoQA Dataset", - "dataset_abstract": "A dataset with 2,437 dialogues and 10,917 QA pairs. The dialogues are collected from three Stack Exchange sites using the Wizard of Oz method with crowdsourcing.", - "paper_name": "DoQA -- Accessing Domain-Specific FAQs via Conversational QA", - "paper_abstract": "The goal of this work is to build conversational Question Answering (QA) interfaces for the large body of domain-specific information available in FAQ sites. We present DoQA, a dataset with 2,437 dialogues and 10,917 QA pairs. The dialogues are collected from three Stack Exchange sites using the Wizard of Oz method with crowdsourcing. Compared to previous work, DoQA comprises well-defined information needs, leading to more coherent and natural conversations with less factoid questions and is multi-domain. In addition, we introduce a more realistic information retrieval(IR) scenario where the system needs to find the answer in any of the FAQ documents. The results of an existing, strong, system show that, thanks to transfer learning from a Wikipedia QA dataset and fine tuning on a single FAQ domain, it is possible to build high quality conversational QA systems for FAQs without in-domain training data. The good results carry over into the more challenging IR scenario. In both cases, there is still ample room for improvement, as indicated by the higher human upperbound." - }, - "definite_pronoun_resolution": { - "pwc_id": "definite-pronoun-resolution-dataset", - "dataset_name": "Definite Pronoun Resolution Dataset Dataset", - "dataset_abstract": "Composes sentence pairs (i.e., twin sentences).", - "paper_name": "", - "paper_abstract": "" - }, - "search_qa": { - "pwc_id": "searchqa", - "dataset_name": "SearchQA Dataset", - "dataset_abstract": "SearchQA was built using an in-production, commercial search engine. It closely reflects the full pipeline of a (hypothetical) general question-answering system, which consists of information retrieval and answer synthesis.", - "paper_name": "SearchQA: A New Q&A Dataset Augmented with Context from a Search Engine", - "paper_abstract": "We publicly release a new large-scale dataset, called SearchQA, for machine\ncomprehension, or question-answering. Unlike recently released datasets, such\nas DeepMind CNN/DailyMail and SQuAD, the proposed SearchQA was constructed to\nreflect a full pipeline of general question-answering. That is, we start not\nfrom an existing article and generate a question-answer pair, but start from an\nexisting question-answer pair, crawled from J! Archive, and augment it with\ntext snippets retrieved by Google. Following this approach, we built SearchQA,\nwhich consists of more than 140k question-answer pairs with each pair having\n49.6 snippets on average. Each question-answer-context tuple of the SearchQA\ncomes with additional meta-data such as the snippet's URL, which we believe\nwill be valuable resources for future research. We conduct human evaluation as\nwell as test two baseline methods, one simple word selection and the other deep\nlearning based, on the SearchQA. We show that there is a meaningful gap between\nthe human and machine performances. This suggests that the proposed dataset\ncould well serve as a benchmark for question-answering." - }, - "reuters21578": { - "pwc_id": "reuters-21578", - "dataset_name": "Reuters-21578 Dataset", - "dataset_abstract": "The Reuters-21578 dataset is a collection of documents with news articles. The original corpus has 10,369 documents and a vocabulary of 29,930 words.", - "paper_name": "", - "paper_abstract": "" - }, - "assin": { - "pwc_id": "assin", - "dataset_name": "ASSIN Dataset", - "dataset_abstract": "ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) is a dataset with semantic similarity score and entailment annotations. It was used in a shared task in the PROPOR 2016 conference.\n\nThe full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese and half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing. This is different from the split used in the shared task, in which the training set had 3,000 pairs and there was no validation set. The shared task training set can be reconstructed by simply merging both sets.", - "paper_name": "", - "paper_abstract": "" - }, - "taskmaster2": { - "pwc_id": "taskmaster-2", - "dataset_name": "Taskmaster-2 Dataset", - "dataset_abstract": "The Taskmaster-2 dataset consists of 17,289 dialogs in seven domains: restaurants (3276), food ordering (1050), movies (3047), hotels (2355), flights (2481), music (1602), and sports (3478).", - "paper_name": "Taskmaster-1: Toward a Realistic and Diverse Dialog Dataset", - "paper_abstract": "A significant barrier to progress in data-driven approaches to building dialog systems is the lack of high quality, goal-oriented conversational data. To help satisfy this elementary requirement, we introduce the initial release of the Taskmaster-1 dataset which includes 13,215 task-based dialogs comprising six domains. Two procedures were used to create this collection, each with unique advantages. The first involves a two-person, spoken \"Wizard of Oz\" (WOz) approach in which trained agents and crowdsourced workers interact to complete the task while the second is \"self-dialog\" in which crowdsourced workers write the entire dialog themselves. We do not restrict the workers to detailed scripts or to a small knowledge base and hence we observe that our dataset contains more realistic and diverse conversations in comparison to existing datasets. We offer several baseline models including state of the art neural seq2seq architectures with benchmark performance as well as qualitative human evaluations. Dialogs are labeled with API calls and arguments, a simple and cost effective approach which avoids the requirement of complex annotation schema. The layer of abstraction between the dialog model and the service provider API allows for a given model to interact with multiple services that provide similar functionally. Finally, the dataset will evoke interest in written vs. spoken language, discourse patterns, error handling and other linguistic phenomena related to dialog system research, development and design." - }, - "open_subtitles": { - "pwc_id": "opensubtitles", - "dataset_name": "OpenSubtitles Dataset", - "dataset_abstract": "OpenSubtitles is collection of multilingual parallel corpora. The dataset is compiled from a large database of movie and TV subtitles and includes a total of 1689 bitexts spanning 2.6 billion sentences across 60 languages.", - "paper_name": "OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles", - "paper_abstract": "We present a new major release of the OpenSubtitles collection of parallel corpora. The release is compiled from a large database of movie and TV subtitles and includes a total of 1689 bitexts spanning 2.6 billion sentences across 60 languages. The release also incorporates a number of enhancements in the preprocessing and alignment of the subtitles, such as the automatic correction of OCR errors and the use of meta-data to estimate the quality of each subtitle and score subtitle pairs." - }, - "cc100": { - "pwc_id": "cc100", - "dataset_name": "CC100 Dataset", - "dataset_abstract": "This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages. This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository.", - "paper_name": "Unsupervised Cross-lingual Representation Learning at Scale", - "paper_abstract": "This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6% average accuracy on XNLI, +13% average F1 score on MLQA, and +2.4% F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7% in XNLI accuracy for Swahili and 11.4% for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code, data and models publicly available." - }, - "euronews": { - "pwc_id": "europeana-newspapers", - "dataset_name": "Europeana Newspapers Dataset", - "dataset_abstract": "Europeana Newspapers consists of four datasets with 100 pages each for the languages Dutch, French, German (including Austrian) as part of the Europeana Newspapers project is expected to contribute to the further development and improvement of named entity recognition systems with a focus on historical content.", - "paper_name": "An Open Corpus for Named Entity Recognition in Historic Newspapers", - "paper_abstract": "The availability of openly available textual datasets ({``}corpora{''}) with highly accurate manual annotations ({``}gold standard{''}) of named entities (e.g. persons, locations, organizations, etc.) is crucial in the training and evaluation of named entity recognition systems. Currently there are only few such datasets available on the web, and even less for texts containing historical spelling variation. The production and subsequent release into the public domain of four such datasets with 100 pages each for the languages Dutch, French, German (including Austrian) as part of the Europeana Newspapers project is expected to contribute to the further development and improvement of named entity recognition systems with a focus on historical content. This paper describes how these datasets were produced, what challenges were encountered in their creation and informs about their final quality and availability." - }, - "fashion_mnist": { - "pwc_id": "fashion-mnist", - "dataset_name": "Fashion-MNIST Dataset", - "dataset_abstract": "Fashion-MNIST is a dataset comprising of 28\u00d728 grayscale images of 70,000 fashion products from 10 categories, with 7,000 images per category. The training set has 60,000 images and the test set has 10,000 images. Fashion-MNIST shares the same image size, data format and the structure of training and testing splits with the original MNIST.", - "paper_name": "Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms", - "paper_abstract": "We present Fashion-MNIST, a new dataset comprising of 28x28 grayscale images\nof 70,000 fashion products from 10 categories, with 7,000 images per category.\nThe training set has 60,000 images and the test set has 10,000 images.\nFashion-MNIST is intended to serve as a direct drop-in replacement for the\noriginal MNIST dataset for benchmarking machine learning algorithms, as it\nshares the same image size, data format and the structure of training and\ntesting splits. The dataset is freely available at\nhttps://github.com/zalandoresearch/fashion-mnist" - }, - "generics_kb": { - "pwc_id": "genericskb", - "dataset_name": "GenericsKB Dataset", - "dataset_abstract": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences.", - "paper_name": "GenericsKB: A Knowledge Base of Generic Statements", - "paper_abstract": "We present a new resource for the NLP community, namely a large (3.5M+ sentence) knowledge base of *generic statements*, e.g., \"Trees remove carbon dioxide from the atmosphere\", collected from multiple corpora. This is the first large resource to contain *naturally occurring* generic sentences, as opposed to extracted or crowdsourced triples, and thus is rich in high-quality, general, semantically complete statements. All GenericsKB sentences are annotated with their topical term, surrounding context (sentences), and a (learned) confidence. We also release GenericsKB-Best (1M+ sentences), containing the best-quality generics in GenericsKB augmented with selected, synthesized generics from WordNet and ConceptNet. In tests on two existing datasets requiring multihop reasoning (OBQA and QASC), we find using GenericsKB can result in higher scores and better explanations than using a much larger corpus. This demonstrates that GenericsKB can be a useful resource for NLP applications, as well as providing data for linguistic studies of generics and their semantics. GenericsKB is available at https://allenai.org/data/genericskb." - }, - "bianet": { - "pwc_id": "bianet", - "dataset_name": "Bianet Dataset", - "dataset_abstract": "Bianet is a parallel news corpus in Turkish, Kurdish and English\nIt contains 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.", - "paper_name": "Bianet: A Parallel News Corpus in Turkish, Kurdish and English", - "paper_abstract": "We present a new open-source parallel corpus consisting of news articles\ncollected from the Bianet magazine, an online newspaper that publishes Turkish\nnews, often along with their translations in English and Kurdish. In this\npaper, we describe the collection process of the corpus and its statistical\nproperties. We validate the benefit of using the Bianet corpus by evaluating\nbilingual and multilingual neural machine translation models in English-Turkish\nand English-Kurdish directions." - }, - "squad_es": { - "pwc_id": "squad-es", - "dataset_name": "SQuAD-es Dataset", - "dataset_abstract": "Stanford Question Answering Dataset (SQuAD) into Spanish.", - "paper_name": "", - "paper_abstract": "" - }, - "newsqa": { - "pwc_id": "newsqa", - "dataset_name": "NewsQA Dataset", - "dataset_abstract": "The NewsQA dataset is a crowd-sourced machine reading comprehension dataset of 120,000 question-answer pairs.\n\n\nDocuments are CNN news articles.\nQuestions are written by human users in natural language.\nAnswers may be multiword passages of the source text.\nQuestions may be unanswerable.\nNewsQA is collected using a 3-stage, siloed process.\nQuestioners see only an article\u2019s headline and highlights.\nAnswerers see the question and the full article, then select an answer passage.\nValidators see the article, the question, and a set of answers that they rank.\nNewsQA is more natural and more challenging than previous datasets.", - "paper_name": "NewsQA: A Machine Comprehension Dataset", - "paper_abstract": "We present NewsQA, a challenging machine comprehension dataset of over\n100,000 human-generated question-answer pairs. Crowdworkers supply questions\nand answers based on a set of over 10,000 news articles from CNN, with answers\nconsisting of spans of text from the corresponding articles. We collect this\ndataset through a four-stage process designed to solicit exploratory questions\nthat require reasoning. A thorough analysis confirms that NewsQA demands\nabilities beyond simple word matching and recognizing textual entailment. We\nmeasure human performance on the dataset and compare it to several strong\nneural models. The performance gap between humans and machines (0.198 in F1)\nindicates that significant progress can be made on NewsQA through future\nresearch. The dataset is freely available at\nhttps://datasets.maluuba.com/NewsQA." - }, - "matinf": { - "pwc_id": "matinf", - "dataset_name": "MATINF Dataset", - "dataset_abstract": "Maternal and Infant (MATINF) Dataset is a large-scale dataset jointly labeled for classification, question answering and summarization in the domain of maternity and baby caring in Chinese. An entry in the dataset includes four fields: question (Q), description (D), class (C) and answer (A).\n\nNearly two million question-answer pairs are collected with fine-grained human-labeled classes from a large Chinese maternity and baby caring QA site. Authors conduct both automatic and manual data cleansing and remove: (1) classes with insufficient samples; (2) entries in which the length of the description filed is less than the length of the question field; (3) data with any field longer than 256 characters; (4) human-spotted ill-formed data. After the data cleansing, MATINF is constructed with the remaining 1.07 million entries", - "paper_name": "MATINF: A Jointly Labeled Large-Scale Dataset for Classification, Question Answering and Summarization", - "paper_abstract": "Recently, large-scale datasets have vastly facilitated the development in nearly all domains of Natural Language Processing. However, there is currently no cross-task dataset in NLP, which hinders the development of multi-task learning. We propose MATINF, the first jointly labeled large-scale dataset for classification, question answering and summarization. MATINF contains 1.07 million question-answer pairs with human-labeled categories and user-generated question descriptions. Based on such rich information, MATINF is applicable for three major NLP tasks, including classification, question answering, and summarization. We benchmark existing methods and a novel multi-task baseline over MATINF to inspire further research. Our comprehensive comparison and experiments over MATINF and other datasets demonstrate the merits held by MATINF." - }, - "sberquad": { - "pwc_id": "sberquad", - "dataset_name": "SberQuAD Dataset", - "dataset_abstract": "A large scale analogue of Stanford SQuAD in the Russian language - is a valuable resource that has not been properly presented to the scientific community. \n\nSee DeepPavlov link\n\nModel results\n| Model config | EM (dev) | F-1 (dev) |\n|------------------------------|-------------|-------------|\n|DeepPavlov RuBERT | 66.30+-0.24 | 84.60+-0.11 |\n| DeepPavlov multilingual BERT | 64.35+-0.39 | 83.39+-0.08 |\n| DeepPavlov R-Net | 60.62 | 80.04 |", - "paper_name": "SberQuAD -- Russian Reading Comprehension Dataset: Description and Analysis", - "paper_abstract": "SberQuAD -- a large scale analog of Stanford SQuAD in the Russian language - is a valuable resource that has not been properly presented to the scientific community. We fill this gap by providing a description, a thorough analysis, and baseline experimental results." - }, - "ecb": { - "pwc_id": "ecb", - "dataset_name": "ECB+ Dataset", - "dataset_abstract": "The ECB+ corpus is an extension to the EventCorefBank (ECB, Bejan and Harabagiu, 2010). A newly added corpus component consists of 502 documents that belong to the 43 topics of the ECB but that describe different seminal events than those already captured in the ECB. All corpus texts were found through Google Search and were annotated with mentions of events and their times, locations, human and non-human participants as well as with within- and cross-document event and entity coreference information. The 2012 version of annotation of the ECB corpus (Lee et al., 2012) was used as a starting point for re-annotation of the ECB according to the ECB+ annotation guideline.\n\nThe major differences with respect to the 2012 version of annotation of the ECB are:\n\n(a) five event components are annotated in text:\n\nactions (annotation tags starting with ACTION and NEG)\ntimes (annotation tags starting with TIME)\nlocations (annotation tags starting with LOC)\nhuman participants (annotation tags starting with HUMAN)\nnon-human participants (annotation tags starting with NON_HUMAN)\n\n(b) specific action classes and entity subtypes are distinguished for each of the five main event components resulting in a total tagset of 30 annotation tags based on ACE annotation guidelines (LDC 2008), TimeML (Pustejovsky et al., 2003 and Sauri et al., 2005)\n(c) intra- and cross-document coreference relations between mentions of the five event components were established:\n\nINTRA_DOC_COREF tag captures within document coreference chains that do not participate in cross-document relations; within document coreference was annotated by means of the CAT tool (Bartalesi et al., 2012)\nCROSS_DOC_COREF tag indicates cross-document coreference relations created in the CROMER tool (Girardi et al., 2014); all coreference branches refer by means of relation target IDs to the so called TAG_DESCRIPTORS, pointing to human friendly instance names (assigned by coders) and also to instance_id-s\n\n(d) events are annotated from an \u201cevent-centric\u201d perspective, i.e. annotation tags are assigned depending on the role a mention plays in an event (for more information see ECB+ references).", - "paper_name": "Using a sledgehammer to crack a nut? Lexical diversity and event coreference resolution", - "paper_abstract": "In this paper we examine the representativeness of the EventCorefBank (ECB, Bejan and Harabagiu, 2010) with regards to the language population of large-volume streams of news. The ECB corpus is one of the data sets used for evaluation of the task of event coreference resolution. Our analysis shows that the ECB in most cases covers one seminal event per domain, what considerably simplifies event and so language diversity that one comes across in the news. We augmented the corpus with a new corpus component, consisting of 502 texts, describing different instances of event types that were already captured by the 43 topics of the ECB, making it more representative of news articles on the web. The new {``}ECB+{''} corpus is available for further research." - }, - "um005": { - "pwc_id": "umc005-english-urdu", - "dataset_name": "UMC005 English-Urdu Dataset", - "dataset_abstract": "UMC005 English-Urdu is a parallel corpus of texts in English and Urdu language with sentence alignments. The corpus can be used for experiments with statistical machine translation.\n\nThe texts come from four different sources:\n\n\nQuran\nBible\nPenn Treebank (Wall Street Journal)\nEmille corpus", - "paper_name": "", - "paper_abstract": "" - }, - "compguesswhat": { - "pwc_id": "compguesswhat", - "dataset_name": "CompGuessWhat?! Dataset", - "dataset_abstract": "CompGuessWhat?! extends the original GuessWhat?! datasets with a rich semantic representations in the form of scene graphs associated with every image used as reference scene for the guessing games.", - "paper_name": "CompGuessWhat?!: A Multi-task Evaluation Framework for Grounded Language Learning", - "paper_abstract": "Approaches to Grounded Language Learning typically focus on a single task-based final performance measure that may not depend on desirable properties of the learned hidden representations, such as their ability to predict salient attributes or to generalise to unseen situations. To remedy this, we present GROLLA, an evaluation framework for Grounded Language Learning with Attributes with three sub-tasks: 1) Goal-oriented evaluation; 2) Object attribute prediction evaluation; and 3) Zero-shot evaluation. We also propose a new dataset CompGuessWhat?! as an instance of this framework for evaluating the quality of learned neural representations, in particular concerning attribute grounding. To this end, we extend the original GuessWhat?! dataset by including a semantic layer on top of the perceptual one. Specifically, we enrich the VisualGenome scene graphs associated with the GuessWhat?! images with abstract and situated attributes. By using diagnostic classifiers, we show that current models learn representations that are not expressive enough to encode object attributes (average F1 of 44.27). In addition, they do not learn strategies nor representations that are robust enough to perform well when novel scenes or objects are involved in gameplay (zero-shot best accuracy 50.06%)." - }, - "irc_disentangle": { - "pwc_id": "irc-disentanglement", - "dataset_name": "irc-disentanglement Dataset", - "dataset_abstract": "This is a dataset for disentangling conversations on IRC, which is the task of identifying separate conversations in a single stream of messages. It contains disentanglement information for 77,563 messages or IRC.", - "paper_name": "A Large-Scale Corpus for Conversation Disentanglement", - "paper_abstract": "Disentangling conversations mixed together in a single stream of messages is a difficult task, made harder by the lack of large manually annotated datasets. We created a new dataset of 77,563 messages manually annotated with reply-structure graphs that both disentangle conversations and define internal conversation structure. Our dataset is 16 times larger than all previously released datasets combined, the first to include adjudication of annotation disagreements, and the first to include context. We use our data to re-examine prior work, in particular, finding that 80% of conversations in a widely used dialogue corpus are either missing messages or contain extra messages. Our manually-annotated data presents an opportunity to develop robust data-driven methods for conversation disentanglement, which will help advance dialogue research." - }, - "assin2": { - "pwc_id": "assin2", - "dataset_name": "ASSIN2 Dataset", - "dataset_abstract": "ASSIN 2 is the second Semantic Similarity Assessment and Textual Inference, and was a workshop held in conjunction with STIL 2019 .", - "paper_name": "", - "paper_abstract": "" - }, - "doc2dial": { - "pwc_id": "doc2dial", - "dataset_name": "doc2dial Dataset", - "dataset_abstract": "A new dataset of goal-oriented dialogues that are grounded in the associated documents.", - "paper_name": "doc2dial: A Goal-Oriented Document-Grounded Dialogue Dataset", - "paper_abstract": "We introduce doc2dial, a new dataset of goal-oriented dialogues that are grounded in the associated documents. Inspired by how the authors compose documents for guiding end users, we first construct dialogue flows based on the content elements that corresponds to higher-level relations across text sections as well as lower-level relations between discourse units within a section. Then we present these dialogue flows to crowd contributors to create conversational utterances. The dataset includes about 4800 annotated conversations with an average of 14 turns that are grounded in over 480 documents from four domains. Compared to the prior document-grounded dialogue datasets, this dataset covers a variety of dialogue scenes in information-seeking conversations. For evaluating the versatility of the dataset, we introduce multiple dialogue modeling tasks and present baseline approaches." - }, - "weibo_ner": { - "pwc_id": "weibo-ner", - "dataset_name": "Weibo NER Dataset", - "dataset_abstract": "The Weibo NER dataset is a Chinese Named Entity Recognition dataset drawn from the social media website Sina Weibo.", - "paper_name": "", - "paper_abstract": "" - }, - "arcd": { - "pwc_id": "arcd", - "dataset_name": "ARCD Dataset", - "dataset_abstract": "Composed of 1,395 questions posed by crowdworkers on Wikipedia articles, and a machine translation of the Stanford Question Answering Dataset (Arabic-SQuAD).", - "paper_name": "Neural Arabic Question Answering", - "paper_abstract": "This paper tackles the problem of open domain factual Arabic question answering (QA) using Wikipedia as our knowledge source. This constrains the answer of any question to be a span of text in Wikipedia. Open domain QA for Arabic entails three challenges: annotated QA datasets in Arabic, large scale efficient information retrieval and machine reading comprehension. To deal with the lack of Arabic QA datasets we present the Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles, and a machine translation of the Stanford Question Answering Dataset (Arabic-SQuAD). Our system for open domain question answering in Arabic (SOQAL) is based on two components: (1) a document retriever using a hierarchical TF-IDF approach and (2) a neural reading comprehension model using the pre-trained bi-directional transformer BERT. Our experiments on ARCD indicate the effectiveness of our approach with our BERT-based reader achieving a 61.3 F1 score, and our open domain system SOQAL achieving a 27.6 F1 score." - }, - "qanta": { - "pwc_id": "quizbowl", - "dataset_name": "Quizbowl Dataset", - "dataset_abstract": "Consists of multiple sentences whose clues are arranged by difficulty (from obscure to obvious) and uniquely identify a well-known entity such as those found on Wikipedia.", - "paper_name": "Quizbowl: The Case for Incremental Question Answering", - "paper_abstract": "Scholastic trivia competitions test knowledge and intelligence through mastery of question answering. Modern question answering benchmarks are one variant of the Turing test. Specifically, answering a set of questions as well as a human is a minimum bar towards demonstrating human-like intelligence. This paper makes the case that the format of one competition -- where participants can answer in the middle of hearing a question (incremental) -- better differentiates the skill between (human or machine) players. Additionally, merging a sequential decision-making sub-task with question answering (QA) provides a good setting for research in model calibration and opponent modeling. Thus, embedded in this task are three machine learning challenges: (1) factoid QA over thousands of Wikipedia-like answers, (2) calibration of the QA model's confidence scores, and (3) sequential decision-making that incorporates knowledge of the QA model, its calibration, and what the opponent may do. We make two contributions: (1) collecting and curating a large factoid QA dataset and an accompanying gameplay dataset, and (2) developing a model that addresses these three machine learning challenges. In addition to offline evaluation, we pitted our model against some of the most accomplished trivia players in the world in a series of exhibition matches spanning several years. Throughout this paper, we show that collaborations with the vibrant trivia community have contributed to the quality of our dataset, spawned new research directions, and doubled as an exciting way to engage the public with research in machine learning and natural language processing." - }, - "web_of_science": { - "pwc_id": "web-of-science-dataset", - "dataset_name": "WOS Dataset", - "dataset_abstract": "Web of Science (WOS) is a document classification dataset that contains 46,985 documents with 134 categories which include 7 parents categories.", - "paper_name": "HDLTex: Hierarchical Deep Learning for Text Classification", - "paper_abstract": "The continually increasing number of documents produced each year\nnecessitates ever improving information processing methods for searching,\nretrieving, and organizing text. Central to these information processing\nmethods is document classification, which has become an important application\nfor supervised learning. Recently the performance of these traditional\nclassifiers has degraded as the number of documents has increased. This is\nbecause along with this growth in the number of documents has come an increase\nin the number of categories. This paper approaches this problem differently\nfrom current document classification methods that view the problem as\nmulti-class classification. Instead we perform hierarchical classification\nusing an approach we call Hierarchical Deep Learning for Text classification\n(HDLTex). HDLTex employs stacks of deep learning architectures to provide\nspecialized understanding at each level of the document hierarchy." - }, - "imagenet-1k": { - "pwc_id": "imagenet", - "dataset_name": "ImageNet Dataset", - "dataset_abstract": "The ImageNet dataset contains 14,197,122 annotated images according to the WordNet hierarchy. Since 2010 the dataset is used in the ImageNet Large Scale Visual Recognition Challenge (ILSVRC), a benchmark in image classification and object detection.\nThe publicly released dataset contains a set of manually annotated training images. A set of test images is also released, with the manual annotations withheld.\nILSVRC annotations fall into one of two categories: (1) image-level annotation of a binary label for the presence or absence of an object class in the image, e.g., \u201cthere are cars in this image\u201d but \u201cthere are no tigers,\u201d and (2) object-level annotation of a tight bounding box and class label around an object instance in the image, e.g., \u201cthere is a screwdriver centered at position (20,25) with width of 50 pixels and height of 30 pixels\u201d.\nThe ImageNet project does not own the copyright of the images, therefore only thumbnails and URLs of images are provided.\n\n\nTotal number of non-empty WordNet synsets: 21841\nTotal number of images: 14197122\nNumber of images with bounding box annotations: 1,034,908\nNumber of synsets with SIFT features: 1000\nNumber of images with SIFT features: 1.2 million", - "paper_name": "", - "paper_abstract": "" - }, - "lj_speech": { - "pwc_id": "ljspeech", - "dataset_name": "LJSpeech Dataset", - "dataset_abstract": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading passages from 7 non-fiction books. A transcription is provided for each clip. Clips vary in length from 1 to 10 seconds and have a total length of approximately 24 hours. The texts were published between 1884 and 1964, and are in the public domain. The audio was recorded in 2016-17 by the LibriVox project and is also in the public domain.", - "paper_name": "", - "paper_abstract": "" - }, - "stereoset": { - "pwc_id": "stereoset", - "dataset_name": "StereoSet Dataset", - "dataset_abstract": "A large-scale natural dataset in English to measure stereotypical biases in four domains: gender, profession, race, and religion.", - "paper_name": "StereoSet: Measuring stereotypical bias in pretrained language models", - "paper_abstract": "A stereotype is an over-generalized belief about a particular group of people, e.g., Asians are good at math or Asians are bad drivers. Such beliefs (biases) are known to hurt target groups. Since pretrained language models are trained on large real world data, they are known to capture stereotypical biases. In order to assess the adverse effects of these models, it is important to quantify the bias captured in them. Existing literature on quantifying bias evaluates pretrained language models on a small set of artificially constructed bias-assessing sentences. We present StereoSet, a large-scale natural dataset in English to measure stereotypical biases in four domains: gender, profession, race, and religion. We evaluate popular models like BERT, GPT-2, RoBERTa, and XLNet on our dataset and show that these models exhibit strong stereotypical biases. We also present a leaderboard with a hidden test set to track the bias of future language models at https://stereoset.mit.edu" - }, - "visual_genome": { - "pwc_id": "visual-genome", - "dataset_name": "Visual Genome Dataset", - "dataset_abstract": "Visual Genome contains Visual Question Answering data in a multi-choice setting. It consists of 101,174 images from MSCOCO with 1.7 million QA pairs, 17 questions per image on average. Compared to the Visual Question Answering dataset, Visual Genome represents a more balanced distribution over 6 question types: What, Where, When, Who, Why and How. The Visual Genome dataset also presents 108K images with densely annotated objects, attributes and relationships.", - "paper_name": "Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations", - "paper_abstract": "Despite progress in perceptual tasks such as image classification, computers\nstill perform poorly on cognitive tasks such as image description and question\nanswering. Cognition is core to tasks that involve not just recognizing, but\nreasoning about our visual world. However, models used to tackle the rich\ncontent in images for cognitive tasks are still being trained using the same\ndatasets designed for perceptual tasks. To achieve success at cognitive tasks,\nmodels need to understand the interactions and relationships between objects in\nan image. When asked \"What vehicle is the person riding?\", computers will need\nto identify the objects in an image as well as the relationships riding(man,\ncarriage) and pulling(horse, carriage) in order to answer correctly that \"the\nperson is riding a horse-drawn carriage\".\n In this paper, we present the Visual Genome dataset to enable the modeling of\nsuch relationships. We collect dense annotations of objects, attributes, and\nrelationships within each image to learn these models. Specifically, our\ndataset contains over 100K images where each image has an average of 21\nobjects, 18 attributes, and 18 pairwise relationships between objects. We\ncanonicalize the objects, attributes, relationships, and noun phrases in region\ndescriptions and questions answer pairs to WordNet synsets. Together, these\nannotations represent the densest and largest dataset of image descriptions,\nobjects, attributes, relationships, and question answers." - }, - "kinnews_kirnews": { - "pwc_id": "kinnews-and-kirnews", - "dataset_name": "KINNEWS and KIRNEWS Dataset", - "dataset_abstract": "Two news datasets (KINNEWS and KIRNEWS) for multi-class classification of news articles in Kinyarwanda and Kirundi, two low-resource African languages. The two languages are mutually intelligible.", - "paper_name": "KINNEWS and KIRNEWS: Benchmarking Cross-Lingual Text Classification for Kinyarwanda and Kirundi", - "paper_abstract": "Recent progress in text classification has been focused on high-resource languages such as English and Chinese. For low-resource languages, amongst them most African languages, the lack of well-annotated data and effective preprocessing, is hindering the progress and the transfer of successful methods. In this paper, we introduce two news datasets (KINNEWS and KIRNEWS) for multi-class classification of news articles in Kinyarwanda and Kirundi, two low-resource African languages. The two languages are mutually intelligible, but while Kinyarwanda has been studied in Natural Language Processing (NLP) to some extent, this work constitutes the first study on Kirundi. Along with the datasets, we provide statistics, guidelines for preprocessing, and monolingual and cross-lingual baseline models. Our experiments show that training embeddings on the relatively higher-resourced Kinyarwanda yields successful cross-lingual transfer to Kirundi. In addition, the design of the created datasets allows for a wider use in NLP beyond text classification in future studies, such as representation learning, cross-lingual learning with more distant languages, or as base for new annotations for tasks such as parsing, POS tagging, and NER. The datasets, stopwords, and pre-trained embeddings are publicly available at https://github.com/Andrews2017/KINNEWS-and-KIRNEWS-Corpus ." - }, - "per_sent": { - "pwc_id": "persent", - "dataset_name": "PerSenT Dataset", - "dataset_abstract": "PerSenT is a dataset of crowd-sourced annotations of the sentiment expressed by the authors towards the main entities in news articles. The dataset also includes paragraph-level sentiment annotations to provide more fine-grained supervision for the task.", - "paper_name": "Author's Sentiment Prediction", - "paper_abstract": "We introduce PerSenT, a dataset of crowd-sourced annotations of the sentiment expressed by the authors towards the main entities in news articles. The dataset also includes paragraph-level sentiment annotations to provide more fine-grained supervision for the task. Our benchmarks of multiple strong baselines show that this is a difficult classification task. The results also suggest that simply fine-tuning document-level representations from BERT isn't adequate for this task. Making paragraph-level decisions and aggregating them over the entire document is also ineffective. We present empirical and qualitative analyses that illustrate the specific challenges posed by this dataset. We release this dataset with 5.3k documents and 38k paragraphs covering 3.2k unique entities as a challenge in entity sentiment analysis." - }, - "pg19": { - "pwc_id": "pg-19", - "dataset_name": "PG-19 Dataset", - "dataset_abstract": "A new open-vocabulary language modelling benchmark derived from books.", - "paper_name": "Compressive Transformers for Long-Range Sequence Modelling", - "paper_abstract": "We present the Compressive Transformer, an attentive sequence model which compresses past memories for long-range sequence learning. We find the Compressive Transformer obtains state-of-the-art language modelling results in the WikiText-103 and Enwik8 benchmarks, achieving 17.1 ppl and 0.97 bpc respectively. We also find it can model high-frequency speech effectively and can be used as a memory mechanism for RL, demonstrated on an object matching task. To promote the domain of long-range sequence learning, we propose a new open-vocabulary language modelling benchmark derived from books, PG-19." - }, - "xed_en_fi": { - "pwc_id": "xed", - "dataset_name": "XED Dataset", - "dataset_abstract": "XED is a multilingual fine-grained emotion dataset. The dataset consists of human-annotated Finnish (25k) and English sentences (30k), as well as projected annotations for 30 additional languages, providing new resources for many low-resource languages.", - "paper_name": "XED: A Multilingual Dataset for Sentiment Analysis and Emotion Detection", - "paper_abstract": "We introduce XED, a multilingual fine-grained emotion dataset. The dataset consists of human-annotated Finnish (25k) and English sentences (30k), as well as projected annotations for 30 additional languages, providing new resources for many low-resource languages. We use Plutchik's core emotions to annotate the dataset with the addition of neutral to create a multilabel multiclass dataset. The dataset is carefully evaluated using language-specific BERT models and SVMs to show that XED performs on par with other similar datasets and is therefore a useful tool for sentiment analysis and emotion detection." - }, - "newsroom": { - "pwc_id": "newsroom", - "dataset_name": "NEWSROOM Dataset", - "dataset_abstract": "CORNELL NEWSROOM is a large dataset for training and evaluating summarization systems. It contains 1.3 million articles and summaries written by authors and editors in the newsrooms of 38 major publications. The summaries are obtained from search and social metadata between 1998 and 2017 and use a variety of summarization strategies combining extraction and abstraction.", - "paper_name": "Newsroom: A Dataset of 1.3 Million Summaries with Diverse Extractive Strategies", - "paper_abstract": "We present NEWSROOM, a summarization dataset of 1.3 million articles and summaries written by authors and editors in newsrooms of 38 major news publications. Extracted from search and social media metadata between 1998 and 2017, these high-quality summaries demonstrate high diversity of summarization styles. In particular, the summaries combine abstractive and extractive strategies, borrowing words and phrases from articles at varying rates. We analyze the extraction strategies used in NEWSROOM summaries against other datasets to quantify the diversity and difficulty of our new data, and train existing methods on the data to evaluate its utility and challenges." - }, - "woz_dialogue": { - "pwc_id": "wizard-of-oz", - "dataset_name": "Wizard-of-Oz Dataset", - "dataset_abstract": "The WoZ 2.0 dataset is a newer dialogue state tracking dataset whose evaluation is detached from the noisy output of speech recognition systems. Similar to DSTC2, it covers the restaurant search domain and has identical evaluation.\n\nDescription from NLP Progress", - "paper_name": "Neural Belief Tracker: Data-Driven Dialogue State Tracking", - "paper_abstract": "One of the core components of modern spoken dialogue systems is the belief\ntracker, which estimates the user's goal at every step of the dialogue.\nHowever, most current approaches have difficulty scaling to larger, more\ncomplex dialogue domains. This is due to their dependency on either: a) Spoken\nLanguage Understanding models that require large amounts of annotated training\ndata; or b) hand-crafted lexicons for capturing some of the linguistic\nvariation in users' language. We propose a novel Neural Belief Tracking (NBT)\nframework which overcomes these problems by building on recent advances in\nrepresentation learning. NBT models reason over pre-trained word vectors,\nlearning to compose them into distributed representations of user utterances\nand dialogue context. Our evaluation on two datasets shows that this approach\nsurpasses past limitations, matching the performance of state-of-the-art models\nwhich rely on hand-crafted semantic lexicons and outperforming them when such\nlexicons are not provided." - }, - "nli_tr": { - "pwc_id": "nli-tr", - "dataset_name": "NLI-TR Dataset", - "dataset_abstract": "Natural Language Inference in Turkish (NLI-TR) provides translations of two large English NLI datasets into Turkish and had a team of experts validate their translation quality and fidelity to the original labels.", - "paper_name": "Data and Representation for Turkish Natural Language Inference", - "paper_abstract": "Large annotated datasets in NLP are overwhelmingly in English. This is an obstacle to progress in other languages. Unfortunately, obtaining new annotated resources for each task in each language would be prohibitively expensive. At the same time, commercial machine translation systems are now robust. Can we leverage these systems to translate English-language datasets automatically? In this paper, we offer a positive response for natural language inference (NLI) in Turkish. We translated two large English NLI datasets into Turkish and had a team of experts validate their translation quality and fidelity to the original labels. Using these datasets, we address core issues of representation for Turkish NLI. We find that in-language embeddings are essential and that morphological parsing can be avoided where the training set is large. Finally, we show that models trained on our machine-translated datasets are successful on human-translated evaluation sets. We share all code, models, and data publicly." - }, - "few_rel": { - "pwc_id": "fewrel", - "dataset_name": "FewRel Dataset", - "dataset_abstract": "The FewRel (Few-Shot Relation Classification Dataset) contains 100 relations and 70,000 instances from Wikipedia. The dataset is divided into three subsets: training set (64 relations), validation set (16 relations) and test set (20 relations).", - "paper_name": "FewRel: A Large-Scale Supervised Few-Shot Relation Classification Dataset with State-of-the-Art Evaluation", - "paper_abstract": "We present a Few-Shot Relation Classification Dataset (FewRel), consisting of\n70, 000 sentences on 100 relations derived from Wikipedia and annotated by\ncrowdworkers. The relation of each sentence is first recognized by distant\nsupervision methods, and then filtered by crowdworkers. We adapt the most\nrecent state-of-the-art few-shot learning methods for relation classification\nand conduct a thorough evaluation of these methods. Empirical results show that\neven the most competitive few-shot learning models struggle on this task,\nespecially as compared with humans. We also show that a range of different\nreasoning skills are needed to solve our task. These results indicate that\nfew-shot relation classification remains an open problem and still requires\nfurther research. Our detailed analysis points multiple directions for future\nresearch. All details and resources about the dataset and baselines are\nreleased on http://zhuhao.me/fewrel." - }, - "multidoc2dial": { - "pwc_id": "multidoc2dial", - "dataset_name": "MultiDoc2Dial Dataset", - "dataset_abstract": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents.", - "paper_name": "MultiDoc2Dial: Modeling Dialogues Grounded in Multiple Documents", - "paper_abstract": "We propose MultiDoc2Dial, a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. In this work, we aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents. To facilitate such a task, we introduce a new dataset that contains dialogues grounded in multiple documents from four different domains. We also explore modeling the dialogue-based and document-based context in the dataset. We present strong baseline approaches and various experimental results, aiming to support further research efforts on such a task." - }, - "kor_nli": { - "pwc_id": "kornli", - "dataset_name": "KorNLI Dataset", - "dataset_abstract": "KorNLI is a Korean Natural Language Inference (NLI) dataset. The dataset is constructed by automatically translating the training sets of the SNLI, XNLI and MNLI datasets. To ensure translation quality, two professional translators with at least seven years of experience who specialize in academic papers/books as well as business contracts post-edited a half of the dataset each and cross-checked each other\u2019s translation afterward.\nIt contains 942,854 training examples translated automatically and 7,500 evaluation (development and test) examples translated manually", - "paper_name": "KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding", - "paper_abstract": "Natural language inference (NLI) and semantic textual similarity (STS) are key tasks in natural language understanding (NLU). Although several benchmark datasets for those tasks have been released in English and a few other languages, there are no publicly available NLI or STS datasets in the Korean language. Motivated by this, we construct and release new datasets for Korean NLI and STS, dubbed KorNLI and KorSTS, respectively. Following previous approaches, we machine-translate existing English training sets and manually translate development and test sets into Korean. To accelerate research on Korean NLU, we also establish baselines on KorNLI and KorSTS. Our datasets are publicly available at https://github.com/kakaobrain/KorNLUDatasets." - }, - "conceptnet5": { - "pwc_id": "conceptnet", - "dataset_name": "ConceptNet Dataset", - "dataset_abstract": "ConceptNet is a knowledge graph that connects words and phrases of natural language with labeled edges. Its knowledge is collected from many sources that include expert-created resources, crowd-sourcing, and games with a purpose. It is designed to represent the general knowledge involved in understanding language, improving natural language applications by allowing the application to better understand the meanings behind the words people use.", - "paper_name": "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge", - "paper_abstract": "Machine learning about language can be improved by supplying it with specific\nknowledge and sources of external information. We present here a new version of\nthe linked open data resource ConceptNet that is particularly well suited to be\nused with modern NLP techniques such as word embeddings.\n ConceptNet is a knowledge graph that connects words and phrases of natural\nlanguage with labeled edges. Its knowledge is collected from many sources that\ninclude expert-created resources, crowd-sourcing, and games with a purpose. It\nis designed to represent the general knowledge involved in understanding\nlanguage, improving natural language applications by allowing the application\nto better understand the meanings behind the words people use.\n When ConceptNet is combined with word embeddings acquired from distributional\nsemantics (such as word2vec), it provides applications with understanding that\nthey would not acquire from distributional semantics alone, nor from narrower\nresources such as WordNet or DBPedia. We demonstrate this with state-of-the-art\nresults on intrinsic evaluations of word relatedness that translate into\nimprovements on applications of word vectors, including solving SAT-style\nanalogies." - }, - "cs_restaurants": { - "pwc_id": "czech-restaurant-information", - "dataset_name": "Czech restaurant information Dataset", - "dataset_abstract": "Czech restaurant information is a dataset for NLG in task-oriented spoken dialogue systems with Czech as the target language. It originated as a translation of the English San Francisco Restaurants dataset by Wen et al. (2015).", - "paper_name": "Neural Generation for Czech: Data and Baselines", - "paper_abstract": "We present the first dataset targeted at end-to-end NLG in Czech in the restaurant domain, along with several strong baseline models using the sequence-to-sequence approach. While non-English NLG is under-explored in general, Czech, as a morphologically rich language, makes the task even harder: Since Czech requires inflecting named entities, delexicalization or copy mechanisms do not work out-of-the-box and lexicalizing the generated outputs is non-trivial. In our experiments, we present two different approaches to this this problem: (1) using a neural language model to select the correct inflected form while lexicalizing, (2) a two-step generation setup: our sequence-to-sequence model generates an interleaved sequence of lemmas and morphological tags, which are then inflected by a morphological generator." - }, - "atomic": { - "pwc_id": "atomic", - "dataset_name": "ATOMIC Dataset", - "dataset_abstract": "ATOMIC is an atlas of everyday commonsense reasoning, organized through 877k textual descriptions of inferential knowledge. Compared to existing resources that center around taxonomic knowledge, ATOMIC focuses on inferential knowledge organized as typed if-then relations with variables (e.g., \"if X pays Y a compliment, then Y will likely return the compliment\").", - "paper_name": "ATOMIC: An Atlas of Machine Commonsense for If-Then Reasoning", - "paper_abstract": "We present ATOMIC, an atlas of everyday commonsense reasoning, organized\nthrough 877k textual descriptions of inferential knowledge. Compared to\nexisting resources that center around taxonomic knowledge, ATOMIC focuses on\ninferential knowledge organized as typed if-then relations with variables\n(e.g., \"if X pays Y a compliment, then Y will likely return the compliment\").\nWe propose nine if-then relation types to distinguish causes vs. effects,\nagents vs. themes, voluntary vs. involuntary events, and actions vs. mental\nstates. By generatively training on the rich inferential knowledge described in\nATOMIC, we show that neural models can acquire simple commonsense capabilities\nand reason about previously unseen events. Experimental results demonstrate\nthat multitask models that incorporate the hierarchical structure of if-then\nrelation types lead to more accurate inference compared to models trained in\nisolation, as measured by both automatic and human evaluation." - }, - "aslg_pc12": { - "pwc_id": "aslg-pc12", - "dataset_name": "ASLG-PC12 Dataset", - "dataset_abstract": "An artificial corpus built using grammatical dependencies rules due to the lack of resources for Sign Language.", - "paper_name": "", - "paper_abstract": "" - }, - "id_nergrit_corpus": { - "pwc_id": "nergrit-corpus", - "dataset_name": "NERGRIT Corpus Dataset", - "dataset_abstract": "NERGRIT involves machine learning based NLP Tools and a corpus used for Indonesian Named Entity Recognition, Statement Extraction, and Sentiment Analysis.", - "paper_name": "", - "paper_abstract": "" - }, - "arabic_speech_corpus": { - "pwc_id": "arabic-speech-corpus", - "dataset_name": "Arabic Speech Corpus Dataset", - "dataset_abstract": "The Arabic Speech Corpus (1.5 GB) is a Modern Standard Arabic (MSA) speech corpus for speech synthesis. The corpus contains phonetic and orthographic transcriptions of more than 3.7 hours of MSA speech aligned with recorded speech on the phoneme level. The annotations include word stress marks on the individual phonemes The Speech corpus has been developed as part of PhD work carried out by Nawar Halabi at the University of Southampton. The corpus was recorded in south Levantine Arabic (Damascian accent) using a professional studio. Synthesized speech as an output using this corpus has produced a high quality, natural voice.", - "paper_name": "", - "paper_abstract": "" - }, - "ascent_kb": { - "pwc_id": "ascentkb", - "dataset_name": "Ascent KB Dataset", - "dataset_abstract": "This dataset contains 8.9M commonsense assertions extracted by the Ascent pipeline developed at the Max Planck Institute for Informatics. The focus of this dataset is on everyday concepts such as elephant, car, laptop, etc. The current version of Ascent KB (v1.0.0) is approximately 19 times larger than ConceptNet (note that, in this comparison, non-commonsense knowledge in ConceptNet such as lexical relations is excluded).", - "paper_name": "Advanced Semantics for Commonsense Knowledge Extraction", - "paper_abstract": "Commonsense knowledge (CSK) about concepts and their properties is useful for AI applications such as robust chatbots. Prior works like ConceptNet, TupleKB and others compiled large CSK collections, but are restricted in their expressiveness to subject-predicate-object (SPO) triples with simple concepts for S and monolithic strings for P and O. Also, these projects have either prioritized precision or recall, but hardly reconcile these complementary goals. This paper presents a methodology, called Ascent, to automatically build a large-scale knowledge base (KB) of CSK assertions, with advanced expressiveness and both better precision and recall than prior works. Ascent goes beyond triples by capturing composite concepts with subgroups and aspects, and by refining assertions with semantic facets. The latter are important to express temporal and spatial validity of assertions and further qualifiers. Ascent combines open information extraction with judicious cleaning using language models. Intrinsic evaluation shows the superior size and quality of the Ascent KB, and an extrinsic evaluation for QA-support tasks underlines the benefits of Ascent." - }, - "dane": { - "pwc_id": "dane", - "dataset_name": "DaNE Dataset", - "dataset_abstract": "Danish Dependency Treebank (DaNE) is a named entity annotation for the Danish Universal Dependencies treebank using the CoNLL-2003 annotation scheme.", - "paper_name": "DaNE: A Named Entity Resource for Danish", - "paper_abstract": "We present a named entity annotation for the Danish Universal Dependencies treebank using the CoNLL-2003 annotation scheme: DaNE. It is the largest publicly available, Danish named entity gold annotation. We evaluate the quality of our annotations intrinsically by double annotating the entire treebank and extrinsically by comparing our annotations to a recently released named entity annotation of the validation and test sections of the Danish Universal Dependencies treebank. We benchmark the new resource by training and evaluating competitive architectures for supervised named entity recognition (NER), including FLAIR, monolingual (Danish) BERT and multilingual BERT. We explore cross-lingual transfer in multilingual BERT from five related languages in zero-shot and direct transfer setups, and we show that even with our modestly-sized training set, we improve Danish NER over a recent cross-lingual approach, as well as over zero-shot transfer from five related languages. Using multilingual BERT, we achieve higher performance by fine-tuning on both DaNE and a larger Bokm{\\aa}l (Norwegian) training set compared to only using DaNE. However, the highest performance isachieved by using a Danish BERT fine-tuned on DaNE. Our dataset enables improvements and applicability for Danish NER beyond cross-lingual methods. We employ a thorough error analysis of the predictions of the best models for seen and unseen entities, as well as their robustness on un-capitalized text. The annotated dataset and all the trained models are made publicly available." - }, - "bsd_ja_en": { - "pwc_id": "business-scene-dialogue", - "dataset_name": "Business Scene Dialogue Dataset", - "dataset_abstract": "The Japanese-English business conversation corpus, namely Business Scene Dialogue corpus, was constructed in 3 steps:\n\n\nselecting business scenes,\nwriting monolingual conversation scenarios according to the selected scenes, and\ntranslating the scenarios into the other language.\n\nHalf of the monolingual scenarios were written in Japanese and the other half were written in English. The whole construction process was supervised by a person who satisfies the following conditions to guarantee the conversations to be natural:\n\n\nhas the experience of being engaged in language learning programs, especially for business conversations\nis able to smoothly communicate with others in various business scenes both in Japanese and English\nhas the experience of being involved in business\n\nThe BSD corpus is split into balanced training, development and evaluation sets. The documents in these sets are balanced in terms of scenes and original languages. In this repository we publicly share the full development and evaluation sets and a part of the training data set.", - "paper_name": "Designing the Business Conversation Corpus", - "paper_abstract": "While the progress of machine translation of written text has come far in the past several years thanks to the increasing availability of parallel corpora and corpora-based training technologies, automatic translation of spoken text and dialogues remains challenging even for modern systems. In this paper, we aim to boost the machine translation quality of conversational texts by introducing a newly constructed Japanese-English business conversation parallel corpus. A detailed analysis of the corpus is provided along with challenging examples for automatic translation. We also experiment with adding the corpus in a machine translation training scenario and show how the resulting system benefits from its use." - }, - "ar_cov19": { - "pwc_id": "arcov-19", - "dataset_name": "ArCOV-19 Dataset", - "dataset_abstract": "ArCOV-19 is an Arabic COVID-19 Twitter dataset that covers the period from 27th of January till 30th of April 2020. ArCOV-19 is the first publicly-available Arabic Twitter dataset covering COVID-19 pandemic that includes over 1M tweets alongside the propagation networks of the most-popular subset of them (i.e., most-retweeted and -liked).", - "paper_name": "ArCOV-19: The First Arabic COVID-19 Twitter Dataset with Propagation Networks", - "paper_abstract": "In this paper, we present ArCOV-19, an Arabic COVID-19 Twitter dataset that spans one year, covering the period from 27th of January 2020 till 31st of January 2021. ArCOV-19 is the first publicly-available Arabic Twitter dataset covering COVID-19 pandemic that includes about 2.7M tweets alongside the propagation networks of the most-popular subset of them (i.e., most-retweeted and -liked). The propagation networks include both retweets and conversational threads (i.e., threads of replies). ArCOV-19 is designed to enable research under several domains including natural language processing, information retrieval, and social computing. Preliminary analysis shows that ArCOV-19 captures rising discussions associated with the first reported cases of the disease as they appeared in the Arab world. In addition to the source tweets and propagation networks, we also release the search queries and language-independent crawler used to collect the tweets to encourage the curation of similar datasets." - }, - "nell": { - "pwc_id": "nell", - "dataset_name": "NELL Dataset", - "dataset_abstract": "NELL is a dataset built from the Web via an intelligent agent called Never-Ending Language Learner. This agent attempts to learn over time to read the web. NELL has accumulated over 50 million candidate beliefs by reading the web, and it is considering these at different levels of confidence. NELL has high confidence in 2,810,379 of these beliefs.", - "paper_name": "", - "paper_abstract": "" - }, - "sbu_captions": { - "pwc_id": "sbu-captions-dataset", - "dataset_name": "SBU Captions Dataset Dataset", - "dataset_abstract": "A collection that allows researchers to approach the extremely challenging problem of description generation using relatively simple non-parametric methods and produces surprisingly effective results.", - "paper_name": "Im2Text: Describing Images Using 1 Million Captioned Photographs", - "paper_abstract": "We develop and demonstrate automatic image description methods using a large captioned photo collection. One contribution is our technique for the automatic collection of this new dataset -- performing a huge number of Flickr queries and then filtering the noisy results down to 1 million images with associated visually relevant captions. Such a collection allows us to approach the extremely challenging problem of description generation using relatively simple non-parametric methods and produces surprisingly effective results. We also develop methods incorporating many state of the art, but fairly noisy, estimates of image content to produce even more pleasing results. Finally we introduce a new objective performance measure for image captioning." - }, - "curiosity_dialogs": { - "pwc_id": "curiosity", - "dataset_name": "Curiosity Dataset", - "dataset_abstract": "The Curiosity dataset consists of 14K dialogs (with 181K utterances) with fine-grained knowledge groundings, dialog act annotations, and other auxiliary annotation. In this dataset users and virtual assistants converse about geographic topics like geopolitical entities and locations. This dataset is annotated with pre-existing user knowledge, message-level dialog acts, grounding to Wikipedia, and user reactions to messages.", - "paper_name": "Information Seeking in the Spirit of Learning: a Dataset for Conversational Curiosity", - "paper_abstract": "Open-ended human learning and information-seeking are increasingly mediated by digital assistants. However, such systems often ignore the user's pre-existing knowledge. Assuming a correlation between engagement and user responses such as \"liking\" messages or asking followup questions, we design a Wizard-of-Oz dialog task that tests the hypothesis that engagement increases when users are presented with facts related to what they know. Through crowd-sourcing of this experiment, we collect and release 14K dialogs (181K utterances) where users and assistants converse about geographic topics like geopolitical entities and locations. This dataset is annotated with pre-existing user knowledge, message-level dialog acts, grounding to Wikipedia, and user reactions to messages. Responses using a user's prior knowledge increase engagement. We incorporate this knowledge into a multi-task model that reproduces human assistant policies and improves over a BERT content model by 13 mean reciprocal rank points." - }, - "hope_edi": { - "pwc_id": "hopeedi", - "dataset_name": "HopeEDI Dataset", - "dataset_abstract": "Over the past few years, systems have been developed to control online content and eliminate abusive, offensive or hate speech content. However, people in power sometimes misuse this form of censorship to obstruct the democratic right of freedom of speech. Therefore, it is imperative that research should take a positive reinforcement approach towards online content that is encouraging, positive and supportive contents. Until now, most studies have focused on solving this problem of negativity in the English language, though the problem is much more than just harmful content. Furthermore, it is multilingual as well. Thus, we have constructed a Hope Speech dataset for Equality, Diversity and Inclusion (HopeEDI) containing user-generated comments from the social media platform YouTube with 28,451, 20,198 and 10,705 comments in English, Tamil and Malayalam, respectively, manually labelled as containing hope speech or not. To our knowledge, this is the first research of its kind to annotate hope speech for equality, diversity and inclusion in a multilingual setting. We determined that the inter-annotator agreement of our dataset using Krippendorff\u2019s alpha. Further, we created several baselines to benchmark the resulting dataset and the results have been expressed using precision, recall and F1-score. The dataset is publicly available for the research community. We hope that this resource will spur further research on encouraging inclusive and responsive speech that reinforces positiveness.", - "paper_name": "HopeEDI: A Multilingual Hope Speech Detection Dataset for Equality, Diversity, and Inclusion", - "paper_abstract": "Over the past few years, systems have been developed to control online content and eliminate abusive, offensive or hate speech content. However, people in power sometimes misuse this form of censorship to obstruct the democratic right of freedom of speech. Therefore, it is imperative that research should take a positive reinforcement approach towards online content that is encouraging, positive and supportive contents. Until now, most studies have focused on solving this problem of negativity in the English language, though the problem is much more than just harmful content. Furthermore, it is multilingual as well. Thus, we have constructed a Hope Speech dataset for Equality, Diversity and Inclusion (HopeEDI) containing user-generated comments from the social media platform YouTube with 28,451, 20,198 and 10,705 comments in English, Tamil and Malayalam, respectively, manually labelled as containing hope speech or not. To our knowledge, this is the first research of its kind to annotate hope speech for equality, diversity and inclusion in a multilingual setting. We determined that the inter-annotator agreement of our dataset using Krippendorff\u2019s alpha. Further, we created several baselines to benchmark the resulting dataset and the results have been expressed using precision, recall and F1-score. The dataset is publicly available for the research community. We hope that this resource will spur further research on encouraging inclusive and responsive speech that reinforces positiveness." - }, - "catalonia_independence": { - "pwc_id": "cic", - "dataset_name": "CIC Dataset", - "dataset_abstract": "The dataset is annotated with stance towards one topic, namely, the independence of Catalonia.", - "paper_name": "Multilingual Stance Detection: The Catalonia Independence Corpus", - "paper_abstract": "Stance detection aims to determine the attitude of a given text with respect to a specific topic or claim. While stance detection has been fairly well researched in the last years, most the work has been focused on English. This is mainly due to the relative lack of annotated data in other languages. The TW-10 Referendum Dataset released at IberEval 2018 is a previous effort to provide multilingual stance-annotated data in Catalan and Spanish. Unfortunately, the TW-10 Catalan subset is extremely imbalanced. This paper addresses these issues by presenting a new multilingual dataset for stance detection in Twitter for the Catalan and Spanish languages, with the aim of facilitating research on stance detection in multilingual and cross-lingual settings. The dataset is annotated with stance towards one topic, namely, the independence of Catalonia. We also provide a semi-automatic method to annotate the dataset based on a categorization of Twitter users. We experiment on the new corpus with a number of supervised approaches, including linear classifiers and deep learning methods. Comparison of our new corpus with the with the TW-1O dataset shows both the benefits and potential of a well balanced corpus for multilingual and cross-lingual research on stance detection. Finally, we establish new state-of-the-art results on the TW-10 dataset, both for Catalan and Spanish." - }, - "pec": { - "pwc_id": "pec", - "dataset_name": "PEC Dataset", - "dataset_abstract": "A novel large-scale multi-domain dataset for persona-based empathetic conversations.", - "paper_name": "Towards Persona-Based Empathetic Conversational Models", - "paper_abstract": "Empathetic conversational models have been shown to improve user satisfaction and task outcomes in numerous domains. In Psychology, persona has been shown to be highly correlated to personality, which in turn influences empathy. In addition, our empirical analysis also suggests that persona plays an important role in empathetic conversations. To this end, we propose a new task towards persona-based empathetic conversations and present the first empirical study on the impact of persona on empathetic responding. Specifically, we first present a novel large-scale multi-domain dataset for persona-based empathetic conversations. We then propose CoBERT, an efficient BERT-based response selection model that obtains the state-of-the-art performance on our dataset. Finally, we conduct extensive experiments to investigate the impact of persona on empathetic responding. Notably, our results show that persona improves empathetic responding more when CoBERT is trained on empathetic conversations than non-empathetic ones, establishing an empirical link between persona and empathy in human conversations." - }, - "allegro_reviews": { - "pwc_id": "allegro-reviews", - "dataset_name": "Allegro Reviews Dataset", - "dataset_abstract": "A comprehensive multi-task benchmark for the Polish language understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing datasets for named entity recognition, question-answering, textual entailment, and others.", - "paper_name": "KLEJ: Comprehensive Benchmark for Polish Language Understanding", - "paper_abstract": "In recent years, a series of Transformer-based models unlocked major improvements in general natural language understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language, which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based models." - }, - "proto_qa": { - "pwc_id": "protoqa", - "dataset_name": "ProtoQA Dataset", - "dataset_abstract": "ProtoQA is a question answering dataset for training and evaluating common sense reasoning capabilities of artificial intelligence systems in such prototypical situations. The training set is gathered from an existing set of questions played in a long-running international game show FAMILY- FEUD. The hidden evaluation set is created by gathering answers for each question from 100 crowd-workers.", - "paper_name": "ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning", - "paper_abstract": "Given questions regarding some prototypical situation such as Name something that people usually do before they leave the house for work? a human can easily answer them via acquired experiences. There can be multiple right answers for such questions, with some more common for a situation than others. This paper introduces a new question answering dataset for training and evaluating common sense reasoning capabilities of artificial intelligence systems in such prototypical situations. The training set is gathered from an existing set of questions played in a long-running international game show FAMILY- FEUD. The hidden evaluation set is created by gathering answers for each question from 100 crowd-workers. We also propose a generative evaluation task where a model has to output a ranked list of answers, ideally covering all prototypical answers for a question. After presenting multiple competitive baseline models, we find that human performance still exceeds model scores on all evaluation metrics with a meaningful gap, supporting the challenging nature of the task." - }, - "fquad": { - "pwc_id": "fquad", - "dataset_name": "FQuAD Dataset", - "dataset_abstract": "A French Native Reading Comprehension dataset of questions and answers on a set of Wikipedia articles that consists of 25,000+ samples for the 1.0 version and 60,000+ samples for the 1.1 version.", - "paper_name": "FQuAD: French Question Answering Dataset", - "paper_abstract": "Recent advances in the field of language modeling have improved state-of-the-art results on many Natural Language Processing tasks. Among them, Reading Comprehension has made significant progress over the past few years. However, most results are reported in English since labeled resources available in other languages, such as French, remain scarce. In the present work, we introduce the French Question Answering Dataset (FQuAD). FQuAD is a French Native Reading Comprehension dataset of questions and answers on a set of Wikipedia articles that consists of 25,000+ samples for the 1.0 version and 60,000+ samples for the 1.1 version. We train a baseline model which achieves an F1 score of 92.2 and an exact match ratio of 82.1 on the test set. In order to track the progress of French Question Answering models we propose a leader-board and we have made the 1.0 version of our dataset freely available at https://illuin-tech.github.io/FQuAD-explorer/." - }, - "crawl_domain": { - "pwc_id": "common-crawl-domain-names", - "dataset_name": "Common Crawl Domain Names Dataset", - "dataset_abstract": "Corpus of domain names scraped from Common Crawl and manually annotated to add word boundaries (e.g. \"commoncrawl\" to \"common crawl\").", - "paper_name": "", - "paper_abstract": "" - }, - "flores": { - "pwc_id": "flores", - "dataset_name": "FLoRes Dataset", - "dataset_abstract": "FLoRes is a benchmark dataset for machine translation between English and four low resource languages, Nepali, Sinhala, Khmer and Pashto, based on sentences translated from Wikipedia.", - "paper_name": "The FLORES Evaluation Datasets for Low-Resource Machine Translation: Nepali--English and Sinhala--English", - "paper_abstract": "For machine translation, a vast majority of language pairs in the world are considered low-resource because they have little parallel data available. Besides the technical challenges of learning with limited supervision, it is difficult to evaluate methods trained on low-resource language pairs because of the lack of freely and publicly available benchmarks. In this work, we introduce the FLORES evaluation datasets for Nepali{--}English and Sinhala{--} English, based on sentences translated from Wikipedia. Compared to English, these are languages with very different morphology and syntax, for which little out-of-domain parallel data is available and for which relatively large amounts of monolingual data are freely available. We describe our process to collect and cross-check the quality of translations, and we report baseline performance using several learning settings: fully supervised, weakly supervised, semi-supervised, and fully unsupervised. Our experiments demonstrate that current state-of-the-art methods perform rather poorly on this benchmark, posing a challenge to the research community working on low-resource MT. Data and code to reproduce our experiments are available at https://github.com/facebookresearch/flores." - }, - "numeric_fused_head": { - "pwc_id": "numeric-fused-head", - "dataset_name": "Numeric Fused-Head Dataset", - "dataset_abstract": "The Numeric Fused-Head dataset consists of ~10K examples of crowd-sourced classified examples, labeled into 7 different categories, from two types. In the first type, Reference, the missing head is referenced explicitly somewhere else in the discourse, either in the same sentence or in surrounding sentences. In the second type, Implicit, the missing head does not appear in the text and needs to be inferred by the reader or hearer based on the context or world knowledge. This category was labeled into the 6 most common categories of the dataset. Models are evaluated based on accuracy.", - "paper_name": "Where's My Head? Definition, Dataset and Models for Numeric Fused-Heads Identification and Resolution", - "paper_abstract": "We provide the first computational treatment of fused-heads constructions (FH), focusing on the numeric fused-heads (NFH). FHs constructions are noun phrases (NPs) in which the head noun is missing and is said to be `fused' with its dependent modifier. This missing information is implicit and is important for sentence understanding. The missing references are easily filled in by humans but pose a challenge for computational models. We formulate the handling of FH as a two stages process: identification of the FH construction and resolution of the missing head. We explore the NFH phenomena in large corpora of English text and create (1) a dataset and a highly accurate method for NFH identification; (2) a 10k examples (1M tokens) crowd-sourced dataset of NFH resolution; and (3) a neural baseline for the NFH resolution task. We release our code and dataset, in hope to foster further research into this challenging problem." - }, - "omp": { - "pwc_id": "one-million-posts-corpus", - "dataset_name": "One Million Posts Corpus Dataset", - "dataset_abstract": "An annotated data set consisting of user comments posted to an Austrian newspaper website (in German language).\n\nDER STANDARD is an Austrian daily broadsheet newspaper. On the newspaper\u2019s website, there is a discussion section below each news article where readers engage in online discussions. The data set contains a selection of user posts from the 12 month time span from 2015-06-01 to 2016-05-31. There are 11,773 labeled and 1,000,000 unlabeled posts in the data set. The labeled posts were annotated by professional forum moderators employed by the newspaper.", - "paper_name": "", - "paper_abstract": "" - }, - "arsentd_lev": { - "pwc_id": "arsentd-lev", - "dataset_name": "ArSentD-LEV Dataset", - "dataset_abstract": "The Arabic Sentiment Twitter Dataset for the Levantine dialect (ArSenTD-LEV) is a dataset of 4,000 tweets with the following annotations: the overall sentiment of the tweet, the target to which the sentiment was expressed, how the sentiment was expressed, and the topic of the tweet.", - "paper_name": "ArSentD-LEV: A Multi-Topic Corpus for Target-based Sentiment Analysis in Arabic Levantine Tweets", - "paper_abstract": "Sentiment analysis is a highly subjective and challenging task. Its complexity further increases when applied to the Arabic language, mainly because of the large variety of dialects that are unstandardized and widely used in the Web, especially in social media. While many datasets have been released to train sentiment classifiers in Arabic, most of these datasets contain shallow annotation, only marking the sentiment of the text unit, as a word, a sentence or a document. In this paper, we present the Arabic Sentiment Twitter Dataset for the Levantine dialect (ArSenTD-LEV). Based on findings from analyzing tweets from the Levant region, we created a dataset of 4,000 tweets with the following annotations: the overall sentiment of the tweet, the target to which the sentiment was expressed, how the sentiment was expressed, and the topic of the tweet. Results confirm the importance of these annotations at improving the performance of a baseline sentiment classifier. They also confirm the gap of training in a certain domain, and testing in another domain." - }, - "crd3": { - "pwc_id": "crd3", - "dataset_name": "CRD3 Dataset", - "dataset_abstract": "The dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding abstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player collaboration and spoken interaction.", - "paper_name": "Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset", - "paper_abstract": "This paper describes the Critical Role Dungeons and Dragons Dataset (CRD3) and related analyses. Critical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. The dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding abstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player collaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, and semantic ties to the previous dialogues. In addition, we provide a data augmentation method that produces 34,243 summary-dialogue chunk pairs to support current neural ML approaches, and we provide an abstractive summarization benchmark and evaluation." - }, - "tilde_model": { - "pwc_id": "tilde-model-corpus", - "dataset_name": "Tilde MODEL Corpus Dataset", - "dataset_abstract": "Tilde MODEL Corpus is a multilingual corpora for European languages \u2013 particularly focused on the smaller languages. The collected resources have been cleaned, aligned, and formatted into a corpora standard TMX format useable for developing new Language technology products and services.\n\nIt contains over 10M segments of multilingual open data.\n\nThe data has been collected from sites allowing free use and reuse of its content, as well as from Public Sector web sites.", - "paper_name": "", - "paper_abstract": "" - }, - "pn_summary": { - "pwc_id": "pn-summary", - "dataset_name": "pn-summary Dataset", - "dataset_abstract": "Pn-summary is a dataset for Persian abstractive text summarization.", - "paper_name": "Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization", - "paper_abstract": "Text summarization is one of the most critical Natural Language Processing (NLP) tasks. More and more researches are conducted in this field every day. Pre-trained transformer-based encoder-decoder models have begun to gain popularity for these tasks. This paper proposes two methods to address this task and introduces a novel dataset named pn-summary for Persian abstractive text summarization. The models employed in this paper are mT5 and an encoder-decoder version of the ParsBERT model (i.e., a monolingual BERT model for Persian). These models are fine-tuned on the pn-summary dataset. The current work is the first of its kind and, by achieving promising results, can serve as a baseline for any future work." - }, - "c3": { - "pwc_id": "c3", - "dataset_name": "C3 Dataset", - "dataset_abstract": "C3 is a free-form multiple-Choice Chinese machine reading Comprehension dataset.", - "paper_name": "Investigating Prior Knowledge for Challenging Chinese Machine Reading Comprehension", - "paper_abstract": "Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations. We present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text. C^3 is available at https://dataset.org/c3/." - }, - "cdsc": { - "pwc_id": "polish-cdscorpus", - "dataset_name": "Polish CDSCorpus Dataset", - "dataset_abstract": "Consists of 10K sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish.", - "paper_name": "Polish evaluation dataset for compositional distributional semantics models", - "paper_abstract": "The paper presents a procedure of building an evaluation dataset. for the validation of compositional distributional semantics models estimated for languages other than English. The procedure generally builds on steps designed to assemble the SICK corpus, which contains pairs of English sentences annotated for semantic relatedness and entailment, because we aim at building a comparable dataset. However, the implementation of particular building steps significantly differs from the original SICK design assumptions, which is caused by both lack of necessary extraneous resources for an investigated language and the need for language-specific transformation rules. The designed procedure is verified on Polish, a fusional language with a relatively free word order, and contributes to building a Polish evaluation dataset. The resource consists of 10K sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish." - }, - "deal_or_no_dialog": { - "pwc_id": "negotiation-dialogues-dataset", - "dataset_name": "Negotiation Dialogues Dataset Dataset", - "dataset_abstract": "This dataset consists of 5808 dialogues, based on 2236 unique scenarios. Each dialogue is converted into two training examples in the dataset, showing the complete conversation from the perspective of each agent. The perspectives differ on their input goals, output choice, and in special tokens marking whether a statement was read or written.", - "paper_name": "Hierarchical Text Generation and Planning for Strategic Dialogue", - "paper_abstract": "End-to-end models for goal-orientated dialogue are challenging to train,\nbecause linguistic and strategic aspects are entangled in latent state vectors.\nWe introduce an approach to learning representations of messages in dialogues\nby maximizing the likelihood of subsequent sentences and actions, which\ndecouples the semantics of the dialogue utterance from its linguistic\nrealization. We then use these latent sentence representations for hierarchical\nlanguage generation, planning and reinforcement learning. Experiments show that\nour approach increases the end-task reward achieved by the model, improves the\neffectiveness of long-term planning using rollouts, and allows self-play\nreinforcement learning to improve decision making without diverging from human\nlanguage. Our hierarchical latent-variable model outperforms previous work both\nlinguistically and strategically." - }, - "conceptual_12m": { - "pwc_id": "cc12m", - "dataset_name": "CC12M Dataset", - "dataset_abstract": "Conceptual 12M (CC12M) is a dataset with 12 million image-text pairs specifically meant to be used for vision-and-language pre-training.", - "paper_name": "Conceptual 12M: Pushing Web-Scale Image-Text Pre-Training To Recognize Long-Tail Visual Concepts", - "paper_abstract": "The availability of large-scale image captioning and visual question answering datasets has contributed significantly to recent successes in vision-and-language pre-training. However, these datasets are often collected with overrestrictive requirements inherited from their original target tasks (e.g., image caption generation), which limit the resulting dataset scale and diversity. We take a step further in pushing the limits of vision-and-language pre-training data by relaxing the data collection pipeline used in Conceptual Captions 3M (CC3M) [Sharma et al. 2018] and introduce the Conceptual 12M (CC12M), a dataset with 12 million image-text pairs specifically meant to be used for vision-and-language pre-training. We perform an analysis of this dataset and benchmark its effectiveness against CC3M on multiple downstream tasks with an emphasis on long-tail visual recognition. Our results clearly illustrate the benefit of scaling up pre-training data for vision-and-language tasks, as indicated by the new state-of-the-art results on both the nocaps and Conceptual Captions benchmarks." - }, - "igbo_english_machine_translation": { - "pwc_id": "igbonlp-datasets", - "dataset_name": "IgboNLP Datasets Dataset", - "dataset_abstract": "IgboNLP is a standard machine translation benchmark dataset for Igbo. It consists of 10,000 English-Igbo human-level quality sentence pairs mostly from the news domain.", - "paper_name": "Igbo-English Machine Translation: An Evaluation Benchmark", - "paper_abstract": "Although researchers and practitioners are pushing the boundaries and enhancing the capacities of NLP tools and methods, works on African languages are lagging. A lot of focus on well resourced languages such as English, Japanese, German, French, Russian, Mandarin Chinese etc. Over 97% of the world's 7000 languages, including African languages, are low resourced for NLP i.e. they have little or no data, tools, and techniques for NLP research. For instance, only 5 out of 2965, 0.19% authors of full text papers in the ACL Anthology extracted from the 5 major conferences in 2018 ACL, NAACL, EMNLP, COLING and CoNLL, are affiliated to African institutions. In this work, we discuss our effort toward building a standard machine translation benchmark dataset for Igbo, one of the 3 major Nigerian languages. Igbo is spoken by more than 50 million people globally with over 50% of the speakers are in southeastern Nigeria. Igbo is low resourced although there have been some efforts toward developing IgboNLP such as part of speech tagging and diacritic restoration" - }, - "squad_it": { - "pwc_id": "squad-it", - "dataset_name": "SQuAD-it Dataset", - "dataset_abstract": "SQuAD-it is derived from the SQuAD dataset and it is obtained through semi-automatic translation of the SQuAD dataset into Italian. It represents a large-scale dataset for open question answering processes on factoid questions in Italian. The dataset contains more than 60,000 question/answer pairs derived from the original English dataset.", - "paper_name": "", - "paper_abstract": "" - }, - "chr_en": { - "pwc_id": "chren", - "dataset_name": "ChrEn Dataset", - "dataset_abstract": "Cherokee-English Parallel Dataset is a low-resource dataset of 14,151 pairs of sentences with around\n313K English tokens and 206K Cherokee tokens. The parallel corpus is accompanied by a monolingual Cherokee dataset of 5,120 sentences. Both datasets are mostly derived from Cherokee monolingual books.", - "paper_name": "ChrEn: Cherokee-English Machine Translation for Endangered Language Revitalization", - "paper_abstract": "Cherokee is a highly endangered Native American language spoken by the Cherokee people. The Cherokee culture is deeply embedded in its language. However, there are approximately only 2,000 fluent first language Cherokee speakers remaining in the world, and the number is declining every year. To help save this endangered language, we introduce ChrEn, a Cherokee-English parallel dataset, to facilitate machine translation research between Cherokee and English. Compared to some popular machine translation language pairs, ChrEn is extremely low-resource, only containing 14k sentence pairs in total. We split our parallel data in ways that facilitate both in-domain and out-of-domain evaluation. We also collect 5k Cherokee monolingual data to enable semi-supervised learning. Besides these datasets, we propose several Cherokee-English and English-Cherokee machine translation systems. We compare SMT (phrase-based) versus NMT (RNN-based and Transformer-based) systems; supervised versus semi-supervised (via language model, back-translation, and BERT/Multilingual-BERT) methods; as well as transfer learning versus multilingual joint training with 4 other languages. Our best results are 15.8/12.7 BLEU for in-domain and 6.5/5.0 BLEU for out-of-domain Chr-En/EnChr translations, respectively, and we hope that our dataset and systems will encourage future work by the community for Cherokee language revitalization. Our data, code, and demo will be publicly available at https://github.com/ZhangShiyue/ChrEn" - }, - "aquamuse": { - "pwc_id": "aquamuse", - "dataset_name": "aquamuse Dataset", - "dataset_abstract": "5,519 query-based summaries, each associated with an average of 6 input documents selected from an index of 355M documents from Common Crawl.", - "paper_name": "AQuaMuSe: Automatically Generating Datasets for Query-Based Multi-Document Summarization", - "paper_abstract": "Summarization is the task of compressing source document(s) into coherent and succinct passages. This is a valuable tool to present users with concise and accurate sketch of the top ranked documents related to their queries. Query-based multi-document summarization (qMDS) addresses this pervasive need, but the research is severely limited due to lack of training and evaluation datasets as existing single-document and multi-document summarization datasets are inadequate in form and scale. We propose a scalable approach called AQuaMuSe to automatically mine qMDS examples from question answering datasets and large document corpora. Our approach is unique in the sense that it can general a dual dataset -- for extractive and abstractive summaries both. We publicly release a specific instance of an AQuaMuSe dataset with 5,519 query-based summaries, each associated with an average of 6 input documents selected from an index of 355M documents from Common Crawl. Extensive evaluation of the dataset along with baseline summarization model experiments are provided." - }, - "conll2000": { - "pwc_id": "conll-2000-1", - "dataset_name": "CoNLL-2000 Dataset", - "dataset_abstract": "CoNLL-2000 is a dataset for dividing text into syntactically related non-overlapping groups of words, so-called text chunking.", - "paper_name": "", - "paper_abstract": "" - }, - "coached_conv_pref": { - "pwc_id": "coached-conversational-preference-elicitation", - "dataset_name": "Coached Conversational Preference Elicitation Dataset", - "dataset_abstract": "Coached Conversational Preference Elicitation is a dataset consisting of 502 English dialogs with 12,000 annotated utterances between a user and an assistant discussing movie preferences in natural language. It was collected using a Wizard-of-Oz methodology between two paid crowd-workers, where one worker plays the role of an 'assistant', while the other plays the role of a 'user'.", - "paper_name": "Coached Conversational Preference Elicitation: A Case Study in Understanding Movie Preferences", - "paper_abstract": "Conversational recommendation has recently attracted significant attention. As systems must understand users{'} preferences, training them has called for conversational corpora, typically derived from task-oriented conversations. We observe that such corpora often do not reflect how people naturally describe preferences. We present a new approach to obtaining user preferences in dialogue: Coached Conversational Preference Elicitation. It allows collection of natural yet structured conversational preferences. Studying the dialogues in one domain, we present a brief quantitative analysis of how people describe movie preferences at scale. Demonstrating the methodology, we release the CCPE-M dataset to the community with over 500 movie preference dialogues expressing over 10,000 preferences." - }, - "hate_offensive": { - "pwc_id": "hate-speech-and-offensive-language", - "dataset_name": "Hate Speech and Offensive Language Dataset", - "dataset_abstract": "HSOL is a dataset for hate speech detection. The authors begun with a hate speech lexicon containing words and\nphrases identified by internet users as hate speech, compiled by Hatebase.org. Using the Twitter API they searched\nfor tweets containing terms from the lexicon, resulting in a sample of tweets from 33,458 Twitter users. They extracted\nthe time-line for each user, resulting in a set of 85.4 million tweets. From this corpus they took a random sample of 25k tweets containing terms from the lexicon and had them manually coded by CrowdFlower (CF) workers. Workers were asked to label each tweet as one of three categories: hate speech, offensive but not hate speech, or neither offensive nor hate speech.", - "paper_name": "Automated Hate Speech Detection and the Problem of Offensive Language", - "paper_abstract": "A key challenge for automatic hate-speech detection on social media is the\nseparation of hate speech from other instances of offensive language. Lexical\ndetection methods tend to have low precision because they classify all messages\ncontaining particular terms as hate speech and previous work using supervised\nlearning has failed to distinguish between the two categories. We used a\ncrowd-sourced hate speech lexicon to collect tweets containing hate speech\nkeywords. We use crowd-sourcing to label a sample of these tweets into three\ncategories: those containing hate speech, only offensive language, and those\nwith neither. We train a multi-class classifier to distinguish between these\ndifferent categories. Close analysis of the predictions and the errors shows\nwhen we can reliably separate hate speech from other offensive language and\nwhen this differentiation is more difficult. We find that racist and homophobic\ntweets are more likely to be classified as hate speech but that sexist tweets\nare generally classified as offensive. Tweets without explicit hate keywords\nare also more difficult to classify." - }, - "counter": { - "pwc_id": "counter", - "dataset_name": "COUNTER Dataset", - "dataset_abstract": "The COUNTER (COrpus of Urdu News TExt Reuse) corpus contains 600 source-derived document pairs collected from the field of journalism. It can be used to evaluate mono-lingual text reuse detection systems in general and specifically for Urdu language.\n\nThe corpus has 600 source and 600 derived documents. It contains in total 275,387 words (tokens), 21,426 unique words and 10,841 sentences. It has been manually annotated at document level with three levels of reuse: wholly derived (135), partially derived (288) and non derived (177).", - "paper_name": "", - "paper_abstract": "" - }, - "pubmed": { - "pwc_id": "pubmed", - "dataset_name": "Pubmed Dataset", - "dataset_abstract": "The Pubmed dataset consists of 19717 scientific publications from PubMed database pertaining to diabetes classified into one of three classes. The citation network consists of 44338 links. Each publication in the dataset is described by a TF/IDF weighted word vector from a dictionary which consists of 500 unique words.", - "paper_name": "", - "paper_abstract": "" - }, - "cppe-5": { - "pwc_id": "cppe-5", - "dataset_name": "CPPE-5 Dataset", - "dataset_abstract": "CPPE - 5 (Medical Personal Protective Equipment) is a new challenging dataset with the goal to allow the study of subordinate categorization of medical personal protective equipments, which is not possible with other popular data sets that focus on broad level categories.\n\nSome features of this dataset are:\n\n\nhigh quality images and annotations (~4.6 bounding boxes per image)\nreal-life images unlike any current such dataset\nmajority of non-iconic images (allowing easy deployment to real-world environments)", - "paper_name": "CPPE-5: Medical Personal Protective Equipment Dataset", - "paper_abstract": "We present a new challenging dataset, CPPE - 5 (Medical Personal Protective Equipment), with the goal to allow the study of subordinate categorization of medical personal protective equipments, which is not possible with other popular data sets that focus on broad level categories (such as PASCAL VOC, ImageNet, Microsoft COCO, OpenImages, etc). To make it easy for models trained on this dataset to be used in practical scenarios in complex scenes, our dataset mainly contains images that show complex scenes with several objects in each scene in their natural context. The image collection for this dataset focusing on: obtaining as many non-iconic images as possible and making sure all the images are real-life images unlike other existing datasets in this area. Our dataset includes 5 object categories (coveralls, face shield, gloves, mask, and goggles) and each image is annotated with a set of bounding boxes and positive labels. We present a detailed analysis of the dataset in comparison to other popular broad category datasets as well as datasets focusing on personal protective equipments, we also find that at present there exist no such publicly available datasets. Finally we also analyze performance and compare model complexities on baseline and state-of-the-art models for bounding box results. Our code, data, and trained models are available at https://git.io/cppe5-dataset ." - }, - "ubuntu_dialogs_corpus": { - "pwc_id": "ubuntu-dialogue-corpus", - "dataset_name": "UDC Dataset", - "dataset_abstract": "Ubuntu Dialogue Corpus (UDC) is a dataset containing almost 1 million multi-turn dialogues, with a total of over 7 million utterances and 100 million words. This provides a unique resource for research into building dialogue managers based on neural language models that can make use of large amounts of unlabeled data. The dataset has both the multi-turn property of conversations in the Dialog State Tracking Challenge datasets, and the unstructured nature of interactions from microblog services such as Twitter.", - "paper_name": "The Ubuntu Dialogue Corpus: A Large Dataset for Research in Unstructured Multi-Turn Dialogue Systems", - "paper_abstract": "This paper introduces the Ubuntu Dialogue Corpus, a dataset containing almost\n1 million multi-turn dialogues, with a total of over 7 million utterances and\n100 million words. This provides a unique resource for research into building\ndialogue managers based on neural language models that can make use of large\namounts of unlabeled data. The dataset has both the multi-turn property of\nconversations in the Dialog State Tracking Challenge datasets, and the\nunstructured nature of interactions from microblog services such as Twitter. We\nalso describe two neural learning architectures suitable for analyzing this\ndataset, and provide benchmark performance on the task of selecting the best\nnext response." - }, - "blog_authorship_corpus": { - "pwc_id": "blog-authorship-corpus", - "dataset_name": "Blog Authorship Corpus Dataset", - "dataset_abstract": "The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person.", - "paper_name": "", - "paper_abstract": "" - }, - "recipe_nlg": { - "pwc_id": "recipenlg", - "dataset_name": "RecipeNLG Dataset", - "dataset_abstract": "", - "paper_name": "RecipeNLG: A Cooking Recipes Dataset for Semi-Structured Text Generation", - "paper_abstract": "Semi-structured text generation is a non-trivial problem. Although last years have brought lots of improvements in natural language generation, thanks to the development of neural models trained on large scale datasets, these approaches still struggle with producing structured, context- and commonsense-aware texts. Moreover, it is not clear how to evaluate the quality of generated texts. To address these problems, we introduce RecipeNLG - a novel dataset of cooking recipes. We discuss the data collection process and the relation between the semi-structured texts and cooking recipes. We use the dataset to approach the problem of generating recipes. Finally, we make use of multiple metrics to evaluate the generated recipes." - }, - "hebrew_sentiment": { - "pwc_id": "modern-hebrew-sentiment-dataset", - "dataset_name": "Modern Hebrew Sentiment Dataset Dataset", - "dataset_abstract": "Modern Hebrew Sentiment Dataset is a sentiment analysis benchmark for Hebrew, based on 12K social media comments, and provide two instances of these data: in token-based and morpheme-based settings.", - "paper_name": "Representations and Architectures in Neural Sentiment Analysis for Morphologically Rich Languages: A Case Study from Modern Hebrew", - "paper_abstract": "This paper empirically studies the effects of representation choices on neural sentiment analysis for Modern Hebrew, a morphologically rich language (MRL) for which no sentiment analyzer currently exists. We study two dimensions of representational choices: (i) the granularity of the input signal (token-based vs. morpheme-based), and (ii) the level of encoding of vocabulary items (string-based vs. character-based). We hypothesise that for MRLs, languages where multiple meaning-bearing elements may be carried by a single space-delimited token, these choices will have measurable effects on task perfromance, and that these effects may vary for different architectural designs {---} fully-connected, convolutional or recurrent. Specifically, we hypothesize that morpheme-based representations will have advantages in terms of their generalization capacity and task accuracy, due to their better OOV coverage. To empirically study these effects, we develop a new sentiment analysis benchmark for Hebrew, based on 12K social media comments, and provide two instances of these data: in token-based and morpheme-based settings. Our experiments show that representation choices empirical effects vary with architecture type. While fully-connected and convolutional networks slightly prefer token-based settings, RNNs benefit from a morpheme-based representation, in accord with the hypothesis that explicit morphological information may help generalize. Our endeavour also delivers the first state-of-the-art broad-coverage sentiment analyzer for Hebrew, with over 89{\\%} accuracy, alongside an established benchmark to further study the effects of linguistic representation choices on neural networks{'} task performance." - }, - "dbrd": { - "pwc_id": "dbrd", - "dataset_name": "DBRD Dataset", - "dataset_abstract": "The DBRD (pronounced dee-bird) dataset contains over 110k book reviews along with associated binary sentiment polarity labels. It is greatly influenced by the Large Movie Review Dataset and intended as a benchmark for sentiment classification in Dutch.", - "paper_name": "", - "paper_abstract": "" - }, - "dart": { - "pwc_id": "dart", - "dataset_name": "DART Dataset", - "dataset_abstract": "DART is a large dataset for open-domain structured data record to text generation. DART consists of 82,191 examples across different domains with each input being a semantic RDF triple set derived from data records in tables and the tree ontology of the schema, annotated with sentence descriptions that cover all facts in the triple set.", - "paper_name": "DART: Open-Domain Structured Data Record to Text Generation", - "paper_abstract": "We present DART, an open domain structured DAta Record to Text generation dataset with over 82k instances (DARTs). Data-to-Text annotations can be a costly process, especially when dealing with tables which are the major source of structured data and contain nontrivial structures. To this end, we propose a procedure of extracting semantic triples from tables that encodes their structures by exploiting the semantic dependencies among table headers and the table title. Our dataset construction framework effectively merged heterogeneous sources from open domain semantic parsing and dialogue-act-based meaning representation tasks by utilizing techniques such as: tree ontology annotation, question-answer pair to declarative sentence conversion, and predicate unification, all with minimum post-editing. We present systematic evaluation on DART as well as new state-of-the-art results on WebNLG 2017 to show that DART (1) poses new challenges to existing data-to-text datasets and (2) facilitates out-of-domain generalization. Our data and code can be found at https://github.com/Yale-LILY/dart." - }, - "taskmaster1": { - "pwc_id": "taskmaster-1", - "dataset_name": "Taskmaster-1 Dataset", - "dataset_abstract": "Taskmaster-1 is a dialog dataset consisting of 13,215 task-based dialogs in English, including 5,507 spoken and 7,708 written dialogs created with two distinct procedures. Each conversation falls into one of six domains: ordering pizza, creating auto repair appointments, setting up ride service, ordering movie tickets, ordering coffee drinks and making restaurant reservations.", - "paper_name": "Taskmaster-1: Toward a Realistic and Diverse Dialog Dataset", - "paper_abstract": "A significant barrier to progress in data-driven approaches to building dialog systems is the lack of high quality, goal-oriented conversational data. To help satisfy this elementary requirement, we introduce the initial release of the Taskmaster-1 dataset which includes 13,215 task-based dialogs comprising six domains. Two procedures were used to create this collection, each with unique advantages. The first involves a two-person, spoken \"Wizard of Oz\" (WOz) approach in which trained agents and crowdsourced workers interact to complete the task while the second is \"self-dialog\" in which crowdsourced workers write the entire dialog themselves. We do not restrict the workers to detailed scripts or to a small knowledge base and hence we observe that our dataset contains more realistic and diverse conversations in comparison to existing datasets. We offer several baseline models including state of the art neural seq2seq architectures with benchmark performance as well as qualitative human evaluations. Dialogs are labeled with API calls and arguments, a simple and cost effective approach which avoids the requirement of complex annotation schema. The layer of abstraction between the dialog model and the service provider API allows for a given model to interact with multiple services that provide similar functionally. Finally, the dataset will evoke interest in written vs. spoken language, discourse patterns, error handling and other linguistic phenomena related to dialog system research, development and design." - }, - "multi_nli_mismatch": { - "pwc_id": "multinli", - "dataset_name": "MultiNLI Dataset", - "dataset_abstract": "The Multi-Genre Natural Language Inference (MultiNLI) dataset has 433K sentence pairs. Its size and mode of collection are modeled closely like SNLI. MultiNLI offers ten distinct genres (Face-to-face, Telephone, 9/11, Travel, Letters, Oxford University Press, Slate, Verbatim, Goverment and Fiction) of written and spoken English data. There are matched dev/test sets which are derived from the same sources as those in the training set, and mismatched sets which do not closely resemble any seen at training time.", - "paper_name": "A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference", - "paper_abstract": "This paper introduces the Multi-Genre Natural Language Inference (MultiNLI)\ncorpus, a dataset designed for use in the development and evaluation of machine\nlearning models for sentence understanding. In addition to being one of the\nlargest corpora available for the task of NLI, at 433k examples, this corpus\nimproves upon available resources in its coverage: it offers data from ten\ndistinct genres of written and spoken English--making it possible to evaluate\nsystems on nearly the full complexity of the language--and it offers an\nexplicit setting for the evaluation of cross-genre domain adaptation." - }, - "wiki_movies": { - "pwc_id": "wikimovies", - "dataset_name": "WikiMovies Dataset", - "dataset_abstract": "WikiMovies is a dataset for question answering for movies content. It contains ~100k questions in the movie domain, and was designed to be answerable by using either a perfect KB (based on OMDb),", - "paper_name": "Key-Value Memory Networks for Directly Reading Documents", - "paper_abstract": "Directly reading documents and being able to answer questions from them is an\nunsolved challenge. To avoid its inherent difficulty, question answering (QA)\nhas been directed towards using Knowledge Bases (KBs) instead, which has proven\neffective. Unfortunately KBs often suffer from being too restrictive, as the\nschema cannot support certain types of answers, and too sparse, e.g. Wikipedia\ncontains much more information than Freebase. In this work we introduce a new\nmethod, Key-Value Memory Networks, that makes reading documents more viable by\nutilizing different encodings in the addressing and output stages of the memory\nread operation. To compare using KBs, information extraction or Wikipedia\ndocuments directly in a single framework we construct an analysis tool,\nWikiMovies, a QA dataset that contains raw text alongside a preprocessed KB, in\nthe domain of movies. Our method reduces the gap between all three settings. It\nalso achieves state-of-the-art results on the existing WikiQA benchmark." - }, - "orange_sum": { - "pwc_id": "orangesum", - "dataset_name": "OrangeSum Dataset", - "dataset_abstract": "OrangeSum is a single-document extreme summarization dataset with two tasks: title and abstract. Ground truth summaries are respectively 11.42 and 32.12 words in length on average, for the title and abstract tasks respectively, while document sizes are 315 and 350 words.\n\nThe motivation for OrangeSum was to put together a French equivalent of the XSum dataset.\n\nUnlike the historical CNN, DailyMail, and NY Times datasets, OrangeSum requires the models to display a high degree of abstractivity to perform well.\nOrangeSum was created by scraping articles and their titles and abstracts from the Orange Actu website.\n\nScraped pages cover almost a decade from Feb 2011 to Sep 2020, and belong to five main categories: France, world, politics, automotive, and society.\nThe society category is itself divided into 8 subcategories: health, environment, people, culture, media, high-tech, unsual (\"insolite\" in French), and miscellaneous.\n\nThe dataset is publicly available at: https://github.com/Tixierae/OrangeSum.", - "paper_name": "BARThez: a Skilled Pretrained French Sequence-to-Sequence Model", - "paper_abstract": "Inductive transfer learning has taken the entire NLP field by storm, with models such as BERT and BART setting new state of the art on countless NLU tasks. However, most of the available models and research have been conducted for English. In this work, we introduce BARThez, the first large-scale pretrained seq2seq model for French. Being based on BART, BARThez is particularly well-suited for generative tasks. We evaluate BARThez on five discriminative tasks from the FLUE benchmark and two generative tasks from a novel summarization dataset, OrangeSum, that we created for this research. We show BARThez to be very competitive with state-of-the-art BERT-based French language models such as CamemBERT and FlauBERT. We also continue the pretraining of a multilingual BART on BARThez' corpus, and show our resulting model, mBARThez, to significantly boost BARThez' generative performance. Code, data and models are publicly available." - }, - "simple_questions_v2": { - "pwc_id": "simplequestions", - "dataset_name": "SimpleQuestions Dataset", - "dataset_abstract": "SimpleQuestions is a large-scale factoid question answering dataset. It consists of 108,442 natural language questions, each paired with a corresponding fact from Freebase knowledge base. Each fact is a triple (subject, relation, object) and the answer to the question is always the object. The dataset is divided into training, validation, and test sets with 75,910, 10,845 and 21,687 questions respectively.", - "paper_name": "Large-scale Simple Question Answering with Memory Networks", - "paper_abstract": "Training large-scale question answering systems is complicated because\ntraining sources usually cover a small portion of the range of possible\nquestions. This paper studies the impact of multitask and transfer learning for\nsimple question answering; a setting for which the reasoning required to answer\nis quite easy, as long as one can retrieve the correct evidence given a\nquestion, which can be difficult in large-scale conditions. To this end, we\nintroduce a new dataset of 100k questions that we use in conjunction with\nexisting benchmarks. We conduct our study within the framework of Memory\nNetworks (Weston et al., 2015) because this perspective allows us to eventually\nscale up to more complex reasoning, and show that Memory Networks can be\nsuccessfully trained to achieve excellent performance." - }, - "indonli": { - "pwc_id": "indonli", - "dataset_name": "IndoNLI Dataset", - "dataset_abstract": "IndoNLI is the first human-elicited NLI dataset for Indonesian consisting of nearly 18K sentence pairs annotated by crowd workers and experts.", - "paper_name": "IndoNLI: A Natural Language Inference Dataset for Indonesian", - "paper_abstract": "We present IndoNLI, the first human-elicited NLI dataset for Indonesian. We adapt the data collection protocol for MNLI and collect nearly 18K sentence pairs annotated by crowd workers and experts. The expert-annotated data is used exclusively as a test set. It is designed to provide a challenging test-bed for Indonesian NLI by explicitly incorporating various linguistic phenomena such as numerical reasoning, structural changes, idioms, or temporal and spatial reasoning. Experiment results show that XLM-R outperforms other pre-trained models in our data. The best performance on the expert-annotated data is still far below human performance (13.4% accuracy gap), suggesting that this test set is especially challenging. Furthermore, our analysis shows that our expert-annotated data is more diverse and contains fewer annotation artifacts than the crowd-annotated data. We hope this dataset can help accelerate progress in Indonesian NLP research." - }, - "multi_booked": { - "pwc_id": "multibooked", - "dataset_name": "MultiBooked Dataset", - "dataset_abstract": "MultiBooked is a dataset for supervised aspect-level sentiment analysis in Basque and Catalan, both of which are under-resourced languages.", - "paper_name": "MultiBooked: A Corpus of Basque and Catalan Hotel Reviews Annotated for Aspect-level Sentiment Classification", - "paper_abstract": "While sentiment analysis has become an established field in the NLP\ncommunity, research into languages other than English has been hindered by the\nlack of resources. Although much research in multi-lingual and cross-lingual\nsentiment analysis has focused on unsupervised or semi-supervised approaches,\nthese still require a large number of resources and do not reach the\nperformance of supervised approaches. With this in mind, we introduce two\ndatasets for supervised aspect-level sentiment analysis in Basque and Catalan,\nboth of which are under-resourced languages. We provide high-quality\nannotations and benchmarks with the hope that they will be useful to the\ngrowing community of researchers working on these languages." - }, - "dengue_filipino": { - "pwc_id": "dengue", - "dataset_name": "Dengue Dataset", - "dataset_abstract": "Benchmark dataset for low-resource multiclass classification, with 4,015 training, 500 testing, and 500 validation examples, each labeled as part of five classes. Each sample can be a part of multiple classes. Collected as tweets and originally used in Livelo & Cheng (2018).", - "paper_name": "", - "paper_abstract": "" - }, - "med_hop": { - "pwc_id": "medhop", - "dataset_name": "MedHop Dataset", - "dataset_abstract": "With the same format as WikiHop, the MedHop dataset is based on research paper abstracts from PubMed, and the queries are about interactions between pairs of drugs. The correct answer has to be inferred by combining information from a chain of reactions of drugs and proteins.", - "paper_name": "Constructing Datasets for Multi-hop Reading Comprehension Across Documents", - "paper_abstract": "Most Reading Comprehension methods limit themselves to queries which can be\nanswered using a single sentence, paragraph, or document. Enabling models to\ncombine disjoint pieces of textual evidence would extend the scope of machine\ncomprehension methods, but currently there exist no resources to train and test\nthis capability. We propose a novel task to encourage the development of models\nfor text understanding across multiple documents and to investigate the limits\nof existing methods. In our task, a model learns to seek and combine evidence -\neffectively performing multi-hop (alias multi-step) inference. We devise a\nmethodology to produce datasets for this task, given a collection of\nquery-answer pairs and thematically linked documents. Two datasets from\ndifferent domains are induced, and we identify potential pitfalls and devise\ncircumvention strategies. We evaluate two previously proposed competitive\nmodels and find that one can integrate information across documents. However,\nboth models struggle to select relevant information, as providing documents\nguaranteed to be relevant greatly improves their performance. While the models\noutperform several strong baselines, their best accuracy reaches 42.9% compared\nto human performance at 74.0% - leaving ample room for improvement." - }, - "eth_py150_open": { - "pwc_id": "eth-py150-open", - "dataset_name": "ETH Py150 Open Dataset", - "dataset_abstract": "A massive, deduplicated corpus of 7.4M Python files from GitHub.", - "paper_name": "Learning and Evaluating Contextual Embedding of Source Code", - "paper_abstract": "Recent research has achieved impressive results on understanding and improving source code by building up on machine-learning techniques developed for natural languages. A significant advancement in natural-language understanding has come with the development of pre-trained contextual embeddings, such as BERT, which can be fine-tuned for downstream tasks with less labeled data and training budget, while achieving better accuracies. However, there is no attempt yet to obtain a high-quality contextual embedding of source code, and to evaluate it on multiple program-understanding tasks simultaneously; that is the gap that this paper aims to mitigate. Specifically, first, we curate a massive, deduplicated corpus of 7.4M Python files from GitHub, which we use to pre-train CuBERT, an open-sourced code-understanding BERT model; and, second, we create an open-sourced benchmark that comprises five classification tasks and one program-repair task, akin to code-understanding tasks proposed in the literature before. We fine-tune CuBERT on our benchmark tasks, and compare the resulting models to different variants of Word2Vec token embeddings, BiLSTM and Transformer models, as well as published state-of-the-art models, showing that CuBERT outperforms them all, even with shorter training, and with fewer labeled examples. Future work on source-code embedding can benefit from reusing our benchmark, and from comparing against CuBERT models as a strong baseline." - }, - "peer_read": { - "pwc_id": "peerread", - "dataset_name": "PeerRead Dataset", - "dataset_abstract": "PearRead is a dataset of scientific peer reviews available to help researchers study this important artifact. The dataset consists of over 14K paper drafts and the corresponding accept/reject decisions in top-tier venues including ACL, NIPS and ICLR, as well as over 10K textual peer reviews written by experts for a subset of the papers.", - "paper_name": "A Dataset of Peer Reviews (PeerRead): Collection, Insights and NLP Applications", - "paper_abstract": "Peer reviewing is a central component in the scientific publishing process.\nWe present the first public dataset of scientific peer reviews available for\nresearch purposes (PeerRead v1) providing an opportunity to study this\nimportant artifact. The dataset consists of 14.7K paper drafts and the\ncorresponding accept/reject decisions in top-tier venues including ACL, NIPS\nand ICLR. The dataset also includes 10.7K textual peer reviews written by\nexperts for a subset of the papers. We describe the data collection process and\nreport interesting observed phenomena in the peer reviews. We also propose two\nnovel NLP tasks based on this dataset and provide simple baseline models. In\nthe first task, we show that simple models can predict whether a paper is\naccepted with up to 21% error reduction compared to the majority baseline. In\nthe second task, we predict the numerical scores of review aspects and show\nthat simple models can outperform the mean baseline for aspects with high\nvariance such as 'originality' and 'impact'." - }, - "scb_mt_enth_2020": { - "pwc_id": "scb-mt-en-th-2020", - "dataset_name": "scb-mt-en-th-2020 Dataset", - "dataset_abstract": "scb-mt-en-th-2020 is an English-Thai machine translation dataset with over 1 million segment pairs, curated from various sources, namely news, Wikipedia articles, SMS messages, task-based dialogs, web-crawled data and government documents.", - "paper_name": "scb-mt-en-th-2020: A Large English-Thai Parallel Corpus", - "paper_abstract": "The primary objective of our work is to build a large-scale English-Thai dataset for machine translation. We construct an English-Thai machine translation dataset with over 1 million segment pairs, curated from various sources, namely news, Wikipedia articles, SMS messages, task-based dialogs, web-crawled data and government documents. Methodology for gathering data, building parallel texts and removing noisy sentence pairs are presented in a reproducible manner. We train machine translation models based on this dataset. Our models' performance are comparable to that of Google Translation API (as of May 2020) for Thai-English and outperform Google when the Open Parallel Corpus (OPUS) is included in the training data for both Thai-English and English-Thai translation. The dataset, pre-trained models, and source code to reproduce our work are available for public use." - }, - "com_qa": { - "pwc_id": "comqa", - "dataset_name": "ComQA Dataset", - "dataset_abstract": "ComQA is a large dataset of real user questions that exhibit different challenging aspects such as compositionality, temporal reasoning, and comparisons. ComQA questions come from the WikiAnswers community QA platform, which typically contains questions that are not satisfactorily answerable by existing search engine technology.", - "paper_name": "ComQA: A Community-sourced Dataset for Complex Factoid Question Answering with Paraphrase Clusters", - "paper_abstract": "To bridge the gap between the capabilities of the state-of-the-art in factoid\nquestion answering (QA) and what users ask, we need large datasets of real user\nquestions that capture the various question phenomena users are interested in,\nand the diverse ways in which these questions are formulated. We introduce\nComQA, a large dataset of real user questions that exhibit different\nchallenging aspects such as compositionality, temporal reasoning, and\ncomparisons. ComQA questions come from the WikiAnswers community QA platform,\nwhich typically contains questions that are not satisfactorily answerable by\nexisting search engine technology. Through a large crowdsourcing effort, we\nclean the question dataset, group questions into paraphrase clusters, and\nannotate clusters with their answers. ComQA contains 11,214 questions grouped\ninto 4,834 paraphrase clusters. We detail the process of constructing ComQA,\nincluding the measures taken to ensure its high quality while making effective\nuse of crowdsourcing. We also present an extensive analysis of the dataset and\nthe results achieved by state-of-the-art systems on ComQA, demonstrating that\nour dataset can be a driver of future research on QA." - }, - "cail2018": { - "pwc_id": "chinese-ai-and-law-cail-2018", - "dataset_name": "Chinese AI and Law (CAIL) 2018 Dataset", - "dataset_abstract": "Large-scale Chinese legal dataset for judgment prediction. \\dataset contains more than 2.6 million criminal cases published by the Supreme People's Court of China, which are several times larger than other datasets in existing works on judgment prediction.", - "paper_name": "CAIL2018: A Large-Scale Legal Dataset for Judgment Prediction", - "paper_abstract": "In this paper, we introduce the \\textbf{C}hinese \\textbf{AI} and \\textbf{L}aw\nchallenge dataset (CAIL2018), the first large-scale Chinese legal dataset for\njudgment prediction. \\dataset contains more than $2.6$ million criminal cases\npublished by the Supreme People's Court of China, which are several times\nlarger than other datasets in existing works on judgment prediction. Moreover,\nthe annotations of judgment results are more detailed and rich. It consists of\napplicable law articles, charges, and prison terms, which are expected to be\ninferred according to the fact descriptions of cases. For comparison, we\nimplement several conventional text classification baselines for judgment\nprediction and experimental results show that it is still a challenge for\ncurrent models to predict the judgment results of legal cases, especially on\nprison terms. To help the researchers make improvements on legal judgment\nprediction, both \\dataset and baselines will be released after the CAIL\ncompetition\\footnote{http://cail.cipsc.org.cn/}." - }, - "inquisitive_qg": { - "pwc_id": "inquisitive", - "dataset_name": "INQUISITIVE Dataset", - "dataset_abstract": "A dataset of ~19K questions that are elicited while a person is reading through a document.", - "paper_name": "Inquisitive Question Generation for High Level Text Comprehension", - "paper_abstract": "Inquisitive probing questions come naturally to humans in a variety of settings, but is a challenging task for automatic systems. One natural type of question to ask tries to fill a gap in knowledge during text comprehension, like reading a news article: we might ask about background information, deeper reasons behind things occurring, or more. Despite recent progress with data-driven approaches, generating such questions is beyond the range of models trained on existing datasets. We introduce INQUISITIVE, a dataset of ~19K questions that are elicited while a person is reading through a document. Compared to existing datasets, INQUISITIVE questions target more towards high-level (semantic and discourse) comprehension of text. We show that readers engage in a series of pragmatic strategies to seek information. Finally, we evaluate question generation models based on GPT-2 and show that our model is able to generate reasonable questions although the task is challenging, and highlight the importance of context to generate INQUISITIVE questions." - }, - "tuple_ie": { - "pwc_id": "tupleinf-open-ie-dataset", - "dataset_name": "TupleInf Open IE Dataset Dataset", - "dataset_abstract": "The TupleInf Open IE dataset contains Open IE tuples extracted from 263K sentences that were used by the solver in \u201cAnswering Complex Questions Using Open Information Extraction\u201d (referred as Tuple KB, T). These sentences were collected from a large Web corpus using training questions from 4th and 8th grade as queries. This dataset contains 156K sentences collected for 4th grade questions and 107K sentences for 8th grade questions. Each sentence is followed by the Open IE v4 tuples using their simple format.", - "paper_name": "Answering Complex Questions Using Open Information Extraction", - "paper_abstract": "While there has been substantial progress in factoid question-answering (QA),\nanswering complex questions remains challenging, typically requiring both a\nlarge body of knowledge and inference techniques. Open Information Extraction\n(Open IE) provides a way to generate semi-structured knowledge for QA, but to\ndate such knowledge has only been used to answer simple questions with\nretrieval-based methods. We overcome this limitation by presenting a method for\nreasoning with Open IE knowledge, allowing more complex questions to be\nhandled. Using a recently proposed support graph optimization framework for QA,\nwe develop a new inference model for Open IE, in particular one that can work\neffectively with multiple short facts, noise, and the relational structure of\ntuples. Our model significantly outperforms a state-of-the-art structured\nsolver on complex questions of varying difficulty, while also removing the\nreliance on manually curated knowledge." - }, - "wi_locness": { - "pwc_id": "locness-corpus", - "dataset_name": "WI-LOCNESS Dataset", - "dataset_abstract": "WI-LOCNESS is part of the Building Educational Applications 2019 Shared Task for Grammatical Error Correction. It consists of two datasets:\n\n\nLOCNESS: is a corpus consisting of essays written by native English students. \nCambridge English Write & Improve (W&I): Write & Improve (Yannakoudakis et al., 2018) is an online web platform that assists non-native English students with their writing. Specifically, students from around the world submit letters, stories, articles and essays in response to various prompts, and the W&I system provides instant feedback. Since W&I went live in 2014, W&I annotators have manually annotated some of these submissions and assigned them a CEFR level.", - "paper_name": "The BEA-2019 Shared Task on Grammatical Error Correction", - "paper_abstract": "This paper reports on the BEA-2019 Shared Task on Grammatical Error Correction (GEC). As with the CoNLL-2014 shared task, participants are required to correct all types of errors in test data. One of the main contributions of the BEA-2019 shared task is the introduction of a new dataset, the Write{\\&}Improve+LOCNESS corpus, which represents a wider range of native and learner English levels and abilities. Another contribution is the introduction of tracks, which control the amount of annotated data available to participants. Systems are evaluated in terms of ERRANT F{\\_}0.5, which allows us to report a much wider range of performance statistics. The competition was hosted on Codalab and remains open for further submissions on the blind test set." - }, - "casino": { - "pwc_id": "casino", - "dataset_name": "CaSiNo Dataset", - "dataset_abstract": "CaSiNo is a dataset of 1030 negotiation dialogues in English. To create the dataset, two participates take the role of campsite neighbors and negotiate for Food, Water, and Firewood packages, based on their individual preferences and requirements. This design keeps the task tractable, while still facilitating linguistically rich and personal conversations.", - "paper_name": "CaSiNo: A Corpus of Campsite Negotiation Dialogues for Automatic Negotiation Systems", - "paper_abstract": "Automated systems that negotiate with humans have broad applications in pedagogy and conversational AI. To advance the development of practical negotiation systems, we present CaSiNo: a novel corpus of over a thousand negotiation dialogues in English. Participants take the role of campsite neighbors and negotiate for food, water, and firewood packages for their upcoming trip. Our design results in diverse and linguistically rich negotiations while maintaining a tractable, closed-domain environment. Inspired by the literature in human-human negotiations, we annotate persuasion strategies and perform correlation analysis to understand how the dialogue behaviors are associated with the negotiation performance. We further propose and evaluate a multi-task framework to recognize these strategies in a given utterance. We find that multi-task learning substantially improves the performance for all strategy labels, especially for the ones that are the most skewed. We release the dataset, annotations, and the code to propel future work in human-machine negotiations: https://github.com/kushalchawla/CaSiNo" - }, - "finer": { - "pwc_id": "finer", - "dataset_name": "Finer Dataset", - "dataset_abstract": "Finnish News Corpus for Named Entity Recognition (Finer) is a corpus that consists of 953 articles (193,742 word tokens) with six named entity classes (organization, location, person, product, event,and date). The articles are extracted from the archives of Digitoday, a Finnish online technology news source.", - "paper_name": "A Finnish News Corpus for Named Entity Recognition", - "paper_abstract": "We present a corpus of Finnish news articles with a manually prepared named entity annotation. The corpus consists of 953 articles (193,742 word tokens) with six named entity classes (organization, location, person, product, event, and date). The articles are extracted from the archives of Digitoday, a Finnish online technology news source. The corpus is available for research purposes. We present baseline experiments on the corpus using a rule-based and two deep learning systems on two, in-domain and out-of-domain, test sets." - }, - "rvl_cdip": { - "pwc_id": "rvl-cdip", - "dataset_name": "RVL-CDIP Dataset", - "dataset_abstract": "The RVL-CDIP dataset consists of scanned document images belonging to 16 classes such as letter, form, email, resume, memo, etc. The dataset has 320,000 training, 40,000 validation and 40,000 test images. The images are characterized by low quality, noise, and low resolution, typically 100 dpi.", - "paper_name": "Evaluation of Deep Convolutional Nets for Document Image Classification and Retrieval", - "paper_abstract": "This paper presents a new state-of-the-art for document image classification\nand retrieval, using features learned by deep convolutional neural networks\n(CNNs). In object and scene analysis, deep neural nets are capable of learning\na hierarchical chain of abstraction from pixel inputs to concise and\ndescriptive representations. The current work explores this capacity in the\nrealm of document analysis, and confirms that this representation strategy is\nsuperior to a variety of popular hand-crafted alternatives. Experiments also\nshow that (i) features extracted from CNNs are robust to compression, (ii) CNNs\ntrained on non-document images transfer well to document analysis tasks, and\n(iii) enforcing region-specific feature-learning is unnecessary given\nsufficient training data. This work also makes available a new labelled subset\nof the IIT-CDIP collection, containing 400,000 document images across 16\ncategories, useful for training new CNNs for document analysis." - }, - "gap": { - "pwc_id": "gap", - "dataset_name": "GAP Dataset", - "dataset_abstract": "GAP is a graph processing benchmark suite with the goal of helping to standardize graph processing evaluations. Fewer differences between graph processing evaluations will make it easier to compare different research efforts and quantify improvements. The benchmark not only specifies graph kernels, input graphs, and evaluation methodologies, but it also provides optimized baseline implementations. These baseline implementations are representative of state-of-the-art performance, and thus new contributions should outperform them to demonstrate an improvement. The input graphs are sized appropriately for shared memory platforms, but any implementation on any platform that conforms to the benchmark's specifications could be compared. This benchmark suite can be used in a variety of settings. Graph framework developers can demonstrate the generality of their programming model by implementing all of the benchmark's kernels and delivering competitive performance on all of the benchmark's graphs. Algorithm designers can use the input graphs and the baseline implementations to demonstrate their contribution. Platform designers and performance analysts can use the suite as a workload representative of graph processing.", - "paper_name": "The GAP Benchmark Suite", - "paper_abstract": "We present a graph processing benchmark suite with the goal of helping to\nstandardize graph processing evaluations. Fewer differences between graph\nprocessing evaluations will make it easier to compare different research\nefforts and quantify improvements. The benchmark not only specifies graph\nkernels, input graphs, and evaluation methodologies, but it also provides\noptimized baseline implementations. These baseline implementations are\nrepresentative of state-of-the-art performance, and thus new contributions\nshould outperform them to demonstrate an improvement.\n The input graphs are sized appropriately for shared memory platforms, but any\nimplementation on any platform that conforms to the benchmark's specifications\ncould be compared. This benchmark suite can be used in a variety of settings.\nGraph framework developers can demonstrate the generality of their programming\nmodel by implementing all of the benchmark's kernels and delivering competitive\nperformance on all of the benchmark's graphs. Algorithm designers can use the\ninput graphs and the baseline implementations to demonstrate their\ncontribution. Platform designers and performance analysts can use the suite as\na workload representative of graph processing." - }, - "capes": { - "pwc_id": "capes", - "dataset_name": "capes Dataset", - "dataset_abstract": "Approximately 240,000 documents were collected and aligned using the Hunalign tool.", - "paper_name": "A Parallel Corpus of Theses and Dissertations Abstracts", - "paper_abstract": "In Brazil, the governmental body responsible for overseeing and coordinating post-graduate programs, CAPES, keeps records of all theses and dissertations presented in the country. Information regarding such documents can be accessed online in the Theses and Dissertations Catalog (TDC), which contains abstracts in Portuguese and English, and additional metadata. Thus, this database can be a potential source of parallel corpora for the Portuguese and English languages. In this article, we present the development of a parallel corpus from TDC, which is made available by CAPES under the open data initiative. Approximately 240,000 documents were collected and aligned using the Hunalign tool. We demonstrate the capability of our developed corpus by training Statistical Machine Translation (SMT) and Neural Machine Translation (NMT) models for both language directions, followed by a comparison with Google Translate (GT). Both translation models presented better BLEU scores than GT, with NMT system being the most accurate one. Sentence alignment was also manually evaluated, presenting an average of 82.30% correctly aligned sentences. Our parallel corpus is freely available in TMX format, with complementary information regarding document metadata" - }, - "bn_hate_speech": { - "pwc_id": "bengali-hate-speech", - "dataset_name": "Bengali Hate Speech Dataset", - "dataset_abstract": "Introduces three datasets of expressing hate, commonly used topics, and opinions for hate speech detection, document classification, and sentiment analysis, respectively.", - "paper_name": "Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network", - "paper_abstract": "Exponential growths of social media and micro-blogging sites not only provide platforms for empowering freedom of expressions and individual voices but also enables people to express anti-social behaviour like online harassment, cyberbullying, and hate speech. Numerous works have been proposed to utilize these data for social and anti-social behaviours analysis, document characterization, and sentiment analysis by predicting the contexts mostly for highly resourced languages such as English. However, there are languages that are under-resources, e.g., South Asian languages like Bengali, Tamil, Assamese, Telugu that lack of computational resources for the NLP tasks. In this paper, we provide several classification benchmarks for Bengali, an under-resourced language. We prepared three datasets of expressing hate, commonly used topics, and opinions for hate speech detection, document classification, and sentiment analysis, respectively. We built the largest Bengali word embedding models to date based on 250 million articles, which we call BengFastText. We perform three different experiments, covering document classification, sentiment analysis, and hate speech detection. We incorporate word embeddings into a Multichannel Convolutional-LSTM (MConv-LSTM) network for predicting different types of hate speech, document classification, and sentiment analysis. Experiments demonstrate that BengFastText can capture the semantics of words from respective contexts correctly. Evaluations against several baseline embedding models, e.g., Word2Vec and GloVe yield up to 92.30%, 82.25%, and 90.45% F1-scores in case of document classification, sentiment analysis, and hate speech detection, respectively during 5-fold cross-validation tests." - }, - "hkcancor": { - "pwc_id": "hong-kong-cantonese-corpus", - "dataset_name": "Hong Kong Cantonese corpus Dataset", - "dataset_abstract": "The Hong Kong Cantonese Corpus was collected from transcribed conversations that were recorded between March 1997 and August 1998. About 230,000 Chinese words were collected in the annotated corpus. It contains recordings of spontaneous speech (51 texts) and radio programmes (42 texts), which involve 2 to 4 speakers, with 1 text of monologue. The text were word-segmented, annotated with part-of-speech tagging and Cantonese pronunciation using the romanisation scheme of Linguistic Society of Hong Kong (LSHK).", - "paper_name": "", - "paper_abstract": "" - }, - "offcombr": { - "pwc_id": "offcombr", - "dataset_name": "OffComBR Dataset", - "dataset_abstract": "Offensive comments obtained from Brazilian website.", - "paper_name": "", - "paper_abstract": "" - }, - "dialog_re": { - "pwc_id": "dialogre", - "dataset_name": "DialogRE Dataset", - "dataset_abstract": "DialogRE is the first human-annotated dialogue-based relation extraction dataset, containing 1,788 dialogues originating from the complete transcripts of a famous American television situation comedy Friends. The are annotations for all occurrences of 36 possible relation types that exist between an argument pair in a dialogue. DialogRE is available in English and Chinese.", - "paper_name": "Dialogue-Based Relation Extraction", - "paper_abstract": "We present the first human-annotated dialogue-based relation extraction (RE) dataset DialogRE, aiming to support the prediction of relation(s) between two arguments that appear in a dialogue. We further offer DialogRE as a platform for studying cross-sentence RE as most facts span multiple sentences. We argue that speaker-related information plays a critical role in the proposed task, based on an analysis of similarities and differences between dialogue-based and traditional RE tasks. Considering the timeliness of communication in a dialogue, we design a new metric to evaluate the performance of RE methods in a conversational setting and investigate the performance of several representative RE methods on DialogRE. Experimental results demonstrate that a speaker-aware extension on the best-performing model leads to gains in both the standard and conversational evaluation settings. DialogRE is available at https://dataset.org/dialogre/." - }, - "glucose": { - "pwc_id": "glucose", - "dataset_name": "GLUCOSE Dataset", - "dataset_abstract": "GLUCOSE is a large-scale dataset of implicit commonsense causal knowledge, encoded as causal mini-theories about the world, each grounded in a narrative context. To construct GLUCOSE, we drew on cognitive psychology to identify ten dimensions of causal explanation, focusing on events, states, motivations, and emotions. Each GLUCOSE entry includes a story-specific causal statement paired with an inference rule generalized from the statement.", - "paper_name": "GLUCOSE: GeneraLized and COntextualized Story Explanations", - "paper_abstract": "When humans read or listen, they make implicit commonsense inferences that frame their understanding of what happened and why. As a step toward AI systems that can build similar mental models, we introduce GLUCOSE, a large-scale dataset of implicit commonsense causal knowledge, encoded as causal mini-theories about the world, each grounded in a narrative context. To construct GLUCOSE, we drew on cognitive psychology to identify ten dimensions of causal explanation, focusing on events, states, motivations, and emotions. Each GLUCOSE entry includes a story-specific causal statement paired with an inference rule generalized from the statement. This paper details two concrete contributions. First, we present our platform for effectively crowdsourcing GLUCOSE data at scale, which uses semi-structured templates to elicit causal explanations. Using this platform, we collected a total of ~670K specific statements and general rules that capture implicit commonsense knowledge about everyday situations. Second, we show that existing knowledge resources and pretrained language models do not include or readily predict GLUCOSE's rich inferential content. However, when state-of-the-art neural models are trained on this knowledge, they can start to make commonsense inferences on unseen stories that match humans' mental models." - }, - "tweets_ar_en_parallel": { - "pwc_id": "bilingual-corpus-of-arabic-english-parallel", - "dataset_name": "Bilingual Corpus of Arabic-English Parallel Tweets Dataset", - "dataset_abstract": "A bilingual corpus of English-Arabic parallel tweets and a list of Twitter accounts who post English-Arabic tweets regularly.", - "paper_name": "Constructing a Bilingual Corpus of Parallel Tweets", - "paper_abstract": "In a bid to reach a larger and more diverse audience, Twitter users often post parallel tweets{---}tweets that contain the same content but are written in different languages. Parallel tweets can be an important resource for developing machine translation (MT) systems among other natural language processing (NLP) tasks. In this paper, we introduce a generic method for collecting parallel tweets. Using this method, we collect a bilingual corpus of English-Arabic parallel tweets and a list of Twitter accounts who post English-Arabictweets regularly. Since our method is generic, it can also be used for collecting parallel tweets that cover less-resourced languages such as Serbian and Urdu. Additionally, we annotate a subset of Twitter accounts with their countries of origin and topic of interest, which provides insights about the population who post parallel tweets. This latter information can also be useful for author profiling tasks." - }, - "lc_quad": { - "pwc_id": "lc-quad-2-0", - "dataset_name": "LC-QuAD 2.0 Dataset", - "dataset_abstract": "LC-QuAD 2.0 is a Large Question Answering dataset with 30,000 pairs of question and its corresponding SPARQL query. The target knowledge base is Wikidata and DBpedia, specifically the 2018 version.", - "paper_name": "", - "paper_abstract": "" - }, - "ronec": { - "pwc_id": "ronec", - "dataset_name": "RONEC Dataset", - "dataset_abstract": "Romanian Named Entity Corpus is a named entity corpus for the Romanian language. The corpus contains over 26000 entities in ~5000 annotated sentences, belonging to 16 distinct classes. The sentences have been extracted from a copy-right free newspaper, covering several styles. This corpus represents the first initiative in the Romanian language space specifically targeted for named entity recognition.", - "paper_name": "Introducing RONEC -- the Romanian Named Entity Corpus", - "paper_abstract": "We present RONEC - the Named Entity Corpus for the Romanian language. The corpus contains over 26000 entities in ~5000 annotated sentences, belonging to 16 distinct classes. The sentences have been extracted from a copy-right free newspaper, covering several styles. This corpus represents the first initiative in the Romanian language space specifically targeted for named entity recognition. It is available in BRAT and CoNLL-U Plus formats, and it is free to use and extend at github.com/dumitrescustefan/ronec ." - }, - "event2Mind": { - "pwc_id": "event2mind", - "dataset_name": "Event2Mind Dataset", - "dataset_abstract": "Event2Mind is a corpus of 25,000 event phrases covering a diverse range of everyday events and situations.", - "paper_name": "Event2Mind: Commonsense Inference on Events, Intents, and Reactions", - "paper_abstract": "We investigate a new commonsense inference task: given an event described in a short free-form text (\"X drinks coffee in the morning\"), a system reasons about the likely intents (\"X wants to stay awake\") and reactions (\"X feels alert\") of the event's participants. To support this study, we construct a new crowdsourced corpus of 25,000 event phrases covering a diverse range of everyday events and situations. We report baseline performance on this task, demonstrating that neural encoder-decoder models can successfully compose embedding representations of previously unseen events and reason about the likely intents and reactions of the event participants. In addition, we demonstrate how commonsense inference on people's intents and reactions can help unveil the implicit gender inequality prevalent in modern movie scripts." - }, - "kor_hate": { - "pwc_id": "korean-hatespeech-dataset", - "dataset_name": "Korean HateSpeech Dataset Dataset", - "dataset_abstract": "Presents 9.4K manually labeled entertainment news comments for identifying Korean toxic speech, collected from a widely used online news platform in Korea.", - "paper_name": "BEEP! Korean Corpus of Online News Comments for Toxic Speech Detection", - "paper_abstract": "Toxic comments in online platforms are an unavoidable social issue under the cloak of anonymity. Hate speech detection has been actively done for languages such as English, German, or Italian, where manually labeled corpus has been released. In this work, we first present 9.4K manually labeled entertainment news comments for identifying Korean toxic speech, collected from a widely used online news platform in Korea. The comments are annotated regarding social bias and hate speech since both aspects are correlated. The inter-annotator agreement Krippendorff's alpha score is 0.492 and 0.496, respectively. We provide benchmarks using CharCNN, BiLSTM, and BERT, where BERT achieves the highest score on all tasks. The models generally display better performance on bias identification, since the hate speech detection is a more subjective issue. Additionally, when BERT is trained with bias label for hate speech detection, the prediction score increases, implying that bias and hate are intertwined. We make our dataset publicly available and open competitions with the corpus and benchmarks." - }, - "eitb_parcc": { - "pwc_id": "eitb-parcc", - "dataset_name": "EiTB-ParCC Dataset", - "dataset_abstract": "A large comparable corpus for Basque-Spanish was prepared, on the basis of independently-produced news by the Basque public broadcaster EiTB.", - "paper_name": "Handle with Care: A Case Study in Comparable Corpora Exploitation for Neural Machine Translation", - "paper_abstract": "We present the results of a case study in the exploitation of comparable corpora for Neural Machine Translation. A large comparable corpus for Basque-Spanish was prepared, on the basis of independently-produced news by the Basque public broadcaster EiTB, and we discuss the impact of various techniques to exploit the original data in order to determine optimal variants of the corpus. In particular, we show that filtering in terms of alignment thresholds and length-difference outliers has a significant impact on translation quality. The impact of tags identifying comparable data in the training datasets is also evaluated, with results indicating that this technique might be useful to help the models discriminate noisy information, in the form of informational imbalance between aligned sentences. The final corpus was prepared according to the experimental results and is made available to the scientific community for research purposes." - }, - "brwac": { - "pwc_id": "brwac", - "dataset_name": "BRWAC Dataset", - "dataset_abstract": "Composed by 2.7 billion tokens, and has been annotated with tagging and parsing information.", - "paper_name": "", - "paper_abstract": "" - }, - "sede": { - "pwc_id": "sede", - "dataset_name": "SEDE Dataset", - "dataset_abstract": "SEDE is a dataset comprised of 12,023 complex and diverse SQL queries and their natural language titles and descriptions, written by real users of the Stack Exchange Data Explorer out of a natural interaction. These pairs contain a variety of real-world challenges which were rarely reflected so far in any other semantic parsing dataset. The goal of this dataset is to take a significant step towards evaluation of Text-to-SQL models in a real-world setting. Compared to other Text-to-SQL datasets, SEDE contains at least 10 times more SQL queries templates (queries after canonization and anonymization of values) than other datasets, and has the most diverse set of utterances and SQL queries (in terms of 3-grams) out of all single-domain datasets. SEDE introduces real-world challenges, such as under-specification, usage of parameters in queries, dates manipulation and more.", - "paper_name": "Text-to-SQL in the Wild: A Naturally-Occurring Dataset Based on Stack Exchange Data", - "paper_abstract": "Most available semantic parsing datasets, comprising of pairs of natural utterances and logical forms, were collected solely for the purpose of training and evaluation of natural language understanding systems. As a result, they do not contain any of the richness and variety of natural-occurring utterances, where humans ask about data they need or are curious about. In this work, we release SEDE, a dataset with 12,023 pairs of utterances and SQL queries collected from real usage on the Stack Exchange website. We show that these pairs contain a variety of real-world challenges which were rarely reflected so far in any other semantic parsing dataset, propose an evaluation metric based on comparison of partial query clauses that is more suitable for real-world queries, and conduct experiments with strong baselines, showing a large gap between the performance on SEDE compared to other common datasets." - }, - "totto": { - "pwc_id": "totto", - "dataset_name": "ToTTo Dataset", - "dataset_abstract": "ToTTo is an open-domain English table-to-text dataset with over 120,000 training examples that proposes a controlled generation task: given a Wikipedia table and a set of highlighted table cells, produce a one-sentence description.\n\nDuring the dataset creation process, tables from English Wikipedia are matched with (noisy) descriptions. Each table cell mentioned in the description is highlighted and the descriptions are iteratively cleaned and corrected to faithfully reflect the content of the highlighted cells.", - "paper_name": "ToTTo: A Controlled Table-To-Text Generation Dataset", - "paper_abstract": "We present ToTTo, an open-domain English table-to-text dataset with over 120,000 training examples that proposes a controlled generation task: given a Wikipedia table and a set of highlighted table cells, produce a one-sentence description. To obtain generated targets that are natural but also faithful to the source table, we introduce a dataset construction process where annotators directly revise existing candidate sentences from Wikipedia. We present systematic analyses of our dataset and annotation process as well as results achieved by several state-of-the-art baselines. While usually fluent, existing methods often hallucinate phrases that are not supported by the table, suggesting that this dataset can serve as a useful research benchmark for high-precision conditional text generation." - }, - "re_dial": { - "pwc_id": "redial", - "dataset_name": "ReDial Dataset", - "dataset_abstract": "ReDial (Recommendation Dialogues) is an annotated dataset of dialogues, where users recommend movies to each other. The dataset consists of over 10,000 conversations centered around the theme of providing movie recommendations.", - "paper_name": "Towards Deep Conversational Recommendations", - "paper_abstract": "There has been growing interest in using neural networks and deep learning\ntechniques to create dialogue systems. Conversational recommendation is an\ninteresting setting for the scientific exploration of dialogue with natural\nlanguage as the associated discourse involves goal-driven dialogue that often\ntransforms naturally into more free-form chat. This paper provides two\ncontributions. First, until now there has been no publicly available\nlarge-scale dataset consisting of real-world dialogues centered around\nrecommendations. To address this issue and to facilitate our exploration here,\nwe have collected ReDial, a dataset consisting of over 10,000 conversations\ncentered around the theme of providing movie recommendations. We make this data\navailable to the community for further research. Second, we use this dataset to\nexplore multiple facets of conversational recommendations. In particular we\nexplore new neural architectures, mechanisms, and methods suitable for\ncomposing conversational recommendation systems. Our dataset allows us to\nsystematically probe model sub-components addressing different parts of the\noverall problem domain ranging from: sentiment analysis and cold-start\nrecommendation generation to detailed aspects of how natural language is used\nin this setting in the real world. We combine such sub-components into a\nfull-blown dialogue system and examine its behavior." - }, - "narrativeqa_manual": { - "pwc_id": "narrativeqa", - "dataset_name": "NarrativeQA Dataset", - "dataset_abstract": "The NarrativeQA dataset includes a list of documents with Wikipedia summaries, links to full stories, and questions and answers.", - "paper_name": "The NarrativeQA Reading Comprehension Challenge", - "paper_abstract": "Reading comprehension (RC)---in contrast to information retrieval---requires\nintegrating information and reasoning about events, entities, and their\nrelations across a full document. Question answering is conventionally used to\nassess RC ability, in both artificial agents and children learning to read.\nHowever, existing RC datasets and tasks are dominated by questions that can be\nsolved by selecting answers using superficial information (e.g., local context\nsimilarity or global term frequency); they thus fail to test for the essential\nintegrative aspect of RC. To encourage progress on deeper comprehension of\nlanguage, we present a new dataset and set of tasks in which the reader must\nanswer questions about stories by reading entire books or movie scripts. These\ntasks are designed so that successfully answering their questions requires\nunderstanding the underlying narrative rather than relying on shallow pattern\nmatching or salience. We show that although humans solve the tasks easily,\nstandard RC models struggle on the tasks presented here. We provide an analysis\nof the dataset and the challenges it presents." - }, - "cornell_movie_dialog": { - "pwc_id": "cornell-movie-dialogs-corpus", - "dataset_name": "Cornell Movie-Dialogs Corpus Dataset", - "dataset_abstract": "This corpus contains a large metadata-rich collection of fictional conversations extracted from raw movie scripts:\n\n\n220,579 conversational exchanges between 10,292 pairs of movie characters\ninvolves 9,035 characters from 617 movies\nin total 304,713 utterances\nmovie metadata included:\ngenres\nrelease year\nIMDB rating\nnumber of IMDB votes\nIMDB rating\n\n\ncharacter metadata included:\ngender (for 3,774 characters)\nposition on movie credits (3,321 characters)", - "paper_name": "", - "paper_abstract": "" - }, - "linnaeus": { - "pwc_id": "linnaeus", - "dataset_name": "LINNAEUS Dataset", - "dataset_abstract": "LINNAEUS is a general-purpose dictionary matching software, capable of processing multiple types of document formats in the biomedical domain (MEDLINE, PMC, BMC, OTMI, text, etc.). It can produce multiple types of output (XML, HTML, tab-separated-value file, or save to a database). It also contains methods for acting as a server (including load balancing across several servers), allowing clients to request matching over a network. A package with files for recognizing and identifying species names is available for LINNAEUS, showing 94% recall and 97% precision compared to LINNAEUS-species-corpus.", - "paper_name": "", - "paper_abstract": "" - }, - "coarse_discourse": { - "pwc_id": "coarse-discourse", - "dataset_name": "Coarse Discourse Dataset", - "dataset_abstract": "A large corpus of discourse annotations and relations on ~10K forum threads.", - "paper_name": "", - "paper_abstract": "" - }, - "hind_encorp": { - "pwc_id": "hindencorp", - "dataset_name": "HindEnCorp Dataset", - "dataset_abstract": "A parallel corpus of Hindi and English, and HindMonoCorp, a monolingual corpus of Hindi in their release version 0.5. Both corpora were collected from web sources and preprocessed primarily for the training of statistical machine translation systems. HindEnCorp consists of 274k parallel sentences (3.9 million Hindi and 3.8 million English tokens). HindMonoCorp amounts to 787 million tokens in 44 million sentences.", - "paper_name": "HindEnCorp - Hindi-English and Hindi-only Corpus for Machine Translation", - "paper_abstract": "We present HindEnCorp, a parallel corpus of Hindi and English, and HindMonoCorp, a monolingual corpus of Hindi in their release version 0.5. Both corpora were collected from web sources and preprocessed primarily for the training of statistical machine translation systems. HindEnCorp consists of 274k parallel sentences (3.9 million Hindi and 3.8 million English tokens). HindMonoCorp amounts to 787 million tokens in 44 million sentences. Both the corpora are freely available for non-commercial research and their preliminary release has been used by numerous participants of the WMT 2014 shared translation task." - }, - "newsph_nli": { - "pwc_id": "newsph-nli", - "dataset_name": "NewsPH-NLI Dataset", - "dataset_abstract": "NewsPH-NLI is a sentence entailment benchmark dataset in the low-resource Filipino language.", - "paper_name": "Exploiting News Article Structure for Automatic Corpus Generation of Entailment Datasets", - "paper_abstract": "Transformers represent the state-of-the-art in Natural Language Processing (NLP) in recent years, proving effective even in tasks done in low-resource languages. While pretrained transformers for these languages can be made, it is challenging to measure their true performance and capacity due to the lack of hard benchmark datasets, as well as the difficulty and cost of producing them. In this paper, we present three contributions: First, we propose a methodology for automatically producing Natural Language Inference (NLI) benchmark datasets for low-resource languages using published news articles. Through this, we create and release NewsPH-NLI, the first sentence entailment benchmark dataset in the low-resource Filipino language. Second, we produce new pretrained transformers based on the ELECTRA technique to further alleviate the resource scarcity in Filipino, benchmarking them on our dataset against other commonly-used transfer learning techniques. Lastly, we perform analyses on transfer learning techniques to shed light on their true performance when operating in low-data domains through the use of degradation tests." - }, - "has_part": { - "pwc_id": "haspart-kb", - "dataset_name": "hasPart KB Dataset", - "dataset_abstract": "This dataset is a new knowledge-base (KB) of hasPart relationships, extracted from a large corpus of generic statements. Complementary to other resources available, it is the first which is all three of: accurate (90% precision), salient (covers relationships a person may mention), and has high coverage of common terms (approximated as within a 10 year old\u2019s vocabulary), as well as having several times more hasPart entries than in the popular ontologies ConceptNet and WordNet. In addition, it contains information about quantifiers, argument modifiers, and links the entities to appropriate concepts in Wikipedia and WordNet.", - "paper_name": "Do Dogs have Whiskers? A New Knowledge Base of hasPart Relations", - "paper_abstract": "We present a new knowledge-base of hasPart relationships, extracted from a large corpus of generic statements. Complementary to other resources available, it is the first which is all three of: accurate (90% precision), salient (covers relationships a person may mention), and has high coverage of common terms (approximated as within a 10 year old's vocabulary), as well as having several times more hasPart entries than in the popular ontologies ConceptNet and WordNet. In addition, it contains information about quantifiers, argument modifiers, and links the entities to appropriate concepts in Wikipedia and WordNet. The knowledge base is available at https://allenai.org/data/haspartkb" - }, - "vctk": { - "pwc_id": "vctk", - "dataset_name": "VCTK Dataset", - "dataset_abstract": "This CSTR VCTK Corpus includes speech data uttered by 110 English speakers with various accents. Each speaker reads out about 400 sentences, which were selected from a newspaper, the rainbow passage and an elicitation paragraph used for the speech accent archive. The newspaper texts were taken from Herald Glasgow, with permission from Herald & Times Group. Each speaker has a different set of the newspaper texts selected based a greedy algorithm that increases the contextual and phonetic coverage. The details of the text selection algorithms are described in the following paper: C. Veaux, J. Yamagishi and S. King, \"The voice bank corpus: Design, collection and data analysis of a large regional accent speech database,\" https://doi.org/10.1109/ICSDA.2013.6709856. The rainbow passage and elicitation paragraph are the same for all speakers. The rainbow passage can be found at International Dialects of English Archive: (http://web.ku.edu/~idea/readings/rainbow.htm). The elicitation paragraph is identical to the one used for the speech accent archive (http://accent.gmu.edu). The details of the the speech accent archive can be found at http://www.ualberta.ca/~aacl2009/PDFs/WeinbergerKunath2009AACL.pdf. All speech data was recorded using an identical recording setup: an omni-directional microphone (DPA 4035) and a small diaphragm condenser microphone with very wide bandwidth (Sennheiser MKH 800), 96kHz sampling frequency at 24 bits and in a hemi-anechoic chamber of the University of Edinburgh. (However, two speakers, p280 and p315 had technical issues of the audio recordings using MKH 800). All recordings were converted into 16 bits, were downsampled to 48 kHz, and were manually end-pointed.", - "paper_name": "", - "paper_abstract": "" - }, - "fake_news_filipino": { - "pwc_id": "fake-news-filipino-dataset", - "dataset_name": "Fake News Filipino Dataset Dataset", - "dataset_abstract": "Expertly-curated benchmark dataset for fake news detection in Filipino.", - "paper_name": "Localization of Fake News Detection via Multitask Transfer Learning", - "paper_abstract": "The use of the internet as a fast medium of spreading fake news reinforces the need for computational tools that combat it. Techniques that train fake news classifiers exist, but they all assume an abundance of resources including large labeled datasets and expert-curated corpora, which low-resource languages may not have. In this work, we make two main contributions: First, we alleviate resource scarcity by constructing the first expertly-curated benchmark dataset for fake news detection in Filipino, which we call \"Fake News Filipino.\" Second, we benchmark Transfer Learning (TL) techniques and show that they can be used to train robust fake news classifiers from little data, achieving 91% accuracy on our fake news dataset, reducing the error by 14% compared to established few-shot baselines. Furthermore, lifting ideas from multitask learning, we show that augmenting transformer-based transfer techniques with auxiliary language modeling losses improves their performance by adapting to writing style. Using this, we improve TL performance by 4-6%, achieving an accuracy of 96% on our best model. Lastly, we show that our method generalizes well to different types of news articles, including political news, entertainment news, and opinion articles." - }, - "xor_tydi_qa": { - "pwc_id": "xor-tydi-qa", - "dataset_name": "XOR-TYDI QA Dataset", - "dataset_abstract": "A large-scale dataset built on questions from TyDi QA lacking same-language answers.", - "paper_name": "XOR QA: Cross-lingual Open-Retrieval Question Answering", - "paper_abstract": "Multilingual question answering tasks typically assume answers exist in the same language as the question. Yet in practice, many languages face both information scarcity -- where languages have few reference articles -- and information asymmetry -- where questions reference concepts from other cultures. This work extends open-retrieval question answering to a cross-lingual setting enabling questions from one language to be answered via answer content from another language. We construct a large-scale dataset built on questions from TyDi QA lacking same-language answers. Our task formulation, called Cross-lingual Open Retrieval Question Answering (XOR QA), includes 40k information-seeking questions from across 7 diverse non-English languages. Based on this dataset, we introduce three new tasks that involve cross-lingual document retrieval using multi-lingual and English resources. We establish baselines with state-of-the-art machine translation systems and cross-lingual pretrained models. Experimental results suggest that XOR QA is a challenging task that will facilitate the development of novel techniques for multilingual question answering. Our data and code are available at https://nlp.cs.washington.edu/xorqa." - }, - "metrec": { - "pwc_id": "metrec", - "dataset_name": "AraMeter Dataset", - "dataset_abstract": "A dataset to identify the meters of Arabic poems.", - "paper_name": "", - "paper_abstract": "" - }, - "mutual_friends": { - "pwc_id": "mutualfriends", - "dataset_name": "MutualFriends Dataset", - "dataset_abstract": "In MutualFriends, two agents, A and B, each have a private knowledge base, which contains a list of friends with multiple attributes (e.g., name, school, major, etc.). The agents must chat with each other to find their unique mutual friend.", - "paper_name": "Learning Symmetric Collaborative Dialogue Agents with Dynamic Knowledge Graph Embeddings", - "paper_abstract": "We study a symmetric collaborative dialogue setting in which two agents, each\nwith private knowledge, must strategically communicate to achieve a common\ngoal. The open-ended dialogue state in this setting poses new challenges for\nexisting dialogue systems. We collected a dataset of 11K human-human dialogues,\nwhich exhibits interesting lexical, semantic, and strategic elements. To model\nboth structured knowledge and unstructured language, we propose a neural model\nwith dynamic knowledge graph embeddings that evolve as the dialogue progresses.\nAutomatic and human evaluations show that our model is both more effective at\nachieving the goal and more human-like than baseline neural and rule-based\nmodels." - }, - "told-br": { - "pwc_id": "told-br", - "dataset_name": "ToLD-Br Dataset", - "dataset_abstract": "The Toxic Language Detection for Brazilian Portuguese (ToLD-Br) is a dataset with tweets in Brazilian Portuguese annotated according to different toxic aspects.", - "paper_name": "Toxic Language Detection in Social Media for Brazilian Portuguese: New Dataset and Multilingual Analysis", - "paper_abstract": "Hate speech and toxic comments are a common concern of social media platform users. Although these comments are, fortunately, the minority in these platforms, they are still capable of causing harm. Therefore, identifying these comments is an important task for studying and preventing the proliferation of toxicity in social media. Previous work in automatically detecting toxic comments focus mainly in English, with very few work in languages like Brazilian Portuguese. In this paper, we propose a new large-scale dataset for Brazilian Portuguese with tweets annotated as either toxic or non-toxic or in different types of toxicity. We present our dataset collection and annotation process, where we aimed to select candidates covering multiple demographic groups. State-of-the-art BERT models were able to achieve 76% macro-F1 score using monolingual data in the binary case. We also show that large-scale monolingual data is still needed to create more accurate models, despite recent advances in multilingual approaches. An error analysis and experiments with multi-label classification show the difficulty of classifying certain types of toxic comments that appear less frequently in our data and highlights the need to develop models that are aware of different categories of toxicity." - }, - "hover": { - "pwc_id": "hover", - "dataset_name": "HoVer Dataset", - "dataset_abstract": "Is a dataset for many-hop evidence extraction and fact verification. It challenges models to extract facts from several Wikipedia articles that are relevant to a claim and classify whether the claim is Supported or Not-Supported by the facts. In HoVer, the claims require evidence to be extracted from as many as four English Wikipedia articles and embody reasoning graphs of diverse shapes.", - "paper_name": "HoVer: A Dataset for Many-Hop Fact Extraction And Claim Verification", - "paper_abstract": "We introduce HoVer (HOppy VERification), a dataset for many-hop evidence extraction and fact verification. It challenges models to extract facts from several Wikipedia articles that are relevant to a claim and classify whether the claim is Supported or Not-Supported by the facts. In HoVer, the claims require evidence to be extracted from as many as four English Wikipedia articles and embody reasoning graphs of diverse shapes. Moreover, most of the 3/4-hop claims are written in multiple sentences, which adds to the complexity of understanding long-range dependency relations such as coreference. We show that the performance of an existing state-of-the-art semantic-matching model degrades significantly on our dataset as the number of reasoning hops increases, hence demonstrating the necessity of many-hop reasoning to achieve strong results. We hope that the introduction of this challenging dataset and the accompanying evaluation task will encourage research in many-hop fact retrieval and information verification. We make the HoVer dataset publicly available at https://hover-nlp.github.io" - }, - "hybrid_qa": { - "pwc_id": "hybridqa", - "dataset_name": "HybridQA Dataset", - "dataset_abstract": "A new large-scale question-answering dataset that requires reasoning on heterogeneous information. Each question is aligned with a Wikipedia table and multiple free-form corpora linked with the entities in the table. The questions are designed to aggregate both tabular information and text information, i.e., lack of either form would render the question unanswerable.", - "paper_name": "HybridQA: A Dataset of Multi-Hop Question Answering over Tabular and Textual Data", - "paper_abstract": "Existing question answering datasets focus on dealing with homogeneous information, based either only on text or KB/Table information alone. However, as human knowledge is distributed over heterogeneous forms, using homogeneous information alone might lead to severe coverage problems. To fill in the gap, we present HybridQA https://github.com/wenhuchen/HybridQA, a new large-scale question-answering dataset that requires reasoning on heterogeneous information. Each question is aligned with a Wikipedia table and multiple free-form corpora linked with the entities in the table. The questions are designed to aggregate both tabular information and text information, i.e., lack of either form would render the question unanswerable. We test with three different models: 1) a table-only model. 2) text-only model. 3) a hybrid model that combines heterogeneous information to find the answer. The experimental results show that the EM scores obtained by two baselines are below 20\\%, while the hybrid model can achieve an EM over 40\\%. This gap suggests the necessity to aggregate heterogeneous information in HybridQA. However, the hybrid model's score is still far behind human performance. Hence, HybridQA can serve as a challenging benchmark to study question answering with heterogeneous information." - }, - "gooaq": { - "pwc_id": "gooaq", - "dataset_name": "GooAQ Dataset", - "dataset_abstract": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over 5 million questions and 3 million answers collected from Google. GooAQ questions are collected semi-automatically from the Google search engine using its autocomplete feature. This results in naturalistic questions of practical interest that are nonetheless short and expressed using simple language. GooAQ answers are mined from Google's responses to the collected questions, specifically from the answer boxes in the search results. This yields a rich space of answer types, containing both textual answers (short and long) as well as more structured ones such as collections.", - "paper_name": "GooAQ: Open Question Answering with Diverse Answer Types", - "paper_abstract": "While day-to-day questions come with a variety of answer types, the current question-answering (QA) literature has failed to adequately address the answer diversity of questions. To this end, we present GooAQ, a large-scale dataset with a variety of answer types. This dataset contains over 5 million questions and 3 million answers collected from Google. GooAQ questions are collected semi-automatically from the Google search engine using its autocomplete feature. This results in naturalistic questions of practical interest that are nonetheless short and expressed using simple language. GooAQ answers are mined from Google's responses to our collected questions, specifically from the answer boxes in the search results. This yields a rich space of answer types, containing both textual answers (short and long) as well as more structured ones such as collections. We benchmarkT5 models on GooAQ and observe that: (a) in line with recent work, LM's strong performance on GooAQ's short-answer questions heavily benefit from annotated data; however, (b) their quality in generating coherent and accurate responses for questions requiring long responses (such as 'how' and 'why' questions) is less reliant on observing annotated data and mainly supported by their pre-training. We release GooAQ to facilitate further research on improving QA with diverse response types." - }, - "prachathai67k": { - "pwc_id": "prachathai-67k", - "dataset_name": "prachathai-67k Dataset", - "dataset_abstract": "The prachathai-67k dataset was scraped from the news site Prachathai excluding articles with less than 500 characters of body text (mostly images and cartoons). It contains 67,889 articles with 51,797 tags from August 24, 2004 to November 15, 2018.", - "paper_name": "", - "paper_abstract": "" - }, - "moroco": { - "pwc_id": "moroco", - "dataset_name": "MOROCO Dataset", - "dataset_abstract": "The MOldavian and ROmanian Dialectal COrpus (MOROCO) is a corpus that contains 33,564 samples of text (with over 10 million tokens) collected from the news domain. The samples belong to one of the following six topics: culture, finance, politics, science, sports and tech. The data set is divided into 21,719 samples for training, 5,921 samples for validation and another 5,924 samples for testing.", - "paper_name": "MOROCO: The Moldavian and Romanian Dialectal Corpus", - "paper_abstract": "In this work, we introduce the MOldavian and ROmanian Dialectal COrpus (MOROCO), which is freely available for download at https://github.com/butnaruandrei/MOROCO. The corpus contains 33564 samples of text (with over 10 million tokens) collected from the news domain. The samples belong to one of the following six topics: culture, finance, politics, science, sports and tech. The data set is divided into 21719 samples for training, 5921 samples for validation and another 5924 samples for testing. For each sample, we provide corresponding dialectal and category labels. This allows us to perform empirical studies on several classification tasks such as (i) binary discrimination of Moldavian versus Romanian text samples, (ii) intra-dialect multi-class categorization by topic and (iii) cross-dialect multi-class categorization by topic. We perform experiments using a shallow approach based on string kernels, as well as a novel deep approach based on character-level convolutional neural networks containing Squeeze-and-Excitation blocks. We also present and analyze the most discriminative features of our best performing model, before and after named entity removal." - }, - "x_stance": { - "pwc_id": "x-stance", - "dataset_name": "x-stance Dataset", - "dataset_abstract": "A large-scale stance detection dataset from comments written by candidates of elections in Switzerland. The dataset consists of German, French and Italian text, allowing for a cross-lingual evaluation of stance detection. It contains 67 000 comments on more than 150 political issues (targets).", - "paper_name": "X-Stance: A Multilingual Multi-Target Dataset for Stance Detection", - "paper_abstract": "We extract a large-scale stance detection dataset from comments written by candidates of elections in Switzerland. The dataset consists of German, French and Italian text, allowing for a cross-lingual evaluation of stance detection. It contains 67 000 comments on more than 150 political issues (targets). Unlike stance detection models that have specific target issues, we use the dataset to train a single model on all the issues. To make learning across targets possible, we prepend to each instance a natural question that represents the target (e.g. \"Do you support X?\"). Baseline results from multilingual BERT show that zero-shot cross-lingual and cross-target transfer of stance detection is moderately successful with this approach." - }, - "cawac": { - "pwc_id": "cawac", - "dataset_name": "caWaC Dataset", - "dataset_abstract": "The corpus represents the largest existing corpus of Catalan containing 687 million words, which is a significant increase given that until now the biggest corpus of Catalan, CuCWeb, counts 166 million words.", - "paper_name": "caWaC -- A web corpus of Catalan and its application to language modeling and machine translation", - "paper_abstract": "In this paper we present the construction process of a web corpus of Catalan built from the content of the .cat top-level domain. For collecting and processing data we use the Brno pipeline with the spiderling crawler and its accompanying tools. To the best of our knowledge the corpus represents the largest existing corpus of Catalan containing 687 million words, which is a significant increase given that until now the biggest corpus of Catalan, CuCWeb, counts 166 million words. We evaluate the resulting resource on the tasks of language modeling and statistical machine translation (SMT) by calculating LM perplexity and incorporating the LM in the SMT pipeline. We compare language models trained on different subsets of the resource with those trained on the Catalan Wikipedia and the target side of the parallel data used to train the SMT system." - }, - "newsph": { - "pwc_id": "newsph-nli", - "dataset_name": "NewsPH-NLI Dataset", - "dataset_abstract": "NewsPH-NLI is a sentence entailment benchmark dataset in the low-resource Filipino language.", - "paper_name": "Exploiting News Article Structure for Automatic Corpus Generation of Entailment Datasets", - "paper_abstract": "Transformers represent the state-of-the-art in Natural Language Processing (NLP) in recent years, proving effective even in tasks done in low-resource languages. While pretrained transformers for these languages can be made, it is challenging to measure their true performance and capacity due to the lack of hard benchmark datasets, as well as the difficulty and cost of producing them. In this paper, we present three contributions: First, we propose a methodology for automatically producing Natural Language Inference (NLI) benchmark datasets for low-resource languages using published news articles. Through this, we create and release NewsPH-NLI, the first sentence entailment benchmark dataset in the low-resource Filipino language. Second, we produce new pretrained transformers based on the ELECTRA technique to further alleviate the resource scarcity in Filipino, benchmarking them on our dataset against other commonly-used transfer learning techniques. Lastly, we perform analyses on transfer learning techniques to shed light on their true performance when operating in low-data domains through the use of degradation tests." - }, - "labr": { - "pwc_id": "labr", - "dataset_name": "LABR Dataset", - "dataset_abstract": "LABR is a large sentiment analysis dataset to-date for the Arabic language. It consists of over 63,000 book reviews, each rated on a scale of 1 to 5 stars.", - "paper_name": "", - "paper_abstract": "" - }, - "metooma": { - "pwc_id": "metooma", - "dataset_name": "#MeTooMA Dataset", - "dataset_abstract": "The dataset consists of tweets belonging to #MeToo movement on Twitter, labelled into different categories.", - "paper_name": "", - "paper_abstract": "" - }, - "hard": { - "pwc_id": "hard", - "dataset_name": "HARD Dataset", - "dataset_abstract": "The Hotel Arabic-Reviews Dataset (HARD) contains 93700 hotel reviews in Arabic language. The hotel reviews were collected from Booking.com website during June/July 2016. The reviews are expressed in Modern Standard Arabic as well as dialectal Arabic.", - "paper_name": "", - "paper_abstract": "" - }, - "liveqa": { - "pwc_id": "liveqa", - "dataset_name": "LiveQA Dataset", - "dataset_abstract": "A new question answering dataset constructed from play-by-play live broadcast. It contains 117k multiple-choice questions written by human commentators for over 1,670 NBA games, which are collected from the Chinese Hupu (https://nba.hupu.com/games) website.", - "paper_name": "LiveQA: A Question Answering Dataset over Sports Live", - "paper_abstract": "In this paper, we introduce LiveQA, a new question answering dataset constructed from play-by-play live broadcast. It contains 117k multiple-choice questions written by human commentators for over 1,670 NBA games, which are collected from the Chinese Hupu (https://nba.hupu.com/games) website. Derived from the characteristics of sports games, LiveQA can potentially test the reasoning ability across timeline-based live broadcasts, which is challenging compared to the existing datasets. In LiveQA, the questions require understanding the timeline, tracking events or doing mathematical computations. Our preliminary experiments show that the dataset introduces a challenging problem for question answering models, and a strong baseline model only achieves the accuracy of 53.1\\% and cannot beat the dominant option rule. We release the code and data of this paper for future research." - }, - "mkqa": { - "pwc_id": "mkqa", - "dataset_name": "MKQA Dataset", - "dataset_abstract": "Multilingual Knowledge Questions and Answers (MKQA) is an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages (260k question-answer pairs in total). The goal of this dataset is to provide a challenging benchmark for question answering quality across a wide set of languages. Answers are based on a language-independent data representation, making results comparable across languages and independent of language-specific passages. With 26 languages, this dataset supplies the widest range of languages to-date for evaluating question answering.", - "paper_name": "MKQA: A Linguistically Diverse Benchmark for Multilingual Open Domain Question Answering", - "paper_abstract": "Progress in cross-lingual modeling depends on challenging, realistic, and diverse evaluation sets. We introduce Multilingual Knowledge Questions and Answers (MKQA), an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages (260k question-answer pairs in total). Answers are based on a heavily curated, language-independent data representation, making results comparable across languages and independent of language-specific passages. With 26 languages, this dataset supplies the widest range of languages to-date for evaluating question answering. We benchmark a variety of state-of-the-art methods and baselines for generative and extractive question answering, trained on Natural Questions, in zero shot and translation settings. Results indicate this dataset is challenging even in English, but especially in low-resource languages" - }, - "onestop_qa": { - "pwc_id": "onestopqa", - "dataset_name": "OneStopQA Dataset", - "dataset_abstract": "OneStopQA provides an alternative test set for reading comprehension which alleviates these shortcomings and has a substantially higher human ceiling performance.", - "paper_name": "STARC: Structured Annotations for Reading Comprehension", - "paper_abstract": "We present STARC (Structured Annotations for Reading Comprehension), a new annotation framework for assessing reading comprehension with multiple choice questions. Our framework introduces a principled structure for the answer choices and ties them to textual span annotations. The framework is implemented in OneStopQA, a new high-quality dataset for evaluation and analysis of reading comprehension in English. We use this dataset to demonstrate that STARC can be leveraged for a key new application for the development of SAT-like reading comprehension materials: automatic annotation quality probing via span ablation experiments. We further show that it enables in-depth analyses and comparisons between machine and human reading comprehension behavior, including error distributions and guessing ability. Our experiments also reveal that the standard multiple choice dataset in NLP, RACE, is limited in its ability to measure reading comprehension. 47% of its questions can be guessed by machines without accessing the passage, and 18% are unanimously judged by humans as not having a unique correct answer. OneStopQA provides an alternative test set for reading comprehension which alleviates these shortcomings and has a substantially higher human ceiling performance." - }, - "opinosis": { - "pwc_id": "opinosis", - "dataset_name": "Opinosis Dataset", - "dataset_abstract": "This dataset contains sentences extracted from user reviews on a given topic. Example topics are \u201cperformance of Toyota Camry\u201d and \u201csound quality of ipod nano\u201d, etc. In total there are 51 such topics with each topic having approximately 100 sentences (on average). The reviews were obtained from various sources \u2013 Tripadvisor (hotels), Edmunds.com (cars) and Amazon.com (various electronics). This dataset was used for the following automatic text summarization project .", - "paper_name": "", - "paper_abstract": "" - }, - "wili_2018": { - "pwc_id": "wili-2018", - "dataset_name": "WiLI-2018 Dataset", - "dataset_abstract": "WiLI-2018 is a benchmark dataset for monolingual written natural language identification. WiLI-2018 is a publicly available, free of charge dataset of short text extracts from Wikipedia. It contains 1000 paragraphs of 235 languages, totaling in 23500 paragraphs. WiLI is a classification dataset: Given an unknown paragraph written in one dominant language, it has to be decided which language it is.", - "paper_name": "The WiLI benchmark dataset for written language identification", - "paper_abstract": "This paper describes the WiLI-2018 benchmark dataset for monolingual written\nnatural language identification. WiLI-2018 is a publicly available, free of\ncharge dataset of short text extracts from Wikipedia. It contains 1000\nparagraphs of 235 languages, totaling in 23500 paragraphs. WiLI is a\nclassification dataset: Given an unknown paragraph written in one dominant\nlanguage, it has to be decided which language it is." - }, - "wider_face": { - "pwc_id": "wider-face-1", - "dataset_name": "WIDER FACE Dataset", - "dataset_abstract": "The WIDER FACE dataset contains 32,203 images and labels 393,703 faces with a high degree of variability in scale, pose and occlusion. The database is split into training (40%), validation (10%) and testing (50%) set. Besides, the images are divided into three levels (Easy \u2286 Medium \u2286 Hard) according to the difficulties of the detection. The images and annotations of training and validation set are available online, while the annotations of testing set are not released and the results are sent to the database server for receiving the precision-recall curves.", - "paper_name": "WIDER FACE: A Face Detection Benchmark", - "paper_abstract": "Face detection is one of the most studied topics in the computer vision\ncommunity. Much of the progresses have been made by the availability of face\ndetection benchmark datasets. We show that there is a gap between current face\ndetection performance and the real world requirements. To facilitate future\nface detection research, we introduce the WIDER FACE dataset, which is 10 times\nlarger than existing datasets. The dataset contains rich annotations, including\nocclusions, poses, event categories, and face bounding boxes. Faces in the\nproposed dataset are extremely challenging due to large variations in scale,\npose and occlusion, as shown in Fig. 1. Furthermore, we show that WIDER FACE\ndataset is an effective training source for face detection. We benchmark\nseveral representative detection systems, providing an overview of\nstate-of-the-art performance and propose a solution to deal with large scale\nvariation. Finally, we discuss common failure cases that worth to be further\ninvestigated. Dataset can be downloaded at:\nmmlab.ie.cuhk.edu.hk/projects/WIDERFace" - }, - "wiki_qa_ar": { - "pwc_id": "wikiqaar", - "dataset_name": "WikiQAar Dataset", - "dataset_abstract": "A publicly available set of question and sentence pairs, collected and annotated for research on open-domain question answering.", - "paper_name": "", - "paper_abstract": "" - }, - "wikitext_tl39": { - "pwc_id": "wikitext-tl-39", - "dataset_name": "WikiText-TL-39 Dataset", - "dataset_abstract": "WikiText-TL-39 is a benchmark language modeling dataset in Filipino that has 39 million tokens in the training set.", - "paper_name": "Evaluating Language Model Finetuning Techniques for Low-resource Languages", - "paper_abstract": "Unlike mainstream languages (such as English and French), low-resource languages often suffer from a lack of expert-annotated corpora and benchmark resources that make it hard to apply state-of-the-art techniques directly. In this paper, we alleviate this scarcity problem for the low-resourced Filipino language in two ways. First, we introduce a new benchmark language modeling dataset in Filipino which we call WikiText-TL-39. Second, we show that language model finetuning techniques such as BERT and ULMFiT can be used to consistently train robust classifiers in low-resource settings, experiencing at most a 0.0782 increase in validation error when the number of training examples is decreased from 10K to 1K while finetuning using a privately-held sentiment dataset." - }, - "lm1b": { - "pwc_id": "billion-word-benchmark", - "dataset_name": "Billion Word Benchmark Dataset", - "dataset_abstract": "The One Billion Word dataset is a dataset for language modeling. The training/held-out data was produced from the WMT 2011 News Crawl data using a combination of Bash shell and Perl scripts.", - "paper_name": "One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling", - "paper_abstract": "We propose a new benchmark corpus to be used for measuring progress in\nstatistical language modeling. With almost one billion words of training data,\nwe hope this benchmark will be useful to quickly evaluate novel language\nmodeling techniques, and to compare their contribution when combined with other\nadvanced techniques. We show performance of several well-known types of\nlanguage models, with the best results achieved with a recurrent neural network\nbased language model. The baseline unpruned Kneser-Ney 5-gram model achieves\nperplexity 67.6; a combination of techniques leads to 35% reduction in\nperplexity, or 10% reduction in cross-entropy (bits), over that baseline.\n The benchmark is available as a code.google.com project; besides the scripts\nneeded to rebuild the training/held-out data, it also makes available\nlog-probability values for each word in each of ten held-out data sets, for\neach of the baseline n-gram models." - }, - "s2orc": { - "pwc_id": "s2orc", - "dataset_name": "S2ORC Dataset", - "dataset_abstract": "A large corpus of 81.1M English-language academic papers spanning many academic disciplines. Rich metadata, paper abstracts, resolved bibliographic references, as well as structured full text for 8.1M open access papers. Full text annotated with automatically-detected inline mentions of citations, figures, and tables, each linked to their corresponding paper objects. Aggregated papers from hundreds of academic publishers and digital archives into a unified source, and create the largest publicly-available collection of machine-readable academic text to date.", - "paper_name": "S2ORC: The Semantic Scholar Open Research Corpus", - "paper_abstract": "We introduce S2ORC, a large corpus of 81.1M English-language academic papers spanning many academic disciplines. The corpus consists of rich metadata, paper abstracts, resolved bibliographic references, as well as structured full text for 8.1M open access papers. Full text is annotated with automatically-detected inline mentions of citations, figures, and tables, each linked to their corresponding paper objects. In S2ORC, we aggregate papers from hundreds of academic publishers and digital archives into a unified source, and create the largest publicly-available collection of machine-readable academic text to date. We hope this resource will facilitate research and development of tools and tasks for text mining over academic text." - }, - "spanish_billion_words": { - "pwc_id": "sbwce", - "dataset_name": "SBWCE Dataset", - "dataset_abstract": "This resource consists of an unannotated corpus of the Spanish language of nearly 1.5 billion words, compiled from different corpora and resources from the web; and a set of word vectors (or embeddings), created from this corpus using the word2vec algorithm, provided by the gensim package. These embeddings were evaluated by translating to Spanish word2vec\u2019s word relation test set.", - "paper_name": "", - "paper_abstract": "" - }, - "time_dial": { - "pwc_id": "timedial", - "dataset_name": "TimeDial Dataset", - "dataset_abstract": "TimeDial presents a crowdsourced English challenge set, for temporal commonsense reasoning, formulated as a multiple choice cloze task with around 1.5k carefully curated dialogs. The dataset is derived from the DailyDialog, which is a multi-turn dialog corpus.\n\nTimeDial dataset consists of 1,104 dialog instances with 2 correct and 2 incorrect options with the following statistics:", - "paper_name": "TIMEDIAL: Temporal Commonsense Reasoning in Dialog", - "paper_abstract": "Everyday conversations require understanding everyday events, which in turn, requires understanding temporal commonsense concepts interwoven with those events. Despite recent progress with massive pre-trained language models (LMs) such as T5 and GPT-3, their capability of temporal reasoning in dialogs remains largely under-explored. In this paper, we present the first study to investigate pre-trained LMs for their temporal reasoning capabilities in dialogs by introducing a new task and a crowd-sourced English challenge set, TIMEDIAL. We formulate TIME-DIAL as a multiple-choice cloze task with over 1.1K carefully curated dialogs. Empirical results demonstrate that even the best performing models struggle on this task compared to humans, with 23 absolute points of gap in accuracy. Furthermore, our analysis reveals that the models fail to reason about dialog context correctly; instead, they rely on shallow cues based on existing temporal patterns in context, motivating future research for modeling temporal concepts in text and robust contextual reasoning about them. The dataset is publicly available at: https://github.com/google-research-datasets/timedial." - }, - "urdu_sentiment_corpus": { - "pwc_id": "urdu-sentiment-corpus", - "dataset_name": "Urdu Sentiment Corpus Dataset", - "dataset_abstract": "Consists of Urdu tweets for the sentiment analysis and polarity detection. The dataset is consisting of tweets, such that it casts a political shadow and presents a competitive environment between two separate political parties versus the government of Pakistan. Overall, the dataset is comprising over 17, 185 tokens with 52% records as positive, and 48% records as negative.", - "paper_name": "", - "paper_abstract": "" - }, - "sharc": { - "pwc_id": "sharc", - "dataset_name": "ShARC Dataset", - "dataset_abstract": "ShARC is a Conversational Question Answering dataset focussing on question answering from texts containing rules.", - "paper_name": "Interpretation of Natural Language Rules in Conversational Machine Reading", - "paper_abstract": "Most work in machine reading focuses on question answering problems where the\nanswer is directly expressed in the text to read. However, many real-world\nquestion answering problems require the reading of text not because it contains\nthe literal answer, but because it contains a recipe to derive an answer\ntogether with the reader's background knowledge. One example is the task of\ninterpreting regulations to answer \"Can I...?\" or \"Do I have to...?\" questions\nsuch as \"I am working in Canada. Do I have to carry on paying UK National\nInsurance?\" after reading a UK government website about this topic. This task\nrequires both the interpretation of rules and the application of background\nknowledge. It is further complicated due to the fact that, in practice, most\nquestions are underspecified, and a human assistant will regularly have to ask\nclarification questions such as \"How long have you been working abroad?\" when\nthe answer cannot be directly derived from the question and text. In this\npaper, we formalise this task and develop a crowd-sourcing strategy to collect\n32k task instances based on real-world rules and crowd-generated questions and\nscenarios. We analyse the challenges of this task and assess its difficulty by\nevaluating the performance of rule-based and machine-learning baselines. We\nobserve promising results when no background knowledge is necessary, and\nsubstantial room for improvement whenever background knowledge is needed." - }, - "GEM/xlsum": { - "pwc_id": "xl-sum", - "dataset_name": "XL-Sum Dataset", - "dataset_abstract": "XL-Sum is a comprehensive and diverse dataset for abstractive summarization comprising 1 million professionally annotated article-summary pairs from BBC, extracted using a set of carefully designed heuristics. The dataset covers 44 languages ranging from low to high-resource, for many of which no public dataset is currently available. XL-Sum is highly abstractive, concise, and of high quality, as indicated by human and intrinsic evaluation.", - "paper_name": "XL-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages", - "paper_abstract": "Contemporary works on abstractive text summarization have focused primarily on high-resource languages like English, mostly due to the limited availability of datasets for low/mid-resource ones. In this work, we present XL-Sum, a comprehensive and diverse dataset comprising 1 million professionally annotated article-summary pairs from BBC, extracted using a set of carefully designed heuristics. The dataset covers 44 languages ranging from low to high-resource, for many of which no public dataset is currently available. XL-Sum is highly abstractive, concise, and of high quality, as indicated by human and intrinsic evaluation. We fine-tune mT5, a state-of-the-art pretrained multilingual model, with XL-Sum and experiment on multilingual and low-resource summarization tasks. XL-Sum induces competitive results compared to the ones obtained using similar monolingual datasets: we show higher than 11 ROUGE-2 scores on 10 languages we benchmark on, with some of them exceeding 15, as obtained by multilingual training. Additionally, training on low-resource languages individually also provides competitive performance. To the best of our knowledge, XL-Sum is the largest abstractive summarization dataset in terms of the number of samples collected from a single source and the number of languages covered. We are releasing our dataset and models to encourage future research on multilingual abstractive summarization. The resources can be found at \\url{https://github.com/csebuetnlp/xl-sum}." - }, - "refresd": { - "pwc_id": "refresd", - "dataset_name": "REFreSD Dataset", - "dataset_abstract": "Consists of English-French sentence-pairs annotated with semantic divergence classes and token-level rationales.", - "paper_name": "Detecting Fine-Grained Cross-Lingual Semantic Divergences without Supervision by Learning to Rank", - "paper_abstract": "Detecting fine-grained differences in content conveyed in different languages matters for cross-lingual NLP and multilingual corpora analysis, but it is a challenging machine learning problem since annotation is expensive and hard to scale. This work improves the prediction and annotation of fine-grained semantic divergences. We introduce a training strategy for multilingual BERT models by learning to rank synthetic divergent examples of varying granularity. We evaluate our models on the Rationalized English-French Semantic Divergences, a new dataset released with this work, consisting of English-French sentence-pairs annotated with semantic divergence classes and token-level rationales. Learning to rank helps detect fine-grained sentence-level divergences more accurately than a strong sentence-level similarity model, while token-level predictions have the potential of further distinguishing between coarse and fine-grained divergences." - }, - "roman_urdu": { - "pwc_id": "roman-urdu-data-set", - "dataset_name": "Roman Urdu Data Set Dataset", - "dataset_abstract": "Tagged for Sentiment (Positive, Negative, Neutral).", - "paper_name": "", - "paper_abstract": "" - }, - "tsac": { - "pwc_id": "tsac", - "dataset_name": "TSAC Dataset", - "dataset_abstract": "Tunisian Sentiment Analysis Corpus (TSAC) is a Tunisian Dialect corpus of 17.000 comments from Facebook.", - "paper_name": "Sentiment Analysis of Tunisian Dialects: Linguistic Ressources and Experiments", - "paper_abstract": "Dialectal Arabic (DA) is significantly different from the Arabic language taught in schools and used in written communication and formal speech (broadcast news, religion, politics, etc.). There are many existing researches in the field of Arabic language Sentiment Analysis (SA); however, they are generally restricted to Modern Standard Arabic (MSA) or some dialects of economic or political interest. In this paper we are interested in the SA of the Tunisian Dialect. We utilize Machine Learning techniques to determine the polarity of comments written in Tunisian Dialect. First, we evaluate the SA systems performances with models trained using freely available MSA and Multi-dialectal data sets. We then collect and annotate a Tunisian Dialect corpus of 17.000 comments from Facebook. This corpus allows us a significant accuracy improvement compared to the best model trained on other Arabic dialects or MSA data. We believe that this first freely available corpus will be valuable to researchers working in the field of Tunisian Sentiment Analysis and similar areas." - }, - "pass": { - "pwc_id": "pass", - "dataset_name": "PASS Dataset", - "dataset_abstract": "PASS is a large-scale image dataset, containing 1.4 million images, that does not include any humans and which can be used for high-quality pretraining while significantly reducing privacy concerns.", - "paper_name": "PASS: An ImageNet replacement for self-supervised pretraining without humans", - "paper_abstract": "Computer vision has long relied on ImageNet and other large datasets of images sampled from the Internet for pretraining models. However, these datasets have ethical and technical shortcomings, such as containing personal information taken without consent, unclear license usage, biases, and, in some cases, even problematic image content. On the other hand, state-of-the-art pretraining is nowadays obtained with unsupervised methods, meaning that labelled datasets such as ImageNet may not be necessary, or perhaps not even optimal, for model pretraining. We thus propose an unlabelled dataset PASS: Pictures without humAns for Self-Supervision. PASS only contains images with CC-BY license and complete attribution metadata, addressing the copyright issue. Most importantly, it contains no images of people at all, and also avoids other types of images that are problematic for data protection or ethics. We show that PASS can be used for pretraining with methods such as MoCo-v2, SwAV and DINO. In the transfer learning setting, it yields similar downstream performances to ImageNet pretraining even on tasks that involve humans, such as human pose estimation. PASS does not make existing datasets obsolete, as for instance it is insufficient for benchmarking. However, it shows that model pretraining is often possible while using safer data, and it also provides the basis for a more robust evaluation of pretraining methods." - }, - "tunizi": { - "pwc_id": "tunizi", - "dataset_name": "TUNIZI Dataset", - "dataset_abstract": "A sentiment analysis Tunisian Arabizi Dataset, collected from social networks, preprocessed for analytical studies and annotated manually by Tunisian native speakers.", - "paper_name": "TUNIZI: a Tunisian Arabizi sentiment analysis Dataset", - "paper_abstract": "On social media, Arabic people tend to express themselves in their own local dialects. More particularly, Tunisians use the informal way called \"Tunisian Arabizi\". Analytical studies seek to explore and recognize online opinions aiming to exploit them for planning and prediction purposes such as measuring the customer satisfaction and establishing sales and marketing strategies. However, analytical studies based on Deep Learning are data hungry. On the other hand, African languages and dialects are considered low resource languages. For instance, to the best of our knowledge, no annotated Tunisian Arabizi dataset exists. In this paper, we introduce TUNIZI a sentiment analysis Tunisian Arabizi Dataset, collected from social networks, preprocessed for analytical studies and annotated manually by Tunisian native speakers." - }, - "norec": { - "pwc_id": "norec", - "dataset_name": "NoReC Dataset", - "dataset_abstract": "The Norwegian Review Corpus (NoReC) was created for the purpose of training and evaluating models for document-level sentiment analysis. More than 43,000 full-text reviews have been collected from major Norwegian news sources and cover a range of different domains, including literature, movies, video games, restaurants, music and theater, in addition to product reviews across a range of categories. Each review is labeled with a manually assigned score of 1\u20136, as provided by the rating of the original author.", - "paper_name": "NoReC: The Norwegian Review Corpus", - "paper_abstract": "This paper presents the Norwegian Review Corpus (NoReC), created for training\nand evaluating models for document-level sentiment analysis. The full-text\nreviews have been collected from major Norwegian news sources and cover a range\nof different domains, including literature, movies, video games, restaurants,\nmusic and theater, in addition to product reviews across a range of categories.\nEach review is labeled with a manually assigned score of 1-6, as provided by\nthe rating of the original author. This first release of the corpus comprises\nmore than 35,000 reviews. It is distributed using the CoNLL-U format,\npre-processed using UDPipe, along with a rich set of metadata. The work\nreported in this paper forms part of the SANT initiative (Sentiment Analysis\nfor Norwegian Text), a project seeking to provide resources and tools for\nsentiment analysis and opinion mining for Norwegian. As resources for sentiment\nanalysis have so far been unavailable for Norwegian, NoReC represents a highly\nvaluable and sought-after addition to Norwegian language technology." - }, - "medmcqa": { - "pwc_id": "medmcqa", - "dataset_name": "MedMCQA Dataset", - "dataset_abstract": "MedMCQA is a large-scale, Multiple-Choice Question Answering (MCQA) dataset designed to address real-world medical entrance exam questions.\n\nMedMCQA has more than 194k high-quality AIIMS & NEET PG entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected with an average token length of 12.77 and high topical diversity.", - "paper_name": "MedMCQA : A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering", - "paper_abstract": "This paper introduces MedMCQA, a new large-scale, Multiple-Choice Question Answering (MCQA) dataset designed to address real-world medical entrance exam questions. More than 194k high-quality AIIMS \\& NEET PG entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected with an average token length of 12.77 and high topical diversity. Each sample contains a question, correct answer(s), and other options which requires a deeper language understanding as it tests the 10+ reasoning abilities of a model across a wide range of medical subjects \\& topics. A detailed explanation of the solution, along with the above information, is provided in this study." - }, - "metashift": { - "pwc_id": "metashift", - "dataset_name": "MetaShift Dataset", - "dataset_abstract": "MetaShift is a collection of 12,868 sets of natural images across 410 classes. It can be used to benchmark and evaluate how robust machine learning models are to data shifts.", - "paper_name": "MetaShift: A Dataset of Datasets for Evaluating Contextual Distribution Shifts and Training Conflicts", - "paper_abstract": "Understanding the performance of machine learning models across diverse data distributions is critically important for reliable applications. Motivated by this, there is a growing focus on curating benchmark datasets that capture distribution shifts. While valuable, the existing benchmarks are limited in that many of them only contain a small number of shifts and they lack systematic annotation about what is different across different shifts. We present MetaShift--a collection of 12,868 sets of natural images across 410 classes--to address this challenge. We leverage the natural heterogeneity of Visual Genome and its annotations to construct MetaShift. The key construction idea is to cluster images using its metadata, which provides context for each image (e.g. \"cats with cars\" or \"cats in bathroom\") that represent distinct data distributions. MetaShift has two important benefits: first, it contains orders of magnitude more natural data shifts than previously available. Second, it provides explicit explanations of what is unique about each of its data sets and a distance score that measures the amount of distribution shift between any two of its data sets. We demonstrate the utility of MetaShift in benchmarking several recent proposals for training models to be robust to data shifts. We find that the simple empirical risk minimization performs the best when shifts are moderate and no method had a systematic advantage for large shifts. We also show how MetaShift can help to visualize conflicts between data subsets during model training." - }, - "csebuetnlp/xlsum": { - "pwc_id": "xl-sum", - "dataset_name": "XL-Sum Dataset", - "dataset_abstract": "XL-Sum is a comprehensive and diverse dataset for abstractive summarization comprising 1 million professionally annotated article-summary pairs from BBC, extracted using a set of carefully designed heuristics. The dataset covers 44 languages ranging from low to high-resource, for many of which no public dataset is currently available. XL-Sum is highly abstractive, concise, and of high quality, as indicated by human and intrinsic evaluation.", - "paper_name": "XL-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages", - "paper_abstract": "Contemporary works on abstractive text summarization have focused primarily on high-resource languages like English, mostly due to the limited availability of datasets for low/mid-resource ones. In this work, we present XL-Sum, a comprehensive and diverse dataset comprising 1 million professionally annotated article-summary pairs from BBC, extracted using a set of carefully designed heuristics. The dataset covers 44 languages ranging from low to high-resource, for many of which no public dataset is currently available. XL-Sum is highly abstractive, concise, and of high quality, as indicated by human and intrinsic evaluation. We fine-tune mT5, a state-of-the-art pretrained multilingual model, with XL-Sum and experiment on multilingual and low-resource summarization tasks. XL-Sum induces competitive results compared to the ones obtained using similar monolingual datasets: we show higher than 11 ROUGE-2 scores on 10 languages we benchmark on, with some of them exceeding 15, as obtained by multilingual training. Additionally, training on low-resource languages individually also provides competitive performance. To the best of our knowledge, XL-Sum is the largest abstractive summarization dataset in terms of the number of samples collected from a single source and the number of languages covered. We are releasing our dataset and models to encourage future research on multilingual abstractive summarization. The resources can be found at \\url{https://github.com/csebuetnlp/xl-sum}." - }, - "BeIR/beir": { - "pwc_id": "beir", - "dataset_name": "BEIR Dataset", - "dataset_abstract": "BEIR (Benchmarking IR) is an heterogeneous benchmark containing different information retrieval (IR) tasks. Through BEIR, it is possible to systematically study the zero-shot generalization capabilities of multiple neural retrieval approaches.\n\nThe benchmark contains a total of 9 information retrieval tasks (Fact Checking, Citation Prediction, Duplicate Question Retrieval, Argument Retrieval, News Retrieval, Question Answering, Tweet Retrieval, Biomedical IR, Entity Retrieval) from 17 different datasets:\n\n\nMS MARCO\nTREC-COVID\nNFCorpus\nBioASQ\nNatural Questions\nHotpotQA\nFiQA-2018\nSignal-1M\nTREC-News\nArguAna\nTouche 2020\nCQADupStack\nQuora Question Pairs\nDBPedia\nSciDocs\nFEVER\nClimate-FEVER\nSciFact", - "paper_name": "BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models", - "paper_abstract": "Existing neural information retrieval (IR) models have often been studied in homogeneous and narrow settings, which has considerably limited insights into their out-of-distribution (OOD) generalization capabilities. To address this, and to facilitate researchers to broadly evaluate the effectiveness of their models, we introduce Benchmarking-IR (BEIR), a robust and heterogeneous evaluation benchmark for information retrieval. We leverage a careful selection of 18 publicly available datasets from diverse text retrieval tasks and domains and evaluate 10 state-of-the-art retrieval systems including lexical, sparse, dense, late-interaction and re-ranking architectures on the BEIR benchmark. Our results show BM25 is a robust baseline and re-ranking and late-interaction-based models on average achieve the best zero-shot performances, however, at high computational costs. In contrast, dense and sparse-retrieval models are computationally more efficient but often underperform other approaches, highlighting the considerable room for improvement in their generalization capabilities. We hope this framework allows us to better evaluate and understand existing retrieval systems, and contributes to accelerating progress towards better robust and generalizable systems in the future. BEIR is publicly available at https://github.com/UKPLab/beir." - }, - "bertin-project/mc4-sampling": { - "pwc_id": "mc4", - "dataset_name": "mC4 Dataset", - "dataset_abstract": "mC4 is a multilingual variant of the C4 dataset called mC4. mC4 comprises natural text in 101 languages drawn from the public Common Crawl web scrape.", - "paper_name": "mT5: A massively multilingual pre-trained text-to-text transformer", - "paper_abstract": "The recent \"Text-to-Text Transfer Transformer\" (T5) leveraged a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual benchmarks. We also describe a simple technique to prevent \"accidental translation\" in the zero-shot setting, where a generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model checkpoints used in this work are publicly available." - }, - "BeIR/beir-corpus": { - "pwc_id": "beir", - "dataset_name": "BEIR Dataset", - "dataset_abstract": "BEIR (Benchmarking IR) is an heterogeneous benchmark containing different information retrieval (IR) tasks. Through BEIR, it is possible to systematically study the zero-shot generalization capabilities of multiple neural retrieval approaches.\n\nThe benchmark contains a total of 9 information retrieval tasks (Fact Checking, Citation Prediction, Duplicate Question Retrieval, Argument Retrieval, News Retrieval, Question Answering, Tweet Retrieval, Biomedical IR, Entity Retrieval) from 17 different datasets:\n\n\nMS MARCO\nTREC-COVID\nNFCorpus\nBioASQ\nNatural Questions\nHotpotQA\nFiQA-2018\nSignal-1M\nTREC-News\nArguAna\nTouche 2020\nCQADupStack\nQuora Question Pairs\nDBPedia\nSciDocs\nFEVER\nClimate-FEVER\nSciFact", - "paper_name": "BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models", - "paper_abstract": "Existing neural information retrieval (IR) models have often been studied in homogeneous and narrow settings, which has considerably limited insights into their out-of-distribution (OOD) generalization capabilities. To address this, and to facilitate researchers to broadly evaluate the effectiveness of their models, we introduce Benchmarking-IR (BEIR), a robust and heterogeneous evaluation benchmark for information retrieval. We leverage a careful selection of 18 publicly available datasets from diverse text retrieval tasks and domains and evaluate 10 state-of-the-art retrieval systems including lexical, sparse, dense, late-interaction and re-ranking architectures on the BEIR benchmark. Our results show BM25 is a robust baseline and re-ranking and late-interaction-based models on average achieve the best zero-shot performances, however, at high computational costs. In contrast, dense and sparse-retrieval models are computationally more efficient but often underperform other approaches, highlighting the considerable room for improvement in their generalization capabilities. We hope this framework allows us to better evaluate and understand existing retrieval systems, and contributes to accelerating progress towards better robust and generalizable systems in the future. BEIR is publicly available at https://github.com/UKPLab/beir." - }, - "GEM/OrangeSum": { - "pwc_id": "orangesum", - "dataset_name": "OrangeSum Dataset", - "dataset_abstract": "OrangeSum is a single-document extreme summarization dataset with two tasks: title and abstract. Ground truth summaries are respectively 11.42 and 32.12 words in length on average, for the title and abstract tasks respectively, while document sizes are 315 and 350 words.\n\nThe motivation for OrangeSum was to put together a French equivalent of the XSum dataset.\n\nUnlike the historical CNN, DailyMail, and NY Times datasets, OrangeSum requires the models to display a high degree of abstractivity to perform well.\nOrangeSum was created by scraping articles and their titles and abstracts from the Orange Actu website.\n\nScraped pages cover almost a decade from Feb 2011 to Sep 2020, and belong to five main categories: France, world, politics, automotive, and society.\nThe society category is itself divided into 8 subcategories: health, environment, people, culture, media, high-tech, unsual (\"insolite\" in French), and miscellaneous.\n\nThe dataset is publicly available at: https://github.com/Tixierae/OrangeSum.", - "paper_name": "BARThez: a Skilled Pretrained French Sequence-to-Sequence Model", - "paper_abstract": "Inductive transfer learning has taken the entire NLP field by storm, with models such as BERT and BART setting new state of the art on countless NLU tasks. However, most of the available models and research have been conducted for English. In this work, we introduce BARThez, the first large-scale pretrained seq2seq model for French. Being based on BART, BARThez is particularly well-suited for generative tasks. We evaluate BARThez on five discriminative tasks from the FLUE benchmark and two generative tasks from a novel summarization dataset, OrangeSum, that we created for this research. We show BARThez to be very competitive with state-of-the-art BERT-based French language models such as CamemBERT and FlauBERT. We also continue the pretraining of a multilingual BART on BARThez' corpus, and show our resulting model, mBARThez, to significantly boost BARThez' generative performance. Code, data and models are publicly available." - }, - "IlyaGusev/gazeta": { - "pwc_id": "gazeta", - "dataset_name": "Gazeta Dataset", - "dataset_abstract": "Gazeta is a dataset for automatic summarization of Russian news. The dataset consists of 63,435 text-summary pairs. To form training, validation, and test datasets, these pairs were sorted by time and the first 52,400 pairs are used as the training dataset, the proceeding 5,265 pairs as the validation dataset, and the remaining 5,770 pairs as the test dataset.", - "paper_name": "Dataset for Automatic Summarization of Russian News", - "paper_abstract": "Automatic text summarization has been studied in a variety of domains and languages. However, this does not hold for the Russian language. To overcome this issue, we present Gazeta, the first dataset for summarization of Russian news. We describe the properties of this dataset and benchmark several extractive and abstractive models. We demonstrate that the dataset is a valid task for methods of text summarization for Russian. Additionally, we prove the pretrained mBART model to be useful for Russian text summarization." - }, - "facebook/multilingual_librispeech": { - "pwc_id": "multilingual-librispeech", - "dataset_name": "Multilingual LibriSpeech Dataset", - "dataset_abstract": "Multilingual LibriSpeech is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish. It includes about 44.5K hours of English and a total of about 6K hours for other languages.", - "paper_name": "MLS: A Large-Scale Multilingual Dataset for Speech Research", - "paper_abstract": "This paper introduces Multilingual LibriSpeech (MLS) dataset, a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages, including about 44.5K hours of English and a total of about 6K hours for other languages. Additionally, we provide Language Models (LM) and baseline Automatic Speech Recognition (ASR) models and for all the languages in our dataset. We believe such a large transcribed dataset will open new avenues in ASR and Text-To-Speech (TTS) research. The dataset will be made freely available for anyone at http://www.openslr.org." - }, - "GEM/indonlg": { - "pwc_id": "indonlg", - "dataset_name": "IndoNLG Dataset", - "dataset_abstract": "IndoNLG is a benchmark to measure natural language generation (NLG) progress in three low-resource\u2014yet widely spoken\u2014languages of Indonesia: Indonesian, Javanese, and Sundanese. Altogether, these languages are spoken by more than 100 million native speakers, and hence constitute an important use case of NLG systems today. Concretely, IndoNLG covers six tasks: summarization, question answering, chit-chat, and three different pairs of machine translation (MT) tasks.", - "paper_name": "IndoNLG: Benchmark and Resources for Evaluating Indonesian Natural Language Generation", - "paper_abstract": "Natural language generation (NLG) benchmarks provide an important avenue to measure progress and develop better NLG systems. Unfortunately, the lack of publicly available NLG benchmarks for low-resource languages poses a challenging barrier for building NLG systems that work well for languages with limited amounts of data. Here we introduce IndoNLG, the first benchmark to measure natural language generation (NLG) progress in three low-resource -- yet widely spoken -- languages of Indonesia: Indonesian, Javanese, and Sundanese. Altogether, these languages are spoken by more than 100 million native speakers, and hence constitute an important use case of NLG systems today. Concretely, IndoNLG covers six tasks: summarization, question answering, chit-chat, and three different pairs of machine translation (MT) tasks. We collate a clean pretraining corpus of Indonesian, Sundanese, and Javanese datasets, Indo4B-Plus, which is used to pretrain our models: IndoBART and IndoGPT. We show that IndoBART and IndoGPT achieve competitive performance on all tasks -- despite using only one-fifth the parameters of a larger multilingual model, mBART-LARGE (Liu et al., 2020). This finding emphasizes the importance of pretraining on closely related, local languages to achieve more efficient learning and faster inference for very low-resource languages like Javanese and Sundanese." - }, - "DDSC/dkhate": { - "pwc_id": "dkhate", - "dataset_name": "DKhate Dataset", - "dataset_abstract": "A corpus of Offensive Language and Hate Speech Detection for Danish\n\nThis DKhate dataset contains 3600 comments from the web annotated for offensive language, following the Zampieri et al. / OLID scheme.\n\nSubmissions and benchmarks for the OffensEval 2020 Danish track are also included.", - "paper_name": "Offensive Language and Hate Speech Detection for Danish", - "paper_abstract": "The presence of offensive language on social media platforms and the implications this poses is becoming a major concern in modern society. Given the enormous amount of content created every day, automatic methods are required to detect and deal with this type of content. Until now, most of the research has focused on solving the problem for the English language, while the problem is multilingual. We construct a Danish dataset containing user-generated comments from \\textit{Reddit} and \\textit{Facebook}. It contains user generated comments from various social media platforms, and to our knowledge, it is the first of its kind. Our dataset is annotated to capture various types and target of offensive language. We develop four automatic classification systems, each designed to work for both the English and the Danish language. In the detection of offensive language in English, the best performing system achieves a macro averaged F1-score of $0.74$, and the best performing system for Danish achieves a macro averaged F1-score of $0.70$. In the detection of whether or not an offensive post is targeted, the best performing system for English achieves a macro averaged F1-score of $0.62$, while the best performing system for Danish achieves a macro averaged F1-score of $0.73$. Finally, in the detection of the target type in a targeted offensive post, the best performing system for English achieves a macro averaged F1-score of $0.56$, and the best performing system for Danish achieves a macro averaged F1-score of $0.63$. Our work for both the English and the Danish language captures the type and targets of offensive language, and present automatic methods for detecting different kinds of offensive language such as hate speech and cyberbullying." - }, - "GEM/squad_v2": { - "pwc_id": "squad", - "dataset_name": "SQuAD Dataset", - "dataset_abstract": "The Stanford Question Answering Dataset (SQuAD) is a collection of question-answer pairs derived from Wikipedia articles. In SQuAD, the correct answers of questions can be any sequence of tokens in the given text. Because the questions and answers are produced by humans through crowdsourcing, it is more diverse than some other question-answering datasets. SQuAD 1.1 contains 107,785 question-answer pairs on 536 articles. SQuAD2.0 (open-domain SQuAD, SQuAD-Open), the latest version, combines the 100,000 questions in SQuAD1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers in forms that are similar to the answerable ones.", - "paper_name": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", - "paper_abstract": "We present the Stanford Question Answering Dataset (SQuAD), a new reading\ncomprehension dataset consisting of 100,000+ questions posed by crowdworkers on\na set of Wikipedia articles, where the answer to each question is a segment of\ntext from the corresponding reading passage. We analyze the dataset to\nunderstand the types of reasoning required to answer the questions, leaning\nheavily on dependency and constituency trees. We build a strong logistic\nregression model, which achieves an F1 score of 51.0%, a significant\nimprovement over a simple baseline (20%). However, human performance (86.8%) is\nmuch higher, indicating that the dataset presents a good challenge problem for\nfuture research.\n The dataset is freely available at https://stanford-qa.com" - }, - "dennlinger/klexikon": { - "pwc_id": "klexikon", - "dataset_name": "Klexikon Dataset", - "dataset_abstract": "The dataset introduces document alignments between German Wikipedia and the children's lexicon Klexikon.\nThe source texts in Wikipedia are both written in a more complex language than Klexikon, and also significantly longer, which makes this a suitable application for both summarization and simplification.\nIn fact, previous research has so far only focused on either of the two, but not comprehensively been studied as a joint task.", - "paper_name": "Klexikon: A German Dataset for Joint Summarization and Simplification", - "paper_abstract": "Traditionally, Text Simplification is treated as a monolingual translation task where sentences between source texts and their simplified counterparts are aligned for training. However, especially for longer input documents, summarizing the text (or dropping less relevant content altogether) plays an important role in the simplification process, which is currently not reflected in existing datasets. Simultaneously, resources for non-English languages are scarce in general and prohibitive for training new solutions. To tackle this problem, we pose core requirements for a system that can jointly summarize and simplify long source documents. We further describe the creation of a new dataset for joint Text Simplification and Summarization based on German Wikipedia and the German children's lexicon \"Klexikon\", consisting of almost 2900 documents. We release a document-aligned version that particularly highlights the summarization aspect, and provide statistical evidence that this resource is well suited to simplification as well. Code and data are available on Github: https://github.com/dennlinger/klexikon" - }, - "gsarti/flores_101": { - "pwc_id": "flores", - "dataset_name": "FLoRes Dataset", - "dataset_abstract": "FLoRes is a benchmark dataset for machine translation between English and four low resource languages, Nepali, Sinhala, Khmer and Pashto, based on sentences translated from Wikipedia.", - "paper_name": "The FLORES Evaluation Datasets for Low-Resource Machine Translation: Nepali--English and Sinhala--English", - "paper_abstract": "For machine translation, a vast majority of language pairs in the world are considered low-resource because they have little parallel data available. Besides the technical challenges of learning with limited supervision, it is difficult to evaluate methods trained on low-resource language pairs because of the lack of freely and publicly available benchmarks. In this work, we introduce the FLORES evaluation datasets for Nepali{--}English and Sinhala{--} English, based on sentences translated from Wikipedia. Compared to English, these are languages with very different morphology and syntax, for which little out-of-domain parallel data is available and for which relatively large amounts of monolingual data are freely available. We describe our process to collect and cross-check the quality of translations, and we report baseline performance using several learning settings: fully supervised, weakly supervised, semi-supervised, and fully unsupervised. Our experiments demonstrate that current state-of-the-art methods perform rather poorly on this benchmark, posing a challenge to the research community working on low-resource MT. Data and code to reproduce our experiments are available at https://github.com/facebookresearch/flores." - }, - "SoLID/shellcode_i_a32": { - "pwc_id": "shellcode-ia32", - "dataset_name": "Shellcode_IA32 Dataset", - "dataset_abstract": "Shellcode_IA32 is a dataset containing 20 years of shellcodes from a variety of sources is the largest collection of shellcodes in assembly available to date.\n\nThis dataset consists of 3,200 examples of instructions in assembly language for IA-32 (the 32-bit version of the x86 Intel Architecture) from publicly available security exploits. We collected assembly programs used to generate shellcode from exploit-db and from shell-storm. We enriched the dataset by adding examples of assembly programs for the IA-32 architecture from popular tutorials and books. This allowed us to understand how different authors and assembly experts comment and, thus, how to deal with the ambiguity of natural language in this specific context. Our dataset consists of 10% of instructions collected from books and guidelines, and the rest from real shellcodes.\n\nOur focus is on Linux, the most common OS for security-critical network services. Accordingly, we added assembly instructions written with Netwide Assembler (NASM) for Linux.\n\nEach line of Shellcode_IA32 dataset represents a snippet - intent pair. The snippet is a line or a combination of multiple lines of assembly code, built by following the NASM syntax. The intent is a comment in the English language.\n\nFurther statistics on the dataset and a set of preliminary experiments performed with a neural machine translation (NMT) model are described in the following paper: Shellcode_IA32: A Dataset for Automatic Shellcode Generation.", - "paper_name": "Shellcode_IA32: A Dataset for Automatic Shellcode Generation", - "paper_abstract": "We take the first step to address the task of automatically generating shellcodes, i.e., small pieces of code used as a payload in the exploitation of a software vulnerability, starting from natural language comments. We assemble and release a novel dataset (Shellcode_IA32), consisting of challenging but common assembly instructions with their natural language descriptions. We experiment with standard methods in neural machine translation (NMT) to establish baseline performance levels on this task." - }, - "antoiloui/bsard": { - "pwc_id": "bsard", - "dataset_name": "BSARD Dataset", - "dataset_abstract": "The Belgian Statutory Article Retrieval Dataset (BSARD) is a French native corpus for studying statutory article retrieval. BSARD consists of more than 22,600 statutory articles from Belgian law and about 1,100 legal questions posed by Belgian citizens and labeled by experienced jurists with relevant articles from the corpus.", - "paper_name": "A Statutory Article Retrieval Dataset in French", - "paper_abstract": "Statutory article retrieval is the task of automatically retrieving law articles relevant to a legal question. While recent advances in natural language processing have sparked considerable interest in many legal tasks, statutory article retrieval remains primarily untouched due to the scarcity of large-scale and high-quality annotated datasets. To address this bottleneck, we introduce the Belgian Statutory Article Retrieval Dataset (BSARD), which consists of 1,100+ French native legal questions labeled by experienced jurists with relevant articles from a corpus of 22,600+ Belgian law articles. Using BSARD, we benchmark several state-of-the-art retrieval approaches, including lexical and dense architectures, both in zero-shot and supervised setups. We find that fine-tuned dense retrieval models significantly outperform other systems. Our best performing baseline achieves 74.8% R@100, which is promising for the feasibility of the task and indicates there is still room for improvement. By the specificity of the domain and addressed task, BSARD presents a unique challenge problem for future research on legal information retrieval. Our dataset and source code are publicly available." - }, - "corypaik/prost": { - "pwc_id": "prost", - "dataset_name": "PROST Dataset", - "dataset_abstract": "The PROST (Physical Reasoning about Objects Through Space and Time) dataset contains 18,736 multiple-choice questions made from 14 manually curated templates, covering 10 physical reasoning concepts. All questions are designed to probe both causal and masked language models in a zero-shot setting.", - "paper_name": "PROST: Physical Reasoning of Objects through Space and Time", - "paper_abstract": "We present a new probing dataset named PROST: Physical Reasoning about Objects Through Space and Time. This dataset contains 18,736 multiple-choice questions made from 14 manually curated templates, covering 10 physical reasoning concepts. All questions are designed to probe both causal and masked language models in a zero-shot setting. We conduct an extensive analysis which demonstrates that state-of-the-art pretrained models are inadequate at physical reasoning: they are influenced by the order in which answer options are presented to them, they struggle when the superlative in a question is inverted (e.g., most <-> least), and increasing the amount of pretraining data and parameters only yields minimal improvements. These results provide support for the hypothesis that current pretrained models' ability to reason about physical interactions is inherently limited by a lack of real world experience. By highlighting these limitations, we hope to motivate the development of models with a human-like understanding of the physical world." - }, - "dfki-nlp/few-nerd": { - "pwc_id": "few-nerd", - "dataset_name": "Few-NERD Dataset", - "dataset_abstract": "Few-NERD is a large-scale, fine-grained manually annotated named entity recognition dataset, which contains 8 coarse-grained types, 66 fine-grained types, 188,200 sentences, 491,711 entities, and 4,601,223 tokens. Three benchmark tasks are built, one is supervised (Few-NERD (SUP)) and the other two are few-shot (Few-NERD (INTRA) and Few-NERD (INTER)).", - "paper_name": "Few-NERD: A Few-Shot Named Entity Recognition Dataset", - "paper_abstract": "Recently, considerable literature has grown up around the theme of few-shot named entity recognition (NER), but little published benchmark data specifically focused on the practical and challenging task. Current approaches collect existing supervised NER datasets and re-organize them to the few-shot setting for empirical study. These strategies conventionally aim to recognize coarse-grained entity types with few examples, while in practice, most unseen entity types are fine-grained. In this paper, we present Few-NERD, a large-scale human-annotated few-shot NER dataset with a hierarchy of 8 coarse-grained and 66 fine-grained entity types. Few-NERD consists of 188,238 sentences from Wikipedia, 4,601,160 words are included and each is annotated as context or a part of a two-level entity type. To the best of our knowledge, this is the first few-shot NER dataset and the largest human-crafted NER dataset. We construct benchmark tasks with different emphases to comprehensively assess the generalization capability of models. Extensive empirical results and analysis show that Few-NERD is challenging and the problem requires further research. We make Few-NERD public at https://ningding97.github.io/fewnerd/." - }, - "gsarti/clean_mc4_it": { - "pwc_id": "mc4", - "dataset_name": "mC4 Dataset", - "dataset_abstract": "mC4 is a multilingual variant of the C4 dataset called mC4. mC4 comprises natural text in 101 languages drawn from the public Common Crawl web scrape.", - "paper_name": "mT5: A massively multilingual pre-trained text-to-text transformer", - "paper_abstract": "The recent \"Text-to-Text Transfer Transformer\" (T5) leveraged a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual benchmarks. We also describe a simple technique to prevent \"accidental translation\" in the zero-shot setting, where a generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model checkpoints used in this work are publicly available." - }, - "lara-martin/Scifi_TV_Shows": { - "pwc_id": "scifi-tv-plots", - "dataset_name": "Scifi TV Shows Dataset", - "dataset_abstract": "A collection of long-running (80+ episodes) science fiction TV show synopses, scraped from Fandom.com wikis. Collected Nov 2017. Each episode is considered a \"story\".\n\nContains plot summaries from :\n\n\nBabylon 5 (https://babylon5.fandom.com/wiki/Main_Page) - 84 stories\nDoctor Who (https://tardis.fandom.com/wiki/Doctor_Who_Wiki) - 311 stories\nDoctor Who spin-offs - 95 stories\nFarscape (https://farscape.fandom.com/wiki/Farscape_Encyclopedia_Project:Main_Page) - 90 stories\nFringe (https://fringe.fandom.com/wiki/FringeWiki) - 87 stories\nFuturama (https://futurama.fandom.com/wiki/Futurama_Wiki) - 87 stories\nStargate (https://stargate.fandom.com/wiki/Stargate_Wiki) - 351 stories\nStar Trek (https://memory-alpha.fandom.com/wiki/Star_Trek) - 701 stories\nStar Wars books (https://starwars.fandom.com/wiki/Main_Page) - 205 stories\nStar Wars Rebels - 65 stories\nX-Files (https://x-files.fandom.com/wiki/Main_Page) - 200 stories\n\nTotal: 2276 stories\n\nDataset is \"eventified\" and generalized (see LJ Martin, P Ammanabrolu, X Wang, W Hancock, S Singh, B Harrison, and MO Riedl. Event Representations for Automated Story Generation with Deep Neural Nets, Thirty-Second AAAI Conference on Artificial Intelligence (AAAI), 2018. for details on these processes.) and split into train-test-validation sets for converting events into full sentences.", - "paper_name": "Story Realization: Expanding Plot Events into Sentences", - "paper_abstract": "Neural network based approaches to automated story plot generation attempt to learn how to generate novel plots from a corpus of natural language plot summaries. Prior work has shown that a semantic abstraction of sentences called events improves neural plot generation and and allows one to decompose the problem into: (1) the generation of a sequence of events (event-to-event) and (2) the transformation of these events into natural language sentences (event-to-sentence). However, typical neural language generation approaches to event-to-sentence can ignore the event details and produce grammatically-correct but semantically-unrelated sentences. We present an ensemble-based model that generates natural language guided by events.We provide results---including a human subjects study---for a full end-to-end automated story generation system showing that our method generates more coherent and plausible stories than baseline approaches." - }, - "allenai/scico": { - "pwc_id": "scico", - "dataset_name": "SciCo Dataset", - "dataset_abstract": "SciCo is an expert-annotated dataset for hierarchical CDCR (cross-document coreference resolution) for concepts in scientific papers, with the goal of jointly inferring coreference clusters and hierarchy between them.", - "paper_name": "SciCo: Hierarchical Cross-Document Coreference for Scientific Concepts", - "paper_abstract": "Determining coreference of concept mentions across multiple documents is a fundamental task in natural language understanding. Previous work on cross-document coreference resolution (CDCR) typically considers mentions of events in the news, which seldom involve abstract technical concepts that are prevalent in science and technology. These complex concepts take diverse or ambiguous forms and have many hierarchical levels of granularity (e.g., tasks and subtasks), posing challenges for CDCR. We present a new task of Hierarchical CDCR (H-CDCR) with the goal of jointly inferring coreference clusters and hierarchy between them. We create SciCo, an expert-annotated dataset for H-CDCR in scientific papers, 3X larger than the prominent ECB+ resource. We study strong baseline models that we customize for H-CDCR, and highlight challenges for future work." - }, - "corypaik/coda": { - "pwc_id": "coda", - "dataset_name": "CoDa Dataset", - "dataset_abstract": "The Color Dataset (CoDa) is a probing dataset to evaluate the representation of visual properties in language models. CoDa consists of color distributions for 521 common objects, which are split into 3 groups: Single, Multi, and Any. \n\nThe default configuration of CoDa uses 10 CLIP-style templates (e.g. \"A photo of a [object]\"), and 10 cloze-style templates (e.g. \"Everyone knows most [object] are [color].\"", - "paper_name": "The World of an Octopus: How Reporting Bias Influences a Language Model's Perception of Color", - "paper_abstract": "Recent work has raised concerns about the inherent limitations of text-only pretraining. In this paper, we first demonstrate that reporting bias, the tendency of people to not state the obvious, is one of the causes of this limitation, and then investigate to what extent multimodal training can mitigate this issue. To accomplish this, we 1) generate the Color Dataset (CoDa), a dataset of human-perceived color distributions for 521 common objects; 2) use CoDa to analyze and compare the color distribution found in text, the distribution captured by language models, and a human's perception of color; and 3) investigate the performance differences between text-only and multimodal models on CoDa. Our results show that the distribution of colors that a language model recovers correlates more strongly with the inaccurate distribution found in text than with the ground-truth, supporting the claim that reporting bias negatively impacts and inherently limits text-only training. We then demonstrate that multimodal models can leverage their visual training to mitigate these effects, providing a promising avenue for future research." - }, - "dfki-nlp/mobie": { - "pwc_id": "mobie", - "dataset_name": "MobIE Dataset", - "dataset_abstract": "MobIE is a German-language dataset which is human-annotated with 20 coarse- and fine-grained entity types and entity linking information for geographically linkable entities. The dataset consists of 3,232 social media texts and traffic reports with 91K tokens, and contains 20.5K annotated entities, 13.1K of which are linked to a knowledge base. A subset of the dataset is human-annotated with seven mobility-related, n-ary relation types, while the remaining documents are annotated using a weakly-supervised labeling approach implemented with the Snorkel framework.\n\nThe dataset can be used for NER (Named entity recognition), EL (entity linking) and RE (relation extraction), and thus can be used for joint and multi-task learning of these fundamental information extraction tasks.", - "paper_name": "MobIE: A German Dataset for Named Entity Recognition, Entity Linking and Relation Extraction in the Mobility Domain", - "paper_abstract": "We present MobIE, a German-language dataset, which is human-annotated with 20 coarse- and fine-grained entity types and entity linking information for geographically linkable entities. The dataset consists of 3,232 social media texts and traffic reports with 91K tokens, and contains 20.5K annotated entities, 13.1K of which are linked to a knowledge base. A subset of the dataset is human-annotated with seven mobility-related, n-ary relation types, while the remaining documents are annotated using a weakly-supervised labeling approach implemented with the Snorkel framework. To the best of our knowledge, this is the first German-language dataset that combines annotations for NER, EL and RE, and thus can be used for joint and multi-task learning of these fundamental information extraction tasks. We make MobIE public at https://github.com/dfki-nlp/mobie." - }, - "debatelab/aaac": { - "pwc_id": "aaac", - "dataset_name": "AAAC Dataset", - "dataset_abstract": "DeepA2 is a modular framework for deep argument analysis. DeepA2 datasets contain comprehensive logical reconstructions of informally presented arguments in short argumentative texts. This item references two two synthetic DeepA2 datasets for artificial argument analysis: AAAC01 and AAAC02.", - "paper_name": "DeepA2: A Modular Framework for Deep Argument Analysis with Pretrained Neural Text2Text Language Models", - "paper_abstract": "In this paper, we present and implement a multi-dimensional, modular framework for performing deep argument analysis (DeepA2) using current pre-trained language models (PTLMs). ArgumentAnalyst -- a T5 model (Raffel et al. 2020) set up and trained within DeepA2 -- reconstructs argumentative texts, which advance an informal argumentation, as valid arguments: It inserts, e.g., missing premises and conclusions, formalizes inferences, and coherently links the logical reconstruction to the source text. We create a synthetic corpus for deep argument analysis, and evaluate ArgumentAnalyst on this new dataset as well as on existing data, specifically EntailmentBank (Dalvi et al. 2021). Our empirical findings vindicate the overall framework and highlight the advantages of a modular design, in particular its ability to emulate established heuristics (such as hermeneutic cycles), to explore the model's uncertainty, to cope with the plurality of correct solutions (underdetermination), and to exploit higher-order evidence." - }, - "lewtun/autoevaluate__conll2003": { - "pwc_id": "conll-2003", - "dataset_name": "CoNLL-2003 Dataset", - "dataset_abstract": "CoNLL-2003 is a named entity recognition dataset released as a part of CoNLL-2003 shared task: language-independent named entity recognition.\nThe data consists of eight files covering two languages: English and German.\nFor each of the languages there is a training file, a development file, a test file and a large file with unannotated data.\n\nThe English data was taken from the Reuters Corpus. This corpus consists of Reuters news stories between August 1996 and August 1997.\nFor the training and development set, ten days worth of data were taken from the files representing the end of August 1996.\nFor the test set, the texts were from December 1996. The preprocessed raw data covers the month of September 1996.\n\nThe text for the German data was taken from the ECI Multilingual Text Corpus. This corpus consists of texts in many languages. The portion of data that\nwas used for this task, was extracted from the German newspaper Frankfurter Rundshau. All three of the training, development and test sets were taken\nfrom articles written in one week at the end of August 1992.\nThe raw data were taken from the months of September to December 1992.\n\n| English data | Articles | Sentences | Tokens | LOC | MISC | ORG | PER |\n|-------------------|----------|-----------|---------|------|------|------|------|\n| Training set | 946 | 14,987 | 203,621 | 7140 | 3438 | 6321 | 6600 |\n| Development set | 216 | 3,466 | 51,362 | 1837 | 922 | 1341 | 1842 |\n| Test set | 231 | 3,684 | 46,435 | 1668 | 702 | 1661 | 1617 |\n\nNumber of articles, sentences, tokens and entities (locations, miscellaneous, organizations, and persons) in English data files.\n\n| German data | Articles | Sentences | Tokens | LOC | MISC | ORG | PER |\n|-------------------|----------|-----------|---------|------|------|------|------|\n| Training set | 553 | 12,705 | 206,931 | 4363 | 2288 | 2427 | 2773 |\n| Development set | 201 | 3,068 | 51,444 | 1181 | 1010 | 1241 | 1401 |\n| Test set | 155 | 3,160 | 51,943 | 1035 | 670 | 773 | 1195 |\n\nNumber of articles, sentences, tokens and entities (locations, miscellaneous, organizations, and persons) in German data files.", - "paper_name": "Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition", - "paper_abstract": "We describe the CoNLL-2003 shared task: language-independent named entity recognition. We give background information on the data sets (English and German) and the evaluation method, present a general overview of the systems that have taken part in the task and discuss their performance." - }, - "cfilt/HiNER-collapsed": { - "pwc_id": "hiner-collapsed-1", - "dataset_name": "HiNER-collapsed Dataset", - "dataset_abstract": "This dataset releases a significantly sized standard-abiding Hindi NER dataset containing 109,146 sentences and 2,220,856 tokens, annotated with 3 collapsed tags (PER, LOC, ORG).", - "paper_name": "HiNER: A Large Hindi Named Entity Recognition Dataset", - "paper_abstract": "Named Entity Recognition (NER) is a foundational NLP task that aims to provide class labels like Person, Location, Organisation, Time, and Number to words in free text. Named Entities can also be multi-word expressions where the additional I-O-B annotation information helps label them during the NER annotation process. While English and European languages have considerable annotated data for the NER task, Indian languages lack on that front -- both in terms of quantity and following annotation standards. This paper releases a significantly sized standard-abiding Hindi NER dataset containing 109,146 sentences and 2,220,856 tokens, annotated with 11 tags. We discuss the dataset statistics in all their essential detail and provide an in-depth analysis of the NER tag-set used with our data. The statistics of tag-set in our dataset show a healthy per-tag distribution, especially for prominent classes like Person, Location and Organisation. Since the proof of resource-effectiveness is in building models with the resource and testing the model on benchmark data and against the leader-board entries in shared tasks, we do the same with the aforesaid data. We use different language models to perform the sequence labelling task for NER and show the efficacy of our data by performing a comparative evaluation with models trained on another dataset available for the Hindi NER task. Our dataset helps achieve a weighted F1 score of 88.78 with all the tags and 92.22 when we collapse the tag-set, as discussed in the paper. To the best of our knowledge, no available dataset meets the standards of volume (amount) and variability (diversity), as far as Hindi NER is concerned. We fill this gap through this work, which we hope will significantly help NLP for Hindi. We release this dataset with our code and models at https://github.com/cfiltnlp/HiNER" - }, - "mozilla-foundation/common_voice_8_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "mozilla-foundation/common_voice_7_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "shibing624/nli_zh": { - "pwc_id": "snli", - "dataset_name": "SNLI Dataset", - "dataset_abstract": "The SNLI dataset (Stanford Natural Language Inference) consists of 570k sentence-pairs manually labeled as entailment, contradiction, and neutral. Premises are image captions from Flickr30k, while hypotheses were generated by crowd-sourced annotators who were shown a premise and asked to generate entailing, contradicting, and neutral sentences. Annotators were instructed to judge the relation between sentences given that they describe the same event. Each pair is labeled as \u201centailment\u201d, \u201cneutral\u201d, \u201ccontradiction\u201d or \u201c-\u201d, where \u201c-\u201d indicates that an agreement could not be reached.", - "paper_name": "A large annotated corpus for learning natural language inference", - "paper_abstract": "Understanding entailment and contradiction is fundamental to understanding\nnatural language, and inference about entailment and contradiction is a\nvaluable testing ground for the development of semantic representations.\nHowever, machine learning research in this area has been dramatically limited\nby the lack of large-scale resources. To address this, we introduce the\nStanford Natural Language Inference corpus, a new, freely available collection\nof labeled sentence pairs, written by humans doing a novel grounded task based\non image captioning. At 570K pairs, it is two orders of magnitude larger than\nall other resources of its type. This increase in scale allows lexicalized\nclassifiers to outperform some sophisticated existing entailment models, and it\nallows a neural network-based model to perform competitively on natural\nlanguage inference benchmarks for the first time." - }, - "copenlu/fever_gold_evidence": { - "pwc_id": "fever", - "dataset_name": "FEVER Dataset", - "dataset_abstract": "FEVER is a publicly available dataset for fact extraction and verification against textual sources.\n\nIt consists of 185,445 claims manually verified against the introductory sections of Wikipedia pages and classified as SUPPORTED, REFUTED or NOTENOUGHINFO. For the first two classes, systems and annotators need to also return the combination of sentences forming the necessary evidence supporting or refuting the claim.\n\nThe claims were generated by human annotators extracting claims from Wikipedia and mutating them in a variety of ways, some of which were meaning-altering. The verification of each claim was conducted in a separate annotation process by annotators who were aware of the page but not the sentence from which original claim was\nextracted and thus in 31.75% of the claims more than one sentence was considered appropriate evidence. Claims require composition of evidence from multiple sentences in 16.82% of cases. Furthermore, in 12.15% of the claims, this evidence was taken from multiple pages.", - "paper_name": "FEVER: a large-scale dataset for Fact Extraction and VERification", - "paper_abstract": "In this paper we introduce a new publicly available dataset for verification\nagainst textual sources, FEVER: Fact Extraction and VERification. It consists\nof 185,445 claims generated by altering sentences extracted from Wikipedia and\nsubsequently verified without knowledge of the sentence they were derived from.\nThe claims are classified as Supported, Refuted or NotEnoughInfo by annotators\nachieving 0.6841 in Fleiss $\\kappa$. For the first two classes, the annotators\nalso recorded the sentence(s) forming the necessary evidence for their\njudgment. To characterize the challenge of the dataset presented, we develop a\npipeline approach and compare it to suitably designed oracles. The best\naccuracy we achieve on labeling a claim accompanied by the correct evidence is\n31.87%, while if we ignore the evidence we achieve 50.91%. Thus we believe that\nFEVER is a challenging testbed that will help stimulate progress on claim\nverification against textual sources." - }, - "merty/nateraw-food101-copy": { - "pwc_id": "food-101", - "dataset_name": "Food-101 Dataset", - "dataset_abstract": "The Food-101 dataset consists of 101 food categories with 750 training and 250 test images per category, making a total of 101k images. The labels for the test images have been manually cleaned, while the training set contains some noise.", - "paper_name": "", - "paper_abstract": "" - }, - "mozilla-foundation/common_voice_2_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "mozilla-foundation/common_voice_4_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "mozilla-foundation/common_voice_5_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "mozilla-foundation/common_voice_5_1": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "mozilla-foundation/common_voice_6_1": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "nateraw/food101": { - "pwc_id": "food-101", - "dataset_name": "Food-101 Dataset", - "dataset_abstract": "The Food-101 dataset consists of 101 food categories with 750 training and 250 test images per category, making a total of 101k images. The labels for the test images have been manually cleaned, while the training set contains some noise.", - "paper_name": "", - "paper_abstract": "" - }, - "nielsr/FUNSD_layoutlmv2": { - "pwc_id": "funsd", - "dataset_name": "FUNSD Dataset", - "dataset_abstract": "Form Understanding in Noisy Scanned Documents (FUNSD) comprises 199 real, fully annotated, scanned forms. The documents are noisy and vary widely in appearance, making form understanding (FoUn) a challenging task. The proposed dataset can be used for various tasks, including text detection, optical character recognition, spatial layout analysis, and entity labeling/linking.", - "paper_name": "FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents", - "paper_abstract": "We present a new dataset for form understanding in noisy scanned documents (FUNSD) that aims at extracting and structuring the textual content of forms. The dataset comprises 199 real, fully annotated, scanned forms. The documents are noisy and vary widely in appearance, making form understanding (FoUn) a challenging task. The proposed dataset can be used for various tasks, including text detection, optical character recognition, spatial layout analysis, and entity labeling/linking. To the best of our knowledge, this is the first publicly available dataset with comprehensive annotations to address FoUn task. We also present a set of baselines and introduce metrics to evaluate performance on the FUNSD dataset, which can be downloaded at https://guillaumejaume.github.io/FUNSD/." - }, - "oscar-corpus/OSCAR-2109": { - "pwc_id": "oscar", - "dataset_name": "OSCAR Dataset", - "dataset_abstract": "OSCAR or Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture. The dataset used for training multilingual models such as BART incorporates 138 GB of text.", - "paper_name": "A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages", - "paper_abstract": "We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for five mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures." - }, - "qwant/squad_fr": { - "pwc_id": "squad", - "dataset_name": "SQuAD Dataset", - "dataset_abstract": "The Stanford Question Answering Dataset (SQuAD) is a collection of question-answer pairs derived from Wikipedia articles. In SQuAD, the correct answers of questions can be any sequence of tokens in the given text. Because the questions and answers are produced by humans through crowdsourcing, it is more diverse than some other question-answering datasets. SQuAD 1.1 contains 107,785 question-answer pairs on 536 articles. SQuAD2.0 (open-domain SQuAD, SQuAD-Open), the latest version, combines the 100,000 questions in SQuAD1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers in forms that are similar to the answerable ones.", - "paper_name": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", - "paper_abstract": "We present the Stanford Question Answering Dataset (SQuAD), a new reading\ncomprehension dataset consisting of 100,000+ questions posed by crowdworkers on\na set of Wikipedia articles, where the answer to each question is a segment of\ntext from the corresponding reading passage. We analyze the dataset to\nunderstand the types of reasoning required to answer the questions, leaning\nheavily on dependency and constituency trees. We build a strong logistic\nregression model, which achieves an F1 score of 51.0%, a significant\nimprovement over a simple baseline (20%). However, human performance (86.8%) is\nmuch higher, indicating that the dataset presents a good challenge problem for\nfuture research.\n The dataset is freely available at https://stanford-qa.com" - }, - "sagnikrayc/mctest": { - "pwc_id": "mctest", - "dataset_name": "MCTest Dataset", - "dataset_abstract": "MCTest is a freely available set of stories and associated questions intended for research on the machine comprehension of text. \n\nMCTest requires machines to answer multiple-choice reading comprehension questions about fictional stories, directly tackling the high-level goal of open-domain machine comprehension.", - "paper_name": "", - "paper_abstract": "" - }, - "sagnikrayc/quasar": { - "pwc_id": "quasar-1", - "dataset_name": "QUASAR Dataset", - "dataset_abstract": "The Question Answering by Search And Reading (QUASAR) is a large-scale dataset consisting of QUASAR-S and QUASAR-T. Each of these datasets is built to focus on evaluating systems devised to understand a natural language query, a large corpus of texts and to extract an answer to the question from the corpus. Specifically, QUASAR-S comprises 37,012 fill-in-the-gaps questions that are collected from the popular website Stack Overflow using entity tags. The QUASAR-T dataset contains 43,012 open-domain questions collected from various internet sources. The candidate documents for each question in this dataset are retrieved from an Apache Lucene based search engine built on top of the ClueWeb09 dataset.", - "paper_name": "Quasar: Datasets for Question Answering by Search and Reading", - "paper_abstract": "We present two new large-scale datasets aimed at evaluating systems designed\nto comprehend a natural language query and extract its answer from a large\ncorpus of text. The Quasar-S dataset consists of 37000 cloze-style\n(fill-in-the-gap) queries constructed from definitions of software entity tags\non the popular website Stack Overflow. The posts and comments on the website\nserve as the background corpus for answering the cloze questions. The Quasar-T\ndataset consists of 43000 open-domain trivia questions and their answers\nobtained from various internet sources. ClueWeb09 serves as the background\ncorpus for extracting these answers. We pose these datasets as a challenge for\ntwo related subtasks of factoid Question Answering: (1) searching for relevant\npieces of text that include the correct answer to a query, and (2) reading the\nretrieved text to answer the query. We also describe a retrieval system for\nextracting relevant sentences and documents from the corpus given a query, and\ninclude these in the release for researchers wishing to only focus on (2). We\nevaluate several baselines on both datasets, ranging from simple heuristics to\npowerful neural models, and show that these lag behind human performance by\n16.4% and 32.1% for Quasar-S and -T respectively. The datasets are available at\nhttps://github.com/bdhingra/quasar ." - }, - "toloka/CrowdSpeech": { - "pwc_id": "crowdspeech", - "dataset_name": "CrowdSpeech Dataset", - "dataset_abstract": "CrowdSpeech is a publicly available large-scale dataset of crowdsourced audio transcriptions. It contains annotations for more than 20 hours of English speech from more than 1,000 crowd workers.", - "paper_name": "CrowdSpeech and VoxDIY: Benchmark Datasets for Crowdsourced Audio Transcription", - "paper_abstract": "Domain-specific data is the crux of the successful transfer of machine learning systems from benchmarks to real life. In simple problems such as image classification, crowdsourcing has become one of the standard tools for cheap and time-efficient data collection: thanks in large part to advances in research on aggregation methods. However, the applicability of crowdsourcing to more complex tasks (e.g., speech recognition) remains limited due to the lack of principled aggregation methods for these modalities. The main obstacle towards designing aggregation methods for more advanced applications is the absence of training data, and in this work, we focus on bridging this gap in speech recognition. For this, we collect and release CrowdSpeech -- the first publicly available large-scale dataset of crowdsourced audio transcriptions. Evaluation of existing and novel aggregation methods on our data shows room for improvement, suggesting that our work may entail the design of better algorithms. At a higher level, we also contribute to the more general challenge of developing the methodology for reliable data collection via crowdsourcing. In that, we design a principled pipeline for constructing datasets of crowdsourced audio transcriptions in any novel domain. We show its applicability on an under-resourced language by constructing VoxDIY -- a counterpart of CrowdSpeech for the Russian language. We also release the code that allows a full replication of our data collection pipeline and share various insights on best practices of data collection via crowdsourcing." - }, - "yhavinga/mc4_nl_cleaned": { - "pwc_id": "mc4", - "dataset_name": "mC4 Dataset", - "dataset_abstract": "mC4 is a multilingual variant of the C4 dataset called mC4. mC4 comprises natural text in 101 languages drawn from the public Common Crawl web scrape.", - "paper_name": "mT5: A massively multilingual pre-trained text-to-text transformer", - "paper_abstract": "The recent \"Text-to-Text Transfer Transformer\" (T5) leveraged a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual benchmarks. We also describe a simple technique to prevent \"accidental translation\" in the zero-shot setting, where a generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model checkpoints used in this work are publicly available." - }, - "google/xtreme_s": { - "pwc_id": "librispeech-1", - "dataset_name": "LibriSpeech Dataset", - "dataset_abstract": "The LibriSpeech corpus is a collection of approximately 1,000 hours of audiobooks that are a part of the LibriVox project. Most of the audiobooks come from the Project Gutenberg. The training data is split into 3 partitions of 100hr, 360hr, and 500hr sets while the dev and test data are split into the \u2019clean\u2019 and \u2019other\u2019 categories, respectively, depending upon how well or challening Automatic Speech Recognition systems would perform against. Each of the dev and test sets is around 5hr in audio length. This corpus also provides the n-gram language models and the corresponding texts excerpted from the Project Gutenberg books, which contain 803M tokens and 977K unique words.", - "paper_name": "", - "paper_abstract": "" - }, - "drAbreu/bc4chemd_ner": { - "pwc_id": "bc4chemd", - "dataset_name": "BC4CHEMD Dataset", - "dataset_abstract": "Introduced by Krallinger et al. in The CHEMDNER corpus of chemicals and drugs and its annotation principles\n\nBC4CHEMD is a collection of 10,000 PubMed abstracts that contain a total of 84,355 chemical entity mentions labeled manually by expert chemistry literature curators.", - "paper_name": "", - "paper_abstract": "" - }, - "malteos/test2": { - "pwc_id": "cnn-daily-mail-1", - "dataset_name": "CNN/Daily Mail Dataset", - "dataset_abstract": "CNN/Daily Mail is a dataset for text summarization. Human generated abstractive summary bullets were generated from news stories in CNN and Daily Mail websites as questions (with one of the entities hidden), and stories as the corresponding passages from which the system is expected to answer the fill-in the-blank question. The authors released the scripts that crawl, extract and generate pairs of passages and questions from these websites.\n\nIn all, the corpus has 286,817 training pairs, 13,368 validation pairs and 11,487 test pairs, as defined by their scripts. The source documents in the training set have 766 words spanning 29.74 sentences on an average while the summaries consist of 53 words and 3.72 sentences.", - "paper_name": "Abstractive Text Summarization Using Sequence-to-Sequence RNNs and Beyond", - "paper_abstract": "In this work, we model abstractive text summarization using Attentional\nEncoder-Decoder Recurrent Neural Networks, and show that they achieve\nstate-of-the-art performance on two different corpora. We propose several novel\nmodels that address critical problems in summarization that are not adequately\nmodeled by the basic architecture, such as modeling key-words, capturing the\nhierarchy of sentence-to-word structure, and emitting words that are rare or\nunseen at training time. Our work shows that many of our proposed models\ncontribute to further improvement in performance. We also propose a new dataset\nconsisting of multi-sentence summaries, and establish performance benchmarks\nfor further research." - }, - "Sampson2022/demo": { - "pwc_id": "squad", - "dataset_name": "SQuAD Dataset", - "dataset_abstract": "The Stanford Question Answering Dataset (SQuAD) is a collection of question-answer pairs derived from Wikipedia articles. In SQuAD, the correct answers of questions can be any sequence of tokens in the given text. Because the questions and answers are produced by humans through crowdsourcing, it is more diverse than some other question-answering datasets. SQuAD 1.1 contains 107,785 question-answer pairs on 536 articles. SQuAD2.0 (open-domain SQuAD, SQuAD-Open), the latest version, combines the 100,000 questions in SQuAD1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers in forms that are similar to the answerable ones.", - "paper_name": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", - "paper_abstract": "We present the Stanford Question Answering Dataset (SQuAD), a new reading\ncomprehension dataset consisting of 100,000+ questions posed by crowdworkers on\na set of Wikipedia articles, where the answer to each question is a segment of\ntext from the corresponding reading passage. We analyze the dataset to\nunderstand the types of reasoning required to answer the questions, leaning\nheavily on dependency and constituency trees. We build a strong logistic\nregression model, which achieves an F1 score of 51.0%, a significant\nimprovement over a simple baseline (20%). However, human performance (86.8%) is\nmuch higher, indicating that the dataset presents a good challenge problem for\nfuture research.\n The dataset is freely available at https://stanford-qa.com" - }, - "jason9693/APEACH": { - "pwc_id": "apeach", - "dataset_name": "Korean Hate Speech Evaluation Datasets Dataset", - "dataset_abstract": "APEACH is the first crowd-generated Korean evaluation dataset for hate speech detection. Sentences of the dataset are created by anonymous participants using an online crowdsourcing platform DeepNatural AI.", - "paper_name": "APEACH: Attacking Pejorative Expressions with Analysis on Crowd-Generated Hate Speech Evaluation Datasets", - "paper_abstract": "Detecting toxic or pejorative expressions in online communities has become one of the main concerns for preventing the users' mental harm. This led to the development of large-scale hate speech detection datasets of various domains, which are mainly built upon web-crawled texts with labels by crowd workers. However, for languages other than English, researchers might have to rely on only a small-sized corpus due to the lack of data-driven research of hate speech detection. This sometimes misleads the evaluation of prevalently used pretrained language models (PLMs) such as BERT, given that PLMs often share the domain of pretraining corpus with the evaluation set, resulting in over-representation of the detection performance. Also, the scope of pejorative expressions might be restricted if the dataset is built on a single domain text. To alleviate the above problems in Korean hate speech detection, we propose APEACH,a method that allows the collection of hate speech generated by unspecified users. By controlling the crowd-generation of hate speech and adding only a minimum post-labeling, we create a corpus that enables the generalizable and fair evaluation of hate speech detection regarding text domain and topic. We Compare our outcome with prior work on an annotation-based toxic news comment dataset using publicly available PLMs. We check that our dataset is less sensitive to the lexical overlap between the evaluation set and pretraining corpus of PLMs, showing that it helps mitigate the unexpected under/over-representation of model performance. We distribute our dataset publicly online to further facilitate the general-domain hate speech detection in Korean." - }, - "julien-c/reactiongif": { - "pwc_id": "reactiongif", - "dataset_name": "ReactionGIF Dataset", - "dataset_abstract": "ReactionGIF is an affective dataset of 30K tweets which can be used for tasks like induced sentiment prediction and multilabel classification of induced emotions.", - "paper_name": "Happy Dance, Slow Clap: Using Reaction GIFs to Predict Induced Affect on Twitter", - "paper_abstract": "Datasets with induced emotion labels are scarce but of utmost importance for many NLP tasks. We present a new, automated method for collecting texts along with their induced reaction labels. The method exploits the online use of reaction GIFs, which capture complex affective states. We show how to augment the data with induced emotion and induced sentiment labels. We use our method to create and publish ReactionGIF, a first-of-its-kind affective dataset of 30K tweets. We provide baselines for three new tasks, including induced sentiment prediction and multilabel classification of induced emotions. Our method and dataset open new research opportunities in emotion detection and affective computing." - }, - "mozilla-foundation/common_voice_1_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "mozilla-foundation/common_voice_3_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "mozilla-foundation/common_voice_6_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "nateraw/food101_old": { - "pwc_id": "food-101", - "dataset_name": "Food-101 Dataset", - "dataset_abstract": "The Food-101 dataset consists of 101 food categories with 750 training and 250 test images per category, making a total of 101k images. The labels for the test images have been manually cleaned, while the training set contains some noise.", - "paper_name": "", - "paper_abstract": "" - }, - "nateraw/sync_food101": { - "pwc_id": "food-101", - "dataset_name": "Food-101 Dataset", - "dataset_abstract": "The Food-101 dataset consists of 101 food categories with 750 training and 250 test images per category, making a total of 101k images. The labels for the test images have been manually cleaned, while the training set contains some noise.", - "paper_name": "", - "paper_abstract": "" - }, - "ncats/EpiSet4BinaryClassification": { - "pwc_id": "glue", - "dataset_name": "GLUE Dataset", - "dataset_abstract": "General Language Understanding Evaluation (GLUE) benchmark is a collection of nine natural language understanding tasks, including single-sentence tasks CoLA and SST-2, similarity and paraphrasing tasks MRPC, STS-B and QQP, and natural language inference tasks MNLI, QNLI, RTE and WNLI.", - "paper_name": "GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding", - "paper_abstract": "For natural language understanding (NLU) technology to be maximally useful,\nboth practically and as a scientific object of study, it must be general: it\nmust be able to process language in a way that is not exclusively tailored to\nany one specific task or dataset. In pursuit of this objective, we introduce\nthe General Language Understanding Evaluation benchmark (GLUE), a tool for\nevaluating and analyzing the performance of models across a diverse range of\nexisting NLU tasks. GLUE is model-agnostic, but it incentivizes sharing\nknowledge across tasks because certain tasks have very limited training data.\nWe further provide a hand-crafted diagnostic test suite that enables detailed\nlinguistic analysis of NLU models. We evaluate baselines based on current\nmethods for multi-task and transfer learning and find that they do not\nimmediately give substantial improvements over the aggregate performance of\ntraining a separate model per task, indicating room for improvement in\ndeveloping general and robust NLU systems." - }, - "pierreguillou/lener_br_finetuning_language_model": { - "pwc_id": "lener-br", - "dataset_name": "LeNER-Br Dataset", - "dataset_abstract": "LeNER-Br is a dataset for named entity recognition (NER) in Brazilian Legal Text.", - "paper_name": "LeNER-Br: a Dataset for Named Entity Recognition in Brazilian Legal Text", - "paper_abstract": "Named entity recognition systems have the untapped potential to extract information from legal documents, which can improve\r\ninformation retrieval and decision-making processes. In this paper, a dataset for named entity recognition in Brazilian legal documents is presented. Unlike other Portuguese language datasets, this dataset is composed entirely of legal documents. In addition to tags for persons, locations, time entities and organizations, the dataset contains specific tags for law and legal cases entities. To establish a set of baseline results, we first performed experiments on another Portuguese dataset: Paramopama. This evaluation demonstrate that LSTM-CRF gives results that are significantly better than those previously reported. We then retrained LSTM-CRF, on our dataset and obtained F 1 scores of 97.04% and 88.82% for Legislation and Legal case entities, respectively.\r\nThese results show the viability of the proposed dataset for legal applications." - }, - "pietrolesci/ag_news": { - "pwc_id": "ag-news", - "dataset_name": "AG News Dataset", - "dataset_abstract": "AG News (AG\u2019s News Corpus) is a subdataset of AG's corpus of news articles constructed by assembling titles and description fields of articles from the 4 largest classes (\u201cWorld\u201d, \u201cSports\u201d, \u201cBusiness\u201d, \u201cSci/Tech\u201d) of AG\u2019s Corpus. The AG News contains 30,000 training and 1,900 test samples per class.", - "paper_name": "Character-level Convolutional Networks for Text Classification", - "paper_abstract": "This article offers an empirical exploration on the use of character-level\nconvolutional networks (ConvNets) for text classification. We constructed\nseveral large-scale datasets to show that character-level convolutional\nnetworks could achieve state-of-the-art or competitive results. Comparisons are\noffered against traditional models such as bag of words, n-grams and their\nTFIDF variants, and deep learning models such as word-based ConvNets and\nrecurrent neural networks." - }, - "oscar-corpus/OSCAR-2201": { - "pwc_id": "oscar", - "dataset_name": "OSCAR Dataset", - "dataset_abstract": "OSCAR or Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture. The dataset used for training multilingual models such as BART incorporates 138 GB of text.", - "paper_name": "A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages", - "paper_abstract": "We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for five mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures." - }, - "nthngdy/oscar-small": { - "pwc_id": "oscar", - "dataset_name": "OSCAR Dataset", - "dataset_abstract": "OSCAR or Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture. The dataset used for training multilingual models such as BART incorporates 138 GB of text.", - "paper_name": "A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages", - "paper_abstract": "We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for five mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures." - }, - "yhavinga/ccmatrix": { - "pwc_id": "ccmatrix", - "dataset_name": "CCMatrix Dataset", - "dataset_abstract": "CCMatrix uses ten snapshots of a curated common crawl corpus (Wenzek et al., 2019) totalling 32.7 billion unique sentences.", - "paper_name": "CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB", - "paper_abstract": "We show that margin-based bitext mining in a multilingual sentence space can be applied to monolingual corpora of billions of sentences. We are using ten snapshots of a curated common crawl corpus (Wenzek et al., 2019) totalling 32.7 billion unique sentences. Using one unified approach for 38 languages, we were able to mine 4.5 billions parallel sentences, out of which 661 million are aligned with English. 20 language pairs have more then 30 million parallel sentences, 112 more then 10 million, and most more than one million, including direct alignments between many European or Asian languages. To evaluate the quality of the mined bitexts, we train NMT systems for most of the language pairs and evaluate them on TED, WMT and WAT test sets. Using our mined bitexts only and no human translated parallel data, we achieve a new state-of-the-art for a single system on the WMT'19 test set for translation between English and German, Russian and Chinese, as well as German/French. In particular, our English/German system outperforms the best single one by close to 4 BLEU points and is almost on pair with best WMT'19 evaluation system which uses system combination and back-translation. We also achieve excellent results for distant languages pairs like Russian/Japanese, outperforming the best submission at the 2019 workshop on Asian Translation (WAT)." - }, - "lewtun/autoevaluate__emotion": { - "pwc_id": "emotion", - "dataset_name": "CARER Dataset", - "dataset_abstract": "CARER is an emotion dataset collected through noisy labels, annotated via distant supervision as in (Go et al., 2009). \n\nThe subset of data provided here corresponds to the six emotions variant described in the paper. The six emotions are anger, fear, joy, love, sadness, and surprise.", - "paper_name": "CARER: Contextualized Affect Representations for Emotion Recognition", - "paper_abstract": "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks." - }, - "mwong/fever-evidence-related": { - "pwc_id": "fever", - "dataset_name": "FEVER Dataset", - "dataset_abstract": "FEVER is a publicly available dataset for fact extraction and verification against textual sources.\n\nIt consists of 185,445 claims manually verified against the introductory sections of Wikipedia pages and classified as SUPPORTED, REFUTED or NOTENOUGHINFO. For the first two classes, systems and annotators need to also return the combination of sentences forming the necessary evidence supporting or refuting the claim.\n\nThe claims were generated by human annotators extracting claims from Wikipedia and mutating them in a variety of ways, some of which were meaning-altering. The verification of each claim was conducted in a separate annotation process by annotators who were aware of the page but not the sentence from which original claim was\nextracted and thus in 31.75% of the claims more than one sentence was considered appropriate evidence. Claims require composition of evidence from multiple sentences in 16.82% of cases. Furthermore, in 12.15% of the claims, this evidence was taken from multiple pages.", - "paper_name": "FEVER: a large-scale dataset for Fact Extraction and VERification", - "paper_abstract": "In this paper we introduce a new publicly available dataset for verification\nagainst textual sources, FEVER: Fact Extraction and VERification. It consists\nof 185,445 claims generated by altering sentences extracted from Wikipedia and\nsubsequently verified without knowledge of the sentence they were derived from.\nThe claims are classified as Supported, Refuted or NotEnoughInfo by annotators\nachieving 0.6841 in Fleiss $\\kappa$. For the first two classes, the annotators\nalso recorded the sentence(s) forming the necessary evidence for their\njudgment. To characterize the challenge of the dataset presented, we develop a\npipeline approach and compare it to suitably designed oracles. The best\naccuracy we achieve on labeling a claim accompanied by the correct evidence is\n31.87%, while if we ignore the evidence we achieve 50.91%. Thus we believe that\nFEVER is a challenging testbed that will help stimulate progress on claim\nverification against textual sources." - }, - "mwong/climate-evidence-related": { - "pwc_id": "climate-fever", - "dataset_name": "CLIMATE-FEVER Dataset", - "dataset_abstract": "A new publicly available dataset for verification of climate change-related claims.", - "paper_name": "CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims", - "paper_abstract": "We introduce CLIMATE-FEVER, a new publicly available dataset for verification of climate change-related claims. By providing a dataset for the research community, we aim to facilitate and encourage work on improving algorithms for retrieving evidential support for climate-specific claims, addressing the underlying language understanding challenges, and ultimately help alleviate the impact of misinformation on climate change. We adapt the methodology of FEVER [1], the largest dataset of artificially designed claims, to real-life claims collected from the Internet. While during this process, we could rely on the expertise of renowned climate scientists, it turned out to be no easy task. We discuss the surprising, subtle complexity of modeling real-world climate-related claims within the \\textsc{fever} framework, which we believe provides a valuable challenge for general natural language understanding. We hope that our work will mark the beginning of a new exciting long-term joint effort by the climate science and AI community." - }, - "mwong/fever-claim-related": { - "pwc_id": "fever", - "dataset_name": "FEVER Dataset", - "dataset_abstract": "FEVER is a publicly available dataset for fact extraction and verification against textual sources.\n\nIt consists of 185,445 claims manually verified against the introductory sections of Wikipedia pages and classified as SUPPORTED, REFUTED or NOTENOUGHINFO. For the first two classes, systems and annotators need to also return the combination of sentences forming the necessary evidence supporting or refuting the claim.\n\nThe claims were generated by human annotators extracting claims from Wikipedia and mutating them in a variety of ways, some of which were meaning-altering. The verification of each claim was conducted in a separate annotation process by annotators who were aware of the page but not the sentence from which original claim was\nextracted and thus in 31.75% of the claims more than one sentence was considered appropriate evidence. Claims require composition of evidence from multiple sentences in 16.82% of cases. Furthermore, in 12.15% of the claims, this evidence was taken from multiple pages.", - "paper_name": "FEVER: a large-scale dataset for Fact Extraction and VERification", - "paper_abstract": "In this paper we introduce a new publicly available dataset for verification\nagainst textual sources, FEVER: Fact Extraction and VERification. It consists\nof 185,445 claims generated by altering sentences extracted from Wikipedia and\nsubsequently verified without knowledge of the sentence they were derived from.\nThe claims are classified as Supported, Refuted or NotEnoughInfo by annotators\nachieving 0.6841 in Fleiss $\\kappa$. For the first two classes, the annotators\nalso recorded the sentence(s) forming the necessary evidence for their\njudgment. To characterize the challenge of the dataset presented, we develop a\npipeline approach and compare it to suitably designed oracles. The best\naccuracy we achieve on labeling a claim accompanied by the correct evidence is\n31.87%, while if we ignore the evidence we achieve 50.91%. Thus we believe that\nFEVER is a challenging testbed that will help stimulate progress on claim\nverification against textual sources." - }, - "mwong/climate-claim-related": { - "pwc_id": "climate-fever", - "dataset_name": "CLIMATE-FEVER Dataset", - "dataset_abstract": "A new publicly available dataset for verification of climate change-related claims.", - "paper_name": "CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims", - "paper_abstract": "We introduce CLIMATE-FEVER, a new publicly available dataset for verification of climate change-related claims. By providing a dataset for the research community, we aim to facilitate and encourage work on improving algorithms for retrieving evidential support for climate-specific claims, addressing the underlying language understanding challenges, and ultimately help alleviate the impact of misinformation on climate change. We adapt the methodology of FEVER [1], the largest dataset of artificially designed claims, to real-life claims collected from the Internet. While during this process, we could rely on the expertise of renowned climate scientists, it turned out to be no easy task. We discuss the surprising, subtle complexity of modeling real-world climate-related claims within the \\textsc{fever} framework, which we believe provides a valuable challenge for general natural language understanding. We hope that our work will mark the beginning of a new exciting long-term joint effort by the climate science and AI community." - }, - "Peihao/test-dateset": { - "pwc_id": "c4", - "dataset_name": "C4 Dataset", - "dataset_abstract": "C4 is a colossal, cleaned version of Common Crawl's web crawl corpus. It was based on Common Crawl dataset: https://commoncrawl.org. It was used to train the T5 text-to-text Transformer models.\n\nThe dataset can be downloaded in a pre-processed form from allennlp.", - "paper_name": "Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer", - "paper_abstract": "Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new ``Colossal Clean Crawled Corpus'', we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our data set, pre-trained models, and code." - }, - "surrey-nlp/PLOD-filtered": { - "pwc_id": "plod-filtered", - "dataset_name": "PLOD-filtered Dataset", - "dataset_abstract": "PLOD: An Abbreviation Detection Dataset\n\nThis is the PLOD (filtered) Dataset published at LREC 2022. The dataset can help build sequence labelling models for the task of Abbreviation Detection.", - "paper_name": "PLOD: An Abbreviation Detection Dataset for Scientific Documents", - "paper_abstract": "The detection and extraction of abbreviations from unstructured texts can help to improve the performance of Natural Language Processing tasks, such as machine translation and information retrieval. However, in terms of publicly available datasets, there is not enough data for training deep-neural-networks-based models to the point of generalising well over data. This paper presents PLOD, a large-scale dataset for abbreviation detection and extraction that contains 160k+ segments automatically annotated with abbreviations and their long forms. We performed manual validation over a set of instances and a complete automatic validation for this dataset. We then used it to generate several baseline models for detecting abbreviations and long forms. The best models achieved an F1-score of 0.92 for abbreviations and 0.89 for detecting their corresponding long forms. We release this dataset along with our code and all the models publicly in https://github.com/surrey-nlp/PLOD-AbbreviationDetection" - }, - "surrey-nlp/PLOD-unfiltered": { - "pwc_id": "plod-an-abbreviation-detection-dataset-for", - "dataset_name": "PLOD-unfiltered Dataset", - "dataset_abstract": "PLOD: An Abbreviation Detection Dataset\n\nThis is the PLOD (unfiltered) Dataset published at LREC 2022. The dataset can help build sequence labelling models for the task of Abbreviation Detection.", - "paper_name": "PLOD: An Abbreviation Detection Dataset for Scientific Documents", - "paper_abstract": "The detection and extraction of abbreviations from unstructured texts can help to improve the performance of Natural Language Processing tasks, such as machine translation and information retrieval. However, in terms of publicly available datasets, there is not enough data for training deep-neural-networks-based models to the point of generalising well over data. This paper presents PLOD, a large-scale dataset for abbreviation detection and extraction that contains 160k+ segments automatically annotated with abbreviations and their long forms. We performed manual validation over a set of instances and a complete automatic validation for this dataset. We then used it to generate several baseline models for detecting abbreviations and long forms. The best models achieved an F1-score of 0.92 for abbreviations and 0.89 for detecting their corresponding long forms. We release this dataset along with our code and all the models publicly in https://github.com/surrey-nlp/PLOD-AbbreviationDetection" - }, - "Lexi/spanextract": { - "pwc_id": "squad", - "dataset_name": "SQuAD Dataset", - "dataset_abstract": "The Stanford Question Answering Dataset (SQuAD) is a collection of question-answer pairs derived from Wikipedia articles. In SQuAD, the correct answers of questions can be any sequence of tokens in the given text. Because the questions and answers are produced by humans through crowdsourcing, it is more diverse than some other question-answering datasets. SQuAD 1.1 contains 107,785 question-answer pairs on 536 articles. SQuAD2.0 (open-domain SQuAD, SQuAD-Open), the latest version, combines the 100,000 questions in SQuAD1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers in forms that are similar to the answerable ones.", - "paper_name": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", - "paper_abstract": "We present the Stanford Question Answering Dataset (SQuAD), a new reading\ncomprehension dataset consisting of 100,000+ questions posed by crowdworkers on\na set of Wikipedia articles, where the answer to each question is a segment of\ntext from the corresponding reading passage. We analyze the dataset to\nunderstand the types of reasoning required to answer the questions, leaning\nheavily on dependency and constituency trees. We build a strong logistic\nregression model, which achieves an F1 score of 51.0%, a significant\nimprovement over a simple baseline (20%). However, human performance (86.8%) is\nmuch higher, indicating that the dataset presents a good challenge problem for\nfuture research.\n The dataset is freely available at https://stanford-qa.com" - }, - "aps/imagenet2012": { - "pwc_id": "imagenet", - "dataset_name": "ImageNet Dataset", - "dataset_abstract": "The ImageNet dataset contains 14,197,122 annotated images according to the WordNet hierarchy. Since 2010 the dataset is used in the ImageNet Large Scale Visual Recognition Challenge (ILSVRC), a benchmark in image classification and object detection.\nThe publicly released dataset contains a set of manually annotated training images. A set of test images is also released, with the manual annotations withheld.\nILSVRC annotations fall into one of two categories: (1) image-level annotation of a binary label for the presence or absence of an object class in the image, e.g., \u201cthere are cars in this image\u201d but \u201cthere are no tigers,\u201d and (2) object-level annotation of a tight bounding box and class label around an object instance in the image, e.g., \u201cthere is a screwdriver centered at position (20,25) with width of 50 pixels and height of 30 pixels\u201d.\nThe ImageNet project does not own the copyright of the images, therefore only thumbnails and URLs of images are provided.\n\n\nTotal number of non-empty WordNet synsets: 21841\nTotal number of images: 14197122\nNumber of images with bounding box annotations: 1,034,908\nNumber of synsets with SIFT features: 1000\nNumber of images with SIFT features: 1.2 million", - "paper_name": "", - "paper_abstract": "" - }, - "patrickvonplaten/librispeech_asr_self_contained": { - "pwc_id": "librispeech-1", - "dataset_name": "LibriSpeech Dataset", - "dataset_abstract": "The LibriSpeech corpus is a collection of approximately 1,000 hours of audiobooks that are a part of the LibriVox project. Most of the audiobooks come from the Project Gutenberg. The training data is split into 3 partitions of 100hr, 360hr, and 500hr sets while the dev and test data are split into the \u2019clean\u2019 and \u2019other\u2019 categories, respectively, depending upon how well or challening Automatic Speech Recognition systems would perform against. Each of the dev and test sets is around 5hr in audio length. This corpus also provides the n-gram language models and the corresponding texts excerpted from the Project Gutenberg books, which contain 803M tokens and 977K unique words.", - "paper_name": "", - "paper_abstract": "" - }, - "lewtun/autoevaluate__imdb": { - "pwc_id": "imdb-movie-reviews", - "dataset_name": "IMDb Movie Reviews Dataset", - "dataset_abstract": "The IMDb Movie Reviews dataset is a binary sentiment analysis dataset consisting of 50,000 reviews from the Internet Movie Database (IMDb) labeled as positive or negative. The dataset contains an even number of positive and negative reviews. Only highly polarizing reviews are considered. A negative review has a score \u2264 4 out of 10, and a positive review has a score \u2265 7 out of 10. No more than 30 reviews are included per movie. The dataset contains additional unlabeled data.", - "paper_name": "", - "paper_abstract": "" - }, - "lewtun/autoevaluate__squad": { - "pwc_id": "squad", - "dataset_name": "SQuAD Dataset", - "dataset_abstract": "The Stanford Question Answering Dataset (SQuAD) is a collection of question-answer pairs derived from Wikipedia articles. In SQuAD, the correct answers of questions can be any sequence of tokens in the given text. Because the questions and answers are produced by humans through crowdsourcing, it is more diverse than some other question-answering datasets. SQuAD 1.1 contains 107,785 question-answer pairs on 536 articles. SQuAD2.0 (open-domain SQuAD, SQuAD-Open), the latest version, combines the 100,000 questions in SQuAD1.1 with over 50,000 un-answerable questions written adversarially by crowdworkers in forms that are similar to the answerable ones.", - "paper_name": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", - "paper_abstract": "We present the Stanford Question Answering Dataset (SQuAD), a new reading\ncomprehension dataset consisting of 100,000+ questions posed by crowdworkers on\na set of Wikipedia articles, where the answer to each question is a segment of\ntext from the corresponding reading passage. We analyze the dataset to\nunderstand the types of reasoning required to answer the questions, leaning\nheavily on dependency and constituency trees. We build a strong logistic\nregression model, which achieves an F1 score of 51.0%, a significant\nimprovement over a simple baseline (20%). However, human performance (86.8%) is\nmuch higher, indicating that the dataset presents a good challenge problem for\nfuture research.\n The dataset is freely available at https://stanford-qa.com" - }, - "lewtun/autoevaluate__xsum": { - "pwc_id": "xsum", - "dataset_name": "XSum Dataset", - "dataset_abstract": "The Extreme Summarization (XSum) dataset is a dataset for evaluation of abstractive single-document summarization systems. The goal is to create a short, one-sentence new summary answering the question \u201cWhat is the article about?\u201d. The dataset consists of 226,711 news articles accompanied with a one-sentence summary. The articles are collected from BBC articles (2010 to 2017) and cover a wide variety of domains (e.g., News, Politics, Sports, Weather, Business, Technology, Science, Health, Family, Education, Entertainment and Arts). The official random split contains 204,045 (90%), 11,332 (5%) and 11,334 (5) documents in training, validation and test sets, respectively.", - "paper_name": "Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization", - "paper_abstract": "We introduce extreme summarization, a new single-document summarization task\nwhich does not favor extractive strategies and calls for an abstractive\nmodeling approach. The idea is to create a short, one-sentence news summary\nanswering the question \"What is the article about?\". We collect a real-world,\nlarge-scale dataset for this task by harvesting online articles from the\nBritish Broadcasting Corporation (BBC). We propose a novel abstractive model\nwhich is conditioned on the article's topics and based entirely on\nconvolutional neural networks. We demonstrate experimentally that this\narchitecture captures long-range dependencies in a document and recognizes\npertinent content, outperforming an oracle extractive system and\nstate-of-the-art abstractive approaches when evaluated automatically and by\nhumans." - }, - "lewtun/autoevaluate__ncbi_disease": { - "pwc_id": "ncbi-disease-1", - "dataset_name": "NCBI Disease Dataset", - "dataset_abstract": "The NCBI Disease corpus consists of 793 PubMed abstracts, which are separated into training (593), development (100) and test (100) subsets. The NCBI Disease corpus is annotated with disease mentions, using concept identifiers from either MeSH or OMIM.", - "paper_name": "", - "paper_abstract": "" - }, - "cfilt/HiNER-original": { - "pwc_id": "hiner-original-1", - "dataset_name": "HiNER-original Dataset", - "dataset_abstract": "This dataset releases a significantly sized standard-abiding Hindi NER dataset containing 109,146 sentences and 2,220,856 tokens, annotated with 11 tags.", - "paper_name": "HiNER: A Large Hindi Named Entity Recognition Dataset", - "paper_abstract": "Named Entity Recognition (NER) is a foundational NLP task that aims to provide class labels like Person, Location, Organisation, Time, and Number to words in free text. Named Entities can also be multi-word expressions where the additional I-O-B annotation information helps label them during the NER annotation process. While English and European languages have considerable annotated data for the NER task, Indian languages lack on that front -- both in terms of quantity and following annotation standards. This paper releases a significantly sized standard-abiding Hindi NER dataset containing 109,146 sentences and 2,220,856 tokens, annotated with 11 tags. We discuss the dataset statistics in all their essential detail and provide an in-depth analysis of the NER tag-set used with our data. The statistics of tag-set in our dataset show a healthy per-tag distribution, especially for prominent classes like Person, Location and Organisation. Since the proof of resource-effectiveness is in building models with the resource and testing the model on benchmark data and against the leader-board entries in shared tasks, we do the same with the aforesaid data. We use different language models to perform the sequence labelling task for NER and show the efficacy of our data by performing a comparative evaluation with models trained on another dataset available for the Hindi NER task. Our dataset helps achieve a weighted F1 score of 88.78 with all the tags and 92.22 when we collapse the tag-set, as discussed in the paper. To the best of our knowledge, no available dataset meets the standards of volume (amount) and variability (diversity), as far as Hindi NER is concerned. We fill this gap through this work, which we hope will significantly help NLP for Hindi. We release this dataset with our code and models at https://github.com/cfiltnlp/HiNER" - }, - "janck/bigscience-lama": { - "pwc_id": "lama", - "dataset_name": "LAMA Dataset", - "dataset_abstract": "LAnguage Model Analysis (LAMA) consists of a set of knowledge sources, each comprised of a set of facts. LAMA is a probe for analyzing the factual and commonsense knowledge contained in pretrained language models.", - "paper_name": "Language Models as Knowledge Bases?", - "paper_abstract": "Recent progress in pretraining language models on large textual corpora led to a surge of improvements for downstream NLP tasks. Whilst learning linguistic knowledge, these models may also be storing relational knowledge present in the training data, and may be able to answer queries structured as \"fill-in-the-blank\" cloze statements. Language models have many advantages over structured knowledge bases: they require no schema engineering, allow practitioners to query about an open class of relations, are easy to extend to more data, and require no human supervision to train. We present an in-depth analysis of the relational knowledge already present (without fine-tuning) in a wide range of state-of-the-art pretrained language models. We find that (i) without fine-tuning, BERT contains relational knowledge competitive with traditional NLP methods that have some access to oracle knowledge, (ii) BERT also does remarkably well on open-domain question answering against a supervised baseline, and (iii) certain types of factual knowledge are learned much more readily than others by standard language model pretraining approaches. The surprisingly strong ability of these models to recall factual knowledge without any fine-tuning demonstrates their potential as unsupervised open-domain QA systems. The code to reproduce our analysis is available at https://github.com/facebookresearch/LAMA." - }, - "AmazonScience/massive": { - "pwc_id": "massive", - "dataset_name": "MASSIVE Dataset", - "dataset_abstract": "MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations for the Natural Language Understanding tasks of intent prediction and slot annotation. Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", - "paper_name": "MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages", - "paper_abstract": "We present the MASSIVE dataset--Multilingual Amazon Slu resource package (SLURP) for Slot-filling, Intent classification, and Virtual assistant Evaluation. MASSIVE contains 1M realistic, parallel, labeled virtual assistant utterances spanning 51 languages, 18 domains, 60 intents, and 55 slots. MASSIVE was created by tasking professional translators to localize the English-only SLURP dataset into 50 typologically diverse languages from 29 genera. We also present modeling results on XLM-R and mT5, including exact match accuracy, intent classification accuracy, and slot-filling F1 score. We have released our dataset, modeling code, and models publicly." - }, - "strombergnlp/danfever": { - "pwc_id": "danfever", - "dataset_name": "DanFEVER Dataset", - "dataset_abstract": "We present a dataset, DANFEVER, intended for claim verification in Danish. The dataset builds upon the task framing of the FEVER fact extraction and verification challenge. DANFEVER can be used for creating models for detecting mis- & disinformation in Danish as well as for verification in multilingual settings.", - "paper_name": "DanFEVER: claim verification dataset for Danish", - "paper_abstract": "We present a dataset, DanFEVER, intended for multilingual misinformation research. The dataset is in Danish and has the same format as the well-known English FEVER dataset. It can be used for testing methods in multilingual settings, as well as for creating models in production for the Danish language." - }, - "strombergnlp/broad_twitter_corpus": { - "pwc_id": "broad-twitter-corpus", - "dataset_name": "Broad Twitter Corpus Dataset", - "dataset_abstract": "This paper introduces the Broad Twitter Corpus (BTC), which is not only significantly bigger, but sampled across different regions, temporal periods, and types of Twitter users. The gold-standard named entity annotations are made by a combination of NLP experts and crowd workers, which enables us to harness crowd recall while maintaining high quality. We also measure the entity drift observed in our dataset (i.e. how entity representation varies over time), and compare to newswire.", - "paper_name": "Broad Twitter Corpus: A Diverse Named Entity Recognition Resource", - "paper_abstract": "One of the main obstacles, hampering method development and comparative evaluation of named entity recognition in social media, is the lack of a sizeable, diverse, high quality annotated corpus, analogous to the CoNLL{'}2003 news dataset. For instance, the biggest Ritter tweet corpus is only 45,000 tokens {--} a mere 15{\\%} the size of CoNLL{'}2003. Another major shortcoming is the lack of temporal, geographic, and author diversity. This paper introduces the Broad Twitter Corpus (BTC), which is not only significantly bigger, but sampled across different regions, temporal periods, and types of Twitter users. The gold-standard named entity annotations are made by a combination of NLP experts and crowd workers, which enables us to harness crowd recall while maintaining high quality. We also measure the entity drift observed in our dataset (i.e. how entity representation varies over time), and compare to newswire. The corpus is released openly, including source text and intermediate annotations." - }, - "strombergnlp/ipm_nel": { - "pwc_id": "ipm-nel", - "dataset_name": "IPM NEL Dataset", - "dataset_abstract": "This data is for the task of named entity recognition and linking/disambiguation over tweets. It comprises\nthe addition of an entity URI layer on top of an NER-annotated tweet dataset. The task is to detect entities\nand then provide a correct link to them in DBpedia, thus disambiguating otherwise ambiguous entity surface\nforms; for example, this means linking \"Paris\" to the correct instance of a city named that (e.g. Paris, \nFrance vs. Paris, Texas).\n\nThe data concentrates on ten types of named entities: company, facility, geographic location, movie, musical\nartist, person, product, sports team, TV show, and other.\n\nThe file is tab separated, in CoNLL format, with line breaks between tweets.\nData preserves the tokenisation used in the Ritter datasets.\nPoS labels are not present for all tweets, but where they could be found in the Ritter\ndata, they're given. In cases where a URI could not be agreed, or was not present in\nDBpedia, there is a NIL. See the paper for a full description of the methodology.", - "paper_name": "Analysis of Named Entity Recognition and Linking for Tweets", - "paper_abstract": "Applying natural language processing for mining and intelligent information\naccess to tweets (a form of microblog) is a challenging, emerging research\narea. Unlike carefully authored news text and other longer content, tweets pose\na number of new challenges, due to their short, noisy, context-dependent, and\ndynamic nature. Information extraction from tweets is typically performed in a\npipeline, comprising consecutive stages of language identification,\ntokenisation, part-of-speech tagging, named entity recognition and entity\ndisambiguation (e.g. with respect to DBpedia). In this work, we describe a new\nTwitter entity disambiguation dataset, and conduct an empirical analysis of\nnamed entity recognition and disambiguation, investigating how robust a number\nof state-of-the-art systems are on such noisy texts, what the main sources of\nerror are, and which problems should be further investigated to improve the\nstate of the art." - }, - "strombergnlp/shaj": { - "pwc_id": "shaj", - "dataset_name": "SHAJ Dataset", - "dataset_abstract": "This is an abusive/offensive language detection dataset for Albanian. The data is formatted following the OffensEval convention. Data is from Instagram and YouTube comments.", - "paper_name": "Detecting Abusive Albanian", - "paper_abstract": "The ever growing usage of social media in the recent years has had a direct impact on the increased presence of hate speech and offensive speech in online platforms. Research on effective detection of such content has mainly focused on English and a few other widespread languages, while the leftover majority fail to have the same work put into them and thus cannot benefit from the steady advancements made in the field. In this paper we present \\textsc{Shaj}, an annotated Albanian dataset for hate speech and offensive speech that has been constructed from user-generated content on various social media platforms. Its annotation follows the hierarchical schema introduced in OffensEval. The dataset is tested using three different classification models, the best of which achieves an F1 score of 0.77 for the identification of offensive language, 0.64 F1 score for the automatic categorization of offensive types and lastly, 0.52 F1 score for the offensive language target identification." - }, - "strombergnlp/polstance": { - "pwc_id": "polstance", - "dataset_name": "polstance Dataset", - "dataset_abstract": "Political stance in Danish. Examples represent statements by politicians and are annotated for, against, or neutral to a given topic/article.", - "paper_name": "Political Stance in Danish", - "paper_abstract": "The task of stance detection consists of classifying the opinion within a text towards some target. This paper seeks to generate a dataset of quotes from Danish politicians, label this dataset to allow the task of stance detection to be performed, and present annotation guidelines to allow further expansion of the generated dataset. Furthermore, three models based on an LSTM architecture are designed, implemented and optimized to perform the task of stance detection for the generated dataset. Experiments are performed using conditionality and bi-directionality for these models, and using either singular word embeddings or averaged word embeddings for an entire quote, to determine the optimal model design. The simplest model design, applying neither conditionality or bi-directionality, and averaged word embeddings across quotes, yields the strongest results. Furthermore, it was found that inclusion of the quotes politician, and the party affiliation of the quoted politician, greatly improved performance of the strongest model." - }, - "strombergnlp/twitter_pos_vcb": { - "pwc_id": "twitter-pos-vcb", - "dataset_name": "Twitter PoS VCB Dataset", - "dataset_abstract": "The data is about 1.5 million English tweets annotated for part-of-speech using Ritter's extension of the PTB tagset. The tweets are from 2012 and 2013, tokenized using the GATE tokenizer and tagged jointly using the CMU ARK tagger and Ritter's T-POS tagger. Only when both these taggers' outputs are completely compatible over a whole tweet, is that tweet added to the dataset.", - "paper_name": "", - "paper_abstract": "" - }, - "strombergnlp/zulu_stance": { - "pwc_id": "zulu-stance", - "dataset_name": "zulu-stance Dataset", - "dataset_abstract": "This is a stance detection dataset in the Zulu language. The data is translated to Zulu by Zulu native speakers, from English source texts.\n\nOur paper aims at utilizing this progress made for English to transfers that knowledge into other languages, which is a non-trivial task due to the domain gap between English and the target languages. We propose a black-box non-intrusive method that utilizes techniques from Domain Adaptation to reduce the domain gap, without requiring any human expertise in the target language, by leveraging low-quality data in both a supervised and unsupervised manner. This allows us to rapidly achieve similar results for stance detection for the Zulu language, the target language in this work, as are found for English. A natively-translated dataset is used for evaluation of domain transfer.", - "paper_name": "Bridging the Domain Gap for Stance Detection for the Zulu language", - "paper_abstract": "Misinformation has become a major concern in recent last years given its spread across our information sources. In the past years, many NLP tasks have been introduced in this area, with some systems reaching good results on English language datasets. Existing AI based approaches for fighting misinformation in literature suggest automatic stance detection as an integral first step to success. Our paper aims at utilizing this progress made for English to transfers that knowledge into other languages, which is a non-trivial task due to the domain gap between English and the target languages. We propose a black-box non-intrusive method that utilizes techniques from Domain Adaptation to reduce the domain gap, without requiring any human expertise in the target language, by leveraging low-quality data in both a supervised and unsupervised manner. This allows us to rapidly achieve similar results for stance detection for the Zulu language, the target language in this work, as are found for English. We also provide a stance detection dataset in the Zulu language. Our experimental results show that by leveraging English datasets and machine translation we can increase performances on both English data along with other languages." - }, - "mozilla-foundation/common_voice_9_0": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "google/wit": { - "pwc_id": "wit", - "dataset_name": "WIT Dataset", - "dataset_abstract": "Wikipedia-based Image Text (WIT) Dataset is a large multimodal multilingual dataset. WIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages. Its size enables WIT to be used as a pretraining dataset for multimodal machine learning models.\n\nKey Advantages\n\nA few unique advantages of WIT:\n\n\nThe largest multimodal dataset (time of this writing) by the number of image-text examples.\nA massively multilingual (first of its kind) with coverage for over 100+ languages.\nA collection of diverse set of concepts and real world entities.\nBrings forth challenging real-world test sets.", - "paper_name": "WIT: Wikipedia-based Image Text Dataset for Multimodal Multilingual Machine Learning", - "paper_abstract": "The milestone improvements brought about by deep representation learning and pre-training techniques have led to large performance gains across downstream NLP, IR and Vision tasks. Multimodal modeling techniques aim to leverage large high-quality visio-linguistic datasets for learning complementary information (across image and text modalities). In this paper, we introduce the Wikipedia-based Image Text (WIT) Dataset (https://github.com/google-research-datasets/wit) to better facilitate multimodal, multilingual learning. WIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages. Its size enables WIT to be used as a pretraining dataset for multimodal models, as we show when applied to downstream tasks such as image-text retrieval. WIT has four main and unique advantages. First, WIT is the largest multimodal dataset by the number of image-text examples by 3x (at the time of writing). Second, WIT is massively multilingual (first of its kind) with coverage over 100+ languages (each of which has at least 12K examples) and provides cross-lingual texts for many images. Third, WIT represents a more diverse set of concepts and real world entities relative to what previous datasets cover. Lastly, WIT provides a very challenging real-world test set, as we empirically illustrate using an image-text retrieval task as an example." - }, - "shanya/crd3": { - "pwc_id": "crd3", - "dataset_name": "CRD3 Dataset", - "dataset_abstract": "The dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding abstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player collaboration and spoken interaction.", - "paper_name": "Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset", - "paper_abstract": "This paper describes the Critical Role Dungeons and Dragons Dataset (CRD3) and related analyses. Critical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. The dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding abstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player collaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, and semantic ties to the previous dialogues. In addition, we provide a data augmentation method that produces 34,243 summary-dialogue chunk pairs to support current neural ML approaches, and we provide an abstractive summarization benchmark and evaluation." - }, - "wikimedia/wit_base": { - "pwc_id": "wit", - "dataset_name": "WIT Dataset", - "dataset_abstract": "Wikipedia-based Image Text (WIT) Dataset is a large multimodal multilingual dataset. WIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages. Its size enables WIT to be used as a pretraining dataset for multimodal machine learning models.\n\nKey Advantages\n\nA few unique advantages of WIT:\n\n\nThe largest multimodal dataset (time of this writing) by the number of image-text examples.\nA massively multilingual (first of its kind) with coverage for over 100+ languages.\nA collection of diverse set of concepts and real world entities.\nBrings forth challenging real-world test sets.", - "paper_name": "WIT: Wikipedia-based Image Text Dataset for Multimodal Multilingual Machine Learning", - "paper_abstract": "The milestone improvements brought about by deep representation learning and pre-training techniques have led to large performance gains across downstream NLP, IR and Vision tasks. Multimodal modeling techniques aim to leverage large high-quality visio-linguistic datasets for learning complementary information (across image and text modalities). In this paper, we introduce the Wikipedia-based Image Text (WIT) Dataset (https://github.com/google-research-datasets/wit) to better facilitate multimodal, multilingual learning. WIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages. Its size enables WIT to be used as a pretraining dataset for multimodal models, as we show when applied to downstream tasks such as image-text retrieval. WIT has four main and unique advantages. First, WIT is the largest multimodal dataset by the number of image-text examples by 3x (at the time of writing). Second, WIT is massively multilingual (first of its kind) with coverage over 100+ languages (each of which has at least 12K examples) and provides cross-lingual texts for many images. Third, WIT represents a more diverse set of concepts and real world entities relative to what previous datasets cover. Lastly, WIT provides a very challenging real-world test set, as we empirically illustrate using an image-text retrieval task as an example." - }, - "orieg/elsevier-oa-cc-by": { - "pwc_id": "elsevier-oa-cc-by", - "dataset_name": "Elsevier OA CC-BY Dataset", - "dataset_abstract": "An open corpus of Scientific Research papers which has a representative sample from across scientific disciplines. This corpus not only includes the full text of the article, but also the metadata of the documents, along with the bibliographic information for each reference.", - "paper_name": "Elsevier OA CC-By Corpus", - "paper_abstract": "We introduce the Elsevier OA CC-BY corpus. This is the first open corpus of Scientific Research papers which has a representative sample from across scientific disciplines. This corpus not only includes the full text of the article, but also the metadata of the documents, along with the bibliographic information for each reference." - }, - "JoesSattle/common_voice_specific_version": { - "pwc_id": "common-voice", - "dataset_name": "Common Voice Dataset", - "dataset_abstract": "Common Voice is an audio dataset that consists of a unique MP3 and corresponding text file. There are 9,283 recorded hours in the dataset. The dataset also includes demographic metadata like age, sex, and accent. The dataset consists of 7,335 validated hours in 60 languages.", - "paper_name": "Common Voice: A Massively-Multilingual Speech Corpus", - "paper_abstract": "The Common Voice corpus is a massively-multilingual collection of transcribed speech intended for speech technology research and development. Common Voice is designed for Automatic Speech Recognition purposes but can be useful in other domains (e.g. language identification). To achieve scale and sustainability, the Common Voice project employs crowdsourcing for both data collection and data validation. The most recent release includes 29 languages, and as of November 2019 there are a total of 38 languages collecting data. Over 50,000 individuals have participated so far, resulting in 2,500 hours of collected audio. To our knowledge this is the largest audio corpus in the public domain for speech recognition, both in terms of number of hours and number of languages. As an example use case for Common Voice, we present speech recognition experiments using Mozilla's DeepSpeech Speech-to-Text toolkit. By applying transfer learning from a source English model, we find an average Character Error Rate improvement of 5.99 +/- 5.48 for twelve target languages (German, French, Italian, Turkish, Catalan, Slovenian, Welsh, Irish, Breton, Tatar, Chuvash, and Kabyle). For most of these languages, these are the first ever published results on end-to-end Automatic Speech Recognition." - }, - "filwsyl/video_tags": { - "pwc_id": "mnist", - "dataset_name": "MNIST Dataset", - "dataset_abstract": "The MNIST database (Modified National Institute of Standards and Technology database) is a large collection of handwritten digits. It has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger NIST Special Database 3 (digits written by employees of the United States Census Bureau) and Special Database 1 (digits written by high school students) which contain monochrome images of handwritten digits. The digits have been size-normalized and centered in a fixed-size image. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image by computing the center of mass of the pixels, and translating the image so as to position this point at the center of the 28x28 field.", - "paper_name": "", - "paper_abstract": "" - }, - "strombergnlp/twitter_pos": { - "pwc_id": "ritter-pos", - "dataset_name": "Ritter PoS Dataset", - "dataset_abstract": "PTB-tagged English Tweets", - "paper_name": "Named Entity Recognition in Tweets: An Experimental Study", - "paper_abstract": "People tweet more than 100 Million times\r\ndaily, yielding a noisy, informal, but sometimes informative corpus of 140-character\r\nmessages that mirrors the zeitgeist in an unprecedented manner. The performance of\r\nstandard NLP tools is severely degraded on\r\ntweets. This paper addresses this issue by\r\nre-building the NLP pipeline beginning with\r\npart-of-speech tagging, through chunking, to\r\nnamed-entity recognition. Our novel T-NER\r\nsystem doubles F1 score compared with the\r\nStanford NER system. T-NER leverages the\r\nredundancy inherent in tweets to achieve this\r\nperformance, using LabeledLDA to exploit\r\nFreebase dictionaries as a source of distant\r\nsupervision. LabeledLDA outperforms cotraining, increasing F1 by 25% over ten common entity types.\r\nOur NLP tools are available at: http://\r\ngithub.com/aritter/twitter_nlp" - }, - "strombergnlp/rustance": { - "pwc_id": "rustance", - "dataset_name": "RuStance Dataset", - "dataset_abstract": "Includes Russian tweets and news comments from multiple sources, covering multiple stories, as well as text classification approaches to stance detection as benchmarks over this data in this language.", - "paper_name": "Stance Prediction for Russian: Data and Analysis", - "paper_abstract": "Stance detection is a critical component of rumour and fake news\nidentification. It involves the extraction of the stance a particular author\ntakes related to a given claim, both expressed in text. This paper investigates\nstance classification for Russian. It introduces a new dataset, RuStance, of\nRussian tweets and news comments from multiple sources, covering multiple\nstories, as well as text classification approaches to stance detection as\nbenchmarks over this data in this language. As well as presenting this\nopenly-available dataset, the first of its kind for Russian, the paper presents\na baseline for stance prediction in the language." - }, - "rpeeters/wdc-computers": { - "pwc_id": "wdc-products", - "dataset_name": "WDC Products Dataset", - "dataset_abstract": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\") for four product categories, computers, cameras, watches and shoes. \n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test sets. For each product category, we provide training sets in four different sizes (2.000-70.000 pairs). Furthermore there are sets of ids for each training set for a possible validation split (stratified random draw) available. The test set for each product category consists of 1.100 product pairs. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision. \n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.", - "paper_name": "", - "paper_abstract": "" - }, - "rpeeters/wdc-watches": { - "pwc_id": "wdc-products", - "dataset_name": "WDC Products Dataset", - "dataset_abstract": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\") for four product categories, computers, cameras, watches and shoes. \n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test sets. For each product category, we provide training sets in four different sizes (2.000-70.000 pairs). Furthermore there are sets of ids for each training set for a possible validation split (stratified random draw) available. The test set for each product category consists of 1.100 product pairs. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision. \n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.", - "paper_name": "", - "paper_abstract": "" - }, - "MilaNLProc/honest": { - "pwc_id": "honest-en", - "dataset_name": "HONEST Dataset", - "dataset_abstract": "The HONEST dataset is a template-based corpus for testing the hurtfulness of sentence completions in language models (e.g., BERT) in six different languages (English, Italian, French, Portuguese, Romanian, and Spanish). HONEST is composed of 420 instances for each language, which are generated from 28 identity terms (14 male and 14 female) and 15 templates. It uses a set of identifier terms in singular and plural (i.e., woman, women, girl, boys) and a series of predicates (i.e., \u201cworks as [MASK]\u201d, \u201cis known for [MASK]\u201d). The objective is to use language models to fill the sentence, then the hurtfulness of the completion is evaluated.", - "paper_name": "HONEST: Measuring Hurtful Sentence Completion in Language Models", - "paper_abstract": "Language models have revolutionized the field of NLP. However, language models capture and proliferate hurtful stereotypes, especially in text generation. Our results show that 4.3{\\%} of the time, language models complete a sentence with a hurtful word. These cases are not random, but follow language and gender-specific patterns. We propose a score to measure hurtful sentence completions in language models (HONEST). It uses a systematic template- and lexicon-based bias evaluation methodology for six languages. Our findings suggest that these models replicate and amplify deep-seated societal stereotypes about gender roles. Sentence completions refer to sexual promiscuity when the target is female in 9{\\%} of the time, and in 4{\\%} to homosexuality when the target is male. The results raise questions about the use of these models in production settings." - }, - "strombergnlp/nordic_langid": { - "pwc_id": "nordic-langid", - "dataset_name": "nordic_langid Dataset", - "dataset_abstract": "Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm\u00e5l), Faroese and Icelandic.", - "paper_name": "Discriminating Between Similar Nordic Languages", - "paper_abstract": "Automatic language identification is a challenging problem. Discriminating between closely related languages is especially difficult. This paper presents a machine learning approach for automatic language identification for the Nordic languages, which often suffer miscategorisation by existing state-of-the-art tools. Concretely we will focus on discrimination between six Nordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm{\\aa}l), Faroese and Icelandic." - }, - "aps/charades": { - "pwc_id": "charades", - "dataset_name": "Charades Dataset", - "dataset_abstract": "The Charades dataset is composed of 9,848 videos of daily indoors activities with an average length of 30 seconds, involving interactions with 46 objects classes in 15 types of indoor scenes and containing a vocabulary of 30 verbs leading to 157 action classes. Each video in this dataset is annotated by multiple free-text descriptions, action labels, action intervals and classes of interacting objects. 267 different users were presented with a sentence, which includes objects and actions from a fixed vocabulary, and they recorded a video acting out the sentence. In total, the dataset contains 66,500 temporal annotations for 157 action classes, 41,104 labels for 46 object classes, and 27,847 textual descriptions of the videos. In the standard split there are7,986 training video and 1,863 validation video.", - "paper_name": "Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding", - "paper_abstract": "Computer vision has a great potential to help our daily lives by searching\nfor lost keys, watering flowers or reminding us to take a pill. To succeed with\nsuch tasks, computer vision methods need to be trained from real and diverse\nexamples of our daily dynamic scenes. While most of such scenes are not\nparticularly exciting, they typically do not appear on YouTube, in movies or TV\nbroadcasts. So how do we collect sufficiently many diverse but boring samples\nrepresenting our lives? We propose a novel Hollywood in Homes approach to\ncollect such data. Instead of shooting videos in the lab, we ensure diversity\nby distributing and crowdsourcing the whole process of video creation from\nscript writing to video recording and annotation. Following this procedure we\ncollect a new dataset, Charades, with hundreds of people recording videos in\ntheir own homes, acting out casual everyday activities. The dataset is composed\nof 9,848 annotated videos with an average length of 30 seconds, showing\nactivities of 267 people from three continents. Each video is annotated by\nmultiple free-text descriptions, action labels, action intervals and classes of\ninteracted objects. In total, Charades provides 27,847 video descriptions,\n66,500 temporally localized intervals for 157 action classes and 41,104 labels\nfor 46 object classes. Using this rich data, we evaluate and provide baseline\nresults for several tasks including action recognition and automatic\ndescription generation. We believe that the realism, diversity, and casual\nnature of this dataset will present unique challenges and new opportunities for\ncomputer vision community." - }, - "strombergnlp/bornholmsk_parallel": { - "pwc_id": "bornholmsk-parallel", - "dataset_name": "bornholmsk_parallel Dataset", - "dataset_abstract": "This dataset is parallel text for Bornholmsk and Danish.", - "paper_name": "Bornholmsk Natural Language Processing: Resources and Tools", - "paper_abstract": "This paper introduces language processing resources and tools for Bornholmsk, a language spoken on the island of Bornholm, with roots in Danish and closely related to Scanian. This presents an overview of the language and available data, and the first NLP models for this living, minority Nordic language. Sammenfattnijng p\u00e5 borrijnholmst: D\u00e6jnna artikkelijn introduserer naturspr\u00e5gsresurser \u00e5 varktoi for borrijnholmst, ed spr\u00e5g a d\u00e6r snakkes p\u00e5 \u00f6n Borrijnholm me r\u00f8dder i danst \u00e5 i n\u00e6r familia me sk\u00e5nst. Artikkelijn gjer ed \u00e2uersyn \u00e2uer spr\u00e5ged \u00e5 di datan som fijnnes, \u00e5 di fosste NLP mod\u00e6llarna for d\u00e6tta l\u00e6wenes nordiska minnret\u00e2lsspr\u00e5ged." - }, - "strombergnlp/bajer_danish_misogyny": { - "pwc_id": "bajer-danish-misogyny", - "dataset_name": "bajer_danish_misogyny Dataset", - "dataset_abstract": "This is a high-quality dataset of annotated posts sampled from social media posts and annotated for misogyny. Danish language.", - "paper_name": "Annotating Online Misogyny", - "paper_abstract": "Online misogyny, a category of online abusive language, has serious and harmful social consequences. Automatic detection of misogynistic language online, while imperative, poses complicated challenges to both data gathering, data annotation, and bias mitigation, as this type of data is linguistically complex and diverse. This paper makes three contributions in this area: Firstly, we describe the detailed design of our iterative annotation process and codebook. Secondly, we present a comprehensive taxonomy of labels for annotating misogyny in natural written language, and finally, we introduce a high-quality dataset of annotated posts sampled from social media posts." - } -} \ No newline at end of file