Spaces:
Sleeping
Sleeping
import argparse | |
import pandas as pd | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
from nltk.tokenize import word_tokenize | |
from collections import Counter | |
import string | |
import os | |
# Ensure you've downloaded the set of stop words the first time you run this | |
import nltk | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
def preprocess_document(doc): | |
""" | |
Tokenizes, removes punctuation, stopwords, and stems words in a single document. | |
""" | |
# Lowercase | |
doc = doc.lower() | |
# Remove punctuation | |
doc = doc.translate(str.maketrans('', '', string.punctuation)) | |
# Tokenize | |
tokens = word_tokenize(doc) | |
# Remove stop words | |
stop_words = set(stopwords.words('english')) | |
filtered_tokens = [word for word in tokens if word not in stop_words] | |
# Stemming | |
stemmer = PorterStemmer() | |
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens] | |
return stemmed_tokens | |
def find_dividing_words(documents): | |
""" | |
Identifies candidate words that might split the set of documents into two groups. | |
""" | |
all_words = [] | |
per_doc_word_counts = [] | |
for doc in documents: | |
preprocessed_doc = preprocess_document(doc) | |
all_words.extend(preprocessed_doc) | |
per_doc_word_counts.append(Counter(preprocessed_doc)) | |
# Overall word frequency | |
overall_word_counts = Counter(all_words) | |
# Find words that appear in roughly half the documents | |
num_docs = len(documents) | |
candidate_words = [] | |
for word, count in overall_word_counts.items(): | |
doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0) | |
if 0.3 * num_docs <= doc_frequency <= 0.7 * num_docs: | |
candidate_words.append(word) | |
return candidate_words | |
def make_contents(doc): | |
""" | |
Returns the contents of a document as a single string. | |
""" | |
if "title" in doc and "contents" in doc: | |
return doc["title"] + " " + doc["contents"] | |
if "headline" in doc and "text" in doc: | |
return doc["headline"] + " " + doc["text"] | |
if "title" in doc and "text" in doc: | |
return doc["title"] + " " + doc["text"] | |
if "contents" in doc: | |
return doc["contents"] | |
if "text" in doc: | |
return doc["text"] | |
def main(args): | |
# read in the qrels and docs file from the `args.dataset` directory for the `.relevant_only` files | |
base_dir = os.path.join("data", args.dataset) | |
qrels = pd.read_csv(os.path.join(base_dir, "qrels.relevant_only.trec"), sep="\t", header=None, names=["qid", "docid", "rel"]) | |
docs = pd.read_json(os.path.join(base_dir, "docs.relevant_only.jsonl"), lines=True) | |
for qid in qrels.groupby("qid").groups.keys(): | |
# get the relevant documents for the current query | |
relevant_docids = qrels[qrels["qid"] == qid]["docid"].tolist() | |
# get the text for the relevant documents | |
relevant_docs_text = docs[docs["doc_id"].isin(relevant_docids)].apply(lambda x: make_contents(x), axis=1).tolist() | |
splitting_words = find_dividing_words(relevant_docs_text) | |
breakpoint() | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Find words that might split the set of documents into two groups.') | |
parser.add_argument('dataset', type=str, help='The dataset to use (e.g. "robust04")') | |
args = parser.parse_args() | |
main(args) |