annotate-relevance

Sleeping

App Files Files Community

annotate-relevance / find_splitting_words.py

orionweller

Upload find_splitting_words.py

0c3e233 verified 9 months ago

raw

history blame

3.4 kB

	import argparse
	import pandas as pd
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from nltk.tokenize import word_tokenize
	from collections import Counter
	import string
	import os

	# Ensure you've downloaded the set of stop words the first time you run this
	import nltk
	nltk.download('punkt')
	nltk.download('stopwords')

	def preprocess_document(doc):
	"""
	Tokenizes, removes punctuation, stopwords, and stems words in a single document.
	"""
	# Lowercase
	doc = doc.lower()
	# Remove punctuation
	doc = doc.translate(str.maketrans('', '', string.punctuation))
	# Tokenize
	tokens = word_tokenize(doc)
	# Remove stop words
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [word for word in tokens if word not in stop_words]
	# Stemming
	stemmer = PorterStemmer()
	stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
	return stemmed_tokens

	def find_dividing_words(documents):
	"""
	Identifies candidate words that might split the set of documents into two groups.
	"""
	all_words = []
	per_doc_word_counts = []

	for doc in documents:
	preprocessed_doc = preprocess_document(doc)
	all_words.extend(preprocessed_doc)
	per_doc_word_counts.append(Counter(preprocessed_doc))

	# Overall word frequency
	overall_word_counts = Counter(all_words)

	# Find words that appear in roughly half the documents
	num_docs = len(documents)
	candidate_words = []
	for word, count in overall_word_counts.items():
	doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
	if 0.3 * num_docs <= doc_frequency <= 0.7 * num_docs:
	candidate_words.append(word)

	return candidate_words


	def make_contents(doc):
	"""
	Returns the contents of a document as a single string.
	"""
	if "title" in doc and "contents" in doc:
	return doc["title"] + " " + doc["contents"]
	if "headline" in doc and "text" in doc:
	return doc["headline"] + " " + doc["text"]
	if "title" in doc and "text" in doc:
	return doc["title"] + " " + doc["text"]
	if "contents" in doc:
	return doc["contents"]
	if "text" in doc:
	return doc["text"]


	def main(args):
	# read in the qrels and docs file from the `args.dataset` directory for the `.relevant_only` files
	base_dir = os.path.join("data", args.dataset)
	qrels = pd.read_csv(os.path.join(base_dir, "qrels.relevant_only.trec"), sep="\t", header=None, names=["qid", "docid", "rel"])
	docs = pd.read_json(os.path.join(base_dir, "docs.relevant_only.jsonl"), lines=True)

	for qid in qrels.groupby("qid").groups.keys():
	# get the relevant documents for the current query
	relevant_docids = qrels[qrels["qid"] == qid]["docid"].tolist()
	# get the text for the relevant documents
	relevant_docs_text = docs[docs["doc_id"].isin(relevant_docids)].apply(lambda x: make_contents(x), axis=1).tolist()
	splitting_words = find_dividing_words(relevant_docs_text)

	breakpoint()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Find words that might split the set of documents into two groups.')
	parser.add_argument('dataset', type=str, help='The dataset to use (e.g. "robust04")')
	args = parser.parse_args()
	main(args)