annotate-relevance

Sleeping

App Files Files Community

orionweller commited on Feb 23

Commit

8bfed60

•

1 Parent(s): 3c28932

move to avoid import

Browse files

Files changed (2) hide show

app.py +67 -1
find_splitting_words.py +0 -103

app.py CHANGED Viewed

@@ -9,8 +9,74 @@ import re
 import tqdm
 import plotly.express as px
 from dataset_loading import load_local_qrels, load_local_corpus, load_local_queries
-from find_splitting_words import find_dividing_words
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

 import tqdm
 import plotly.express as px
+import pandas as pd
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+from collections import Counter
+import string
+import os
+import streamlit as st
+# Ensure you've downloaded the set of stop words the first time you run this
+import nltk
+# only download if they don't exist
+if not os.path.exists(os.path.join(nltk.data.find('corpora'), 'stopwords')):
+    nltk.download('punkt')
+    nltk.download('stopwords')
 from dataset_loading import load_local_qrels, load_local_corpus, load_local_queries
+def preprocess_document(doc):
+    """
+    Tokenizes, removes punctuation, stopwords, and stems words in a single document.
+    """
+    # Lowercase
+    doc = doc.lower()
+    # Remove punctuation
+    doc = doc.translate(str.maketrans('', '', string.punctuation))
+    # Tokenize
+    tokens = word_tokenize(doc)
+    # Remove stop words
+    stop_words = set(stopwords.words('english'))
+    filtered_tokens = [word for word in tokens if word not in stop_words]
+    # Stemming
+    stemmer = PorterStemmer()
+    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
+    return stemmed_tokens
+@st.cache_data
+def find_dividing_words(documents):
+    """
+    Identifies candidate words that might split the set of documents into two groups.
+    """
+    all_words = []
+    per_doc_word_counts = []
+    i = 0
+    for doc in documents:
+        print(i)
+        preprocessed_doc = preprocess_document(doc)
+        all_words.extend(preprocessed_doc)
+        per_doc_word_counts.append(Counter(preprocessed_doc))
+        i += 1
+    # Overall word frequency
+    overall_word_counts = Counter(all_words)
+    # Find words that appear in roughly half the documents
+    num_docs = len(documents)
+    candidate_words = []
+    for word, count in overall_word_counts.items():
+        doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
+        if 0.35 * num_docs <= doc_frequency <= 0.75 * num_docs:
+            candidate_words.append(word)
+    print("Done with dividing words")
+    return candidate_words
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

find_splitting_words.py DELETED Viewed

@@ -1,103 +0,0 @@
-import argparse
-import pandas as pd
-from nltk.corpus import stopwords
-from nltk.stem import PorterStemmer
-from nltk.tokenize import word_tokenize
-from collections import Counter
-import string
-import os
-import streamlit as st
-# Ensure you've downloaded the set of stop words the first time you run this
-import nltk
-# only download if they don't exist
-if not os.path.exists(os.path.join(nltk.data.find('corpora'), 'stopwords')):
-    nltk.download('punkt')
-    nltk.download('stopwords')
-def preprocess_document(doc):
-    """
-    Tokenizes, removes punctuation, stopwords, and stems words in a single document.
-    """
-    # Lowercase
-    doc = doc.lower()
-    # Remove punctuation
-    doc = doc.translate(str.maketrans('', '', string.punctuation))
-    # Tokenize
-    tokens = word_tokenize(doc)
-    # Remove stop words
-    stop_words = set(stopwords.words('english'))
-    filtered_tokens = [word for word in tokens if word not in stop_words]
-    # Stemming
-    stemmer = PorterStemmer()
-    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
-    return stemmed_tokens
-@st.cache_data
-def find_dividing_words(documents):
-    """
-    Identifies candidate words that might split the set of documents into two groups.
-    """
-    all_words = []
-    per_doc_word_counts = []
-    i = 0
-    for doc in documents:
-        print(i)
-        preprocessed_doc = preprocess_document(doc)
-        all_words.extend(preprocessed_doc)
-        per_doc_word_counts.append(Counter(preprocessed_doc))
-        i += 1
-    # Overall word frequency
-    overall_word_counts = Counter(all_words)
-    # Find words that appear in roughly half the documents
-    num_docs = len(documents)
-    candidate_words = []
-    for word, count in overall_word_counts.items():
-        doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
-        if 0.35 * num_docs <= doc_frequency <= 0.75 * num_docs:
-            candidate_words.append(word)
-    print("Done with dividing words")
-    return candidate_words
-def make_contents(doc):
-    """
-    Returns the contents of a document as a single string.
-    """
-    if "title" in doc and "contents" in doc:
-        return doc["title"] + " " + doc["contents"]
-    if "headline" in doc and "text" in doc:
-        return doc["headline"] + " " + doc["text"]
-    if "title" in doc and "text" in doc:
-        return doc["title"] + " " + doc["text"]
-    if "contents" in doc:
-        return doc["contents"]
-    if "text" in doc:
-        return doc["text"]
-def main(args):
-    # read in the qrels and docs file from the `args.dataset` directory for the `.relevant_only` files
-    base_dir = os.path.join("data", args.dataset)
-    qrels = pd.read_csv(os.path.join(base_dir, "qrels.relevant_only.trec"), sep="\t", header=None, names=["qid", "docid", "rel"])
-    docs = pd.read_json(os.path.join(base_dir, "docs.relevant_only.jsonl"), lines=True)
-    for qid in qrels.groupby("qid").groups.keys():
-        # get the relevant documents for the current query
-        relevant_docids = qrels[qrels["qid"] == qid]["docid"].tolist()
-        # get the text for the relevant documents
-        relevant_docs_text = docs[docs["doc_id"].isin(relevant_docids)].apply(lambda x: make_contents(x), axis=1).tolist()
-        splitting_words = find_dividing_words(relevant_docs_text)
-        breakpoint()
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Find words that might split the set of documents into two groups.')
-    parser.add_argument('dataset', type=str, help='The dataset to use (e.g. "robust04")')
-    args = parser.parse_args()
-    main(args)