Spaces:

Tiggy115
/

NLP4Web_HW1

Sleeping

App Files Files Community

Kurt commited on Nov 8, 2024

Commit

8dc749e

1 Parent(s): 530e7d9

toll2

Browse files

Files changed (15) hide show

app.py +12 -12
nlp4web_codebase/README.md +2 -0
nlp4web_codebase/nlp4web_codebase/__init__.py +0 -0
nlp4web_codebase/nlp4web_codebase/lr/__init__.py +0 -0
nlp4web_codebase/nlp4web_codebase/lr/analysis.py +160 -0
nlp4web_codebase/nlp4web_codebase/lr/data_loaders/__init__.py +35 -0
nlp4web_codebase/nlp4web_codebase/lr/data_loaders/dm.py +22 -0
nlp4web_codebase/nlp4web_codebase/lr/data_loaders/sciq.py +86 -0
nlp4web_codebase/nlp4web_codebase/lr/models/__init__.py +21 -0
nlp4web_codebase/requirements.txt +1 -0
nlp4web_codebase/setup.py +37 -0
sample_date/README.md +19 -0
sample_date/anscombe.json +49 -0
sample_date/california_housing_test.csv +0 -0
sample_date/california_housing_train.csv +0 -0

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 from typing import TypedDict
 from dataclasses import dataclass
@@ -11,6 +12,16 @@ import re
 import nltk
 nltk.download("stopwords", quiet=True)
 from nltk.corpus import stopwords as nltk_stopwords
 LANGUAGE = "english"
 word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
@@ -133,19 +144,12 @@ def run_counting(
         doc_texts=doc_texts,
     )
-from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
 sciq = load_sciq()
 counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
 """### BM25 Index"""
-from __future__ import annotations
-from dataclasses import asdict, dataclass
-import math
-import os
-from typing import Iterable, List, Optional, Type
-import tqdm
-from nlp4web_codebase.ir.data_loaders.dm import Document
 @dataclass
@@ -244,10 +248,6 @@ bm25_index.save("output/bm25_index")
 """### BM25 Retriever"""
-from nlp4web_codebase.ir.models import BaseRetriever
-from typing import Type
-from abc import abstractmethod
 class BaseInvertedIndexRetriever(BaseRetriever):

+from __future__ import annotations
 import gradio as gr
 from typing import TypedDict
 from dataclasses import dataclass
 import nltk
 nltk.download("stopwords", quiet=True)
 from nltk.corpus import stopwords as nltk_stopwords
+from dataclasses import asdict, dataclass
+import math
+import os
+from typing import Iterable, List, Optional, Type
+import tqdm
+from nlp4web_codebase.ir.data_loaders.dm import Document
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+from nlp4web_codebase.ir.models import BaseRetriever
+from typing import Type
+from abc import abstractmethod
 LANGUAGE = "english"
 word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
         doc_texts=doc_texts,
     )
 sciq = load_sciq()
 counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
 """### BM25 Index"""
 @dataclass
 """### BM25 Retriever"""
 class BaseInvertedIndexRetriever(BaseRetriever):

nlp4web_codebase/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # nlp4web
2	+ Codebase of teaching materials for NLP4Web.

nlp4web_codebase/nlp4web_codebase/__init__.py ADDED Viewed

File without changes

nlp4web_codebase/nlp4web_codebase/lr/__init__.py ADDED Viewed

File without changes

nlp4web_codebase/nlp4web_codebase/lr/analysis.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+from typing import Dict, List, Optional, Protocol
+import pandas as pd
+import tqdm
+import ujson
+from nlp4web_codebase.ir.data_loaders import IRDataset
+def round_dict(obj: Dict[str, float], ndigits: int = 4) -> Dict[str, float]:
+    return {k: round(v, ndigits=ndigits) for k, v in obj.items()}
+def sort_dict(obj: Dict[str, float], reverse: bool = True) -> Dict[str, float]:
+    return dict(sorted(obj.items(), key=lambda pair: pair[1], reverse=reverse))
+def save_ranking_results(
+    output_dir: str,
+    query_ids: List[str],
+    rankings: List[Dict[str, float]],
+    query_performances_lists: List[Dict[str, float]],
+    cid2tweights_lists: Optional[List[Dict[str, Dict[str, float]]]] = None,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, "ranking_results.jsonl")
+    rows = []
+    for i, (query_id, ranking, query_performances) in enumerate(
+        zip(query_ids, rankings, query_performances_lists)
+    ):
+        row = {
+            "query_id": query_id,
+            "ranking": round_dict(ranking),
+            "query_performances": round_dict(query_performances),
+            "cid2tweights": {},
+        }
+        if cid2tweights_lists is not None:
+            row["cid2tweights"] = {
+                cid: round_dict(tws) for cid, tws in cid2tweights_lists[i].items()
+            }
+        rows.append(row)
+    pd.DataFrame(rows).to_json(
+        output_path,
+        orient="records",
+        lines=True,
+    )
+class TermWeightingFunction(Protocol):
+    def __call__(self, query: str, cid: str) -> Dict[str, float]: ...
+def compare(
+    dataset: IRDataset,
+    results_path1: str,
+    results_path2: str,
+    output_dir: str,
+    main_metric: str = "recip_rank",
+    system1: Optional[str] = None,
+    system2: Optional[str] = None,
+    term_weighting_fn1: Optional[TermWeightingFunction] = None,
+    term_weighting_fn2: Optional[TermWeightingFunction] = None,
+) -> None:
+    os.makedirs(output_dir, exist_ok=True)
+    df1 = pd.read_json(results_path1, orient="records", lines=True)
+    df2 = pd.read_json(results_path2, orient="records", lines=True)
+    assert len(df1) == len(df2)
+    all_qrels = {}
+    for split in dataset.split2qrels:
+        all_qrels.update(dataset.get_qrels_dict(split))
+    qid2query = {query.query_id: query for query in dataset.queries}
+    cid2doc = {doc.collection_id: doc for doc in dataset.corpus}
+    diff_col = f"{main_metric}:qp1-qp2"
+    merged = pd.merge(df1, df2, on="query_id", how="outer")
+    rows = []
+    for _, example in tqdm.tqdm(merged.iterrows(), desc="Comparing", total=len(merged)):
+        docs = {cid: cid2doc[cid].text for cid in dict(example["ranking_x"])}
+        docs.update({cid: cid2doc[cid].text for cid in dict(example["ranking_y"])})
+        query_id = example["query_id"]
+        row = {
+            "query_id": query_id,
+            "query": qid2query[query_id].text,
+            diff_col: example["query_performances_x"][main_metric]
+            - example["query_performances_y"][main_metric],
+            "ranking1": ujson.dumps(example["ranking_x"], indent=4),
+            "ranking2": ujson.dumps(example["ranking_y"], indent=4),
+            "docs": ujson.dumps(docs, indent=4),
+            "query_performances1": ujson.dumps(
+                example["query_performances_x"], indent=4
+            ),
+            "query_performances2": ujson.dumps(
+                example["query_performances_y"], indent=4
+            ),
+            "qrels": ujson.dumps(all_qrels[query_id], indent=4),
+        }
+        if term_weighting_fn1 is not None and term_weighting_fn2 is not None:
+            all_cids = set(example["ranking_x"]) | set(example["ranking_y"])
+            cid2tweights1 = {}
+            cid2tweights2 = {}
+            ranking1 = {}
+            ranking2 = {}
+            for cid in all_cids:
+                tweights1 = term_weighting_fn1(query=qid2query[query_id].text, cid=cid)
+                tweights2 = term_weighting_fn2(query=qid2query[query_id].text, cid=cid)
+                ranking1[cid] = sum(tweights1.values())
+                ranking2[cid] = sum(tweights2.values())
+                cid2tweights1[cid] = tweights1
+                cid2tweights2[cid] = tweights2
+            ranking1 = sort_dict(ranking1)
+            ranking2 = sort_dict(ranking2)
+            row["ranking1"] = ujson.dumps(ranking1, indent=4)
+            row["ranking2"] = ujson.dumps(ranking2, indent=4)
+            cid2tweights1 = {cid: cid2tweights1[cid] for cid in ranking1}
+            cid2tweights2 = {cid: cid2tweights2[cid] for cid in ranking2}
+            row["cid2tweights1"] = ujson.dumps(cid2tweights1, indent=4)
+            row["cid2tweights2"] = ujson.dumps(cid2tweights2, indent=4)
+        rows.append(row)
+    table = pd.DataFrame(rows).sort_values(by=diff_col, ascending=False)
+    output_path = os.path.join(output_dir, f"compare-{system1}_vs_{system2}.tsv")
+    table.to_csv(output_path, sep="\t", index=False)
+# if __name__ == "__main__":
+#     # python -m lecture2.bm25.analysis
+#     from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+#     from lecture2.bm25.bm25_retriever import BM25Retriever
+#     from lecture2.bm25.tfidf_retriever import TFIDFRetriever
+#     import numpy as np
+#     sciq = load_sciq()
+#     system1 = "bm25"
+#     system2 = "tfidf"
+#     results_path1 = f"output/sciq-{system1}/results/ranking_results.jsonl"
+#     results_path2 = f"output/sciq-{system2}/results/ranking_results.jsonl"
+#     index_dir1 = f"output/sciq-{system1}"
+#     index_dir2 = f"output/sciq-{system2}"
+#     compare(
+#         dataset=sciq,
+#         results_path1=results_path1,
+#         results_path2=results_path2,
+#         output_dir=f"output/sciq-{system1}_vs_{system2}",
+#         system1=system1,
+#         system2=system2,
+#         term_weighting_fn1=BM25Retriever(index_dir1).get_term_weights,
+#         term_weighting_fn2=TFIDFRetriever(index_dir2).get_term_weights,
+#     )
+#     # bias on #shared_terms of TFIDF:
+#     df1 = pd.read_json(results_path1, orient="records", lines=True)
+#     df2 = pd.read_json(results_path2, orient="records", lines=True)
+#     merged = pd.merge(df1, df2, on="query_id", how="outer")
+#     nterms1 = []
+#     nterms2 = []
+#     for _, row in merged.iterrows():
+#         nterms1.append(len(list(dict(row["cid2tweights_x"]).values())[0]))
+#         nterms2.append(len(list(dict(row["cid2tweights_y"]).values())[0]))
+#     percentiles = (5, 25, 50, 75, 95)
+#     print(system1, np.percentile(nterms1, percentiles), np.mean(nterms1).round(2))
+#     print(system2, np.percentile(nterms2, percentiles), np.mean(nterms2).round(2))
+#     # bm25 [ 3.  4.  5.  7. 11.] 5.64
+#     # tfidf [1. 2. 3. 5. 9.] 3.58

nlp4web_codebase/nlp4web_codebase/lr/data_loaders/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List
+from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
+class Split(str, Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+@dataclass
+class IRDataset:
+    corpus: List[Document]
+    queries: List[Query]
+    split2qrels: Dict[Split, List[QRel]]
+    def get_stats(self) -> Dict[str, int]:
+        stats = {"|corpus|": len(self.corpus), "|queries|": len(self.queries)}
+        for split, qrels in self.split2qrels.items():
+            stats[f"|qrels-{split}|"] = len(qrels)
+        return stats
+    def get_qrels_dict(self, split: Split) -> Dict[str, Dict[str, int]]:
+        qrels_dict = {}
+        for qrel in self.split2qrels[split]:
+            qrels_dict.setdefault(qrel.query_id, {})
+            qrels_dict[qrel.query_id][qrel.collection_id] = qrel.relevance
+        return qrels_dict
+    def get_split_queries(self, split: Split) -> List[Query]:
+        qrels = self.split2qrels[split]
+        qids = {qrel.query_id for qrel in qrels}
+        return list(filter(lambda query: query.query_id in qids, self.queries))

nlp4web_codebase/nlp4web_codebase/lr/data_loaders/dm.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class Document:
+    collection_id: str
+    text: str
+@dataclass
+class Query:
+    query_id: str
+    text: str
+@dataclass
+class QRel:
+    query_id: str
+    collection_id: str
+    relevance: int
+    answer: Optional[str] = None

nlp4web_codebase/nlp4web_codebase/lr/data_loaders/sciq.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from typing import Dict, List
+from nlp4web_codebase.ir.data_loaders import IRDataset, Split
+from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
+from datasets import load_dataset
+import joblib
+@(joblib.Memory(".cache").cache)
+def load_sciq(verbose: bool = False) -> IRDataset:
+    train = load_dataset("allenai/sciq", split="train")
+    validation = load_dataset("allenai/sciq", split="validation")
+    test = load_dataset("allenai/sciq", split="test")
+    data = {Split.train: train, Split.dev: validation, Split.test: test}
+    # Each duplicated record is the same to each other:
+    df = train.to_pandas() + validation.to_pandas() + test.to_pandas()
+    for question, group in df.groupby("question"):
+        assert len(set(group["support"].tolist())) == len(group)
+        assert len(set(group["correct_answer"].tolist())) == len(group)
+    # Build:
+    corpus = []
+    queries = []
+    split2qrels: Dict[str, List[dict]] = {}
+    question2id = {}
+    support2id = {}
+    for split, rows in data.items():
+        if verbose:
+            print(f"|raw_{split}|", len(rows))
+        split2qrels[split] = []
+        for i, row in enumerate(rows):
+            example_id = f"{split}-{i}"
+            support: str = row["support"]
+            if len(support.strip()) == 0:
+                continue
+            question = row["question"]
+            if len(support.strip()) == 0:
+                continue
+            if support in support2id:
+                continue
+            else:
+                support2id[support] = example_id
+            if question in question2id:
+                continue
+            else:
+                question2id[question] = example_id
+            doc = {"collection_id": example_id, "text": support}
+            query = {"query_id": example_id, "text": row["question"]}
+            qrel = {
+                "query_id": example_id,
+                "collection_id": example_id,
+                "relevance": 1,
+                "answer": row["correct_answer"],
+            }
+            corpus.append(Document(**doc))
+            queries.append(Query(**query))
+            split2qrels[split].append(QRel(**qrel))
+    # Assembly and return:
+    return IRDataset(corpus=corpus, queries=queries, split2qrels=split2qrels)
+if __name__ == "__main__":
+    # python -m nlp4web_codebase.ir.data_loaders.sciq
+    import ujson
+    import time
+    start = time.time()
+    dataset = load_sciq(verbose=True)
+    print(f"Loading costs: {time.time() - start}s")
+    print(ujson.dumps(dataset.get_stats(), indent=4))
+    # ________________________________________________________________________________
+    # [Memory] Calling __main__--home-kwang-research-nlp4web-ir-exercise-nlp4web-nlp4web-ir-data_loaders-sciq.load_sciq...
+    # load_sciq(verbose=True)
+    # |raw_train| 11679
+    # |raw_dev| 1000
+    # |raw_test| 1000
+    # ________________________________________________________load_sciq - 7.3s, 0.1min
+    # Loading costs: 7.260092735290527s
+    # {
+    #     "|corpus|": 12160,
+    #     "|queries|": 12160,
+    #     "|qrels-train|": 10409,
+    #     "|qrels-dev|": 875,
+    #     "|qrels-test|": 876
+    # }

nlp4web_codebase/nlp4web_codebase/lr/models/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from abc import ABC, abstractmethod
+from typing import Any, Dict, Type
+class BaseRetriever(ABC):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[Any]:
+        pass
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        raise NotImplementedError
+    @abstractmethod
+    def score(self, query: str, cid: str) -> float:
+        pass
+    @abstractmethod
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        pass

nlp4web_codebase/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ .

nlp4web_codebase/setup.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from setuptools import setup, find_packages
+with open("README.md", "r", encoding="utf-8") as fh:
+    readme = fh.read()
+setup(
+    name="nlp4web-codebase",
+    version="0.0.0",
+    author="Kexin Wang",
+    author_email="kexin.wang.2049@gmail.com",
+    description="Codebase of teaching materials for NLP4Web.",
+    long_description=readme,
+    long_description_content_type="text/markdown",
+    url="https://https://github.com/kwang2049/nlp4web-codebase",
+    project_urls={
+        "Bug Tracker": "https://github.com/kwang2049/nlp4web-codebase/issues",
+    },
+    packages=find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.10",
+    install_requires=[
+        "nltk==3.8.1",
+        "numpy==1.26.4",
+        "scipy==1.13.1",
+        "pandas==2.2.2",
+        "tqdm==4.66.5",
+        "ujson==5.10.0",
+        "joblib==1.4.2",
+        "datasets==3.0.1",
+        "pytrec_eval==0.5",
+    ],
+)

sample_date/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+This directory includes a few sample datasets to get you started.
+*   `california_housing_data*.csv` is California housing data from the 1990 US
+    Census; more information is available at:
+    https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub
+*   `mnist_*.csv` is a small sample of the
+    [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is
+    described at: http://yann.lecun.com/exdb/mnist/
+*   `anscombe.json` contains a copy of
+    [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet); it
+    was originally described in
+    Anscombe, F. J. (1973). 'Graphs in Statistical Analysis'. American
+    Statistician. 27 (1): 17-21. JSTOR 2682899.
+    and our copy was prepared by the
+    [vega_datasets library](https://github.com/altair-viz/vega_datasets/blob/4f67bdaad10f45e3549984e17e1b3088c731503d/vega_datasets/_data/anscombe.json).

sample_date/anscombe.json ADDED Viewed

	@@ -0,0 +1,49 @@

+[
+  {"Series":"I", "X":10.0, "Y":8.04},
+  {"Series":"I", "X":8.0, "Y":6.95},
+  {"Series":"I", "X":13.0, "Y":7.58},
+  {"Series":"I", "X":9.0, "Y":8.81},
+  {"Series":"I", "X":11.0, "Y":8.33},
+  {"Series":"I", "X":14.0, "Y":9.96},
+  {"Series":"I", "X":6.0, "Y":7.24},
+  {"Series":"I", "X":4.0, "Y":4.26},
+  {"Series":"I", "X":12.0, "Y":10.84},
+  {"Series":"I", "X":7.0, "Y":4.81},
+  {"Series":"I", "X":5.0, "Y":5.68},
+  {"Series":"II", "X":10.0, "Y":9.14},
+  {"Series":"II", "X":8.0, "Y":8.14},
+  {"Series":"II", "X":13.0, "Y":8.74},
+  {"Series":"II", "X":9.0, "Y":8.77},
+  {"Series":"II", "X":11.0, "Y":9.26},
+  {"Series":"II", "X":14.0, "Y":8.10},
+  {"Series":"II", "X":6.0, "Y":6.13},
+  {"Series":"II", "X":4.0, "Y":3.10},
+  {"Series":"II", "X":12.0, "Y":9.13},
+  {"Series":"II", "X":7.0, "Y":7.26},
+  {"Series":"II", "X":5.0, "Y":4.74},
+  {"Series":"III", "X":10.0, "Y":7.46},
+  {"Series":"III", "X":8.0, "Y":6.77},
+  {"Series":"III", "X":13.0, "Y":12.74},
+  {"Series":"III", "X":9.0, "Y":7.11},
+  {"Series":"III", "X":11.0, "Y":7.81},
+  {"Series":"III", "X":14.0, "Y":8.84},
+  {"Series":"III", "X":6.0, "Y":6.08},
+  {"Series":"III", "X":4.0, "Y":5.39},
+  {"Series":"III", "X":12.0, "Y":8.15},
+  {"Series":"III", "X":7.0, "Y":6.42},
+  {"Series":"III", "X":5.0, "Y":5.73},
+  {"Series":"IV", "X":8.0, "Y":6.58},
+  {"Series":"IV", "X":8.0, "Y":5.76},
+  {"Series":"IV", "X":8.0, "Y":7.71},
+  {"Series":"IV", "X":8.0, "Y":8.84},
+  {"Series":"IV", "X":8.0, "Y":8.47},
+  {"Series":"IV", "X":8.0, "Y":7.04},
+  {"Series":"IV", "X":8.0, "Y":5.25},
+  {"Series":"IV", "X":19.0, "Y":12.50},
+  {"Series":"IV", "X":8.0, "Y":5.56},
+  {"Series":"IV", "X":8.0, "Y":7.91},
+  {"Series":"IV", "X":8.0, "Y":6.89}
+]

sample_date/california_housing_test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

sample_date/california_housing_train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff