Spaces:

miki5799
/

nlp4web

Sleeping

App Files Files Community

miki5799 commited on Nov 7, 2024

Commit

5270fb9

1 Parent(s): 54d1112

Extract jupyter notebook and nlp4web-codebase contents to hf shitspace repo

Browse files

Files changed (9) hide show

app.py +318 -0
nlp4web_codebase/__init__.py +0 -0
nlp4web_codebase/ir/__init__.py +0 -0
nlp4web_codebase/ir/analysis.py +160 -0
nlp4web_codebase/ir/data_loaders/__init__.py +35 -0
nlp4web_codebase/ir/data_loaders/dm.py +22 -0
nlp4web_codebase/ir/data_loaders/sciq.py +86 -0
nlp4web_codebase/ir/models/__init__.py +21 -0
requirements.txt +389 -0

app.py CHANGED Viewed

@@ -1,3 +1,321 @@
 import gradio as gr
 from typing import TypedDict
 import pandas as pd

+from dataclasses import dataclass
+import pickle
+import os
+from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
+from nlp4web_codebase.ir.data_loaders.dm import Document
+from collections import Counter
+import tqdm
+import re
+import nltk
+nltk.download("stopwords", quiet=True)
+from nltk.corpus import stopwords as nltk_stopwords
+LANGUAGE = "english"
+word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
+stopwords = set(nltk_stopwords.words(LANGUAGE))
+def word_splitting(text: str) -> List[str]:
+    return word_splitter(text.lower())
+def lemmatization(words: List[str]) -> List[str]:
+    return words  # We ignore lemmatization here for simplicity
+def simple_tokenize(text: str) -> List[str]:
+    words = word_splitting(text)
+    tokenized = list(filter(lambda w: w not in stopwords, words))
+    tokenized = lemmatization(tokenized)
+    return tokenized
+T = TypeVar("T", bound="InvertedIndex")
+@dataclass
+class PostingList:
+    term: str  # The term
+    docid_postings: List[int]  # docid_postings[i] means the docid (int) of the i-th associated posting
+    tweight_postings: List[float]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
+@dataclass
+class InvertedIndex:
+    posting_lists: List[PostingList]  # docid -> posting_list
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]  # collection_id -> docid
+    collection_ids: List[str]  # docid -> collection_id
+    doc_texts: Optional[List[str]] = None  # docid -> document text
+    def save(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
+            pickle.dump(self, f)
+    @classmethod
+    def from_saved(cls: Type[T], saved_dir: str) -> T:
+        index = cls(
+            posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
+        )
+        with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
+            index = pickle.load(f)
+        return index
+# The output of the counting function:
+@dataclass
+class Counting:
+    posting_lists: List[PostingList]
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]
+    collection_ids: List[str]
+    dfs: List[int]  # tid -> df
+    dls: List[int]  # docid -> doc length
+    avgdl: float
+    nterms: int
+    doc_texts: Optional[List[str]] = None
+def run_counting(
+    documents: Iterable[Document],
+    tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
+    store_raw: bool = True,  # store the document text in doc_texts
+    ndocs: Optional[int] = None,
+    show_progress_bar: bool = True,
+) -> Counting:
+    """Counting TFs, DFs, doc_lengths, etc."""
+    posting_lists: List[PostingList] = []
+    vocab: Dict[str, int] = {}
+    cid2docid: Dict[str, int] = {}
+    collection_ids: List[str] = []
+    dfs: List[int] = []  # tid -> df
+    dls: List[int] = []  # docid -> doc length
+    nterms: int = 0
+    doc_texts: Optional[List[str]] = []
+    for doc in tqdm.tqdm(
+        documents,
+        desc="Counting",
+        total=ndocs,
+        disable=not show_progress_bar,
+    ):
+        if doc.collection_id in cid2docid:
+            continue
+        collection_ids.append(doc.collection_id)
+        docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
+        toks = tokenize_fn(doc.text)
+        tok2tf = Counter(toks)
+        dls.append(sum(tok2tf.values()))
+        for tok, tf in tok2tf.items():
+            nterms += tf
+            tid = vocab.get(tok, None)
+            if tid is None:
+                posting_lists.append(
+                    PostingList(term=tok, docid_postings=[], tweight_postings=[])
+                )
+                tid = vocab.setdefault(tok, len(vocab))
+            posting_lists[tid].docid_postings.append(docid)
+            posting_lists[tid].tweight_postings.append(tf)
+            if tid < len(dfs):
+                dfs[tid] += 1
+            else:
+                dfs.append(0)
+        if store_raw:
+            doc_texts.append(doc.text)
+        else:
+            doc_texts = None
+    return Counting(
+        posting_lists=posting_lists,
+        vocab=vocab,
+        cid2docid=cid2docid,
+        collection_ids=collection_ids,
+        dfs=dfs,
+        dls=dls,
+        avgdl=sum(dls) / len(dls),
+        nterms=nterms,
+        doc_texts=doc_texts,
+    )
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+sciq = load_sciq()
+counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
+from __future__ import annotations
+from dataclasses import asdict, dataclass
+import math
+import os
+from typing import Iterable, List, Optional, Type
+import tqdm
+from nlp4web_codebase.ir.data_loaders.dm import Document
+@dataclass
+class BM25Index(InvertedIndex):
+    @staticmethod
+    def tokenize(text: str) -> List[str]:
+        return simple_tokenize(text)
+    @staticmethod
+    def cache_term_weights(
+        posting_lists: List[PostingList],
+        total_docs: int,
+        avgdl: float,
+        dfs: List[int],
+        dls: List[int],
+        k1: float,
+        b: float,
+    ) -> None:
+        """Compute term weights and caching"""
+        N = total_docs
+        for tid, posting_list in enumerate(
+            tqdm.tqdm(posting_lists, desc="Regularizing TFs")
+        ):
+            idf = BM25Index.calc_idf(df=dfs[tid], N=N)
+            for i in range(len(posting_list.docid_postings)):
+                docid = posting_list.docid_postings[i]
+                tf = posting_list.tweight_postings[i]
+                dl = dls[docid]
+                regularized_tf = BM25Index.calc_regularized_tf(
+                    tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
+                )
+                posting_list.tweight_postings[i] = regularized_tf * idf
+    @staticmethod
+    def calc_regularized_tf(
+        tf: int, dl: float, avgdl: float, k1: float, b: float
+    ) -> float:
+        return tf / (tf + k1 * (1 - b + b * dl / avgdl))
+    @staticmethod
+    def calc_idf(df: int, N: int):
+        return math.log(1 + (N - df + 0.5) / (df + 0.5))
+    @classmethod
+    def build_from_documents(
+        cls: Type[BM25Index],
+        documents: Iterable[Document],
+        store_raw: bool = True,
+        output_dir: Optional[str] = None,
+        ndocs: Optional[int] = None,
+        show_progress_bar: bool = True,
+        k1: float = 0.9,
+        b: float = 0.4,
+    ) -> BM25Index:
+        # Counting TFs, DFs, doc_lengths, etc.:
+        counting = run_counting(
+            documents=documents,
+            tokenize_fn=BM25Index.tokenize,
+            store_raw=store_raw,
+            ndocs=ndocs,
+            show_progress_bar=show_progress_bar,
+        )
+        # Compute term weights and caching:
+        posting_lists = counting.posting_lists
+        total_docs = len(counting.cid2docid)
+        BM25Index.cache_term_weights(
+            posting_lists=posting_lists,
+            total_docs=total_docs,
+            avgdl=counting.avgdl,
+            dfs=counting.dfs,
+            dls=counting.dls,
+            k1=k1,
+            b=b,
+        )
+        # Assembly and save:
+        index = BM25Index(
+            posting_lists=posting_lists,
+            vocab=counting.vocab,
+            cid2docid=counting.cid2docid,
+            collection_ids=counting.collection_ids,
+            doc_texts=counting.doc_texts,
+        )
+        return index
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+)
+bm25_index.save("output/bm25_index")
+from nlp4web_codebase.ir.models import BaseRetriever
+from typing import Type
+from abc import abstractmethod
+class BaseInvertedIndexRetriever(BaseRetriever):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[InvertedIndex]:
+        pass
+    def __init__(self, index_dir: str) -> None:
+        self.index = self.index_class.from_saved(index_dir)
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        target_docid = self.index.cid2docid[cid]
+        term_weights = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists[tid]
+            for docid, tweight in zip(
+                posting_list.docid_postings, posting_list.tweight_postings
+            ):
+                if docid == target_docid:
+                    term_weights[tok] = tweight
+                    break
+        return term_weights
+    def score(self, query: str, cid: str) -> float:
+        return sum(self.get_term_weights(query=query, cid=cid).values())
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        docid2score: Dict[int, float] = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists[tid]
+            for docid, tweight in zip(
+                posting_list.docid_postings, posting_list.tweight_postings
+            ):
+                docid2score.setdefault(docid, 0)
+                docid2score[docid] += tweight
+        docid2score = dict(
+            sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
+        )
+        return {
+            self.index.collection_ids[docid]: score
+            for docid, score in docid2score.items()
+        }
+class BM25Retriever(BaseInvertedIndexRetriever):
+    @property
+    def index_class(self) -> Type[BM25Index]:
+        return BM25Index
+bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")
+plots_b = {'X': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'Y': [0.694980045351474, 0.8126195011337869, 0.821528798185941, 0.8218562358276644, 0.8222244897959182, 0.8195024943310657, 0.8182163265306123, 0.8174734693877551, 0.8139020408163266, 0.8116893424036281, 0.8083002267573697]} #TODO: Replace
+plots_k1 = {'X': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'Y': [0.7345419501133786, 0.7668607709750567, 0.779508843537415, 0.7900947845804988, 0.8015931972789115, 0.8103560090702948, 0.812374149659864, 0.8156743764172336, 0.8194036281179138, 0.8222244897959182, 0.8221800453514739]}
+best_b = plots_b["X"][np.argmax(plots_b["Y"])]
+best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+    k1=best_k1,
+    b=best_b
+)
 import gradio as gr
 from typing import TypedDict
 import pandas as pd

nlp4web_codebase/__init__.py ADDED Viewed

File without changes

nlp4web_codebase/ir/__init__.py ADDED Viewed

File without changes

nlp4web_codebase/ir/analysis.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+from typing import Dict, List, Optional, Protocol
+import pandas as pd
+import tqdm
+import ujson
+from nlp4web_codebase.ir.data_loaders import IRDataset
+def round_dict(obj: Dict[str, float], ndigits: int = 4) -> Dict[str, float]:
+    return {k: round(v, ndigits=ndigits) for k, v in obj.items()}
+def sort_dict(obj: Dict[str, float], reverse: bool = True) -> Dict[str, float]:
+    return dict(sorted(obj.items(), key=lambda pair: pair[1], reverse=reverse))
+def save_ranking_results(
+    output_dir: str,
+    query_ids: List[str],
+    rankings: List[Dict[str, float]],
+    query_performances_lists: List[Dict[str, float]],
+    cid2tweights_lists: Optional[List[Dict[str, Dict[str, float]]]] = None,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, "ranking_results.jsonl")
+    rows = []
+    for i, (query_id, ranking, query_performances) in enumerate(
+        zip(query_ids, rankings, query_performances_lists)
+    ):
+        row = {
+            "query_id": query_id,
+            "ranking": round_dict(ranking),
+            "query_performances": round_dict(query_performances),
+            "cid2tweights": {},
+        }
+        if cid2tweights_lists is not None:
+            row["cid2tweights"] = {
+                cid: round_dict(tws) for cid, tws in cid2tweights_lists[i].items()
+            }
+        rows.append(row)
+    pd.DataFrame(rows).to_json(
+        output_path,
+        orient="records",
+        lines=True,
+    )
+class TermWeightingFunction(Protocol):
+    def __call__(self, query: str, cid: str) -> Dict[str, float]: ...
+def compare(
+    dataset: IRDataset,
+    results_path1: str,
+    results_path2: str,
+    output_dir: str,
+    main_metric: str = "recip_rank",
+    system1: Optional[str] = None,
+    system2: Optional[str] = None,
+    term_weighting_fn1: Optional[TermWeightingFunction] = None,
+    term_weighting_fn2: Optional[TermWeightingFunction] = None,
+) -> None:
+    os.makedirs(output_dir, exist_ok=True)
+    df1 = pd.read_json(results_path1, orient="records", lines=True)
+    df2 = pd.read_json(results_path2, orient="records", lines=True)
+    assert len(df1) == len(df2)
+    all_qrels = {}
+    for split in dataset.split2qrels:
+        all_qrels.update(dataset.get_qrels_dict(split))
+    qid2query = {query.query_id: query for query in dataset.queries}
+    cid2doc = {doc.collection_id: doc for doc in dataset.corpus}
+    diff_col = f"{main_metric}:qp1-qp2"
+    merged = pd.merge(df1, df2, on="query_id", how="outer")
+    rows = []
+    for _, example in tqdm.tqdm(merged.iterrows(), desc="Comparing", total=len(merged)):
+        docs = {cid: cid2doc[cid].text for cid in dict(example["ranking_x"])}
+        docs.update({cid: cid2doc[cid].text for cid in dict(example["ranking_y"])})
+        query_id = example["query_id"]
+        row = {
+            "query_id": query_id,
+            "query": qid2query[query_id].text,
+            diff_col: example["query_performances_x"][main_metric]
+            - example["query_performances_y"][main_metric],
+            "ranking1": ujson.dumps(example["ranking_x"], indent=4),
+            "ranking2": ujson.dumps(example["ranking_y"], indent=4),
+            "docs": ujson.dumps(docs, indent=4),
+            "query_performances1": ujson.dumps(
+                example["query_performances_x"], indent=4
+            ),
+            "query_performances2": ujson.dumps(
+                example["query_performances_y"], indent=4
+            ),
+            "qrels": ujson.dumps(all_qrels[query_id], indent=4),
+        }
+        if term_weighting_fn1 is not None and term_weighting_fn2 is not None:
+            all_cids = set(example["ranking_x"]) | set(example["ranking_y"])
+            cid2tweights1 = {}
+            cid2tweights2 = {}
+            ranking1 = {}
+            ranking2 = {}
+            for cid in all_cids:
+                tweights1 = term_weighting_fn1(query=qid2query[query_id].text, cid=cid)
+                tweights2 = term_weighting_fn2(query=qid2query[query_id].text, cid=cid)
+                ranking1[cid] = sum(tweights1.values())
+                ranking2[cid] = sum(tweights2.values())
+                cid2tweights1[cid] = tweights1
+                cid2tweights2[cid] = tweights2
+            ranking1 = sort_dict(ranking1)
+            ranking2 = sort_dict(ranking2)
+            row["ranking1"] = ujson.dumps(ranking1, indent=4)
+            row["ranking2"] = ujson.dumps(ranking2, indent=4)
+            cid2tweights1 = {cid: cid2tweights1[cid] for cid in ranking1}
+            cid2tweights2 = {cid: cid2tweights2[cid] for cid in ranking2}
+            row["cid2tweights1"] = ujson.dumps(cid2tweights1, indent=4)
+            row["cid2tweights2"] = ujson.dumps(cid2tweights2, indent=4)
+        rows.append(row)
+    table = pd.DataFrame(rows).sort_values(by=diff_col, ascending=False)
+    output_path = os.path.join(output_dir, f"compare-{system1}_vs_{system2}.tsv")
+    table.to_csv(output_path, sep="\t", index=False)
+# if __name__ == "__main__":
+#     # python -m lecture2.bm25.analysis
+#     from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+#     from lecture2.bm25.bm25_retriever import BM25Retriever
+#     from lecture2.bm25.tfidf_retriever import TFIDFRetriever
+#     import numpy as np
+#     sciq = load_sciq()
+#     system1 = "bm25"
+#     system2 = "tfidf"
+#     results_path1 = f"output/sciq-{system1}/results/ranking_results.jsonl"
+#     results_path2 = f"output/sciq-{system2}/results/ranking_results.jsonl"
+#     index_dir1 = f"output/sciq-{system1}"
+#     index_dir2 = f"output/sciq-{system2}"
+#     compare(
+#         dataset=sciq,
+#         results_path1=results_path1,
+#         results_path2=results_path2,
+#         output_dir=f"output/sciq-{system1}_vs_{system2}",
+#         system1=system1,
+#         system2=system2,
+#         term_weighting_fn1=BM25Retriever(index_dir1).get_term_weights,
+#         term_weighting_fn2=TFIDFRetriever(index_dir2).get_term_weights,
+#     )
+#     # bias on #shared_terms of TFIDF:
+#     df1 = pd.read_json(results_path1, orient="records", lines=True)
+#     df2 = pd.read_json(results_path2, orient="records", lines=True)
+#     merged = pd.merge(df1, df2, on="query_id", how="outer")
+#     nterms1 = []
+#     nterms2 = []
+#     for _, row in merged.iterrows():
+#         nterms1.append(len(list(dict(row["cid2tweights_x"]).values())[0]))
+#         nterms2.append(len(list(dict(row["cid2tweights_y"]).values())[0]))
+#     percentiles = (5, 25, 50, 75, 95)
+#     print(system1, np.percentile(nterms1, percentiles), np.mean(nterms1).round(2))
+#     print(system2, np.percentile(nterms2, percentiles), np.mean(nterms2).round(2))
+#     # bm25 [ 3.  4.  5.  7. 11.] 5.64
+#     # tfidf [1. 2. 3. 5. 9.] 3.58

nlp4web_codebase/ir/data_loaders/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List
+from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
+class Split(str, Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+@dataclass
+class IRDataset:
+    corpus: List[Document]
+    queries: List[Query]
+    split2qrels: Dict[Split, List[QRel]]
+    def get_stats(self) -> Dict[str, int]:
+        stats = {"|corpus|": len(self.corpus), "|queries|": len(self.queries)}
+        for split, qrels in self.split2qrels.items():
+            stats[f"|qrels-{split}|"] = len(qrels)
+        return stats
+    def get_qrels_dict(self, split: Split) -> Dict[str, Dict[str, int]]:
+        qrels_dict = {}
+        for qrel in self.split2qrels[split]:
+            qrels_dict.setdefault(qrel.query_id, {})
+            qrels_dict[qrel.query_id][qrel.collection_id] = qrel.relevance
+        return qrels_dict
+    def get_split_queries(self, split: Split) -> List[Query]:
+        qrels = self.split2qrels[split]
+        qids = {qrel.query_id for qrel in qrels}
+        return list(filter(lambda query: query.query_id in qids, self.queries))

nlp4web_codebase/ir/data_loaders/dm.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class Document:
+    collection_id: str
+    text: str
+@dataclass
+class Query:
+    query_id: str
+    text: str
+@dataclass
+class QRel:
+    query_id: str
+    collection_id: str
+    relevance: int
+    answer: Optional[str] = None

nlp4web_codebase/ir/data_loaders/sciq.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from typing import Dict, List
+from nlp4web_codebase.ir.data_loaders import IRDataset, Split
+from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
+from datasets import load_dataset
+import joblib
+@(joblib.Memory(".cache").cache)
+def load_sciq(verbose: bool = False) -> IRDataset:
+    train = load_dataset("allenai/sciq", split="train")
+    validation = load_dataset("allenai/sciq", split="validation")
+    test = load_dataset("allenai/sciq", split="test")
+    data = {Split.train: train, Split.dev: validation, Split.test: test}
+    # Each duplicated record is the same to each other:
+    df = train.to_pandas() + validation.to_pandas() + test.to_pandas()
+    for question, group in df.groupby("question"):
+        assert len(set(group["support"].tolist())) == len(group)
+        assert len(set(group["correct_answer"].tolist())) == len(group)
+    # Build:
+    corpus = []
+    queries = []
+    split2qrels: Dict[str, List[dict]] = {}
+    question2id = {}
+    support2id = {}
+    for split, rows in data.items():
+        if verbose:
+            print(f"|raw_{split}|", len(rows))
+        split2qrels[split] = []
+        for i, row in enumerate(rows):
+            example_id = f"{split}-{i}"
+            support: str = row["support"]
+            if len(support.strip()) == 0:
+                continue
+            question = row["question"]
+            if len(support.strip()) == 0:
+                continue
+            if support in support2id:
+                continue
+            else:
+                support2id[support] = example_id
+            if question in question2id:
+                continue
+            else:
+                question2id[question] = example_id
+            doc = {"collection_id": example_id, "text": support}
+            query = {"query_id": example_id, "text": row["question"]}
+            qrel = {
+                "query_id": example_id,
+                "collection_id": example_id,
+                "relevance": 1,
+                "answer": row["correct_answer"],
+            }
+            corpus.append(Document(**doc))
+            queries.append(Query(**query))
+            split2qrels[split].append(QRel(**qrel))
+    # Assembly and return:
+    return IRDataset(corpus=corpus, queries=queries, split2qrels=split2qrels)
+if __name__ == "__main__":
+    # python -m nlp4web_codebase.ir.data_loaders.sciq
+    import ujson
+    import time
+    start = time.time()
+    dataset = load_sciq(verbose=True)
+    print(f"Loading costs: {time.time() - start}s")
+    print(ujson.dumps(dataset.get_stats(), indent=4))
+    # ________________________________________________________________________________
+    # [Memory] Calling __main__--home-kwang-research-nlp4web-ir-exercise-nlp4web-nlp4web-ir-data_loaders-sciq.load_sciq...
+    # load_sciq(verbose=True)
+    # |raw_train| 11679
+    # |raw_dev| 1000
+    # |raw_test| 1000
+    # ________________________________________________________load_sciq - 7.3s, 0.1min
+    # Loading costs: 7.260092735290527s
+    # {
+    #     "|corpus|": 12160,
+    #     "|queries|": 12160,
+    #     "|qrels-train|": 10409,
+    #     "|qrels-dev|": 875,
+    #     "|qrels-test|": 876
+    # }

nlp4web_codebase/ir/models/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from abc import ABC, abstractmethod
+from typing import Any, Dict, Type
+class BaseRetriever(ABC):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[Any]:
+        pass
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        raise NotImplementedError
+    @abstractmethod
+    def score(self, query: str, cid: str) -> float:
+        pass
+    @abstractmethod
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        pass

requirements.txt ADDED Viewed

	@@ -0,0 +1,389 @@

+absl-py==1.4.0
+alabaster==0.7.13
+anyio==4.4.0
+appnope==0.1.4
+argcomplete==3.2.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==23.1.0
+Babel==2.12.1
+beautifulsoup4==4.12.3
+black==23.1.0
+blacken-docs==1.13.0
+bleach==6.1.0
+cachetools==5.3.1
+certifi==2023.5.7
+cffi==1.15.1
+cfgv==3.3.1
+charset-normalizer==3.2.0
+click==8.1.5
+cloudpickle==2.2.1
+coloredlogs==15.0.1
+comm==0.2.2
+contourpy==1.1.0
+coverage==7.2.7
+cryptography==41.0.2
+cycler==0.11.0
+dataclasses==0.6
+DateTime==5.5
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.7
+docutils==0.18.1
+eradicate==2.3.0
+et-xmlfile==1.1.0
+# Editable install with no version control (eta-utility==2.2.2b2.dev78+g7a5fece)
+-e /Users/mikaelhailu/Documents/Office/ETA-Fabrik/eta-utility
+exceptiongroup==1.1.2
+executing==2.0.1
+fastjsonschema==2.20.0
+filelock==3.12.2
+flake8==5.0.4
+flake8-builtins==2.1.0
+flake8-comprehensions==3.10.1
+flake8-eradicate==1.4.0
+flake8-mutable==1.2.0
+flake8-plugin-utils==1.3.3
+flake8-print==5.0.0
+flake8-pytest-style==1.7.2
+flake8-requirements==1.7.7
+flake8-rst-docstrings==0.3.0
+flatbuffers==23.5.26
+FMPy==0.3.15
+fonttools==4.41.0
+fqdn==1.5.1
+google-auth==2.22.0
+google-auth-oauthlib==1.0.0
+grpcio==1.56.0
+gym @ git+https://github.com/rlberry-py/gym_fix_021@fd62b4bc15dfd5d8a9be42da54b234c5c47fc98b
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+humanfriendly==10.0
+icalendar==6.0.1
+identify==2.5.24
+idna==3.4
+imagesize==1.4.1
+importlib-metadata==4.13.0
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.24.0
+ipywidgets==8.1.3
+isoduration==20.11.0
+isort==5.12.0
+jedi==0.19.1
+Jinja2==3.1.2
+joblib==1.4.2
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter==1.0.0
+jupyter-console==6.6.3
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.2
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.4
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.11
+keyboard==0.13.5
+kiwisolver==1.4.4
+lark==1.1.6
+lxml==4.9.3
+Markdown==3.4.3
+MarkupSafe==2.1.5
+matplotlib==3.7.2
+matplotlib-inline==0.1.7
+mccabe==0.7.0
+mistune==3.0.2
+MouseInfo==0.1.3
+mpmath==1.3.0
+msgpack==1.0.5
+mushroom-rl==1.10.1
+mypy==1.0.0
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.1
+nodeenv==1.8.0
+notebook==7.2.1
+notebook_shim==0.2.4
+numpy==1.25.2
+oauthlib==3.2.2
+onnxruntime==1.15.1
+opcua==0.98.13
+opencv-python==4.10.0.84
+openpyxl==3.1.2
+overrides==7.7.0
+packaging==23.1
+pandas==2.0.3
+pandocfilters==1.5.1
+parso==0.8.4
+pathspec==0.11.1
+pbr==6.0.0
+pep8-naming==0.13.3
+pexpect==4.9.0
+Pillow==10.0.0
+pipx==1.4.3
+platformdirs==3.9.1
+pluggy==1.2.0
+ply==3.11
+pre-commit==3.3.3
+prometheus_client==0.20.0
+prompt-toolkit==3.0.43
+protobuf==4.23.4
+psutil==6.0.0
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycodestyle==2.9.1
+pycparser==2.21
+pyflakes==2.5.0
+pygame==2.5.0
+PyGetWindow==0.0.9
+pyglet==2.0.8
+Pygments==2.15.1
+pyModbusTCP==0.2.0
+PyMsgBox==1.0.9
+pyobjc==9.2
+pyobjc-core==9.2
+pyobjc-framework-Accessibility==9.2
+pyobjc-framework-Accounts==9.2
+pyobjc-framework-AddressBook==9.2
+pyobjc-framework-AdServices==9.2
+pyobjc-framework-AdSupport==9.2
+pyobjc-framework-AppleScriptKit==9.2
+pyobjc-framework-AppleScriptObjC==9.2
+pyobjc-framework-ApplicationServices==9.2
+pyobjc-framework-AppTrackingTransparency==9.2
+pyobjc-framework-AudioVideoBridging==9.2
+pyobjc-framework-AuthenticationServices==9.2
+pyobjc-framework-AutomaticAssessmentConfiguration==9.2
+pyobjc-framework-Automator==9.2
+pyobjc-framework-AVFoundation==9.2
+pyobjc-framework-AVKit==9.2
+pyobjc-framework-AVRouting==9.2
+pyobjc-framework-BackgroundAssets==9.2
+pyobjc-framework-BusinessChat==9.2
+pyobjc-framework-CalendarStore==9.2
+pyobjc-framework-CallKit==9.2
+pyobjc-framework-CFNetwork==9.2
+pyobjc-framework-ClassKit==9.2
+pyobjc-framework-CloudKit==9.2
+pyobjc-framework-Cocoa==9.2
+pyobjc-framework-Collaboration==9.2
+pyobjc-framework-ColorSync==9.2
+pyobjc-framework-Contacts==9.2
+pyobjc-framework-ContactsUI==9.2
+pyobjc-framework-CoreAudio==9.2
+pyobjc-framework-CoreAudioKit==9.2
+pyobjc-framework-CoreBluetooth==9.2
+pyobjc-framework-CoreData==9.2
+pyobjc-framework-CoreHaptics==9.2
+pyobjc-framework-CoreLocation==9.2
+pyobjc-framework-CoreMedia==9.2
+pyobjc-framework-CoreMediaIO==9.2
+pyobjc-framework-CoreMIDI==9.2
+pyobjc-framework-CoreML==9.2
+pyobjc-framework-CoreMotion==9.2
+pyobjc-framework-CoreServices==9.2
+pyobjc-framework-CoreSpotlight==9.2
+pyobjc-framework-CoreText==9.2
+pyobjc-framework-CoreWLAN==9.2
+pyobjc-framework-CryptoTokenKit==9.2
+pyobjc-framework-DataDetection==9.2
+pyobjc-framework-DeviceCheck==9.2
+pyobjc-framework-DictionaryServices==9.2
+pyobjc-framework-DiscRecording==9.2
+pyobjc-framework-DiscRecordingUI==9.2
+pyobjc-framework-DiskArbitration==9.2
+pyobjc-framework-DVDPlayback==9.2
+pyobjc-framework-EventKit==9.2
+pyobjc-framework-ExceptionHandling==9.2
+pyobjc-framework-ExecutionPolicy==9.2
+pyobjc-framework-ExtensionKit==9.2
+pyobjc-framework-ExternalAccessory==9.2
+pyobjc-framework-FileProvider==9.2
+pyobjc-framework-FileProviderUI==9.2
+pyobjc-framework-FinderSync==9.2
+pyobjc-framework-FSEvents==9.2
+pyobjc-framework-GameCenter==9.2
+pyobjc-framework-GameController==9.2
+pyobjc-framework-GameKit==9.2
+pyobjc-framework-GameplayKit==9.2
+pyobjc-framework-HealthKit==9.2
+pyobjc-framework-ImageCaptureCore==9.2
+pyobjc-framework-IMServicePlugIn==9.2
+pyobjc-framework-InputMethodKit==9.2
+pyobjc-framework-InstallerPlugins==9.2
+pyobjc-framework-InstantMessage==9.2
+pyobjc-framework-Intents==9.2
+pyobjc-framework-IntentsUI==9.2
+pyobjc-framework-IOBluetooth==9.2
+pyobjc-framework-IOBluetoothUI==9.2
+pyobjc-framework-IOSurface==9.2
+pyobjc-framework-iTunesLibrary==9.2
+pyobjc-framework-KernelManagement==9.2
+pyobjc-framework-LatentSemanticMapping==9.2
+pyobjc-framework-LaunchServices==9.2
+pyobjc-framework-libdispatch==9.2
+pyobjc-framework-libxpc==9.2
+pyobjc-framework-LinkPresentation==9.2
+pyobjc-framework-LocalAuthentication==9.2
+pyobjc-framework-LocalAuthenticationEmbeddedUI==9.2
+pyobjc-framework-MailKit==9.2
+pyobjc-framework-MapKit==9.2
+pyobjc-framework-MediaAccessibility==9.2
+pyobjc-framework-MediaLibrary==9.2
+pyobjc-framework-MediaPlayer==9.2
+pyobjc-framework-MediaToolbox==9.2
+pyobjc-framework-Metal==9.2
+pyobjc-framework-MetalFX==9.2
+pyobjc-framework-MetalKit==9.2
+pyobjc-framework-MetalPerformanceShaders==9.2
+pyobjc-framework-MetalPerformanceShadersGraph==9.2
+pyobjc-framework-MetricKit==9.2
+pyobjc-framework-MLCompute==9.2
+pyobjc-framework-ModelIO==9.2
+pyobjc-framework-MultipeerConnectivity==9.2
+pyobjc-framework-NaturalLanguage==9.2
+pyobjc-framework-NetFS==9.2
+pyobjc-framework-Network==9.2
+pyobjc-framework-NetworkExtension==9.2
+pyobjc-framework-NotificationCenter==9.2
+pyobjc-framework-OpenDirectory==9.2
+pyobjc-framework-OSAKit==9.2
+pyobjc-framework-OSLog==9.2
+pyobjc-framework-PassKit==9.2
+pyobjc-framework-PencilKit==9.2
+pyobjc-framework-PHASE==9.2
+pyobjc-framework-Photos==9.2
+pyobjc-framework-PhotosUI==9.2
+pyobjc-framework-PreferencePanes==9.2
+pyobjc-framework-PushKit==9.2
+pyobjc-framework-Quartz==9.2
+pyobjc-framework-QuickLookThumbnailing==9.2
+pyobjc-framework-ReplayKit==9.2
+pyobjc-framework-SafariServices==9.2
+pyobjc-framework-SafetyKit==9.2
+pyobjc-framework-SceneKit==9.2
+pyobjc-framework-ScreenCaptureKit==9.2
+pyobjc-framework-ScreenSaver==9.2
+pyobjc-framework-ScreenTime==9.2
+pyobjc-framework-ScriptingBridge==9.2
+pyobjc-framework-SearchKit==9.2
+pyobjc-framework-Security==9.2
+pyobjc-framework-SecurityFoundation==9.2
+pyobjc-framework-SecurityInterface==9.2
+pyobjc-framework-ServiceManagement==9.2
+pyobjc-framework-SharedWithYou==9.2
+pyobjc-framework-SharedWithYouCore==9.2
+pyobjc-framework-ShazamKit==9.2
+pyobjc-framework-Social==9.2
+pyobjc-framework-SoundAnalysis==9.2
+pyobjc-framework-Speech==9.2
+pyobjc-framework-SpriteKit==9.2
+pyobjc-framework-StoreKit==9.2
+pyobjc-framework-SyncServices==9.2
+pyobjc-framework-SystemConfiguration==9.2
+pyobjc-framework-SystemExtensions==9.2
+pyobjc-framework-ThreadNetwork==9.2
+pyobjc-framework-UniformTypeIdentifiers==9.2
+pyobjc-framework-UserNotifications==9.2
+pyobjc-framework-UserNotificationsUI==9.2
+pyobjc-framework-VideoSubscriberAccount==9.2
+pyobjc-framework-VideoToolbox==9.2
+pyobjc-framework-Virtualization==9.2
+pyobjc-framework-Vision==9.2
+pyobjc-framework-WebKit==9.2
+Pyomo==6.6.1
+pyparsing==3.0.9
+pyproject-flake8==5.0.4
+PyRect==0.2.0
+PyScreeze==0.1.30
+pytest==7.4.0
+pytest-cov==4.1.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+pytweening==1.2.0
+pytz==2023.3
+pyupgrade==3.3.1
+PyYAML==6.0.2
+pyzmq==26.1.0
+qiskit==1.1.1
+qiskit-aer==0.14.2
+qtconsole==5.5.2
+QtPy==2.4.1
+referencing==0.35.1
+requests==2.31.0
+requests-oauthlib==1.3.1
+restructuredtext-lint==1.4.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+rsa==4.9
+rubicon-objc==0.4.9
+rustworkx==0.15.1
+scikit-learn==1.5.1
+scipy==1.14.0
+Send2Trash==1.8.3
+six==1.16.0
+sniffio==1.3.1
+snowballstemmer==2.2.0
+soupsieve==2.5
+Sphinx==6.2.1
+sphinx-rtd-theme==1.2.2
+sphinxcontrib-applehelp==1.0.4
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.1
+sphinxcontrib-jquery==4.1
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+stable-baselines3==1.8.0
+stack-data==0.6.3
+stevedore==5.2.0
+symengine==0.11.0
+sympy==1.12
+TatSu==5.8.3
+tensorboard==2.13.0
+tensorboard-data-server==0.7.1
+terminado==0.18.1
+threadpoolctl==3.5.0
+tinycss2==1.3.0
+tokenize-rt==5.1.0
+tomli==2.0.1
+torch==2.0.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+types-python-dateutil==2.8.19.13
+types-requests==2.31.0.1
+types-urllib3==1.26.25.13
+typing_extensions==4.11.0
+tzdata==2024.2
+uri-template==1.3.0
+urllib3==1.26.16
+userpath==1.9.2
+virtualenv==20.24.0
+wcwidth==0.2.13
+webcolors==24.6.0
+webencodings==0.5.1
+websocket-client==1.8.0
+Werkzeug==2.3.6
+widgetsnbextension==4.0.11
+xlrd==2.0.1
+zipp==3.16.2
+zope.interface==7.1.0