Spaces:

BanUrsus
/

BM25-search-engine-index-on-SciQ

Running

App Files Files Community

BM25-search-engine-index-on-SciQ / app.py

BanUrsus

Update app.py

0cf3895 verified 18 days ago

raw

history blame contribute delete

11.7 kB

	from __future__ import annotations

	from nlp4web_codebase.ir.data_loaders.dm import Document
	from nlp4web_codebase.ir.models import BaseRetriever

	from scipy.sparse._csc import csc_matrix
	import numpy as np
	import gradio as gr

	from typing import TypedDict, Type
	from dataclasses import dataclass

	from dataclasses import dataclass
	import pickle
	import os
	from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
	from abc import abstractmethod

	from collections import Counter
	import tqdm
	import re
	import nltk
	nltk.download("stopwords", quiet=True)
	from nltk.corpus import stopwords as nltk_stopwords

	LANGUAGE = "english"
	word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
	stopwords = set(nltk_stopwords.words(LANGUAGE))


	def word_splitting(text: str) -> List[str]:
	return word_splitter(text.lower())

	def lemmatization(words: List[str]) -> List[str]:
	return words # We ignore lemmatization here for simplicity

	def simple_tokenize(text: str) -> List[str]:
	words = word_splitting(text)
	tokenized = list(filter(lambda w: w not in stopwords, words))
	tokenized = lemmatization(tokenized)
	return tokenized

	T = TypeVar("T", bound="InvertedIndex")

	@dataclass
	class PostingList:
	term: str # The term
	docid_postings: List[int] # docid_postings[i] means the docid (int) of the i-th associated posting
	tweight_postings: List[float] # tweight_postings[i] means the term weight (float) of the i-th associated posting


	@dataclass
	class InvertedIndex:
	posting_lists: List[PostingList] # docid -> posting_list
	vocab: Dict[str, int]
	cid2docid: Dict[str, int] # collection_id -> docid
	collection_ids: List[str] # docid -> collection_id
	doc_texts: Optional[List[str]] = None # docid -> document text

	def save(self, output_dir: str) -> None:
	os.makedirs(output_dir, exist_ok=True)
	with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
	pickle.dump(self, f)

	@classmethod
	def from_saved(cls: Type[T], saved_dir: str) -> T:
	index = cls(
	posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
	)
	with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
	index = pickle.load(f)
	return index


	# The output of the counting function:
	@dataclass
	class Counting:
	posting_lists: List[PostingList]
	vocab: Dict[str, int]
	cid2docid: Dict[str, int]
	collection_ids: List[str]
	dfs: List[int] # tid -> df
	dls: List[int] # docid -> doc length
	avgdl: float
	nterms: int
	doc_texts: Optional[List[str]] = None

	def run_counting(
	documents: Iterable[Document],
	tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
	store_raw: bool = True, # store the document text in doc_texts
	ndocs: Optional[int] = None,
	show_progress_bar: bool = True,
	) -> Counting:
	"""Counting TFs, DFs, doc_lengths, etc."""
	posting_lists: List[PostingList] = []
	vocab: Dict[str, int] = {}
	cid2docid: Dict[str, int] = {}
	collection_ids: List[str] = []
	dfs: List[int] = [] # tid -> df
	dls: List[int] = [] # docid -> doc length
	nterms: int = 0
	doc_texts: Optional[List[str]] = []
	for doc in tqdm.tqdm(
	documents,
	desc="Counting",
	total=ndocs,
	disable=not show_progress_bar,
	):
	if doc.collection_id in cid2docid:
	continue
	collection_ids.append(doc.collection_id)
	docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
	toks = tokenize_fn(doc.text)
	tok2tf = Counter(toks)
	dls.append(sum(tok2tf.values()))
	for tok, tf in tok2tf.items():
	nterms += tf
	tid = vocab.get(tok, None)
	if tid is None:
	posting_lists.append(
	PostingList(term=tok, docid_postings=[], tweight_postings=[])
	)
	tid = vocab.setdefault(tok, len(vocab))
	posting_lists[tid].docid_postings.append(docid)
	posting_lists[tid].tweight_postings.append(tf)
	if tid < len(dfs):
	dfs[tid] += 1
	else:
	dfs.append(0)
	if store_raw:
	doc_texts.append(doc.text)
	else:
	doc_texts = None
	return Counting(
	posting_lists=posting_lists,
	vocab=vocab,
	cid2docid=cid2docid,
	collection_ids=collection_ids,
	dfs=dfs,
	dls=dls,
	avgdl=sum(dls) / len(dls),
	nterms=nterms,
	doc_texts=doc_texts,
	)

	from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
	sciq = load_sciq()
	counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))

	# CSC
	@dataclass
	class CSCInvertedIndex:
	posting_lists_matrix: csc_matrix # docid -> posting_list
	vocab: Dict[str, int]
	cid2docid: Dict[str, int] # collection_id -> docid
	collection_ids: List[str] # docid -> collection_id
	doc_texts: Optional[List[str]] = None # docid -> document text

	def save(self, output_dir: str) -> None:
	os.makedirs(output_dir, exist_ok=True)
	with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
	pickle.dump(self, f)

	@classmethod
	def from_saved(cls: Type[T], saved_dir: str) -> T:
	index = cls(
	posting_lists_matrix=None, vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
	)
	with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
	index = pickle.load(f)
	return index

	@dataclass
	class CSCBM25Index(CSCInvertedIndex):

	@staticmethod
	def tokenize(text: str) -> List[str]:
	return simple_tokenize(text)

	@staticmethod
	def cache_term_weights(
	posting_lists: List[PostingList],
	total_docs: int,
	avgdl: float,
	dfs: List[int],
	dls: List[int],
	k1: float,
	b: float,
	) -> csc_matrix:
	"""Compute term weights and caching"""

	data = []
	indices = []
	N = total_docs
	indptr = [0]

	for tid, posting_list in enumerate(posting_lists):
	idf = CSCBM25Index.calc_idf(df=dfs[tid], N=N)

	# Iterate through docid_postings and tweight_postings simultaneously
	for docid, tf in zip(posting_list.docid_postings, posting_list.tweight_postings):
	dl = dls[docid]
	regularized_tf = CSCBM25Index.calc_regularized_tf(
	tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
	)

	bm25_weight = regularized_tf * idf
	if bm25_weight != 0:
	# Update the tweight_postings directly
	posting_list.tweight_postings[posting_list.docid_postings.index(docid)] = bm25_weight
	data.append(bm25_weight)
	indices.append(docid)


	indptr.append(len(data))

	shape = (N, len(posting_lists))

	data = np.array(data, np.float32)
	indices = np.array(indices)
	indptr = np.array(indptr)

	output_matrix = csc_matrix((data, indices, indptr), shape=shape)

	return output_matrix

	@staticmethod
	def calc_regularized_tf(
	tf: int, dl: float, avgdl: float, k1: float, b: float
	) -> float:
	return tf / (tf + k1 * (1 - b + b * dl / avgdl))

	@staticmethod
	def calc_idf(df: int, N: int):
	return math.log(1 + (N - df + 0.5) / (df + 0.5))

	@classmethod
	def build_from_documents(
	cls: Type[CSCBM25Index],
	documents: Iterable[Document],
	store_raw: bool = True,
	output_dir: Optional[str] = None,
	ndocs: Optional[int] = None,
	show_progress_bar: bool = True,
	k1: float = 0.9,
	b: float = 0.4,
	) -> CSCBM25Index:
	# Counting TFs, DFs, doc_lengths, etc.:
	counting = run_counting(
	documents=documents,
	tokenize_fn=CSCBM25Index.tokenize,
	store_raw=store_raw,
	ndocs=ndocs,
	show_progress_bar=show_progress_bar,
	)

	# Compute term weights and caching:
	posting_lists = counting.posting_lists
	total_docs = len(counting.cid2docid)
	posting_lists_matrix = CSCBM25Index.cache_term_weights(
	posting_lists=posting_lists,
	total_docs=total_docs,
	avgdl=counting.avgdl,
	dfs=counting.dfs,
	dls=counting.dls,
	k1=k1,
	b=b,
	)

	# Assembly and save:
	index = CSCBM25Index(
	posting_lists_matrix=posting_lists_matrix,
	vocab=counting.vocab,
	cid2docid=counting.cid2docid,
	collection_ids=counting.collection_ids,
	doc_texts=counting.doc_texts,
	)
	return index

	class BaseCSCInvertedIndexRetriever(BaseRetriever):

	@property
	@abstractmethod
	def index_class(self) -> Type[CSCInvertedIndex]:
	pass

	def __init__(self, index_dir: str) -> None:
	self.index = self.index_class.from_saved(index_dir)

	def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
	toks = self.index.tokenize(query)
	target_docid = self.index.cid2docid[cid]
	term_weights: Dict[str, float] = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	posting_list = self.index.posting_lists_matrix.getcol(tid)
	for docid, tweight in zip(
	posting_list.indices, posting_list.data
	):
	if docid == target_docid:
	term_weights[tok] = tweight
	break

	return term_weights

	def score(self, query: str, cid: str) -> float:
	return sum(self.get_term_weights(query=query, cid=cid).values())

	def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
	ranking:Dict[str, float] = {}

	toks = self.index.tokenize(query)
	docid2score: Dict(int, float) = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	posting_list = self.index.posting_lists_matrix.getcol(tid)
	for docid, tweight in zip(
	posting_list.indices, posting_list.data
	):
	docid2score[docid] = docid2score.get(docid, 0) + tweight

	ranking = {self.index.collection_ids[docid]: score for docid, score in docid2score.items()}
	ranking = sorted(ranking.items(), key=lambda x: x[1], reverse=True)[:topk]
	return ranking


	class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):

	@property
	def index_class(self) -> Type[CSCBM25Index]:
	return CSCBM25Index

	class Hit(TypedDict):
	cid: str
	score: float
	text: str

	demo: Optional[gr.Interface] = None # Assign your gradio demo to this variable
	return_type = List[Hit]

	def search(query: str) -> List[Hit]:

	csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index")

	top_docs = csc_bm25_retriever.retrieve(query)

	dict_docs = {doc.collection_id: doc.text for doc in sciq.corpus}

	hits = [Hit(cid=cid, score=score, text=dict_docs[cid]) for cid, score in top_docs]

	return hits

	demo = gr.Interface(
	fn=search,
	inputs="textbox",
	outputs="textbox",
	live=True,
	title="BM25 search engine index on SciQ",
	description="Enter your query to see the search results.",
	)

	demo.launch()