NetsPresso_QA / run_indexing_mrtydi_sp_ds.sh
geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
#!/bin/bash
# path setting
CORPUS_DIR=/root/Corpus/mrtydi-korean/collection
CORPUS_PATH=${CORPUS_DIR}/docs.jsonl
INDEX_DIR=indexes/mrtydi-korean
mkdir -p $INDEX_DIR
# sparse indexing
lang=korean
abbr=ko
NUM_THREADS=16
# `target` directory not found
#echo "sparse (anserini version)"
#target/appassembler/bin/IndexCollection \
#-collection MrTyDiCollection \
#-input $CORPUS_DIR \
#-index $INDEX_DIR/sparse_anserini \
#-generator DefaultLuceneDocumentGenerator \
#-threads $NUM_THREADS -storePositions -storeDocvectors -storeRaw -language $abbr
echo "sparse (pyserini version) ========================> SKIP ====================> "
#python -m pyserini.index.lucene \
#--collection JsonCollection \
#--input $CORPUS_DIR \
#--index $INDEX_DIR/sparse_pyserini \
#--generator DefaultLuceneDocumentGenerator \
#--language $abbr \
#--threads $NUM_THREADS \
#--storePositions --storeDocvectors --storeRaw
# dense indexing
echo "dense"
export CUDA_VISIBLE_DEVICES=1
BATCH_SIZE=8
MAXLEN=512
ENCODER=castorini/mdpr-passage-nq
python -m pyserini.encode input --corpus $CORPUS_PATH \
--fields title text \
--delimiter "\n\n" \
output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \
--to-faiss \
encoder --encoder $ENCODER \
--fields title text \
--max-length $MAXLEN \
--batch $BATCH_SIZE \
--fp16
BATCH_SIZE=32
MAXLEN=256 # default
ENCODER=castorini/mdpr-passage-nq
python -m pyserini.encode input --corpus $CORPUS_PATH \
--fields title text \
--delimiter "\n\n" \
output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \
--to-faiss \
encoder --encoder $ENCODER \
--fields title text \
--max-length $MAXLEN \
--batch $BATCH_SIZE \
--fp16