Spaces:
Runtime error
Runtime error
File size: 2,291 Bytes
d6585f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
#!/bin/bash
# path setting
CORPUS_DIR=/root/Corpus/mrtydi-korean/collection
CORPUS_PATH=${CORPUS_DIR}/docs.jsonl
INDEX_DIR=indexes/mrtydi-korean
mkdir -p $INDEX_DIR
# sparse indexing
lang=korean
abbr=ko
NUM_THREADS=16
# `target` directory not found
#echo "sparse (anserini version)"
#target/appassembler/bin/IndexCollection \
#-collection MrTyDiCollection \
#-input $CORPUS_DIR \
#-index $INDEX_DIR/sparse_anserini \
#-generator DefaultLuceneDocumentGenerator \
#-threads $NUM_THREADS -storePositions -storeDocvectors -storeRaw -language $abbr
echo "sparse (pyserini version) ========================> SKIP ====================> "
#python -m pyserini.index.lucene \
#--collection JsonCollection \
#--input $CORPUS_DIR \
#--index $INDEX_DIR/sparse_pyserini \
#--generator DefaultLuceneDocumentGenerator \
#--language $abbr \
#--threads $NUM_THREADS \
#--storePositions --storeDocvectors --storeRaw
# dense indexing
echo "dense"
export CUDA_VISIBLE_DEVICES=1
BATCH_SIZE=8
MAXLEN=512
ENCODER=castorini/mdpr-passage-nq
python -m pyserini.encode input --corpus $CORPUS_PATH \
--fields title text \
--delimiter "\n\n" \
output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \
--to-faiss \
encoder --encoder $ENCODER \
--fields title text \
--max-length $MAXLEN \
--batch $BATCH_SIZE \
--fp16
BATCH_SIZE=32
MAXLEN=256 # default
ENCODER=castorini/mdpr-passage-nq
python -m pyserini.encode input --corpus $CORPUS_PATH \
--fields title text \
--delimiter "\n\n" \
output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \
--to-faiss \
encoder --encoder $ENCODER \
--fields title text \
--max-length $MAXLEN \
--batch $BATCH_SIZE \
--fp16 |