Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / tests /test_index_otf.py

geonmin-kim

Upload folder using huggingface_hub

d6585f5 about 1 year ago

raw

history blame contribute delete

10.4 kB

	#
	# Pyserini: Reproducible IR research with sparse and dense representations
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import os
	import shutil
	import unittest
	from typing import List

	from pyserini.index.lucene import LuceneIndexer, IndexReader, JacksonObjectMapper
	from pyserini.search.lucene import JLuceneSearcherResult, LuceneSearcher


	class TestSearch(unittest.TestCase):
	def setUp(self):
	self.docs = []
	self.tmp_dir = "temp_dir"

	# The current directory depends on if you're running inside an IDE or from command line.
	curdir = os.getcwd()
	if curdir.endswith('tests'):
	self.test_file = '../tests/resources/simple_cacm_corpus.json'
	else:
	self.test_file = 'tests/resources/simple_cacm_corpus.json'

	def test_indexer(self):
	indexer = LuceneIndexer(self.tmp_dir)

	with open(self.test_file) as f:
	for doc in f:
	indexer.add_doc_raw(doc)

	indexer.close()

	searcher = LuceneSearcher(self.tmp_dir)
	self.assertEqual(3, searcher.num_docs)

	hits = searcher.search('semantic networks')

	self.assertTrue(isinstance(hits, List))
	self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
	self.assertEqual(1, len(hits))
	self.assertEqual('CACM-2274', hits[0].docid)
	self.assertAlmostEqual(1.53650, hits[0].score, places=5)

	def test_indexer_batch1(self):
	batch = []
	with open(self.test_file) as f:
	for doc in f:
	batch.append(doc)

	# Test different ways to initialize indexer.
	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_batch_raw(batch)
	indexer.close()

	searcher = LuceneSearcher(self.tmp_dir)
	self.assertEqual(3, searcher.num_docs)

	hits = searcher.search('semantic networks')

	self.assertTrue(isinstance(hits, List))
	self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
	self.assertEqual(1, len(hits))
	self.assertEqual('CACM-2274', hits[0].docid)
	self.assertAlmostEqual(1.53650, hits[0].score, places=5)

	# Test different ways to initialize indexer.
	indexer = LuceneIndexer(self.tmp_dir, threads=2)
	indexer.add_batch_raw(batch)
	indexer.close()

	searcher = LuceneSearcher(self.tmp_dir)
	self.assertEqual(3, searcher.num_docs)

	hits = searcher.search('semantic networks')

	self.assertTrue(isinstance(hits, List))
	self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
	self.assertEqual(1, len(hits))
	self.assertEqual('CACM-2274', hits[0].docid)
	self.assertAlmostEqual(1.53650, hits[0].score, places=5)

	# Test different ways to initialize indexer.
	indexer = LuceneIndexer(self.tmp_dir, threads=4)
	indexer.add_batch_raw(batch)
	indexer.close()

	searcher = LuceneSearcher(self.tmp_dir)
	self.assertEqual(3, searcher.num_docs)

	hits = searcher.search('semantic networks')

	self.assertTrue(isinstance(hits, List))
	self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
	self.assertEqual(1, len(hits))
	self.assertEqual('CACM-2274', hits[0].docid)
	self.assertAlmostEqual(1.53650, hits[0].score, places=5)

	# Test different ways to initialize indexer
	indexer = LuceneIndexer(args=['-index', self.tmp_dir, '-threads', '4'])
	indexer.add_batch_raw(batch)
	indexer.close()

	searcher = LuceneSearcher(self.tmp_dir)
	self.assertEqual(3, searcher.num_docs)

	hits = searcher.search('semantic networks')

	self.assertTrue(isinstance(hits, List))
	self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
	self.assertEqual(1, len(hits))
	self.assertEqual('CACM-2274', hits[0].docid)
	self.assertAlmostEqual(1.53650, hits[0].score, places=5)

	def test_indexer_with_args(self):
	indexer = LuceneIndexer(args=['-index', self.tmp_dir, '-pretokenized'])

	with open(self.test_file) as f:
	for doc in f:
	indexer.add_doc_raw(doc)

	indexer.close()

	searcher = LuceneSearcher(self.tmp_dir)
	self.assertEqual(3, searcher.num_docs)

	hits = searcher.search('semantic networks')

	self.assertTrue(isinstance(hits, List))
	self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
	self.assertEqual(1, len(hits))
	self.assertEqual('CACM-2274', hits[0].docid)
	self.assertAlmostEqual(0.62610, hits[0].score, places=5)

	def test_indexer_append1(self):
	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_doc_raw('{"id": "0", "contents": "Document 0"}')
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(1, stats['documents'])
	self.assertIsNotNone(reader.doc('0'))

	indexer = LuceneIndexer(self.tmp_dir, append=True)
	indexer.add_doc_raw('{"id": "1", "contents": "Document 1"}')
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(2, stats['documents'])
	self.assertIsNotNone(reader.doc('0'))
	self.assertIsNotNone(reader.doc('1'))

	def test_indexer_append2(self):
	# Make sure it's okay if we append to an empty index.
	indexer = LuceneIndexer(self.tmp_dir, append=True)
	indexer.add_doc_raw('{"id": "0", "contents": "Document 0"}')
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(1, stats['documents'])
	self.assertIsNotNone(reader.doc('0'))

	# Confirm that we are overwriting.
	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_doc_raw('{"id": "1", "contents": "Document 1"}')
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(1, stats['documents'])
	self.assertIsNone(reader.doc('0'))
	self.assertIsNotNone(reader.doc('1'))

	# Now we're appending.
	indexer = LuceneIndexer(self.tmp_dir, append=True)
	indexer.add_doc_raw('{"id": "x", "contents": "Document x"}')
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(2, stats['documents'])
	self.assertIsNone(reader.doc('0'))
	self.assertIsNotNone(reader.doc('1'))
	self.assertIsNotNone(reader.doc('x'))

	def test_indexer_type_raw(self):
	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_doc_raw('{"id": "doc0", "contents": "document 0 contents"}')
	indexer.add_doc_raw('{"id": "doc1", "contents": "document 1 contents"}')
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(2, stats['documents'])
	self.assertIsNotNone(reader.doc('doc0'))
	self.assertIsNotNone(reader.doc('doc1'))

	def test_indexer_type_raw_batch(self):
	batch = ['{"id": "doc0", "contents": "document 0 contents"}',
	'{"id": "doc1", "contents": "document 1 contents"}']

	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_batch_raw(batch)
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(2, stats['documents'])
	self.assertIsNotNone(reader.doc('doc0'))
	self.assertIsNotNone(reader.doc('doc1'))

	def test_indexer_type_dict(self):
	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_doc_dict({'id': 'doc0', 'contents': 'document 0 contents'})
	indexer.add_doc_dict({'id': 'doc1', 'contents': 'document 1 contents'})
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(2, stats['documents'])
	self.assertIsNotNone(reader.doc('doc0'))
	self.assertIsNotNone(reader.doc('doc1'))

	def test_indexer_type_dict_batch(self):
	batch = [{'id': 'doc0', 'contents': 'document 0 contents'},
	{'id': 'doc1', 'contents': 'document 1 contents'}]

	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_batch_dict(batch)
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(2, stats['documents'])
	self.assertIsNotNone(reader.doc('doc0'))
	self.assertIsNotNone(reader.doc('doc1'))

	def test_indexer_type_json(self):
	mapper = JacksonObjectMapper()

	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_doc_json(mapper.createObjectNode().put('id', 'doc0').put('contents', 'document 0 contents'))
	indexer.add_doc_json(mapper.createObjectNode().put('id', 'doc1').put('contents', 'document 1 contents'))
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(2, stats['documents'])
	self.assertIsNotNone(reader.doc('doc0'))
	self.assertIsNotNone(reader.doc('doc1'))

	def test_indexer_type_json_batch(self):
	mapper = JacksonObjectMapper()
	batch = [mapper.createObjectNode().put('id', 'doc0').put('contents', 'document 0 contents'),
	mapper.createObjectNode().put('id', 'doc1').put('contents', 'document 1 contents')]

	indexer = LuceneIndexer(self.tmp_dir)
	indexer.add_batch_json(batch)
	indexer.close()

	reader = IndexReader(self.tmp_dir)
	stats = reader.stats()
	self.assertEqual(2, stats['documents'])
	self.assertIsNotNone(reader.doc('doc0'))
	self.assertIsNotNone(reader.doc('doc1'))

	def tearDown(self):
	shutil.rmtree(self.tmp_dir)