Spaces:

sivan22
/

Ituria

Running

App Files Files Community

Ituria / tantivy_search_agent.py

sivan22

Upload tantivy_search_agent.py

f6c0c6c verified 10 days ago

raw

history blame contribute delete

6.9 kB

	from typing import List, Dict, Any, Optional
	from tantivy import Index
	import logging
	import os
	import re


	class TantivySearchAgent:
	def __init__(self, index_path: str):
	"""Initialize the Tantivy search agent with the index path"""
	self.index_path = index_path
	self.logger = logging.getLogger(__name__)
	try:
	self.index = Index.open(index_path)
	self.logger.info(f"Successfully opened Tantivy index at {index_path}")
	except Exception as e:
	self.logger.error(f"Failed to open Tantivy index: {e}")
	raise

	def get_query_instructions(self) -> str:
	"""Return instructions for the LLM on how to parse and construct Tantivy queries"""
	return """
	Instructions for generating a query:

	1. Boolean Operators:

	- AND: term1 AND term2 (both required)
	- OR: term1 OR term2 (either term)
	- Multiple words default to OR operation (cloud network = cloud OR network)
	- AND takes precedence over OR
	- Example: Shabath AND (walk OR go)

	2. Field-specific Terms:
	- Field-specific terms: field:term
	- Example: text:אדם AND reference:בראשית
	- available fields: text, reference, topics
	- text contains the text of the document
	- reference contains the citation of the document, e.g. בראשית, פרק א
	- topics contains the topics of the document. available topics includes: תנך, הלכה, מדרש, etc.

	3. Required/Excluded Terms:
	- Required (+): +term (must contain)
	- Excluded (-): -term (must not contain)
	- Example: +security cloud -deprecated
	- Equivalent to: security AND cloud AND NOT deprecated

	4. Phrase Search:
	- Use quotes: "exact phrase"
	- Both single/double quotes work
	- Escape quotes with \\"
	- Slop operator: "term1 term2"~N
	- Example: "cloud security"~2
	- the above will find "cloud framework and security "
	- Prefix matching: "start of phrase"*

	5. Wildcards:
	- ? for single character
	- * for any number of characters
	- Example: sec?rity cloud*

	6. Special Features:
	- All docs: *
	- Boost terms: term^2.0 (positive numbers only)
	- Example: security^2.0 cloud
	- the above will boost security by 2.0

	Query Examples:
	1. Basic: +שבת +חולה +אסור
	2. Field-specific: text:סיני AND topics:תנך
	3. Phrase with slop: "security framework"~2
	4. Complex: +reference:בראשית +text:"הבל"^2.0 +(דמי OR דמים) -הבלים
	6. Mixed: (text:"רבנו משה"^2.0 OR reference:"משנה תורה") AND topics:הלכה) AND text:"תורה המלך"~3 AND NOT topics:מדרש

	Tips:
	- Group complex expressions with parentheses
	- Use quotes for exact phrases
	- Add + for required terms, - for excluded terms
	- Boost important terms with ^N
	- use field-specific terms for better results.
	"""

	def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
	"""Search the Tantivy index with the given query using Tantivy's query syntax"""
	try:
	# Create a searcher
	searcher = self.index.searcher()

	# Parse and execute the query
	try:
	# First try with lenient parsing
	query_parser = self.index.parse_query_lenient(query)
	search_results = searcher.search(query_parser[0], num_results).hits

	except Exception as query_error:
	self.logger.error(f"Lenient query parsing failed: {query_error}")

	# Process results
	results = []
	for score, doc_address in search_results:
	doc = searcher.doc(doc_address)
	text = doc.get_first("text")

	# Extract highlighted snippets based on query terms
	# Remove special syntax for highlighting while preserving Hebrew
	highlight_terms = re.sub(
	r'[:"()[\]{}^~*\\]\|\b(AND\|OR\|NOT\|TO\|IN)\b\|[-+]',
	' ',
	query
	).strip()
	highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]

	# Create regex pattern for highlighting
	if highlight_terms:
	# Escape regex special chars but preserve Hebrew
	patterns = [re.escape(term) for term in highlight_terms]
	pattern = '\|'.join(patterns)
	# Get surrounding context for matches
	matches = list(re.finditer(pattern, text, re.IGNORECASE))
	if matches:
	highlights = []
	for match in matches:
	start = max(0, match.start() - 50)
	end = min(len(text), match.end() + 50)
	highlight = text[start:end]
	if start > 0:
	highlight = f"...{highlight}"
	if end < len(text):
	highlight = f"{highlight}..."
	highlights.append(highlight)
	else:
	highlights = [text[:100] + "..." if len(text) > 100 else text]
	else:
	highlights = [text[:100] + "..." if len(text) > 100 else text]

	result = {
	"score": float(score),
	"title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
	"reference": doc.get_first("reference"),
	"topics": doc.get_first("topics"),
	"file_path": doc.get_first("filePath"),
	"line_number": doc.get_first("segment"),
	"is_pdf": doc.get_first("isPdf"),
	"text": text,
	"highlights": highlights
	}
	results.append(result)

	self.logger.info(f"Found {len(results)} results for query: {query}")
	return results

	except Exception as e:
	self.logger.error(f"Error during search: {str(e)}")
	return []

	def validate_index(self) -> bool:
	"""Validate that the index exists and is accessible"""
	try:
	# Try to create a searcher and perform a simple search
	searcher = self.index.searcher()
	query_parser = self.index.parse_query("*")
	searcher.search(query_parser, 1)
	return True
	except Exception as e:
	self.logger.error(f"Index validation failed: {e}")
	return False