|
from typing import List, Dict, Any, Optional
|
|
from tantivy import Index
|
|
import logging
|
|
import os
|
|
import re
|
|
|
|
|
|
class TantivySearchAgent:
|
|
def __init__(self, index_path: str):
|
|
"""Initialize the Tantivy search agent with the index path"""
|
|
self.index_path = index_path
|
|
self.logger = logging.getLogger(__name__)
|
|
try:
|
|
self.index = Index.open(index_path)
|
|
self.logger.info(f"Successfully opened Tantivy index at {index_path}")
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to open Tantivy index: {e}")
|
|
raise
|
|
|
|
def get_query_instructions(self) -> str:
|
|
"""Return instructions for the LLM on how to parse and construct Tantivy queries"""
|
|
return """
|
|
Instructions for generating a query:
|
|
|
|
1. Boolean Operators:
|
|
|
|
- AND: term1 AND term2 (both required)
|
|
- OR: term1 OR term2 (either term)
|
|
- Multiple words default to OR operation (cloud network = cloud OR network)
|
|
- AND takes precedence over OR
|
|
- Example: Shabath AND (walk OR go)
|
|
|
|
2. Field-specific Terms:
|
|
- Field-specific terms: field:term
|
|
- Example: text:讗讚诐 AND reference:讘专讗砖讬转
|
|
- available fields: text, reference, topics
|
|
- text contains the text of the document
|
|
- reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗
|
|
- topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc.
|
|
|
|
3. Required/Excluded Terms:
|
|
- Required (+): +term (must contain)
|
|
- Excluded (-): -term (must not contain)
|
|
- Example: +security cloud -deprecated
|
|
- Equivalent to: security AND cloud AND NOT deprecated
|
|
|
|
4. Phrase Search:
|
|
- Use quotes: "exact phrase"
|
|
- Both single/double quotes work
|
|
- Escape quotes with \\"
|
|
- Slop operator: "term1 term2"~N
|
|
- Example: "cloud security"~2
|
|
- the above will find "cloud framework and security "
|
|
- Prefix matching: "start of phrase"*
|
|
|
|
5. Wildcards:
|
|
- ? for single character
|
|
- * for any number of characters
|
|
- Example: sec?rity cloud*
|
|
|
|
6. Special Features:
|
|
- All docs: *
|
|
- Boost terms: term^2.0 (positive numbers only)
|
|
- Example: security^2.0 cloud
|
|
- the above will boost security by 2.0
|
|
|
|
Query Examples:
|
|
1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专
|
|
2. Field-specific: text:住讬谞讬 AND topics:转谞讱
|
|
3. Phrase with slop: "security framework"~2
|
|
4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐
|
|
6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖
|
|
|
|
Tips:
|
|
- Group complex expressions with parentheses
|
|
- Use quotes for exact phrases
|
|
- Add + for required terms, - for excluded terms
|
|
- Boost important terms with ^N
|
|
- use field-specific terms for better results.
|
|
"""
|
|
|
|
def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
|
|
"""Search the Tantivy index with the given query using Tantivy's query syntax"""
|
|
try:
|
|
|
|
searcher = self.index.searcher()
|
|
|
|
|
|
try:
|
|
|
|
query_parser = self.index.parse_query_lenient(query)
|
|
search_results = searcher.search(query_parser[0], num_results).hits
|
|
|
|
except Exception as query_error:
|
|
self.logger.error(f"Lenient query parsing failed: {query_error}")
|
|
|
|
|
|
results = []
|
|
for score, doc_address in search_results:
|
|
doc = searcher.doc(doc_address)
|
|
text = doc.get_first("text")
|
|
|
|
|
|
|
|
highlight_terms = re.sub(
|
|
r'[:"()[\]{}^~*\\]|\b(AND|OR|NOT|TO|IN)\b|[-+]',
|
|
' ',
|
|
query
|
|
).strip()
|
|
highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]
|
|
|
|
|
|
if highlight_terms:
|
|
|
|
patterns = [re.escape(term) for term in highlight_terms]
|
|
pattern = '|'.join(patterns)
|
|
|
|
matches = list(re.finditer(pattern, text, re.IGNORECASE))
|
|
if matches:
|
|
highlights = []
|
|
for match in matches:
|
|
start = max(0, match.start() - 50)
|
|
end = min(len(text), match.end() + 50)
|
|
highlight = text[start:end]
|
|
if start > 0:
|
|
highlight = f"...{highlight}"
|
|
if end < len(text):
|
|
highlight = f"{highlight}..."
|
|
highlights.append(highlight)
|
|
else:
|
|
highlights = [text[:100] + "..." if len(text) > 100 else text]
|
|
else:
|
|
highlights = [text[:100] + "..." if len(text) > 100 else text]
|
|
|
|
result = {
|
|
"score": float(score),
|
|
"title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
|
|
"reference": doc.get_first("reference"),
|
|
"topics": doc.get_first("topics"),
|
|
"file_path": doc.get_first("filePath"),
|
|
"line_number": doc.get_first("segment"),
|
|
"is_pdf": doc.get_first("isPdf"),
|
|
"text": text,
|
|
"highlights": highlights
|
|
}
|
|
results.append(result)
|
|
|
|
self.logger.info(f"Found {len(results)} results for query: {query}")
|
|
return results
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error during search: {str(e)}")
|
|
return []
|
|
|
|
def validate_index(self) -> bool:
|
|
"""Validate that the index exists and is accessible"""
|
|
try:
|
|
|
|
searcher = self.index.searcher()
|
|
query_parser = self.index.parse_query("*")
|
|
searcher.search(query_parser, 1)
|
|
return True
|
|
except Exception as e:
|
|
self.logger.error(f"Index validation failed: {e}")
|
|
return False
|
|
|