sivan22 commited on
Commit
a6e858c
verified
1 Parent(s): a318ec5

Upload tantivy_search_agent.py

Browse files
Files changed (1) hide show
  1. tantivy_search_agent.py +165 -0
tantivy_search_agent.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Optional
2
+ from tantivy import Index
3
+ import logging
4
+ import os
5
+ import re
6
+
7
+
8
+ class TantivySearchAgent:
9
+ def __init__(self, index_path: str):
10
+ """Initialize the Tantivy search agent with the index path"""
11
+ self.index_path = index_path
12
+ self.logger = logging.getLogger(__name__)
13
+ try:
14
+ self.index = Index.open(index_path)
15
+ self.logger.info(f"Successfully opened Tantivy index at {index_path}")
16
+ except Exception as e:
17
+ self.logger.error(f"Failed to open Tantivy index: {e}")
18
+ raise
19
+
20
+ def get_query_instructions(self) -> str:
21
+ """Return instructions for the LLM on how to parse and construct Tantivy queries"""
22
+ return """
23
+ Instructions for generating a query:
24
+
25
+ 1. Boolean Operators:
26
+
27
+ - AND: term1 AND term2 (both required)
28
+ - OR: term1 OR term2 (either term)
29
+ - Multiple words default to OR operation (cloud network = cloud OR network)
30
+ - AND takes precedence over OR
31
+ - Example: Shabath AND (walk OR go)
32
+
33
+ 2. Field-specific Terms:
34
+ - Field-specific terms: field:term
35
+ - Example: text:讗讚诐 AND reference:讘专讗砖讬转
36
+ - available fields: text, reference, topics
37
+ - text contains the text of the document
38
+ - reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗
39
+ - topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc.
40
+
41
+ 3. Required/Excluded Terms:
42
+ - Required (+): +term (must contain)
43
+ - Excluded (-): -term (must not contain)
44
+ - Example: +security cloud -deprecated
45
+ - Equivalent to: security AND cloud AND NOT deprecated
46
+
47
+ 4. Phrase Search:
48
+ - Use quotes: "exact phrase"
49
+ - Both single/double quotes work
50
+ - Escape quotes with \\"
51
+ - Slop operator: "term1 term2"~N
52
+ - Example: "cloud security"~2
53
+ - the above will find "cloud framework and security "
54
+ - Prefix matching: "start of phrase"*
55
+
56
+ 5. Wildcards:
57
+ - ? for single character
58
+ - * for any number of characters
59
+ - Example: sec?rity cloud*
60
+
61
+ 6. Special Features:
62
+ - All docs: *
63
+ - Boost terms: term^2.0 (positive numbers only)
64
+ - Example: security^2.0 cloud
65
+ - the above will boost security by 2.0
66
+
67
+ Query Examples:
68
+ 1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专
69
+ 2. Field-specific: text:住讬谞讬 AND topics:转谞讱
70
+ 3. Phrase with slop: "security framework"~2
71
+ 4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐
72
+ 6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖
73
+
74
+ Tips:
75
+ - Group complex expressions with parentheses
76
+ - Use quotes for exact phrases
77
+ - Add + for required terms, - for excluded terms
78
+ - Boost important terms with ^N
79
+ - use field-specific terms for better results.
80
+ """
81
+
82
+ def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
83
+ """Search the Tantivy index with the given query using Tantivy's query syntax"""
84
+ try:
85
+ # Create a searcher
86
+ searcher = self.index.searcher()
87
+
88
+ # Parse and execute the query
89
+ try:
90
+ # First try with lenient parsing
91
+ query_parser = self.index.parse_query_lenient(query)
92
+ search_results = searcher.search(query_parser[0], num_results).hits
93
+
94
+ except Exception as query_error:
95
+ self.logger.error(f"Lenient query parsing failed: {query_error}")
96
+
97
+ # Process results
98
+ results = []
99
+ for score, doc_address in search_results:
100
+ doc = searcher.doc(doc_address)
101
+ text = doc.get_first("text")
102
+
103
+ # Extract highlighted snippets based on query terms
104
+ # Remove special syntax for highlighting while preserving Hebrew
105
+ highlight_terms = re.sub(
106
+ r'[:"()[\]{}^~*\\]|\b(AND|OR|NOT|TO|IN)\b|[-+]',
107
+ ' ',
108
+ query
109
+ ).strip()
110
+ highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]
111
+
112
+ # Create regex pattern for highlighting
113
+ if highlight_terms:
114
+ # Escape regex special chars but preserve Hebrew
115
+ patterns = [re.escape(term) for term in highlight_terms]
116
+ pattern = '|'.join(patterns)
117
+ # Get surrounding context for matches
118
+ matches = list(re.finditer(pattern, text, re.IGNORECASE))
119
+ if matches:
120
+ highlights = []
121
+ for match in matches:
122
+ start = max(0, match.start() - 50)
123
+ end = min(len(text), match.end() + 50)
124
+ highlight = text[start:end]
125
+ if start > 0:
126
+ highlight = f"...{highlight}"
127
+ if end < len(text):
128
+ highlight = f"{highlight}..."
129
+ highlights.append(highlight)
130
+ else:
131
+ highlights = [text[:100] + "..." if len(text) > 100 else text]
132
+ else:
133
+ highlights = [text[:100] + "..." if len(text) > 100 else text]
134
+
135
+ result = {
136
+ "score": float(score),
137
+ "title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
138
+ "reference": doc.get_first("reference"),
139
+ "topics": doc.get_first("topics"),
140
+ "file_path": doc.get_first("filePath"),
141
+ "line_number": doc.get_first("segment"),
142
+ "is_pdf": doc.get_first("isPdf"),
143
+ "text": text,
144
+ "highlights": highlights
145
+ }
146
+ results.append(result)
147
+
148
+ self.logger.info(f"Found {len(results)} results for query: {query}")
149
+ return results
150
+
151
+ except Exception as e:
152
+ self.logger.error(f"Error during search: {str(e)}")
153
+ return []
154
+
155
+ def validate_index(self) -> bool:
156
+ """Validate that the index exists and is accessible"""
157
+ try:
158
+ # Try to create a searcher and perform a simple search
159
+ searcher = self.index.searcher()
160
+ query_parser = self.index.parse_query("*")
161
+ searcher.search(query_parser, 1)
162
+ return True
163
+ except Exception as e:
164
+ self.logger.error(f"Index validation failed: {e}")
165
+ return False