sivan22 commited on
Commit
68b3376
verified
1 Parent(s): 6cd9eb9

Delete tantivy_search_agent.py

Browse files
Files changed (1) hide show
  1. tantivy_search_agent.py +0 -165
tantivy_search_agent.py DELETED
@@ -1,165 +0,0 @@
1
- from typing import List, Dict, Any, Optional
2
- from tantivy import Index
3
- import logging
4
- import os
5
- import re
6
-
7
-
8
- class TantivySearchAgent:
9
- def __init__(self, index_path: str):
10
- """Initialize the Tantivy search agent with the index path"""
11
- self.index_path = index_path
12
- self.logger = logging.getLogger(__name__)
13
- try:
14
- self.index = Index.open(index_path)
15
- self.logger.info(f"Successfully opened Tantivy index at {index_path}")
16
- except Exception as e:
17
- self.logger.error(f"Failed to open Tantivy index: {e}")
18
- raise
19
-
20
- def get_query_instructions(self) -> str:
21
- """Return instructions for the LLM on how to parse and construct Tantivy queries"""
22
- return """
23
- Instructions for generating a query:
24
-
25
- 1. Boolean Operators:
26
-
27
- - AND: term1 AND term2 (both required)
28
- - OR: term1 OR term2 (either term)
29
- - Multiple words default to OR operation (cloud network = cloud OR network)
30
- - AND takes precedence over OR
31
- - Example: Shabath AND (walk OR go)
32
-
33
- 2. Field-specific Terms:
34
- - Field-specific terms: field:term
35
- - Example: text:讗讚诐 AND reference:讘专讗砖讬转
36
- - available fields: text, reference, topics
37
- - text contains the text of the document
38
- - reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗
39
- - topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc.
40
-
41
- 3. Required/Excluded Terms:
42
- - Required (+): +term (must contain)
43
- - Excluded (-): -term (must not contain)
44
- - Example: +security cloud -deprecated
45
- - Equivalent to: security AND cloud AND NOT deprecated
46
-
47
- 4. Phrase Search:
48
- - Use quotes: "exact phrase"
49
- - Both single/double quotes work
50
- - Escape quotes with \\"
51
- - Slop operator: "term1 term2"~N
52
- - Example: "cloud security"~2
53
- - the above will find "cloud framework and security "
54
- - Prefix matching: "start of phrase"*
55
-
56
- 5. Wildcards:
57
- - ? for single character
58
- - * for any number of characters
59
- - Example: sec?rity cloud*
60
-
61
- 6. Special Features:
62
- - All docs: *
63
- - Boost terms: term^2.0 (positive numbers only)
64
- - Example: security^2.0 cloud
65
- - the above will boost security by 2.0
66
-
67
- Query Examples:
68
- 1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专
69
- 2. Field-specific: text:住讬谞讬 AND topics:转谞讱
70
- 3. Phrase with slop: "security framework"~2
71
- 4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐
72
- 6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖
73
-
74
- Tips:
75
- - Group complex expressions with parentheses
76
- - Use quotes for exact phrases
77
- - Add + for required terms, - for excluded terms
78
- - Boost important terms with ^N
79
- - use field-specific terms for better results.
80
- """
81
-
82
- def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]:
83
- """Search the Tantivy index with the given query using Tantivy's query syntax"""
84
- try:
85
- # Create a searcher
86
- searcher = self.index.searcher()
87
-
88
- # Parse and execute the query
89
- try:
90
- # First try with lenient parsing
91
- query_parser = self.index.parse_query_lenient(query)
92
- search_results = searcher.search(query_parser[0], num_results).hits
93
-
94
- except Exception as query_error:
95
- self.logger.error(f"Lenient query parsing failed: {query_error}")
96
-
97
- # Process results
98
- results = []
99
- for score, doc_address in search_results:
100
- doc = searcher.doc(doc_address)
101
- text = doc.get_first("text")
102
-
103
- # Extract highlighted snippets based on query terms
104
- # Remove special syntax for highlighting while preserving Hebrew
105
- highlight_terms = re.sub(
106
- r'[:"()[\]{}^~*\\]|\b(AND|OR|NOT|TO|IN)\b|[-+]',
107
- ' ',
108
- query
109
- ).strip()
110
- highlight_terms = [term for term in highlight_terms.split() if len(term) > 1]
111
-
112
- # Create regex pattern for highlighting
113
- if highlight_terms:
114
- # Escape regex special chars but preserve Hebrew
115
- patterns = [re.escape(term) for term in highlight_terms]
116
- pattern = '|'.join(patterns)
117
- # Get surrounding context for matches
118
- matches = list(re.finditer(pattern, text, re.IGNORECASE))
119
- if matches:
120
- highlights = []
121
- for match in matches:
122
- start = max(0, match.start() - 50)
123
- end = min(len(text), match.end() + 50)
124
- highlight = text[start:end]
125
- if start > 0:
126
- highlight = f"...{highlight}"
127
- if end < len(text):
128
- highlight = f"{highlight}..."
129
- highlights.append(highlight)
130
- else:
131
- highlights = [text[:100] + "..." if len(text) > 100 else text]
132
- else:
133
- highlights = [text[:100] + "..." if len(text) > 100 else text]
134
-
135
- result = {
136
- "score": float(score),
137
- "title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""),
138
- "reference": doc.get_first("reference"),
139
- "topics": doc.get_first("topics"),
140
- "file_path": doc.get_first("filePath"),
141
- "line_number": doc.get_first("segment"),
142
- "is_pdf": doc.get_first("isPdf"),
143
- "text": text,
144
- "highlights": highlights
145
- }
146
- results.append(result)
147
-
148
- self.logger.info(f"Found {len(results)} results for query: {query}")
149
- return results
150
-
151
- except Exception as e:
152
- self.logger.error(f"Error during search: {str(e)}")
153
- return []
154
-
155
- def validate_index(self) -> bool:
156
- """Validate that the index exists and is accessible"""
157
- try:
158
- # Try to create a searcher and perform a simple search
159
- searcher = self.index.searcher()
160
- query_parser = self.index.parse_query("*")
161
- searcher.search(query_parser, 1)
162
- return True
163
- except Exception as e:
164
- self.logger.error(f"Index validation failed: {e}")
165
- return False