File size: 4,910 Bytes
a2e592b
 
 
 
 
849b076
 
 
a2e592b
 
 
e9ce916
a2e592b
9681aac
a2e592b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150c4a4
849b076
 
 
150c4a4
849b076
 
 
 
 
3ad5350
a2e592b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from langchain_core.tools import tool
from sefaria import get_text as sefaria_get_text, get_commentaries as sefaria_get_commentaries
from tantivy_search import TantivySearch
from typing import Optional
from pydantic import BaseModel, Field
import os 
import gdown 
import zipfile

from app import INDEX_PATH

                    
class ReadTextArgs(BaseModel):
        reference: str = Field(description="The reference to retrieve the text for. examples: 讘专讗砖讬转 讗 驻专拽 讗; 砖讜诇讞谉 注专讜讱 讞讜砖谉 诪砖驻讟 住讬诪谉 砖诪讟 住注讬祝 讗")

class SearchArgs(BaseModel):
        query: str = Field(description="""the query for the search.
    Instructions for generating a query:

    1. Boolean Operators:

    - AND: term1 AND term2 (both required)
    - OR: term1 OR term2 (either term)
    - Multiple words default to OR operation (cloud network = cloud OR network)
    - AND takes precedence over OR
    - Example: Shabath AND (walk OR go)

    2. Field-specific Terms:
    - Field-specific terms: field:term
    - Example: text:讗讚诐 AND reference:讘专讗砖讬转
    - available fields: text, reference, topics
    - text contains the text of the document
    - reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗
    - topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc.

    3. Required/Excluded Terms:
    - Required (+): +term (must contain)
    - Excluded (-): -term (must not contain)
    - Example: +security cloud -deprecated
    - Equivalent to: security AND cloud AND NOT deprecated

    4. Phrase Search:
    - Use quotes: "exact phrase"
    - Both single/double quotes work
    - Escape quotes with \\"
    - Slop operator: "term1 term2"~N 
    - Example: "cloud security"~2 
    - the above will find "cloud framework and security "
    - Prefix matching: "start of phrase"*

    5. Wildcards:
    - ? for single character
    - * for any number of characters
    - Example: sec?rity cloud*

    6. Special Features:
    - All docs: * 
    - Boost terms: term^2.0 (positive numbers only)
    - Example: security^2.0 cloud
    - the above will boost security by 2.0
    
    Query Examples:
    1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专
    2. Field-specific: text:住讬谞讬 AND topics:转谞讱
    3. Phrase with slop: "security framework"~2
    4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐
    6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖

    Tips:
    - Group complex expressions with parentheses
    - Use quotes for exact phrases
    - Add + for required terms, - for excluded terms
    - Boost important terms with ^N
    - use field-specific terms for better results. 
    - the corpus to search in is an ancient Hebrew corpus: Tora and Talmud. so Try to use ancient Hebrew terms and or Talmudic expressions and prevent modern words that are not common in talmudic texts
    """)
        num_results: int = Field(description="the maximum number of results to return. Default: 10", default=10)



index_path = INDEX_PATH
gdrive_index_id = os.getenv("GDRIVE_INDEX_ID", "1lpbBCPimwcNfC0VZOlQueA4SHNGIp5_t")
if not os.path.exists(index_path):
    try:
            zip_path = "index.zip"
            url = f"https://drive.google.com/uc?id={gdrive_index_id}"
            gdown.download(url, zip_path, quiet=False)  
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(".")
            os.remove(zip_path)
    except Exception as e:
        raise Exception(f"failed to download index: {e}")
try:
    tantivy = TantivySearch(index_path)
    tantivy.validate_index()    
except Exception as e:
    raise Exception(f"failed to create index: {e}")
        
            
    
@tool(args_schema=SearchArgs)
def search( query: str, num_results: int = 10):
    """Searches the index for the given query."""
    results = tantivy.search(query, num_results)
    formatted_results = []
    for result in results:
        formatted_results.append({
            'text': result.get('text', 'N/A'),
            'reference': result.get('reference', 'N/A')
        })
            
    return formatted_results


@tool(args_schema=ReadTextArgs)
def read_text(reference: str )->str:
    """Retrieves the text for a given reference.  
    """
    text = sefaria_get_text(reference)
    return {
        'text': str(text),
        'reference': reference
    }

@tool
def get_commentaries(reference: str, num_results: int = 10)->str:
    """Retrieves references to all available commentaries on the given verse."""
    commentaries = sefaria_get_commentaries(reference)
    return {
        'text': '\n'.join(commentaries) if isinstance(commentaries, list) else str(commentaries),
        'reference': f"Commentaries on {reference}"
    }