Spaces:
Running
Running
File size: 4,910 Bytes
a2e592b 849b076 a2e592b e9ce916 a2e592b 9681aac a2e592b 150c4a4 849b076 150c4a4 849b076 3ad5350 a2e592b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from langchain_core.tools import tool
from sefaria import get_text as sefaria_get_text, get_commentaries as sefaria_get_commentaries
from tantivy_search import TantivySearch
from typing import Optional
from pydantic import BaseModel, Field
import os
import gdown
import zipfile
from app import INDEX_PATH
class ReadTextArgs(BaseModel):
reference: str = Field(description="The reference to retrieve the text for. examples: 讘专讗砖讬转 讗 驻专拽 讗; 砖讜诇讞谉 注专讜讱 讞讜砖谉 诪砖驻讟 住讬诪谉 砖诪讟 住注讬祝 讗")
class SearchArgs(BaseModel):
query: str = Field(description="""the query for the search.
Instructions for generating a query:
1. Boolean Operators:
- AND: term1 AND term2 (both required)
- OR: term1 OR term2 (either term)
- Multiple words default to OR operation (cloud network = cloud OR network)
- AND takes precedence over OR
- Example: Shabath AND (walk OR go)
2. Field-specific Terms:
- Field-specific terms: field:term
- Example: text:讗讚诐 AND reference:讘专讗砖讬转
- available fields: text, reference, topics
- text contains the text of the document
- reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗
- topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc.
3. Required/Excluded Terms:
- Required (+): +term (must contain)
- Excluded (-): -term (must not contain)
- Example: +security cloud -deprecated
- Equivalent to: security AND cloud AND NOT deprecated
4. Phrase Search:
- Use quotes: "exact phrase"
- Both single/double quotes work
- Escape quotes with \\"
- Slop operator: "term1 term2"~N
- Example: "cloud security"~2
- the above will find "cloud framework and security "
- Prefix matching: "start of phrase"*
5. Wildcards:
- ? for single character
- * for any number of characters
- Example: sec?rity cloud*
6. Special Features:
- All docs: *
- Boost terms: term^2.0 (positive numbers only)
- Example: security^2.0 cloud
- the above will boost security by 2.0
Query Examples:
1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专
2. Field-specific: text:住讬谞讬 AND topics:转谞讱
3. Phrase with slop: "security framework"~2
4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐
6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖
Tips:
- Group complex expressions with parentheses
- Use quotes for exact phrases
- Add + for required terms, - for excluded terms
- Boost important terms with ^N
- use field-specific terms for better results.
- the corpus to search in is an ancient Hebrew corpus: Tora and Talmud. so Try to use ancient Hebrew terms and or Talmudic expressions and prevent modern words that are not common in talmudic texts
""")
num_results: int = Field(description="the maximum number of results to return. Default: 10", default=10)
index_path = INDEX_PATH
gdrive_index_id = os.getenv("GDRIVE_INDEX_ID", "1lpbBCPimwcNfC0VZOlQueA4SHNGIp5_t")
if not os.path.exists(index_path):
try:
zip_path = "index.zip"
url = f"https://drive.google.com/uc?id={gdrive_index_id}"
gdown.download(url, zip_path, quiet=False)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(".")
os.remove(zip_path)
except Exception as e:
raise Exception(f"failed to download index: {e}")
try:
tantivy = TantivySearch(index_path)
tantivy.validate_index()
except Exception as e:
raise Exception(f"failed to create index: {e}")
@tool(args_schema=SearchArgs)
def search( query: str, num_results: int = 10):
"""Searches the index for the given query."""
results = tantivy.search(query, num_results)
formatted_results = []
for result in results:
formatted_results.append({
'text': result.get('text', 'N/A'),
'reference': result.get('reference', 'N/A')
})
return formatted_results
@tool(args_schema=ReadTextArgs)
def read_text(reference: str )->str:
"""Retrieves the text for a given reference.
"""
text = sefaria_get_text(reference)
return {
'text': str(text),
'reference': reference
}
@tool
def get_commentaries(reference: str, num_results: int = 10)->str:
"""Retrieves references to all available commentaries on the given verse."""
commentaries = sefaria_get_commentaries(reference)
return {
'text': '\n'.join(commentaries) if isinstance(commentaries, list) else str(commentaries),
'reference': f"Commentaries on {reference}"
}
|