from langchain_core.tools import tool from sefaria import get_text as sefaria_get_text, get_commentaries as sefaria_get_commentaries from tantivy_search import TantivySearch from typing import Optional from pydantic import BaseModel, Field import os import gdown import zipfile from app import INDEX_PATH class ReadTextArgs(BaseModel): reference: str = Field(description="The reference to retrieve the text for. examples: בראשית א פרק א; שולחן ערוך חושן משפט סימן שמט סעיף א") class SearchArgs(BaseModel): query: str = Field(description="""the query for the search. Instructions for generating a query: 1. Boolean Operators: - AND: term1 AND term2 (both required) - OR: term1 OR term2 (either term) - Multiple words default to OR operation (cloud network = cloud OR network) - AND takes precedence over OR - Example: Shabath AND (walk OR go) 2. Field-specific Terms: - Field-specific terms: field:term - Example: text:אדם AND reference:בראשית - available fields: text, reference, topics - text contains the text of the document - reference contains the citation of the document, e.g. בראשית, פרק א - topics contains the topics of the document. available topics includes: תנך, הלכה, מדרש, etc. 3. Required/Excluded Terms: - Required (+): +term (must contain) - Excluded (-): -term (must not contain) - Example: +security cloud -deprecated - Equivalent to: security AND cloud AND NOT deprecated 4. Phrase Search: - Use quotes: "exact phrase" - Both single/double quotes work - Escape quotes with \\" - Slop operator: "term1 term2"~N - Example: "cloud security"~2 - the above will find "cloud framework and security " - Prefix matching: "start of phrase"* 5. Wildcards: - ? for single character - * for any number of characters - Example: sec?rity cloud* 6. Special Features: - All docs: * - Boost terms: term^2.0 (positive numbers only) - Example: security^2.0 cloud - the above will boost security by 2.0 Query Examples: 1. Basic: +שבת +חולה +אסור 2. Field-specific: text:סיני AND topics:תנך 3. Phrase with slop: "security framework"~2 4. Complex: +reference:בראשית +text:"הבל"^2.0 +(דמי OR דמים) -הבלים 6. Mixed: (text:"רבנו משה"^2.0 OR reference:"משנה תורה") AND topics:הלכה) AND text:"תורה המלך"~3 AND NOT topics:מדרש Tips: - Group complex expressions with parentheses - Use quotes for exact phrases - Add + for required terms, - for excluded terms - Boost important terms with ^N - use field-specific terms for better results. - the corpus to search in is an ancient Hebrew corpus: Tora and Talmud. so Try to use ancient Hebrew terms and or Talmudic expressions and prevent modern words that are not common in talmudic texts """) num_results: int = Field(description="the maximum number of results to return. Default: 10", default=10) index_path = INDEX_PATH gdrive_index_id = os.getenv("GDRIVE_INDEX_ID", "1lpbBCPimwcNfC0VZOlQueA4SHNGIp5_t") if not os.path.exists(index_path): try: zip_path = "index.zip" url = f"https://drive.google.com/uc?id={gdrive_index_id}" gdown.download(url, zip_path, quiet=False) with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(".") os.remove(zip_path) except Exception as e: raise Exception(f"failed to download index: {e}") try: tantivy = TantivySearch(index_path) tantivy.validate_index() except Exception as e: raise Exception(f"failed to create index: {e}") @tool(args_schema=SearchArgs) def search( query: str, num_results: int = 10): """Searches the index for the given query.""" results = tantivy.search(query, num_results) formatted_results = [] for result in results: formatted_results.append({ 'text': result.get('text', 'N/A'), 'reference': result.get('reference', 'N/A') }) return formatted_results @tool(args_schema=ReadTextArgs) def read_text(reference: str )->str: """Retrieves the text for a given reference. """ text = sefaria_get_text(reference) return { 'text': str(text), 'reference': reference } @tool def get_commentaries(reference: str, num_results: int = 10)->str: """Retrieves references to all available commentaries on the given verse.""" commentaries = sefaria_get_commentaries(reference) return { 'text': '\n'.join(commentaries) if isinstance(commentaries, list) else str(commentaries), 'reference': f"Commentaries on {reference}" }