import os import sys import logging import torch from flask import Flask, request, jsonify import requests from transformers import AutoTokenizer, AutoModelForSequenceClassification # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s', handlers=[ logging.StreamHandler(sys.stdout), # Log to console logging.FileHandler('/tmp/search_app.log') # Log to file ] ) logger = logging.getLogger(__name__) # Set cache directory explicitly os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache' os.makedirs('/tmp/huggingface_cache', exist_ok=True) logger.info("🚀 Initializing Educational Search Reranker Application") logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}") app = Flask(__name__) # Define the SearXNG instance URL SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search" # Load the educational content classifier with explicit cache directory def load_model_with_retry(max_retries=3): logger.info("Attempting to load educational content classifier...") for attempt in range(max_retries): try: logger.info(f"Loading attempt {attempt + 1}...") # Log system info logger.info(f"Python Version: {sys.version}") logger.info(f"Torch Version: {torch.__version__}") logger.info("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( "HuggingFaceTB/fineweb-edu-classifier", cache_dir='/tmp/huggingface_cache' ) logger.info("Loading classification model...") model = AutoModelForSequenceClassification.from_pretrained( "HuggingFaceTB/fineweb-edu-classifier", cache_dir='/tmp/huggingface_cache' ) logger.info("✅ Model and tokenizer loaded successfully!") return tokenizer, model except Exception as e: logger.error(f"Model loading attempt {attempt + 1} failed: {e}") logger.error(f"Detailed error: {sys.exc_info()}") if attempt == max_retries - 1: logger.critical("❌ Failed to load model after all attempts!") raise # Load models at startup try: tokenizer, model = load_model_with_retry() except Exception as startup_error: logger.critical(f"Startup failed: {startup_error}") tokenizer, model = None, None def classify_educational_quality(text): """ Classify the educational quality of a given text snippet """ if tokenizer is None or model is None: logger.warning("Model not initialized. Returning default score.") return 0 try: logger.info(f"Classifying text (first 50 chars): {text[:50]}...") # Prepare input for the model inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True) # Get model outputs with torch.no_grad(): outputs = model(**inputs) # Extract the logits and convert to a score logits = outputs.logits.squeeze(-1).float().detach().numpy() score = logits.item() logger.info(f"Educational quality score: {score}") return score except Exception as e: logger.error(f"Classification error: {e}") return 0 # Default score if classification fails @app.route('/search', methods=['GET']) def search(): # Get the search term from query parameters search_term = request.args.get('q', '') logger.info(f"🔍 Received search query: {search_term}") if not search_term: logger.warning("No search term provided") return jsonify({'error': 'No search term provided'}), 400 # Define the query parameters for the SearXNG API params = { 'q': search_term, 'format': 'json', 'categories': 'general' } try: logger.info("Sending request to SearXNG search API...") # Make the request to the SearXNG API response = requests.get(SEARXNG_INSTANCE_URL, params=params) # Check the response status code if response.status_code == 200: logger.info("Received successful response from SearXNG") data = response.json() # Retrieve the first 30 results results = data.get('results', [])[:30] logger.info(f"Total results found: {len(results)}") # Classify and score educational quality for each result scored_snippets = [] for idx, result in enumerate(results, 1): snippet = { 'title': result.get('title', 'No title'), 'snippet': result.get('content', 'No snippet available'), 'url': result.get('url', 'No URL') } # Combine title and snippet for classification full_text = f"{snippet['title']} {snippet['snippet']}" # Classify educational quality edu_score = classify_educational_quality(full_text) snippet['educational_score'] = edu_score scored_snippets.append(snippet) logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}") # Sort results by educational score in descending order sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True) logger.info("🏆 Results sorted by educational quality") return jsonify(sorted_snippets) else: logger.error(f"SearXNG API error: {response.status_code}") return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code except Exception as e: logger.error(f"Search processing error: {e}") return jsonify({'error': str(e)}), 500 if __name__ == '__main__': logger.info("🌐 Starting Flask application...") # Run the Flask app on port 7860 app.run(host='0.0.0.0', port=7860, debug=True)