Spaces:

oscarwang2
/

full-search-api

Sleeping

File size: 6,321 Bytes

import os
import sys
import logging
import torch
from flask import Flask, request, jsonify
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),  # Log to console
        logging.FileHandler('/tmp/search_app.log')  # Log to file
    ]
)
logger = logging.getLogger(__name__)

# Set cache directory explicitly
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
os.makedirs('/tmp/huggingface_cache', exist_ok=True)

logger.info("🚀 Initializing Educational Search Reranker Application")
logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")

app = Flask(__name__)

# Define the SearXNG instance URL
SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"

# Load the educational content classifier with explicit cache directory
def load_model_with_retry(max_retries=3):
    logger.info("Attempting to load educational content classifier...")
    for attempt in range(max_retries):
        try:
            logger.info(f"Loading attempt {attempt + 1}...")
            
            # Log system info
            logger.info(f"Python Version: {sys.version}")
            logger.info(f"Torch Version: {torch.__version__}")
            
            logger.info("Loading tokenizer...")
            tokenizer = AutoTokenizer.from_pretrained(
                "HuggingFaceTB/fineweb-edu-classifier", 
                cache_dir='/tmp/huggingface_cache'
            )
            
            logger.info("Loading classification model...")
            model = AutoModelForSequenceClassification.from_pretrained(
                "HuggingFaceTB/fineweb-edu-classifier", 
                cache_dir='/tmp/huggingface_cache'
            )
            
            logger.info("✅ Model and tokenizer loaded successfully!")
            return tokenizer, model
        
        except Exception as e:
            logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
            logger.error(f"Detailed error: {sys.exc_info()}")
            
            if attempt == max_retries - 1:
                logger.critical("❌ Failed to load model after all attempts!")
                raise

# Load models at startup
try:
    tokenizer, model = load_model_with_retry()
except Exception as startup_error:
    logger.critical(f"Startup failed: {startup_error}")
    tokenizer, model = None, None

def classify_educational_quality(text):
    """
    Classify the educational quality of a given text snippet
    """
    if tokenizer is None or model is None:
        logger.warning("Model not initialized. Returning default score.")
        return 0
    
    try:
        logger.info(f"Classifying text (first 50 chars): {text[:50]}...")
        
        # Prepare input for the model
        inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
        
        # Get model outputs
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the logits and convert to a score
        logits = outputs.logits.squeeze(-1).float().detach().numpy()
        score = logits.item()
        
        logger.info(f"Educational quality score: {score}")
        return score
    
    except Exception as e:
        logger.error(f"Classification error: {e}")
        return 0  # Default score if classification fails

@app.route('/search', methods=['GET'])
def search():
    # Get the search term from query parameters
    search_term = request.args.get('q', '')
    
    logger.info(f"🔍 Received search query: {search_term}")
    
    if not search_term:
        logger.warning("No search term provided")
        return jsonify({'error': 'No search term provided'}), 400
    
    # Define the query parameters for the SearXNG API
    params = {
        'q': search_term,
        'format': 'json',
        'categories': 'general'
    }
    
    try:
        logger.info("Sending request to SearXNG search API...")
        # Make the request to the SearXNG API
        response = requests.get(SEARXNG_INSTANCE_URL, params=params)
        
        # Check the response status code
        if response.status_code == 200:
            logger.info("Received successful response from SearXNG")
            data = response.json()
            # Retrieve the first 30 results
            results = data.get('results', [])[:30]
            
            logger.info(f"Total results found: {len(results)}")
            
            # Classify and score educational quality for each result
            scored_snippets = []
            for idx, result in enumerate(results, 1):
                snippet = {
                    'title': result.get('title', 'No title'),
                    'snippet': result.get('content', 'No snippet available'),
                    'url': result.get('url', 'No URL')
                }
                
                # Combine title and snippet for classification
                full_text = f"{snippet['title']} {snippet['snippet']}"
                
                # Classify educational quality
                edu_score = classify_educational_quality(full_text)
                
                snippet['educational_score'] = edu_score
                scored_snippets.append(snippet)
                
                logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")
            
            # Sort results by educational score in descending order
            sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
            
            logger.info("🏆 Results sorted by educational quality")
            return jsonify(sorted_snippets)
        
        else:
            logger.error(f"SearXNG API error: {response.status_code}")
            return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
    
    except Exception as e:
        logger.error(f"Search processing error: {e}")
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    logger.info("🌐 Starting Flask application...")
    # Run the Flask app on port 7860
    app.run(host='0.0.0.0', port=7860, debug=True)