Spaces:
Sleeping
Sleeping
import os | |
import sys | |
import logging | |
import torch | |
from flask import Flask, request, jsonify | |
import requests | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s: %(message)s', | |
handlers=[ | |
logging.StreamHandler(sys.stdout), # Log to console | |
logging.FileHandler('/tmp/search_app.log') # Log to file | |
] | |
) | |
logger = logging.getLogger(__name__) | |
# Set cache directory explicitly | |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache' | |
os.makedirs('/tmp/huggingface_cache', exist_ok=True) | |
logger.info("π Initializing Educational Search Reranker Application") | |
logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}") | |
app = Flask(__name__) | |
# Define the SearXNG instance URL | |
SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search" | |
# Load the educational content classifier with explicit cache directory | |
def load_model_with_retry(max_retries=3): | |
logger.info("Attempting to load educational content classifier...") | |
for attempt in range(max_retries): | |
try: | |
logger.info(f"Loading attempt {attempt + 1}...") | |
# Log system info | |
logger.info(f"Python Version: {sys.version}") | |
logger.info(f"Torch Version: {torch.__version__}") | |
logger.info("Loading tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained( | |
"HuggingFaceTB/fineweb-edu-classifier", | |
cache_dir='/tmp/huggingface_cache' | |
) | |
logger.info("Loading classification model...") | |
model = AutoModelForSequenceClassification.from_pretrained( | |
"HuggingFaceTB/fineweb-edu-classifier", | |
cache_dir='/tmp/huggingface_cache' | |
) | |
logger.info("β Model and tokenizer loaded successfully!") | |
return tokenizer, model | |
except Exception as e: | |
logger.error(f"Model loading attempt {attempt + 1} failed: {e}") | |
logger.error(f"Detailed error: {sys.exc_info()}") | |
if attempt == max_retries - 1: | |
logger.critical("β Failed to load model after all attempts!") | |
raise | |
# Load models at startup | |
try: | |
tokenizer, model = load_model_with_retry() | |
except Exception as startup_error: | |
logger.critical(f"Startup failed: {startup_error}") | |
tokenizer, model = None, None | |
def classify_educational_quality(text): | |
""" | |
Classify the educational quality of a given text snippet | |
""" | |
if tokenizer is None or model is None: | |
logger.warning("Model not initialized. Returning default score.") | |
return 0 | |
try: | |
logger.info(f"Classifying text (first 50 chars): {text[:50]}...") | |
# Prepare input for the model | |
inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True) | |
# Get model outputs | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Extract the logits and convert to a score | |
logits = outputs.logits.squeeze(-1).float().detach().numpy() | |
score = logits.item() | |
logger.info(f"Educational quality score: {score}") | |
return score | |
except Exception as e: | |
logger.error(f"Classification error: {e}") | |
return 0 # Default score if classification fails | |
def search(): | |
# Get the search term from query parameters | |
search_term = request.args.get('q', '') | |
logger.info(f"π Received search query: {search_term}") | |
if not search_term: | |
logger.warning("No search term provided") | |
return jsonify({'error': 'No search term provided'}), 400 | |
# Define the query parameters for the SearXNG API | |
params = { | |
'q': search_term, | |
'format': 'json', | |
'categories': 'general' | |
} | |
try: | |
logger.info("Sending request to SearXNG search API...") | |
# Make the request to the SearXNG API | |
response = requests.get(SEARXNG_INSTANCE_URL, params=params) | |
# Check the response status code | |
if response.status_code == 200: | |
logger.info("Received successful response from SearXNG") | |
data = response.json() | |
# Retrieve the first 30 results | |
results = data.get('results', [])[:30] | |
logger.info(f"Total results found: {len(results)}") | |
# Classify and score educational quality for each result | |
scored_snippets = [] | |
for idx, result in enumerate(results, 1): | |
snippet = { | |
'title': result.get('title', 'No title'), | |
'snippet': result.get('content', 'No snippet available'), | |
'url': result.get('url', 'No URL') | |
} | |
# Combine title and snippet for classification | |
full_text = f"{snippet['title']} {snippet['snippet']}" | |
# Classify educational quality | |
edu_score = classify_educational_quality(full_text) | |
snippet['educational_score'] = edu_score | |
scored_snippets.append(snippet) | |
logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}") | |
# Sort results by educational score in descending order | |
sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True) | |
logger.info("π Results sorted by educational quality") | |
return jsonify(sorted_snippets) | |
else: | |
logger.error(f"SearXNG API error: {response.status_code}") | |
return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code | |
except Exception as e: | |
logger.error(f"Search processing error: {e}") | |
return jsonify({'error': str(e)}), 500 | |
if __name__ == '__main__': | |
logger.info("π Starting Flask application...") | |
# Run the Flask app on port 7860 | |
app.run(host='0.0.0.0', port=7860, debug=True) |