full-search-api / app.py
oscarwang2's picture
Update app.py
d512680 verified
import os
import sys
import logging
import torch
from flask import Flask, request, jsonify
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s',
handlers=[
logging.StreamHandler(sys.stdout), # Log to console
logging.FileHandler('/tmp/search_app.log') # Log to file
]
)
logger = logging.getLogger(__name__)
# Set cache directory explicitly
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
os.makedirs('/tmp/huggingface_cache', exist_ok=True)
logger.info("πŸš€ Initializing Educational Search Reranker Application")
logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")
app = Flask(__name__)
# Define the SearXNG instance URL
SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"
# Load the educational content classifier with explicit cache directory
def load_model_with_retry(max_retries=3):
logger.info("Attempting to load educational content classifier...")
for attempt in range(max_retries):
try:
logger.info(f"Loading attempt {attempt + 1}...")
# Log system info
logger.info(f"Python Version: {sys.version}")
logger.info(f"Torch Version: {torch.__version__}")
logger.info("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
"HuggingFaceTB/fineweb-edu-classifier",
cache_dir='/tmp/huggingface_cache'
)
logger.info("Loading classification model...")
model = AutoModelForSequenceClassification.from_pretrained(
"HuggingFaceTB/fineweb-edu-classifier",
cache_dir='/tmp/huggingface_cache'
)
logger.info("βœ… Model and tokenizer loaded successfully!")
return tokenizer, model
except Exception as e:
logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
logger.error(f"Detailed error: {sys.exc_info()}")
if attempt == max_retries - 1:
logger.critical("❌ Failed to load model after all attempts!")
raise
# Load models at startup
try:
tokenizer, model = load_model_with_retry()
except Exception as startup_error:
logger.critical(f"Startup failed: {startup_error}")
tokenizer, model = None, None
def classify_educational_quality(text):
"""
Classify the educational quality of a given text snippet
"""
if tokenizer is None or model is None:
logger.warning("Model not initialized. Returning default score.")
return 0
try:
logger.info(f"Classifying text (first 50 chars): {text[:50]}...")
# Prepare input for the model
inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
# Get model outputs
with torch.no_grad():
outputs = model(**inputs)
# Extract the logits and convert to a score
logits = outputs.logits.squeeze(-1).float().detach().numpy()
score = logits.item()
logger.info(f"Educational quality score: {score}")
return score
except Exception as e:
logger.error(f"Classification error: {e}")
return 0 # Default score if classification fails
@app.route('/search', methods=['GET'])
def search():
# Get the search term from query parameters
search_term = request.args.get('q', '')
logger.info(f"πŸ” Received search query: {search_term}")
if not search_term:
logger.warning("No search term provided")
return jsonify({'error': 'No search term provided'}), 400
# Define the query parameters for the SearXNG API
params = {
'q': search_term,
'format': 'json',
'categories': 'general'
}
try:
logger.info("Sending request to SearXNG search API...")
# Make the request to the SearXNG API
response = requests.get(SEARXNG_INSTANCE_URL, params=params)
# Check the response status code
if response.status_code == 200:
logger.info("Received successful response from SearXNG")
data = response.json()
# Retrieve the first 30 results
results = data.get('results', [])[:30]
logger.info(f"Total results found: {len(results)}")
# Classify and score educational quality for each result
scored_snippets = []
for idx, result in enumerate(results, 1):
snippet = {
'title': result.get('title', 'No title'),
'snippet': result.get('content', 'No snippet available'),
'url': result.get('url', 'No URL')
}
# Combine title and snippet for classification
full_text = f"{snippet['title']} {snippet['snippet']}"
# Classify educational quality
edu_score = classify_educational_quality(full_text)
snippet['educational_score'] = edu_score
scored_snippets.append(snippet)
logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")
# Sort results by educational score in descending order
sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
logger.info("πŸ† Results sorted by educational quality")
return jsonify(sorted_snippets)
else:
logger.error(f"SearXNG API error: {response.status_code}")
return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
except Exception as e:
logger.error(f"Search processing error: {e}")
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
logger.info("🌐 Starting Flask application...")
# Run the Flask app on port 7860
app.run(host='0.0.0.0', port=7860, debug=True)