Spaces:

oscarwang2
/

full-search-api

Sleeping

App Files Files Community

full-search-api / app.py

oscarwang2

Update app.py

d512680 verified 20 days ago

raw

history blame contribute delete

6.32 kB

	import os
	import sys
	import logging
	import torch
	from flask import Flask, request, jsonify
	import requests
	from transformers import AutoTokenizer, AutoModelForSequenceClassification

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s: %(message)s',
	handlers=[
	logging.StreamHandler(sys.stdout), # Log to console
	logging.FileHandler('/tmp/search_app.log') # Log to file
	]
	)
	logger = logging.getLogger(__name__)

	# Set cache directory explicitly
	os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
	os.makedirs('/tmp/huggingface_cache', exist_ok=True)

	logger.info("🚀 Initializing Educational Search Reranker Application")
	logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")

	app = Flask(__name__)

	# Define the SearXNG instance URL
	SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"

	# Load the educational content classifier with explicit cache directory
	def load_model_with_retry(max_retries=3):
	logger.info("Attempting to load educational content classifier...")
	for attempt in range(max_retries):
	try:
	logger.info(f"Loading attempt {attempt + 1}...")

	# Log system info
	logger.info(f"Python Version: {sys.version}")
	logger.info(f"Torch Version: {torch.__version__}")

	logger.info("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(
	"HuggingFaceTB/fineweb-edu-classifier",
	cache_dir='/tmp/huggingface_cache'
	)

	logger.info("Loading classification model...")
	model = AutoModelForSequenceClassification.from_pretrained(
	"HuggingFaceTB/fineweb-edu-classifier",
	cache_dir='/tmp/huggingface_cache'
	)

	logger.info("✅ Model and tokenizer loaded successfully!")
	return tokenizer, model

	except Exception as e:
	logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
	logger.error(f"Detailed error: {sys.exc_info()}")

	if attempt == max_retries - 1:
	logger.critical("❌ Failed to load model after all attempts!")
	raise

	# Load models at startup
	try:
	tokenizer, model = load_model_with_retry()
	except Exception as startup_error:
	logger.critical(f"Startup failed: {startup_error}")
	tokenizer, model = None, None

	def classify_educational_quality(text):
	"""
	Classify the educational quality of a given text snippet
	"""
	if tokenizer is None or model is None:
	logger.warning("Model not initialized. Returning default score.")
	return 0

	try:
	logger.info(f"Classifying text (first 50 chars): {text[:50]}...")

	# Prepare input for the model
	inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)

	# Get model outputs
	with torch.no_grad():
	outputs = model(**inputs)

	# Extract the logits and convert to a score
	logits = outputs.logits.squeeze(-1).float().detach().numpy()
	score = logits.item()

	logger.info(f"Educational quality score: {score}")
	return score

	except Exception as e:
	logger.error(f"Classification error: {e}")
	return 0 # Default score if classification fails

	@app.route('/search', methods=['GET'])
	def search():
	# Get the search term from query parameters
	search_term = request.args.get('q', '')

	logger.info(f"🔍 Received search query: {search_term}")

	if not search_term:
	logger.warning("No search term provided")
	return jsonify({'error': 'No search term provided'}), 400

	# Define the query parameters for the SearXNG API
	params = {
	'q': search_term,
	'format': 'json',
	'categories': 'general'
	}

	try:
	logger.info("Sending request to SearXNG search API...")
	# Make the request to the SearXNG API
	response = requests.get(SEARXNG_INSTANCE_URL, params=params)

	# Check the response status code
	if response.status_code == 200:
	logger.info("Received successful response from SearXNG")
	data = response.json()
	# Retrieve the first 30 results
	results = data.get('results', [])[:30]

	logger.info(f"Total results found: {len(results)}")

	# Classify and score educational quality for each result
	scored_snippets = []
	for idx, result in enumerate(results, 1):
	snippet = {
	'title': result.get('title', 'No title'),
	'snippet': result.get('content', 'No snippet available'),
	'url': result.get('url', 'No URL')
	}

	# Combine title and snippet for classification
	full_text = f"{snippet['title']} {snippet['snippet']}"

	# Classify educational quality
	edu_score = classify_educational_quality(full_text)

	snippet['educational_score'] = edu_score
	scored_snippets.append(snippet)

	logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")

	# Sort results by educational score in descending order
	sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)

	logger.info("🏆 Results sorted by educational quality")
	return jsonify(sorted_snippets)

	else:
	logger.error(f"SearXNG API error: {response.status_code}")
	return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code

	except Exception as e:
	logger.error(f"Search processing error: {e}")
	return jsonify({'error': str(e)}), 500

	if __name__ == '__main__':
	logger.info("🌐 Starting Flask application...")
	# Run the Flask app on port 7860
	app.run(host='0.0.0.0', port=7860, debug=True)