Spaces:
Sleeping
Sleeping
File size: 6,321 Bytes
3176bee d512680 3176bee 522c6b8 0993713 522c6b8 d512680 3176bee d512680 522c6b8 3176bee d512680 3176bee d512680 3176bee d512680 3176bee d512680 3176bee d512680 3176bee d512680 3176bee d512680 3176bee d512680 0993713 d512680 0993713 d512680 0993713 d512680 0993713 d512680 0993713 d512680 0993713 522c6b8 d512680 522c6b8 d512680 522c6b8 0993713 522c6b8 0993713 522c6b8 d512680 522c6b8 0993713 522c6b8 d512680 522c6b8 0993713 522c6b8 0993713 d512680 0993713 d512680 522c6b8 0993713 d512680 0993713 d512680 0993713 d512680 522c6b8 d512680 522c6b8 0993713 522c6b8 d512680 522c6b8 d512680 522c6b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import sys
import logging
import torch
from flask import Flask, request, jsonify
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s',
handlers=[
logging.StreamHandler(sys.stdout), # Log to console
logging.FileHandler('/tmp/search_app.log') # Log to file
]
)
logger = logging.getLogger(__name__)
# Set cache directory explicitly
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
os.makedirs('/tmp/huggingface_cache', exist_ok=True)
logger.info("π Initializing Educational Search Reranker Application")
logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")
app = Flask(__name__)
# Define the SearXNG instance URL
SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"
# Load the educational content classifier with explicit cache directory
def load_model_with_retry(max_retries=3):
logger.info("Attempting to load educational content classifier...")
for attempt in range(max_retries):
try:
logger.info(f"Loading attempt {attempt + 1}...")
# Log system info
logger.info(f"Python Version: {sys.version}")
logger.info(f"Torch Version: {torch.__version__}")
logger.info("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
"HuggingFaceTB/fineweb-edu-classifier",
cache_dir='/tmp/huggingface_cache'
)
logger.info("Loading classification model...")
model = AutoModelForSequenceClassification.from_pretrained(
"HuggingFaceTB/fineweb-edu-classifier",
cache_dir='/tmp/huggingface_cache'
)
logger.info("β
Model and tokenizer loaded successfully!")
return tokenizer, model
except Exception as e:
logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
logger.error(f"Detailed error: {sys.exc_info()}")
if attempt == max_retries - 1:
logger.critical("β Failed to load model after all attempts!")
raise
# Load models at startup
try:
tokenizer, model = load_model_with_retry()
except Exception as startup_error:
logger.critical(f"Startup failed: {startup_error}")
tokenizer, model = None, None
def classify_educational_quality(text):
"""
Classify the educational quality of a given text snippet
"""
if tokenizer is None or model is None:
logger.warning("Model not initialized. Returning default score.")
return 0
try:
logger.info(f"Classifying text (first 50 chars): {text[:50]}...")
# Prepare input for the model
inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
# Get model outputs
with torch.no_grad():
outputs = model(**inputs)
# Extract the logits and convert to a score
logits = outputs.logits.squeeze(-1).float().detach().numpy()
score = logits.item()
logger.info(f"Educational quality score: {score}")
return score
except Exception as e:
logger.error(f"Classification error: {e}")
return 0 # Default score if classification fails
@app.route('/search', methods=['GET'])
def search():
# Get the search term from query parameters
search_term = request.args.get('q', '')
logger.info(f"π Received search query: {search_term}")
if not search_term:
logger.warning("No search term provided")
return jsonify({'error': 'No search term provided'}), 400
# Define the query parameters for the SearXNG API
params = {
'q': search_term,
'format': 'json',
'categories': 'general'
}
try:
logger.info("Sending request to SearXNG search API...")
# Make the request to the SearXNG API
response = requests.get(SEARXNG_INSTANCE_URL, params=params)
# Check the response status code
if response.status_code == 200:
logger.info("Received successful response from SearXNG")
data = response.json()
# Retrieve the first 30 results
results = data.get('results', [])[:30]
logger.info(f"Total results found: {len(results)}")
# Classify and score educational quality for each result
scored_snippets = []
for idx, result in enumerate(results, 1):
snippet = {
'title': result.get('title', 'No title'),
'snippet': result.get('content', 'No snippet available'),
'url': result.get('url', 'No URL')
}
# Combine title and snippet for classification
full_text = f"{snippet['title']} {snippet['snippet']}"
# Classify educational quality
edu_score = classify_educational_quality(full_text)
snippet['educational_score'] = edu_score
scored_snippets.append(snippet)
logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")
# Sort results by educational score in descending order
sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
logger.info("π Results sorted by educational quality")
return jsonify(sorted_snippets)
else:
logger.error(f"SearXNG API error: {response.status_code}")
return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
except Exception as e:
logger.error(f"Search processing error: {e}")
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
logger.info("π Starting Flask application...")
# Run the Flask app on port 7860
app.run(host='0.0.0.0', port=7860, debug=True) |