File size: 6,321 Bytes
3176bee
d512680
 
3176bee
522c6b8
 
0993713
522c6b8
d512680
 
 
 
 
 
 
 
 
 
 
3176bee
 
 
 
d512680
 
 
522c6b8
 
 
 
 
3176bee
 
d512680
3176bee
 
d512680
 
 
 
 
 
 
3176bee
 
 
 
d512680
 
3176bee
 
 
 
d512680
 
3176bee
d512680
3176bee
d512680
 
 
3176bee
d512680
3176bee
 
 
d512680
 
 
 
 
0993713
 
 
 
 
d512680
 
 
 
0993713
d512680
 
0993713
 
 
 
 
 
 
 
 
 
 
d512680
0993713
d512680
0993713
d512680
0993713
 
522c6b8
 
 
 
 
d512680
 
522c6b8
d512680
522c6b8
0993713
522c6b8
 
 
 
 
 
0993713
522c6b8
d512680
522c6b8
 
0993713
522c6b8
 
d512680
522c6b8
0993713
522c6b8
0993713
d512680
 
0993713
 
d512680
522c6b8
 
 
 
 
0993713
 
 
 
 
 
 
 
 
d512680
 
0993713
 
 
 
d512680
0993713
d512680
522c6b8
d512680
522c6b8
0993713
522c6b8
d512680
522c6b8
 
 
d512680
522c6b8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
import sys
import logging
import torch
from flask import Flask, request, jsonify
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),  # Log to console
        logging.FileHandler('/tmp/search_app.log')  # Log to file
    ]
)
logger = logging.getLogger(__name__)

# Set cache directory explicitly
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
os.makedirs('/tmp/huggingface_cache', exist_ok=True)

logger.info("πŸš€ Initializing Educational Search Reranker Application")
logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")

app = Flask(__name__)

# Define the SearXNG instance URL
SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"

# Load the educational content classifier with explicit cache directory
def load_model_with_retry(max_retries=3):
    logger.info("Attempting to load educational content classifier...")
    for attempt in range(max_retries):
        try:
            logger.info(f"Loading attempt {attempt + 1}...")
            
            # Log system info
            logger.info(f"Python Version: {sys.version}")
            logger.info(f"Torch Version: {torch.__version__}")
            
            logger.info("Loading tokenizer...")
            tokenizer = AutoTokenizer.from_pretrained(
                "HuggingFaceTB/fineweb-edu-classifier", 
                cache_dir='/tmp/huggingface_cache'
            )
            
            logger.info("Loading classification model...")
            model = AutoModelForSequenceClassification.from_pretrained(
                "HuggingFaceTB/fineweb-edu-classifier", 
                cache_dir='/tmp/huggingface_cache'
            )
            
            logger.info("βœ… Model and tokenizer loaded successfully!")
            return tokenizer, model
        
        except Exception as e:
            logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
            logger.error(f"Detailed error: {sys.exc_info()}")
            
            if attempt == max_retries - 1:
                logger.critical("❌ Failed to load model after all attempts!")
                raise

# Load models at startup
try:
    tokenizer, model = load_model_with_retry()
except Exception as startup_error:
    logger.critical(f"Startup failed: {startup_error}")
    tokenizer, model = None, None

def classify_educational_quality(text):
    """
    Classify the educational quality of a given text snippet
    """
    if tokenizer is None or model is None:
        logger.warning("Model not initialized. Returning default score.")
        return 0
    
    try:
        logger.info(f"Classifying text (first 50 chars): {text[:50]}...")
        
        # Prepare input for the model
        inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
        
        # Get model outputs
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the logits and convert to a score
        logits = outputs.logits.squeeze(-1).float().detach().numpy()
        score = logits.item()
        
        logger.info(f"Educational quality score: {score}")
        return score
    
    except Exception as e:
        logger.error(f"Classification error: {e}")
        return 0  # Default score if classification fails

@app.route('/search', methods=['GET'])
def search():
    # Get the search term from query parameters
    search_term = request.args.get('q', '')
    
    logger.info(f"πŸ” Received search query: {search_term}")
    
    if not search_term:
        logger.warning("No search term provided")
        return jsonify({'error': 'No search term provided'}), 400
    
    # Define the query parameters for the SearXNG API
    params = {
        'q': search_term,
        'format': 'json',
        'categories': 'general'
    }
    
    try:
        logger.info("Sending request to SearXNG search API...")
        # Make the request to the SearXNG API
        response = requests.get(SEARXNG_INSTANCE_URL, params=params)
        
        # Check the response status code
        if response.status_code == 200:
            logger.info("Received successful response from SearXNG")
            data = response.json()
            # Retrieve the first 30 results
            results = data.get('results', [])[:30]
            
            logger.info(f"Total results found: {len(results)}")
            
            # Classify and score educational quality for each result
            scored_snippets = []
            for idx, result in enumerate(results, 1):
                snippet = {
                    'title': result.get('title', 'No title'),
                    'snippet': result.get('content', 'No snippet available'),
                    'url': result.get('url', 'No URL')
                }
                
                # Combine title and snippet for classification
                full_text = f"{snippet['title']} {snippet['snippet']}"
                
                # Classify educational quality
                edu_score = classify_educational_quality(full_text)
                
                snippet['educational_score'] = edu_score
                scored_snippets.append(snippet)
                
                logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")
            
            # Sort results by educational score in descending order
            sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
            
            logger.info("πŸ† Results sorted by educational quality")
            return jsonify(sorted_snippets)
        
        else:
            logger.error(f"SearXNG API error: {response.status_code}")
            return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
    
    except Exception as e:
        logger.error(f"Search processing error: {e}")
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    logger.info("🌐 Starting Flask application...")
    # Run the Flask app on port 7860
    app.run(host='0.0.0.0', port=7860, debug=True)