Spaces:
Sleeping
Sleeping
oscarwang2
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,29 @@
|
|
1 |
import os
|
|
|
|
|
2 |
import torch
|
3 |
from flask import Flask, request, jsonify
|
4 |
import requests
|
5 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
# Set cache directory explicitly
|
8 |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
|
9 |
os.makedirs('/tmp/huggingface_cache', exist_ok=True)
|
10 |
|
|
|
|
|
|
|
11 |
app = Flask(__name__)
|
12 |
|
13 |
# Define the SearXNG instance URL
|
@@ -15,36 +31,56 @@ SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"
|
|
15 |
|
16 |
# Load the educational content classifier with explicit cache directory
|
17 |
def load_model_with_retry(max_retries=3):
|
|
|
18 |
for attempt in range(max_retries):
|
19 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
tokenizer = AutoTokenizer.from_pretrained(
|
21 |
"HuggingFaceTB/fineweb-edu-classifier",
|
22 |
cache_dir='/tmp/huggingface_cache'
|
23 |
)
|
|
|
|
|
24 |
model = AutoModelForSequenceClassification.from_pretrained(
|
25 |
"HuggingFaceTB/fineweb-edu-classifier",
|
26 |
cache_dir='/tmp/huggingface_cache'
|
27 |
)
|
|
|
|
|
28 |
return tokenizer, model
|
|
|
29 |
except Exception as e:
|
30 |
-
|
|
|
|
|
31 |
if attempt == max_retries - 1:
|
|
|
32 |
raise
|
33 |
|
34 |
# Load models at startup
|
35 |
-
|
|
|
|
|
|
|
|
|
36 |
|
37 |
def classify_educational_quality(text):
|
38 |
"""
|
39 |
Classify the educational quality of a given text snippet
|
40 |
-
|
41 |
-
Args:
|
42 |
-
text (str): Text snippet to classify
|
43 |
-
|
44 |
-
Returns:
|
45 |
-
float: Educational quality score
|
46 |
"""
|
|
|
|
|
|
|
|
|
47 |
try:
|
|
|
|
|
48 |
# Prepare input for the model
|
49 |
inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
|
50 |
|
@@ -56,9 +92,11 @@ def classify_educational_quality(text):
|
|
56 |
logits = outputs.logits.squeeze(-1).float().detach().numpy()
|
57 |
score = logits.item()
|
58 |
|
|
|
59 |
return score
|
|
|
60 |
except Exception as e:
|
61 |
-
|
62 |
return 0 # Default score if classification fails
|
63 |
|
64 |
@app.route('/search', methods=['GET'])
|
@@ -66,7 +104,10 @@ def search():
|
|
66 |
# Get the search term from query parameters
|
67 |
search_term = request.args.get('q', '')
|
68 |
|
|
|
|
|
69 |
if not search_term:
|
|
|
70 |
return jsonify({'error': 'No search term provided'}), 400
|
71 |
|
72 |
# Define the query parameters for the SearXNG API
|
@@ -77,18 +118,22 @@ def search():
|
|
77 |
}
|
78 |
|
79 |
try:
|
|
|
80 |
# Make the request to the SearXNG API
|
81 |
response = requests.get(SEARXNG_INSTANCE_URL, params=params)
|
82 |
|
83 |
# Check the response status code
|
84 |
if response.status_code == 200:
|
|
|
85 |
data = response.json()
|
86 |
# Retrieve the first 30 results
|
87 |
results = data.get('results', [])[:30]
|
88 |
|
|
|
|
|
89 |
# Classify and score educational quality for each result
|
90 |
scored_snippets = []
|
91 |
-
for result in results:
|
92 |
snippet = {
|
93 |
'title': result.get('title', 'No title'),
|
94 |
'snippet': result.get('content', 'No snippet available'),
|
@@ -103,17 +148,24 @@ def search():
|
|
103 |
|
104 |
snippet['educational_score'] = edu_score
|
105 |
scored_snippets.append(snippet)
|
|
|
|
|
106 |
|
107 |
# Sort results by educational score in descending order
|
108 |
sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
|
109 |
|
|
|
110 |
return jsonify(sorted_snippets)
|
|
|
111 |
else:
|
|
|
112 |
return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
|
113 |
|
114 |
except Exception as e:
|
|
|
115 |
return jsonify({'error': str(e)}), 500
|
116 |
|
117 |
if __name__ == '__main__':
|
|
|
118 |
# Run the Flask app on port 7860
|
119 |
app.run(host='0.0.0.0', port=7860, debug=True)
|
|
|
1 |
import os
|
2 |
+
import sys
|
3 |
+
import logging
|
4 |
import torch
|
5 |
from flask import Flask, request, jsonify
|
6 |
import requests
|
7 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(
|
11 |
+
level=logging.INFO,
|
12 |
+
format='%(asctime)s - %(levelname)s: %(message)s',
|
13 |
+
handlers=[
|
14 |
+
logging.StreamHandler(sys.stdout), # Log to console
|
15 |
+
logging.FileHandler('/tmp/search_app.log') # Log to file
|
16 |
+
]
|
17 |
+
)
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
# Set cache directory explicitly
|
21 |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
|
22 |
os.makedirs('/tmp/huggingface_cache', exist_ok=True)
|
23 |
|
24 |
+
logger.info("π Initializing Educational Search Reranker Application")
|
25 |
+
logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")
|
26 |
+
|
27 |
app = Flask(__name__)
|
28 |
|
29 |
# Define the SearXNG instance URL
|
|
|
31 |
|
32 |
# Load the educational content classifier with explicit cache directory
|
33 |
def load_model_with_retry(max_retries=3):
|
34 |
+
logger.info("Attempting to load educational content classifier...")
|
35 |
for attempt in range(max_retries):
|
36 |
try:
|
37 |
+
logger.info(f"Loading attempt {attempt + 1}...")
|
38 |
+
|
39 |
+
# Log system info
|
40 |
+
logger.info(f"Python Version: {sys.version}")
|
41 |
+
logger.info(f"Torch Version: {torch.__version__}")
|
42 |
+
|
43 |
+
logger.info("Loading tokenizer...")
|
44 |
tokenizer = AutoTokenizer.from_pretrained(
|
45 |
"HuggingFaceTB/fineweb-edu-classifier",
|
46 |
cache_dir='/tmp/huggingface_cache'
|
47 |
)
|
48 |
+
|
49 |
+
logger.info("Loading classification model...")
|
50 |
model = AutoModelForSequenceClassification.from_pretrained(
|
51 |
"HuggingFaceTB/fineweb-edu-classifier",
|
52 |
cache_dir='/tmp/huggingface_cache'
|
53 |
)
|
54 |
+
|
55 |
+
logger.info("β
Model and tokenizer loaded successfully!")
|
56 |
return tokenizer, model
|
57 |
+
|
58 |
except Exception as e:
|
59 |
+
logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
|
60 |
+
logger.error(f"Detailed error: {sys.exc_info()}")
|
61 |
+
|
62 |
if attempt == max_retries - 1:
|
63 |
+
logger.critical("β Failed to load model after all attempts!")
|
64 |
raise
|
65 |
|
66 |
# Load models at startup
|
67 |
+
try:
|
68 |
+
tokenizer, model = load_model_with_retry()
|
69 |
+
except Exception as startup_error:
|
70 |
+
logger.critical(f"Startup failed: {startup_error}")
|
71 |
+
tokenizer, model = None, None
|
72 |
|
73 |
def classify_educational_quality(text):
|
74 |
"""
|
75 |
Classify the educational quality of a given text snippet
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
"""
|
77 |
+
if tokenizer is None or model is None:
|
78 |
+
logger.warning("Model not initialized. Returning default score.")
|
79 |
+
return 0
|
80 |
+
|
81 |
try:
|
82 |
+
logger.info(f"Classifying text (first 50 chars): {text[:50]}...")
|
83 |
+
|
84 |
# Prepare input for the model
|
85 |
inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
|
86 |
|
|
|
92 |
logits = outputs.logits.squeeze(-1).float().detach().numpy()
|
93 |
score = logits.item()
|
94 |
|
95 |
+
logger.info(f"Educational quality score: {score}")
|
96 |
return score
|
97 |
+
|
98 |
except Exception as e:
|
99 |
+
logger.error(f"Classification error: {e}")
|
100 |
return 0 # Default score if classification fails
|
101 |
|
102 |
@app.route('/search', methods=['GET'])
|
|
|
104 |
# Get the search term from query parameters
|
105 |
search_term = request.args.get('q', '')
|
106 |
|
107 |
+
logger.info(f"π Received search query: {search_term}")
|
108 |
+
|
109 |
if not search_term:
|
110 |
+
logger.warning("No search term provided")
|
111 |
return jsonify({'error': 'No search term provided'}), 400
|
112 |
|
113 |
# Define the query parameters for the SearXNG API
|
|
|
118 |
}
|
119 |
|
120 |
try:
|
121 |
+
logger.info("Sending request to SearXNG search API...")
|
122 |
# Make the request to the SearXNG API
|
123 |
response = requests.get(SEARXNG_INSTANCE_URL, params=params)
|
124 |
|
125 |
# Check the response status code
|
126 |
if response.status_code == 200:
|
127 |
+
logger.info("Received successful response from SearXNG")
|
128 |
data = response.json()
|
129 |
# Retrieve the first 30 results
|
130 |
results = data.get('results', [])[:30]
|
131 |
|
132 |
+
logger.info(f"Total results found: {len(results)}")
|
133 |
+
|
134 |
# Classify and score educational quality for each result
|
135 |
scored_snippets = []
|
136 |
+
for idx, result in enumerate(results, 1):
|
137 |
snippet = {
|
138 |
'title': result.get('title', 'No title'),
|
139 |
'snippet': result.get('content', 'No snippet available'),
|
|
|
148 |
|
149 |
snippet['educational_score'] = edu_score
|
150 |
scored_snippets.append(snippet)
|
151 |
+
|
152 |
+
logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")
|
153 |
|
154 |
# Sort results by educational score in descending order
|
155 |
sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
|
156 |
|
157 |
+
logger.info("π Results sorted by educational quality")
|
158 |
return jsonify(sorted_snippets)
|
159 |
+
|
160 |
else:
|
161 |
+
logger.error(f"SearXNG API error: {response.status_code}")
|
162 |
return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
|
163 |
|
164 |
except Exception as e:
|
165 |
+
logger.error(f"Search processing error: {e}")
|
166 |
return jsonify({'error': str(e)}), 500
|
167 |
|
168 |
if __name__ == '__main__':
|
169 |
+
logger.info("π Starting Flask application...")
|
170 |
# Run the Flask app on port 7860
|
171 |
app.run(host='0.0.0.0', port=7860, debug=True)
|