oscarwang2 commited on
Commit
d512680
Β·
verified Β·
1 Parent(s): 3176bee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -10
app.py CHANGED
@@ -1,13 +1,29 @@
1
  import os
 
 
2
  import torch
3
  from flask import Flask, request, jsonify
4
  import requests
5
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
 
 
 
 
 
 
 
 
 
 
 
 
7
  # Set cache directory explicitly
8
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
9
  os.makedirs('/tmp/huggingface_cache', exist_ok=True)
10
 
 
 
 
11
  app = Flask(__name__)
12
 
13
  # Define the SearXNG instance URL
@@ -15,36 +31,56 @@ SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"
15
 
16
  # Load the educational content classifier with explicit cache directory
17
  def load_model_with_retry(max_retries=3):
 
18
  for attempt in range(max_retries):
19
  try:
 
 
 
 
 
 
 
20
  tokenizer = AutoTokenizer.from_pretrained(
21
  "HuggingFaceTB/fineweb-edu-classifier",
22
  cache_dir='/tmp/huggingface_cache'
23
  )
 
 
24
  model = AutoModelForSequenceClassification.from_pretrained(
25
  "HuggingFaceTB/fineweb-edu-classifier",
26
  cache_dir='/tmp/huggingface_cache'
27
  )
 
 
28
  return tokenizer, model
 
29
  except Exception as e:
30
- print(f"Model loading attempt {attempt + 1} failed: {e}")
 
 
31
  if attempt == max_retries - 1:
 
32
  raise
33
 
34
  # Load models at startup
35
- tokenizer, model = load_model_with_retry()
 
 
 
 
36
 
37
  def classify_educational_quality(text):
38
  """
39
  Classify the educational quality of a given text snippet
40
-
41
- Args:
42
- text (str): Text snippet to classify
43
-
44
- Returns:
45
- float: Educational quality score
46
  """
 
 
 
 
47
  try:
 
 
48
  # Prepare input for the model
49
  inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
50
 
@@ -56,9 +92,11 @@ def classify_educational_quality(text):
56
  logits = outputs.logits.squeeze(-1).float().detach().numpy()
57
  score = logits.item()
58
 
 
59
  return score
 
60
  except Exception as e:
61
- print(f"Error in classification: {e}")
62
  return 0 # Default score if classification fails
63
 
64
  @app.route('/search', methods=['GET'])
@@ -66,7 +104,10 @@ def search():
66
  # Get the search term from query parameters
67
  search_term = request.args.get('q', '')
68
 
 
 
69
  if not search_term:
 
70
  return jsonify({'error': 'No search term provided'}), 400
71
 
72
  # Define the query parameters for the SearXNG API
@@ -77,18 +118,22 @@ def search():
77
  }
78
 
79
  try:
 
80
  # Make the request to the SearXNG API
81
  response = requests.get(SEARXNG_INSTANCE_URL, params=params)
82
 
83
  # Check the response status code
84
  if response.status_code == 200:
 
85
  data = response.json()
86
  # Retrieve the first 30 results
87
  results = data.get('results', [])[:30]
88
 
 
 
89
  # Classify and score educational quality for each result
90
  scored_snippets = []
91
- for result in results:
92
  snippet = {
93
  'title': result.get('title', 'No title'),
94
  'snippet': result.get('content', 'No snippet available'),
@@ -103,17 +148,24 @@ def search():
103
 
104
  snippet['educational_score'] = edu_score
105
  scored_snippets.append(snippet)
 
 
106
 
107
  # Sort results by educational score in descending order
108
  sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
109
 
 
110
  return jsonify(sorted_snippets)
 
111
  else:
 
112
  return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
113
 
114
  except Exception as e:
 
115
  return jsonify({'error': str(e)}), 500
116
 
117
  if __name__ == '__main__':
 
118
  # Run the Flask app on port 7860
119
  app.run(host='0.0.0.0', port=7860, debug=True)
 
1
  import os
2
+ import sys
3
+ import logging
4
  import torch
5
  from flask import Flask, request, jsonify
6
  import requests
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
 
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s: %(message)s',
13
+ handlers=[
14
+ logging.StreamHandler(sys.stdout), # Log to console
15
+ logging.FileHandler('/tmp/search_app.log') # Log to file
16
+ ]
17
+ )
18
+ logger = logging.getLogger(__name__)
19
+
20
  # Set cache directory explicitly
21
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
22
  os.makedirs('/tmp/huggingface_cache', exist_ok=True)
23
 
24
+ logger.info("πŸš€ Initializing Educational Search Reranker Application")
25
+ logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")
26
+
27
  app = Flask(__name__)
28
 
29
  # Define the SearXNG instance URL
 
31
 
32
  # Load the educational content classifier with explicit cache directory
33
  def load_model_with_retry(max_retries=3):
34
+ logger.info("Attempting to load educational content classifier...")
35
  for attempt in range(max_retries):
36
  try:
37
+ logger.info(f"Loading attempt {attempt + 1}...")
38
+
39
+ # Log system info
40
+ logger.info(f"Python Version: {sys.version}")
41
+ logger.info(f"Torch Version: {torch.__version__}")
42
+
43
+ logger.info("Loading tokenizer...")
44
  tokenizer = AutoTokenizer.from_pretrained(
45
  "HuggingFaceTB/fineweb-edu-classifier",
46
  cache_dir='/tmp/huggingface_cache'
47
  )
48
+
49
+ logger.info("Loading classification model...")
50
  model = AutoModelForSequenceClassification.from_pretrained(
51
  "HuggingFaceTB/fineweb-edu-classifier",
52
  cache_dir='/tmp/huggingface_cache'
53
  )
54
+
55
+ logger.info("βœ… Model and tokenizer loaded successfully!")
56
  return tokenizer, model
57
+
58
  except Exception as e:
59
+ logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
60
+ logger.error(f"Detailed error: {sys.exc_info()}")
61
+
62
  if attempt == max_retries - 1:
63
+ logger.critical("❌ Failed to load model after all attempts!")
64
  raise
65
 
66
  # Load models at startup
67
+ try:
68
+ tokenizer, model = load_model_with_retry()
69
+ except Exception as startup_error:
70
+ logger.critical(f"Startup failed: {startup_error}")
71
+ tokenizer, model = None, None
72
 
73
  def classify_educational_quality(text):
74
  """
75
  Classify the educational quality of a given text snippet
 
 
 
 
 
 
76
  """
77
+ if tokenizer is None or model is None:
78
+ logger.warning("Model not initialized. Returning default score.")
79
+ return 0
80
+
81
  try:
82
+ logger.info(f"Classifying text (first 50 chars): {text[:50]}...")
83
+
84
  # Prepare input for the model
85
  inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
86
 
 
92
  logits = outputs.logits.squeeze(-1).float().detach().numpy()
93
  score = logits.item()
94
 
95
+ logger.info(f"Educational quality score: {score}")
96
  return score
97
+
98
  except Exception as e:
99
+ logger.error(f"Classification error: {e}")
100
  return 0 # Default score if classification fails
101
 
102
  @app.route('/search', methods=['GET'])
 
104
  # Get the search term from query parameters
105
  search_term = request.args.get('q', '')
106
 
107
+ logger.info(f"πŸ” Received search query: {search_term}")
108
+
109
  if not search_term:
110
+ logger.warning("No search term provided")
111
  return jsonify({'error': 'No search term provided'}), 400
112
 
113
  # Define the query parameters for the SearXNG API
 
118
  }
119
 
120
  try:
121
+ logger.info("Sending request to SearXNG search API...")
122
  # Make the request to the SearXNG API
123
  response = requests.get(SEARXNG_INSTANCE_URL, params=params)
124
 
125
  # Check the response status code
126
  if response.status_code == 200:
127
+ logger.info("Received successful response from SearXNG")
128
  data = response.json()
129
  # Retrieve the first 30 results
130
  results = data.get('results', [])[:30]
131
 
132
+ logger.info(f"Total results found: {len(results)}")
133
+
134
  # Classify and score educational quality for each result
135
  scored_snippets = []
136
+ for idx, result in enumerate(results, 1):
137
  snippet = {
138
  'title': result.get('title', 'No title'),
139
  'snippet': result.get('content', 'No snippet available'),
 
148
 
149
  snippet['educational_score'] = edu_score
150
  scored_snippets.append(snippet)
151
+
152
+ logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")
153
 
154
  # Sort results by educational score in descending order
155
  sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
156
 
157
+ logger.info("πŸ† Results sorted by educational quality")
158
  return jsonify(sorted_snippets)
159
+
160
  else:
161
+ logger.error(f"SearXNG API error: {response.status_code}")
162
  return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
163
 
164
  except Exception as e:
165
+ logger.error(f"Search processing error: {e}")
166
  return jsonify({'error': str(e)}), 500
167
 
168
  if __name__ == '__main__':
169
+ logger.info("🌐 Starting Flask application...")
170
  # Run the Flask app on port 7860
171
  app.run(host='0.0.0.0', port=7860, debug=True)