Spaces:

oscarwang2
/

full-search-api

Sleeping

App Files Files Community

oscarwang2 commited on Dec 17, 2024

Commit

d512680

verified ·

1 Parent(s): 3176bee

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -10

app.py CHANGED Viewed

@@ -1,13 +1,29 @@
 import os
 import torch
 from flask import Flask, request, jsonify
 import requests
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # Set cache directory explicitly
 os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
 os.makedirs('/tmp/huggingface_cache', exist_ok=True)
 app = Flask(__name__)
 # Define the SearXNG instance URL
@@ -15,36 +31,56 @@ SEARXNG_INSTANCE_URL = "https://oscarwang2-searxng.hf.space/search"
 # Load the educational content classifier with explicit cache directory
 def load_model_with_retry(max_retries=3):
     for attempt in range(max_retries):
         try:
             tokenizer = AutoTokenizer.from_pretrained(
                 "HuggingFaceTB/fineweb-edu-classifier",
                 cache_dir='/tmp/huggingface_cache'
             )
             model = AutoModelForSequenceClassification.from_pretrained(
                 "HuggingFaceTB/fineweb-edu-classifier",
                 cache_dir='/tmp/huggingface_cache'
             )
             return tokenizer, model
         except Exception as e:
-            print(f"Model loading attempt {attempt + 1} failed: {e}")
             if attempt == max_retries - 1:
                 raise
 # Load models at startup
-tokenizer, model = load_model_with_retry()
 def classify_educational_quality(text):
     """
     Classify the educational quality of a given text snippet
-    Args:
-        text (str): Text snippet to classify
-    Returns:
-        float: Educational quality score
     """
     try:
         # Prepare input for the model
         inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
@@ -56,9 +92,11 @@ def classify_educational_quality(text):
         logits = outputs.logits.squeeze(-1).float().detach().numpy()
         score = logits.item()
         return score
     except Exception as e:
-        print(f"Error in classification: {e}")
         return 0  # Default score if classification fails
 @app.route('/search', methods=['GET'])
@@ -66,7 +104,10 @@ def search():
     # Get the search term from query parameters
     search_term = request.args.get('q', '')
     if not search_term:
         return jsonify({'error': 'No search term provided'}), 400
     # Define the query parameters for the SearXNG API
@@ -77,18 +118,22 @@ def search():
     }
     try:
         # Make the request to the SearXNG API
         response = requests.get(SEARXNG_INSTANCE_URL, params=params)
         # Check the response status code
         if response.status_code == 200:
             data = response.json()
             # Retrieve the first 30 results
             results = data.get('results', [])[:30]
             # Classify and score educational quality for each result
             scored_snippets = []
-            for result in results:
                 snippet = {
                     'title': result.get('title', 'No title'),
                     'snippet': result.get('content', 'No snippet available'),
@@ -103,17 +148,24 @@ def search():
                 snippet['educational_score'] = edu_score
                 scored_snippets.append(snippet)
             # Sort results by educational score in descending order
             sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
             return jsonify(sorted_snippets)
         else:
             return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
     except Exception as e:
         return jsonify({'error': str(e)}), 500
 if __name__ == '__main__':
     # Run the Flask app on port 7860
     app.run(host='0.0.0.0', port=7860, debug=True)

 import os
+import sys
+import logging
 import torch
 from flask import Flask, request, jsonify
 import requests
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s: %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),  # Log to console
+        logging.FileHandler('/tmp/search_app.log')  # Log to file
+    ]
+)
+logger = logging.getLogger(__name__)
 # Set cache directory explicitly
 os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
 os.makedirs('/tmp/huggingface_cache', exist_ok=True)
+logger.info("🚀 Initializing Educational Search Reranker Application")
+logger.info(f"Cache directory: {os.environ['TRANSFORMERS_CACHE']}")
 app = Flask(__name__)
 # Define the SearXNG instance URL
 # Load the educational content classifier with explicit cache directory
 def load_model_with_retry(max_retries=3):
+    logger.info("Attempting to load educational content classifier...")
     for attempt in range(max_retries):
         try:
+            logger.info(f"Loading attempt {attempt + 1}...")
+            # Log system info
+            logger.info(f"Python Version: {sys.version}")
+            logger.info(f"Torch Version: {torch.__version__}")
+            logger.info("Loading tokenizer...")
             tokenizer = AutoTokenizer.from_pretrained(
                 "HuggingFaceTB/fineweb-edu-classifier",
                 cache_dir='/tmp/huggingface_cache'
             )
+            logger.info("Loading classification model...")
             model = AutoModelForSequenceClassification.from_pretrained(
                 "HuggingFaceTB/fineweb-edu-classifier",
                 cache_dir='/tmp/huggingface_cache'
             )
+            logger.info("✅ Model and tokenizer loaded successfully!")
             return tokenizer, model
         except Exception as e:
+            logger.error(f"Model loading attempt {attempt + 1} failed: {e}")
+            logger.error(f"Detailed error: {sys.exc_info()}")
             if attempt == max_retries - 1:
+                logger.critical("❌ Failed to load model after all attempts!")
                 raise
 # Load models at startup
+try:
+    tokenizer, model = load_model_with_retry()
+except Exception as startup_error:
+    logger.critical(f"Startup failed: {startup_error}")
+    tokenizer, model = None, None
 def classify_educational_quality(text):
     """
     Classify the educational quality of a given text snippet
     """
+    if tokenizer is None or model is None:
+        logger.warning("Model not initialized. Returning default score.")
+        return 0
     try:
+        logger.info(f"Classifying text (first 50 chars): {text[:50]}...")
         # Prepare input for the model
         inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
         logits = outputs.logits.squeeze(-1).float().detach().numpy()
         score = logits.item()
+        logger.info(f"Educational quality score: {score}")
         return score
     except Exception as e:
+        logger.error(f"Classification error: {e}")
         return 0  # Default score if classification fails
 @app.route('/search', methods=['GET'])
     # Get the search term from query parameters
     search_term = request.args.get('q', '')
+    logger.info(f"🔍 Received search query: {search_term}")
     if not search_term:
+        logger.warning("No search term provided")
         return jsonify({'error': 'No search term provided'}), 400
     # Define the query parameters for the SearXNG API
     }
     try:
+        logger.info("Sending request to SearXNG search API...")
         # Make the request to the SearXNG API
         response = requests.get(SEARXNG_INSTANCE_URL, params=params)
         # Check the response status code
         if response.status_code == 200:
+            logger.info("Received successful response from SearXNG")
             data = response.json()
             # Retrieve the first 30 results
             results = data.get('results', [])[:30]
+            logger.info(f"Total results found: {len(results)}")
             # Classify and score educational quality for each result
             scored_snippets = []
+            for idx, result in enumerate(results, 1):
                 snippet = {
                     'title': result.get('title', 'No title'),
                     'snippet': result.get('content', 'No snippet available'),
                 snippet['educational_score'] = edu_score
                 scored_snippets.append(snippet)
+                logger.info(f"Result {idx}: URL={snippet['url']}, Score={edu_score}")
             # Sort results by educational score in descending order
             sorted_snippets = sorted(scored_snippets, key=lambda x: x['educational_score'], reverse=True)
+            logger.info("🏆 Results sorted by educational quality")
             return jsonify(sorted_snippets)
         else:
+            logger.error(f"SearXNG API error: {response.status_code}")
             return jsonify({'error': f'SearXNG API error: {response.status_code}'}), response.status_code
     except Exception as e:
+        logger.error(f"Search processing error: {e}")
         return jsonify({'error': str(e)}), 500
 if __name__ == '__main__':
+    logger.info("🌐 Starting Flask application...")
     # Run the Flask app on port 7860
     app.run(host='0.0.0.0', port=7860, debug=True)