# from flask import Flask, render_template, request # from weather import get_current_weather # from transformers import PegasusForConditionalGeneration, PegasusTokenizer # app = Flask(__name__) # @app.route('/') # @app.route('/index') # def index(): # return render_template('index.html') # @app.route('/test') # def test(): # tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum") # # Load model # model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum") # return "Hello World!..." # @app.route('/weather') # def get_weather(): # city = request.args.get('city') # print("working...") # # Check for empty strings or string with only spaces # if not bool(city.strip()): # # You could render "City Not Found" instead like we do below # city = "Kansas City" # weather_data = get_current_weather(city) # # City is not found by API # if not weather_data['cod'] == 200: # return render_template('city-not-found.html') # return render_template( # "weather.html", # title=weather_data["name"], # status=weather_data["weather"][0]["description"].capitalize(), # temp=f"{weather_data['main']['temp']:.1f}", # feels_like=f"{weather_data['main']['feels_like']:.1f}" # ) # if __name__ == "__main__": # serve(app, host="0.0.0.0", port=8000) # --------------------------------------------------------------------------------- # from flask import Flask, render_template, request, jsonify # from waitress import serve # from transformers import PegasusForConditionalGeneration, PegasusTokenizer # import time # app = Flask(__name__) # # Assuming the rest of your Flask app code remains unchanged # @app.route('/') # @app.route('/index') # def index(): # return render_template('index.html') # @app.route('/test', methods=['POST']) # def test(): # # Extract text from the request body # content = request.json.get('content', '') # if not content: # return jsonify({"error": "No content provided"}), 400 # start_time = time.time() # # Specify the directory where you have saved the model # model_save_directory = "./my_project_folder/pegasus_model" # # Load the model and tokenizer from the directory # model = PegasusForConditionalGeneration.from_pretrained(model_save_directory) # tokenizer = PegasusTokenizer.from_pretrained(model_save_directory) # # Create tokens - number representation of our text # tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt") # # Summarize # summary = model.generate(**tokens, min_length=60, max_length=100) # # Decode summary # summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True) # end_time = time.time() # execution_time = end_time - start_time # # Return the summarized text and execution time # return jsonify({ # "summarized_text": summarized_text, # "execution_time": f"{execution_time} seconds" # }) # # Assuming you have the `if __name__ == "__main__"` block to run the app # if __name__ == "__main__": # serve(app, host="0.0.0.0", port=8000) # ====================================================================================== # from flask import Flask, request, jsonify # from waitress import serve from pymongo import MongoClient # from transformers import PegasusForConditionalGeneration, PegasusTokenizer from flask import Flask, render_template, request, jsonify from flask_cors import CORS from waitress import serve from transformers import PegasusForConditionalGeneration, PegasusTokenizer from transformers import BartForConditionalGeneration, BartTokenizer import torch import time import time from datetime import datetime, timedelta app = Flask(__name__) CORS(app) # Use your MongoDB Atlas connection string mongo_conn_str = 'mongodb+srv://final_year_project:Ngd2jIj9PpvQfb5i@cluster0.3mhko.mongodb.net/news_scraping_site?retryWrites=true&w=majority&appName=Cluster0' client = MongoClient(mongo_conn_str) # Adjust these to match your specific database and collection names db = client['news_scraping_site'] summaries_collection = db.articles scraped_collection = db.scrapedarticles @app.route('/') def hello(): return {"hello":"its fucking working..."} @app.route('/index') def index(): return render_template('index.html') @app.route('/test', methods=['POST']) def test(): content = request.json.get('content', '') if not content: return jsonify({"error": "No content provided"}), 400 start_time = time.time() # model_save_directory = "./my_project_folder/pegasus_model" model_save_directory = "google/pegasus-xsum" model = PegasusForConditionalGeneration.from_pretrained(model_save_directory) tokenizer = PegasusTokenizer.from_pretrained(model_save_directory) tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt") summary = model.generate(**tokens, min_length=60, max_length=100) summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True) # Save the summary to MongoDB Atlas summary_document = { "original_text": content, "summarized_text": summarized_text, "timestamp": time.time() } result = summaries_collection.insert_one(summary_document) end_time = time.time() execution_time = end_time - start_time return jsonify({ "summarized_text": summarized_text, "execution_time": f"{execution_time} seconds", "mongodb_object_id": str(result.inserted_id) # Return the MongoDB Object ID of the inserted document }) @app.route('/bart', methods=['POST']) def bart(): print("bart route called") # Get the content from the request content = request.json.get('content', '') print(content) # Check if content is provided if not content: return jsonify({"error": "No content provided"}), 400 start_time = time.time() # Path to your BART model, adjust as necessary model_save_directory = "facebook/bart-large-cnn" # Load the tokenizer and model tokenizer = BartTokenizer.from_pretrained(model_save_directory) model = BartForConditionalGeneration.from_pretrained(model_save_directory) # Process the content for summarization inputs_no_trunc = tokenizer(content, max_length=None, return_tensors='pt', truncation=False) chunk_start = 0 chunk_end = tokenizer.model_max_length # 1024 for BART inputs_batch_lst = [] while chunk_start <= len(inputs_no_trunc['input_ids'][0]): inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end] inputs_batch = torch.unsqueeze(inputs_batch, 0) inputs_batch_lst.append(inputs_batch) chunk_start += tokenizer.model_max_length chunk_end += tokenizer.model_max_length # Generate summaries for each batch of tokens summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=100, early_stopping=True) for inputs in inputs_batch_lst] # Combine the batched summaries summary_batch_lst = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summary_id in summary_ids_lst for g in summary_id] summary_all = '\n'.join(summary_batch_lst) # Calculate the execution time execution_time = time.time() - start_time summary_document = { "original_text": content, "summarized_text": summary_all, "timestamp": time.time() } result = summaries_collection.insert_one(summary_document) # Return the summarized text and execution time return jsonify({ "summarized_text": summary_all, "execution_time": f"{execution_time} seconds", "mongodb_article_id":f"{result.inserted_id}" }) @app.route('/one', methods=['POST']) def one(): print("bart route called") # Get the limit from the request limit = request.json.get('limit', 5) # Calculate the time threshold (1 hour ago) time_threshold = datetime.now() - timedelta(hours=1) # Query for articles articles = scraped_collection.find({ "summarized": "false" # "fetched_time": {"$gte": time_threshold} }).limit(limit) # print(len(articles)) articles_list = list(articles) print(articles_list) # Path to your BART model model_save_directory = "facebook/bart-large-cnn" # Load the tokenizer and model tokenizer = BartTokenizer.from_pretrained(model_save_directory) model = BartForConditionalGeneration.from_pretrained(model_save_directory) for article in articles: content = article['content'] start_time = time.time() # Summarize the content inputs = tokenizer(content, return_tensors='pt', max_length=1024, truncation=True) summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True) summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True) execution_time = time.time() - start_time summary_document = { # "original_text": content, "summary": summary_text, "summarized":"true" # "timestamp": time.time() } result = summaries_collection.insert_one(summary_document) # Save the summarized text back to the database result_scraped = scraped_collection.update_one( {"_id": article['_id']}, {"$set": {"summarized":"true"}} ) print(f"Summarized and updated article ID {article['_id']}, Execution time: {execution_time} seconds") return jsonify({"message": "Summarization completed for requested articles"}) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860) # if __name__ == "__main__": # # serve(app, host="0.0.0.0", port=9000) # app.run(host="0.0.0.0", port=9000, debug=True)