summarizer-api / app.py
Randima-Silva's picture
Update app.py
33500d9 verified
# from flask import Flask, render_template, request
# from weather import get_current_weather
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# app = Flask(__name__)
# @app.route('/')
# @app.route('/index')
# def index():
# return render_template('index.html')
# @app.route('/test')
# def test():
# tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
# # Load model
# model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
# return "Hello World!..."
# @app.route('/weather')
# def get_weather():
# city = request.args.get('city')
# print("working...")
# # Check for empty strings or string with only spaces
# if not bool(city.strip()):
# # You could render "City Not Found" instead like we do below
# city = "Kansas City"
# weather_data = get_current_weather(city)
# # City is not found by API
# if not weather_data['cod'] == 200:
# return render_template('city-not-found.html')
# return render_template(
# "weather.html",
# title=weather_data["name"],
# status=weather_data["weather"][0]["description"].capitalize(),
# temp=f"{weather_data['main']['temp']:.1f}",
# feels_like=f"{weather_data['main']['feels_like']:.1f}"
# )
# if __name__ == "__main__":
# serve(app, host="0.0.0.0", port=8000)
# ---------------------------------------------------------------------------------
# from flask import Flask, render_template, request, jsonify
# from waitress import serve
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# import time
# app = Flask(__name__)
# # Assuming the rest of your Flask app code remains unchanged
# @app.route('/')
# @app.route('/index')
# def index():
# return render_template('index.html')
# @app.route('/test', methods=['POST'])
# def test():
# # Extract text from the request body
# content = request.json.get('content', '')
# if not content:
# return jsonify({"error": "No content provided"}), 400
# start_time = time.time()
# # Specify the directory where you have saved the model
# model_save_directory = "./my_project_folder/pegasus_model"
# # Load the model and tokenizer from the directory
# model = PegasusForConditionalGeneration.from_pretrained(model_save_directory)
# tokenizer = PegasusTokenizer.from_pretrained(model_save_directory)
# # Create tokens - number representation of our text
# tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt")
# # Summarize
# summary = model.generate(**tokens, min_length=60, max_length=100)
# # Decode summary
# summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True)
# end_time = time.time()
# execution_time = end_time - start_time
# # Return the summarized text and execution time
# return jsonify({
# "summarized_text": summarized_text,
# "execution_time": f"{execution_time} seconds"
# })
# # Assuming you have the `if __name__ == "__main__"` block to run the app
# if __name__ == "__main__":
# serve(app, host="0.0.0.0", port=8000)
# ======================================================================================
# from flask import Flask, request, jsonify
# from waitress import serve
from pymongo import MongoClient
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
from waitress import serve
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
import time
import time
from datetime import datetime, timedelta
app = Flask(__name__)
CORS(app)
# Use your MongoDB Atlas connection string
mongo_conn_str = 'mongodb+srv://final_year_project:Ngd2jIj9PpvQfb5i@cluster0.3mhko.mongodb.net/news_scraping_site?retryWrites=true&w=majority&appName=Cluster0'
client = MongoClient(mongo_conn_str)
# Adjust these to match your specific database and collection names
db = client['news_scraping_site']
summaries_collection = db.articles
scraped_collection = db.scrapedarticles
@app.route('/')
def hello():
return {"hello":"its fucking working..."}
@app.route('/index')
def index():
return render_template('index.html')
@app.route('/test', methods=['POST'])
def test():
content = request.json.get('content', '')
if not content:
return jsonify({"error": "No content provided"}), 400
start_time = time.time()
# model_save_directory = "./my_project_folder/pegasus_model"
model_save_directory = "google/pegasus-xsum"
model = PegasusForConditionalGeneration.from_pretrained(model_save_directory)
tokenizer = PegasusTokenizer.from_pretrained(model_save_directory)
tokens = tokenizer(content, truncation=True, padding="longest", return_tensors="pt")
summary = model.generate(**tokens, min_length=60, max_length=100)
summarized_text = tokenizer.decode(summary[0], skip_special_tokens=True)
# Save the summary to MongoDB Atlas
summary_document = {
"original_text": content,
"summarized_text": summarized_text,
"timestamp": time.time()
}
result = summaries_collection.insert_one(summary_document)
end_time = time.time()
execution_time = end_time - start_time
return jsonify({
"summarized_text": summarized_text,
"execution_time": f"{execution_time} seconds",
"mongodb_object_id": str(result.inserted_id) # Return the MongoDB Object ID of the inserted document
})
@app.route('/bart', methods=['POST'])
def bart():
print("bart route called")
# Get the content from the request
content = request.json.get('content', '')
print(content)
# Check if content is provided
if not content:
return jsonify({"error": "No content provided"}), 400
start_time = time.time()
# Path to your BART model, adjust as necessary
model_save_directory = "facebook/bart-large-cnn"
# Load the tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_save_directory)
model = BartForConditionalGeneration.from_pretrained(model_save_directory)
# Process the content for summarization
inputs_no_trunc = tokenizer(content, max_length=None, return_tensors='pt', truncation=False)
chunk_start = 0
chunk_end = tokenizer.model_max_length # 1024 for BART
inputs_batch_lst = []
while chunk_start <= len(inputs_no_trunc['input_ids'][0]):
inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end]
inputs_batch = torch.unsqueeze(inputs_batch, 0)
inputs_batch_lst.append(inputs_batch)
chunk_start += tokenizer.model_max_length
chunk_end += tokenizer.model_max_length
# Generate summaries for each batch of tokens
summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=100, early_stopping=True) for inputs in inputs_batch_lst]
# Combine the batched summaries
summary_batch_lst = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summary_id in summary_ids_lst for g in summary_id]
summary_all = '\n'.join(summary_batch_lst)
# Calculate the execution time
execution_time = time.time() - start_time
summary_document = {
"original_text": content,
"summarized_text": summary_all,
"timestamp": time.time()
}
result = summaries_collection.insert_one(summary_document)
# Return the summarized text and execution time
return jsonify({
"summarized_text": summary_all,
"execution_time": f"{execution_time} seconds",
"mongodb_article_id":f"{result.inserted_id}"
})
@app.route('/one', methods=['POST'])
def one():
print("bart route called")
# Get the limit from the request
limit = request.json.get('limit', 5)
# Calculate the time threshold (1 hour ago)
time_threshold = datetime.now() - timedelta(hours=1)
# Query for articles
articles = scraped_collection.find({
"summarized": "false"
# "fetched_time": {"$gte": time_threshold}
}).limit(limit)
# print(len(articles))
articles_list = list(articles)
print(articles_list)
# Path to your BART model
model_save_directory = "facebook/bart-large-cnn"
# Load the tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_save_directory)
model = BartForConditionalGeneration.from_pretrained(model_save_directory)
for article in articles:
content = article['content']
start_time = time.time()
# Summarize the content
inputs = tokenizer(content, return_tensors='pt', max_length=1024, truncation=True)
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
execution_time = time.time() - start_time
summary_document = {
# "original_text": content,
"summary": summary_text,
"summarized":"true"
# "timestamp": time.time()
}
result = summaries_collection.insert_one(summary_document)
# Save the summarized text back to the database
result_scraped = scraped_collection.update_one(
{"_id": article['_id']},
{"$set": {"summarized":"true"}}
)
print(f"Summarized and updated article ID {article['_id']}, Execution time: {execution_time} seconds")
return jsonify({"message": "Summarization completed for requested articles"})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)
# if __name__ == "__main__":
# # serve(app, host="0.0.0.0", port=9000)
# app.run(host="0.0.0.0", port=9000, debug=True)