Spaces:
Sleeping
Sleeping
# import os | |
# from flask import Flask, request, render_template, jsonify | |
# import re | |
# import nltk | |
# import torch | |
# from pathlib import Path | |
# from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
# from nltk.tokenize import word_tokenize | |
# from nltk.stem import WordNetLemmatizer | |
# # Ensure NLTK uses the correct data path | |
# nltk.data.path.append(os.getenv('NLTK_DATA')) | |
# app = Flask(__name__) | |
# # Ensure the Transformers cache directory is set correctly | |
# os.environ['TRANSFORMERS_CACHE'] = os.getenv('TRANSFORMERS_CACHE') | |
# tokenizer = AutoTokenizer.from_pretrained(Path("summary/tokenizer")) | |
# model_name = "summary/pegasus-samsum-model" | |
# def remove_spaces_before_punctuation(text): | |
# pattern = re.compile(r'(\s+)([.,;!?])') | |
# result = pattern.sub(r'\2', text) | |
# result = re.sub(r'\[|\]', '', result) | |
# return result | |
# def replace_pronouns(text): | |
# # Replace "they" with "he" or "she" based on context | |
# text = re.sub(r'\bthey\b', 'He/She', text, flags=re.IGNORECASE) | |
# text = re.sub(r'\b(are|have|were)\b', lambda x: {'are': 'is', 'have': 'has', 'were': 'was'}[x.group()], text) | |
# return text | |
# def clean_and_lemmatize(text): | |
# # Remove digits, symbols, punctuation marks, and newline characters | |
# text = re.sub(r'\d+', '', text) | |
# text = re.sub(r'[^\w\s,-]', '', text.replace('\n', '')) | |
# # Tokenize the text | |
# tokens = word_tokenize(text.lower()) | |
# # Initialize lemmatizer | |
# lemmatizer = WordNetLemmatizer() | |
# # Lemmatize each token and join back into a sentence | |
# lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens]) | |
# return lemmatized_text | |
# @app.route('/summarize', methods=['POST']) | |
# def summarize(): | |
# # Get the input text from the request | |
# input_text = request.form['input_text'] | |
# # Tokenize the input text | |
# tokens_org_text = tokenizer.tokenize(input_text) | |
# sequence_length_org_text = len(tokens_org_text) | |
# input_text = clean_and_lemmatize(input_text) | |
# tokens = tokenizer.tokenize(input_text) | |
# sequence_length = len(tokens) | |
# if sequence_length >= 1024: | |
# return jsonify({'error': 'Input text exceeds maximum token length of 1023.'}) | |
# # Initialize model variable | |
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) | |
# gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128} | |
# pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device=device) | |
# text = pipe(input_text, **gen_kwargs)[0]["summary_text"] | |
# output_text = replace_pronouns(remove_spaces_before_punctuation(text)) | |
# # Clear the GPU cache | |
# torch.cuda.empty_cache() | |
# # Return the summary | |
# return jsonify({'summary': output_text}) | |
# @app.route('/') | |
# def index(): | |
# return render_template('index.html') | |
# if __name__ == '__main__': | |
# app.run(host='0.0.0.0', debug=True, port=7860) | |
import os | |
from flask import Flask, request, render_template, jsonify | |
import re | |
import nltk | |
import torch | |
from pathlib import Path | |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
# Ensure NLTK uses the correct data path | |
nltk.data.path.append(os.getenv('NLTK_DATA')) | |
# Define the device if using GPU | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
app = Flask(__name__) | |
# Ensure the Transformers cache directory is set correctly | |
os.environ['TRANSFORMERS_CACHE'] = os.getenv('TRANSFORMERS_CACHE') | |
tokenizer = AutoTokenizer.from_pretrained(Path("summary/tokenizer")) | |
model_name = "summary/pegasus-samsum-model" | |
def remove_spaces_before_punctuation(text): | |
pattern = re.compile(r'(\s+)([.,;!?])') | |
result = pattern.sub(r'\2', text) | |
result = re.sub(r'\[|\]', '', result) | |
return result | |
def replace_pronouns(text): | |
# Replace "they" with "he" or "she" based on context | |
text = re.sub(r'\bthey\b', 'He/She', text, flags=re.IGNORECASE) | |
text = re.sub(r'\b(are|have|were)\b', lambda x: {'are': 'is', 'have': 'has', 'were': 'was'}[x.group()], text) | |
return text | |
def clean_and_lemmatize(text): | |
# Remove digits, symbols, punctuation marks, and newline characters | |
text = re.sub(r'\d+', '', text) | |
text = re.sub(r'[^\w\s,-]', '', text.replace('\n', '')) | |
# Tokenize the text | |
tokens = word_tokenize(text.lower()) | |
# Initialize lemmatizer | |
lemmatizer = WordNetLemmatizer() | |
# Lemmatize each token and join back into a sentence | |
lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens]) | |
return lemmatized_text | |
def summarize(): | |
# Get the input text from the request | |
input_text = request.form['input_text'] | |
# Tokenize the input text | |
tokens_org_text = tokenizer.tokenize(input_text) | |
sequence_length_org_text = len(tokens_org_text) | |
input_text = clean_and_lemmatize(input_text) | |
tokens = tokenizer.tokenize(input_text) | |
sequence_length = len(tokens) | |
if sequence_length >= 1024: | |
return jsonify({'error': 'Input text exceeds maximum token length of 1023.'}) | |
# Initialize model variable | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) | |
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128} | |
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1) | |
text = pipe(input_text, **gen_kwargs)[0]["summary_text"] | |
output_text = replace_pronouns(remove_spaces_before_punctuation(text)) | |
# Clear the GPU cache | |
if device == "cuda": | |
torch.cuda.empty_cache() | |
# Return the summary | |
return jsonify({'summary': output_text}) | |
def index(): | |
return render_template('index.html') | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', debug=True, port=7860) | |