Spaces:

Bikas0
/

Abstract-Text-Summarizer

Sleeping

App Files Files Community

Bikas0 commited on Jun 25

Commit

4a35f71

•

1 Parent(s): 2ab3fc4

update app file with cuda

Browse files

Files changed (1) hide show

app.py +92 -2

app.py CHANGED Viewed

@@ -1,3 +1,88 @@
 import os
 from flask import Flask, request, render_template, jsonify
 import re
@@ -11,6 +96,9 @@ from nltk.stem import WordNetLemmatizer
 # Ensure NLTK uses the correct data path
 nltk.data.path.append(os.getenv('NLTK_DATA'))
 app = Flask(__name__)
 # Ensure the Transformers cache directory is set correctly
@@ -63,13 +151,14 @@ def summarize():
     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
     gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
-    pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)
     text = pipe(input_text, **gen_kwargs)[0]["summary_text"]
     output_text = replace_pronouns(remove_spaces_before_punctuation(text))
     # Clear the GPU cache
-    torch.cuda.empty_cache()
     # Return the summary
     return jsonify({'summary': output_text})
@@ -80,3 +169,4 @@ def index():
 if __name__ == '__main__':
     app.run(host='0.0.0.0', debug=True, port=7860)

+# import os
+# from flask import Flask, request, render_template, jsonify
+# import re
+# import nltk
+# import torch
+# from pathlib import Path
+# from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
+# from nltk.tokenize import word_tokenize
+# from nltk.stem import WordNetLemmatizer
+# # Ensure NLTK uses the correct data path
+# nltk.data.path.append(os.getenv('NLTK_DATA'))
+# app = Flask(__name__)
+# # Ensure the Transformers cache directory is set correctly
+# os.environ['TRANSFORMERS_CACHE'] = os.getenv('TRANSFORMERS_CACHE')
+# tokenizer = AutoTokenizer.from_pretrained(Path("summary/tokenizer"))
+# model_name = "summary/pegasus-samsum-model"
+# def remove_spaces_before_punctuation(text):
+#     pattern = re.compile(r'(\s+)([.,;!?])')
+#     result = pattern.sub(r'\2', text)
+#     result = re.sub(r'\[|\]', '', result)
+#     return result
+# def replace_pronouns(text):
+#     # Replace "they" with "he" or "she" based on context
+#     text = re.sub(r'\bthey\b', 'He/She', text, flags=re.IGNORECASE)
+#     text = re.sub(r'\b(are|have|were)\b', lambda x: {'are': 'is', 'have': 'has', 'were': 'was'}[x.group()], text)
+#     return text
+# def clean_and_lemmatize(text):
+#     # Remove digits, symbols, punctuation marks, and newline characters
+#     text = re.sub(r'\d+', '', text)
+#     text = re.sub(r'[^\w\s,-]', '', text.replace('\n', ''))
+#     # Tokenize the text
+#     tokens = word_tokenize(text.lower())
+#     # Initialize lemmatizer
+#     lemmatizer = WordNetLemmatizer()
+#     # Lemmatize each token and join back into a sentence
+#     lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
+#     return lemmatized_text
+# @app.route('/summarize', methods=['POST'])
+# def summarize():
+#     # Get the input text from the request
+#     input_text = request.form['input_text']
+#     # Tokenize the input text
+#     tokens_org_text = tokenizer.tokenize(input_text)
+#     sequence_length_org_text = len(tokens_org_text)
+#     input_text = clean_and_lemmatize(input_text)
+#     tokens = tokenizer.tokenize(input_text)
+#     sequence_length = len(tokens)
+#     if sequence_length >= 1024:
+#         return jsonify({'error': 'Input text exceeds maximum token length of 1023.'})
+#     # Initialize model variable
+#     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+#     gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
+#     pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)
+#     text = pipe(input_text, **gen_kwargs)[0]["summary_text"]
+#     output_text = replace_pronouns(remove_spaces_before_punctuation(text))
+#     # Clear the GPU cache
+#     torch.cuda.empty_cache()
+#     # Return the summary
+#     return jsonify({'summary': output_text})
+# @app.route('/')
+# def index():
+#     return render_template('index.html')
+# if __name__ == '__main__':
+#     app.run(host='0.0.0.0', debug=True, port=7860)
 import os
 from flask import Flask, request, render_template, jsonify
 import re
 # Ensure NLTK uses the correct data path
 nltk.data.path.append(os.getenv('NLTK_DATA'))
+# Define the device if using GPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
 app = Flask(__name__)
 # Ensure the Transformers cache directory is set correctly
     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
     gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
+    pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
     text = pipe(input_text, **gen_kwargs)[0]["summary_text"]
     output_text = replace_pronouns(remove_spaces_before_punctuation(text))
     # Clear the GPU cache
+    if device == "cuda":
+        torch.cuda.empty_cache()
     # Return the summary
     return jsonify({'summary': output_text})
 if __name__ == '__main__':
     app.run(host='0.0.0.0', debug=True, port=7860)