Spaces:

DexterSptizu
/

gensim-keyword-extraction

Sleeping

App Files Files Community

gensim-keyword-extraction / app.py

DexterSptizu

Update app.py

f31ddfc verified 24 days ago

raw

history blame

4.8 kB

	import gradio as gr
	from gensim.models import TfidfModel
	from gensim.corpora import Dictionary
	from gensim.utils import simple_preprocess
	from gensim.parsing.preprocessing import remove_stopwords
	import numpy as np
	import warnings
	warnings.filterwarnings('ignore')

	# Example texts
	EXAMPLES = {
	"Scientific Abstract": """
	Compatibility of systems of linear constraints over the set of natural numbers.
	Criteria of compatibility of a system of linear Diophantine equations, strict inequations,
	and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions
	and algorithms of construction of minimal generating sets of solutions for all types of systems are given.
	""",
	"News Article": """
	Machine learning is revolutionizing the way we interact with technology.
	Artificial intelligence systems are becoming more sophisticated, enabling automated decision making
	and pattern recognition at unprecedented scales. Deep learning algorithms continue to improve,
	making breakthroughs in natural language processing and computer vision.
	""",
	"Technical Documentation": """
	The user interface provides intuitive navigation through contextual menus and adaptive layouts.
	System responses are optimized for performance while maintaining high reliability standards.
	Database connections are pooled to minimize resource overhead and maximize throughput.
	"""
	}

	def preprocess_text(text):
	# Remove stopwords
	text = remove_stopwords(text)
	# Tokenize and clean text
	tokens = simple_preprocess(text, deacc=True)
	return ' '.join(tokens)

	# Initialize text processing components
	def extract_keywords(text, num_keywords=10, scores=True, min_length=1):
	# Preprocess text
	processed_text = remove_stopwords(text.lower())
	tokens = simple_preprocess(processed_text, deacc=True)

	# Create dictionary and corpus
	dictionary = Dictionary([tokens])
	corpus = [dictionary.doc2bow(tokens)]

	# Create TF-IDF model
	tfidf = TfidfModel(corpus)
	tfidf_corpus = tfidf[corpus][0]

	# Sort by scores
	sorted_keywords = sorted(tfidf_corpus, key=lambda x: x[1], reverse=True)

	# Get top keywords and filter by length
	results = []
	for word_id, score in sorted_keywords:
	word = dictionary[word_id]
	if len(word.split()) >= min_length:
	if scores:
	results.append(f"• {word:<30} (score: {score:.4f})")
	else:
	results.append(f"• {word}")
	if len(results) >= num_keywords:
	break

	return "\n".join(results) if results else "No keywords found."

	# Update the interface click handler to match the function parameters
	extract_btn.click(
	extract_keywords,
	inputs=[input_text, num_keywords, show_scores, min_length],
	outputs=[output_text]
	)

	def load_example(example_name):
	return EXAMPLES.get(example_name, "")

	# Create Gradio interface
	with gr.Blocks(title="Gensim Keyword Extraction") as demo:
	gr.Markdown("# 📑 Gensim Keyword Extraction")
	gr.Markdown("Extract keywords using Gensim's text processing capabilities")

	with gr.Row():
	with gr.Column(scale=2):
	input_text = gr.Textbox(
	label="Input Text",
	placeholder="Enter your text here...",
	lines=8
	)
	example_dropdown = gr.Dropdown(
	choices=list(EXAMPLES.keys()),
	label="Load Example Text"
	)

	with gr.Column(scale=1):
	ratio = gr.Slider(
	minimum=1,
	maximum=100,
	value=20,
	step=1,
	label="Keyword Ratio (%)"
	)

	min_length = gr.Slider(
	minimum=1,
	maximum=5,
	value=1,
	step=1,
	label="Minimum Words per Keyword"
	)

	show_scores = gr.Checkbox(
	label="Show Relevance Scores",
	value=True
	)

	extract_btn = gr.Button(
	"Extract Keywords",
	variant="primary"
	)

	output_text = gr.Textbox(
	label="Extracted Keywords",
	lines=10,
	interactive=False
	)

	# Set up event handlers
	example_dropdown.change(
	load_example,
	inputs=[example_dropdown],
	outputs=[input_text]
	)

	extract_btn.click(
	extract_keywords,
	inputs=[
	input_text,
	ratio,
	show_scores,
	min_length
	],
	outputs=[output_text]
	)

	demo.launch()