Spaces:

DexterSptizu
/

gensim-keyword-extraction

Sleeping

File size: 2,873 Bytes

5afcfd6
1ec2d49
 
5afcfd6
 
 
 
 
 
 
 
255333a
5afcfd6
 
 
 
255333a
5afcfd6
 
 
255333a
5afcfd6
 
 
f31ddfc
1ec2d49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f31ddfc
1ec2d49
f31ddfc
1ec2d49
f31ddfc
 
 
 
 
 
 
1ec2d49
 
5afcfd6
 
 
 
 
255333a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5afcfd6
0c62899

import gradio as gr
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords

# Example texts
EXAMPLES = {
    "Scientific Abstract": """
    Compatibility of systems of linear constraints over the set of natural numbers. 
    Criteria of compatibility of a system of linear Diophantine equations, strict inequations, 
    and nonstrict inequations are considered.
    """,
    "News Article": """
    Machine learning is revolutionizing the way we interact with technology. 
    Artificial intelligence systems are becoming more sophisticated, enabling automated decision making 
    and pattern recognition at unprecedented scales.
    """,
    "Technical Documentation": """
    The user interface provides intuitive navigation through contextual menus and adaptive layouts. 
    System responses are optimized for performance while maintaining high reliability standards.
    """
}

def extract_keywords(text, num_keywords=10, scores=True, min_length=1):
    # Preprocess text
    processed_text = remove_stopwords(text.lower())
    tokens = simple_preprocess(processed_text, deacc=True)
    
    # Create dictionary and corpus
    dictionary = Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]
    
    # Create TF-IDF model
    tfidf = TfidfModel(corpus)
    tfidf_corpus = tfidf[corpus][0]
    
    # Sort by scores
    sorted_keywords = sorted(tfidf_corpus, key=lambda x: x[1], reverse=True)
    
    # Get top keywords and filter by length
    results = []
    for word_id, score in sorted_keywords:
        word = dictionary[word_id]
        if len(word.split()) >= min_length:
            if scores:
                results.append(f"• {word:<30} (score: {score:.4f})")
            else:
                results.append(f"• {word}")
        if len(results) >= num_keywords:
            break
            
    return "\n".join(results) if results else "No keywords found."

def load_example(example_name):
    return EXAMPLES.get(example_name, "")

# Create Gradio interface
demo = gr.Interface(
    fn=extract_keywords,
    inputs=[
        gr.Textbox(lines=8, label="Input Text", placeholder="Enter your text here..."),
        gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Number of Keywords"),
        gr.Checkbox(label="Show Scores", value=True),
        gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Minimum Words per Keyword")
    ],
    outputs=gr.Textbox(label="Extracted Keywords", lines=10),
    title="📑 Keyword Extraction",
    description="Extract keywords using TF-IDF scoring",
    examples=[
        [EXAMPLES["Scientific Abstract"], 10, True, 1],
        [EXAMPLES["News Article"], 5, True, 1],
        [EXAMPLES["Technical Documentation"], 8, False, 1]
    ]
)

demo.launch(share=True)