File size: 2,873 Bytes
5afcfd6
1ec2d49
 
5afcfd6
 
 
 
 
 
 
 
255333a
5afcfd6
 
 
 
255333a
5afcfd6
 
 
255333a
5afcfd6
 
 
f31ddfc
1ec2d49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f31ddfc
1ec2d49
f31ddfc
1ec2d49
f31ddfc
 
 
 
 
 
 
1ec2d49
 
5afcfd6
 
 
 
 
255333a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5afcfd6
0c62899
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords

# Example texts
EXAMPLES = {
    "Scientific Abstract": """
    Compatibility of systems of linear constraints over the set of natural numbers. 
    Criteria of compatibility of a system of linear Diophantine equations, strict inequations, 
    and nonstrict inequations are considered.
    """,
    "News Article": """
    Machine learning is revolutionizing the way we interact with technology. 
    Artificial intelligence systems are becoming more sophisticated, enabling automated decision making 
    and pattern recognition at unprecedented scales.
    """,
    "Technical Documentation": """
    The user interface provides intuitive navigation through contextual menus and adaptive layouts. 
    System responses are optimized for performance while maintaining high reliability standards.
    """
}

def extract_keywords(text, num_keywords=10, scores=True, min_length=1):
    # Preprocess text
    processed_text = remove_stopwords(text.lower())
    tokens = simple_preprocess(processed_text, deacc=True)
    
    # Create dictionary and corpus
    dictionary = Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]
    
    # Create TF-IDF model
    tfidf = TfidfModel(corpus)
    tfidf_corpus = tfidf[corpus][0]
    
    # Sort by scores
    sorted_keywords = sorted(tfidf_corpus, key=lambda x: x[1], reverse=True)
    
    # Get top keywords and filter by length
    results = []
    for word_id, score in sorted_keywords:
        word = dictionary[word_id]
        if len(word.split()) >= min_length:
            if scores:
                results.append(f"β€’ {word:<30} (score: {score:.4f})")
            else:
                results.append(f"β€’ {word}")
        if len(results) >= num_keywords:
            break
            
    return "\n".join(results) if results else "No keywords found."

def load_example(example_name):
    return EXAMPLES.get(example_name, "")

# Create Gradio interface
demo = gr.Interface(
    fn=extract_keywords,
    inputs=[
        gr.Textbox(lines=8, label="Input Text", placeholder="Enter your text here..."),
        gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Number of Keywords"),
        gr.Checkbox(label="Show Scores", value=True),
        gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Minimum Words per Keyword")
    ],
    outputs=gr.Textbox(label="Extracted Keywords", lines=10),
    title="πŸ“‘ Keyword Extraction",
    description="Extract keywords using TF-IDF scoring",
    examples=[
        [EXAMPLES["Scientific Abstract"], 10, True, 1],
        [EXAMPLES["News Article"], 5, True, 1],
        [EXAMPLES["Technical Documentation"], 8, False, 1]
    ]
)

demo.launch(share=True)