File size: 4,101 Bytes
39fc926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e9c9e8
39fc926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768cb65
 
39fc926
9e9c9e8
 
 
39fc926
370b54b
39fc926
370b54b
 
 
 
 
 
39fc926
 
 
 
 
462b854
39fc926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e9c9e8
39fc926
9e9c9e8
 
370b54b
 
 
 
9e9c9e8
370b54b
9e9c9e8
 
39fc926
 
8d585eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
from newspaper import Article
from newspaper import Config

from transformers import pipeline
import requests
from bs4 import BeautifulSoup
import re

from bs4 import BeautifulSoup as bs
import requests
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

#  Load Model and Tokenize
def get_summary(input_text):
    tokenizer = PreTrainedTokenizerFast.from_pretrained("ainize/kobart-news")
    summary_model = BartForConditionalGeneration.from_pretrained("ainize/kobart-news")
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    summary_text_ids = summary_model.generate(
        input_ids=input_ids,
        bos_token_id=summary_model.config.bos_token_id,
        eos_token_id=summary_model.config.eos_token_id,
        length_penalty=2.0,
        max_length=142,
        min_length=56,
        num_beams=4,
    )
    return tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)



USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10

class news_collector:
    def __init__(self):
        self.examples_text = []

    def get_new_parser(self, url):
        article = Article(url, language='ko')
        article.download()
        article.parse()
        return article

    def get_news_links(self, page=''):
        url = "https://news.daum.net/breakingnews/economic"
        response = requests.get(url)
        html_text = response.text

        soup = bs(response.text, 'html.parser')
        news_titles = soup.select("a.link_txt")
        links = [item.attrs['href'] for item in news_titles ]
        https_links = [item for item in links if item.startswith('https') == True]
        https_links
        return https_links


    def update_news_examples(self):
        news_links = self.get_news_links()

        for news_url in news_links:
            article = self.get_new_parser(news_url)
            self.examples_text.append([get_summary(article.text[:1000]), news_url])



news_data = []

def update_news_data():
    global news_data
    news_data = news_collector().update_news_examples()
    print("๋‰ด์Šค ๋ฐ์ดํ„ฐ ์—…๋ฐ์ดํŠธ ์™„๋ฃŒ")
    
update_news_data()

title = "๊ท ํ˜•์žกํžŒ ๋‰ด์Šค ์ฝ๊ธฐ (Balanced News Reading)"



with gr.Blocks(theme='pseudolab/huggingface-korea-theme') as demo:
    # news = news_collector()
    # news.update_news_examples()

    with gr.Tab("์†Œ๊ฐœ"):
        gr.Markdown(
        """
        # ๊ท ํ˜•์žกํžŒ ๋‰ด์Šค ์ฝ๊ธฐ (Balanced News Reading)

        ๊ธ์ •์ ์ธ ๊ธฐ์‚ฌ์™€ ๋ถ€์ •์ ์ธ ๊ธฐ์‚ฌ์ธ์ง€ ํ™•์ธํ•˜์—ฌ ๋‰ด์Šค๋ฅผ ์ฝ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ตœ๊ทผ ๊ฒฝ์ œ๋‰ด์Šค๊ธฐ์‚ฌ๋ฅผ ๊ฐ€์ ธ์™€ Example์—์„œ ๋ฐ”๋กœ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋„๋ก ๊ตฌ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.

        ## 1. ์‚ฌ์šฉ๋ฐฉ๋ฒ•
        Daum๋‰ด์Šค์˜ ๊ฒฝ์ œ ๊ธฐ์‚ฌ๋ฅผ ๊ฐ€์ ธ์™€ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  `Example`์— ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค. ๊ฐ์ • ๋ถ„์„์„ ํ•˜๊ณ  ์‹ถ์€ ๊ธฐ์‚ฌ๋ฅผ `Examples`์—์„œ ์„ ํƒํ•ด์„œ `Submit`์„ ๋ˆ„๋ฅด๋ฉด `Classification`์—
        ํ•ด๋‹น ๊ธฐ์‚ฌ์˜ ๊ฐ์ • ํ‰๊ฐ€ ๊ฒฐ๊ณผ๊ฐ€ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค. ๊ฐ์ •ํ‰๊ฐ€๋Š” ๊ฐ ์ƒํƒœ์˜ ํ™•๋ฅ  ์ •๋ณด์™€ ํ•จ๊ป˜ `neutral`, `positive`, `negative` 3๊ฐ€์ง€๋กœ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.

        ## 2. ๊ตฌ์กฐ ์„ค๋ช…
        ๋‰ด์Šค๊ธฐ์‚ฌ๋ฅผ ํฌ๋กค๋ง ๋ฐ ์š”์•ฝ ๋ชจ๋ธ์„ ์ด์šฉํ•œ ๊ธฐ์‚ฌ ์š”์•ฝ >> ๊ธฐ์‚ฌ ์š”์•ฝ์ •๋ณด Example์— ์ถ”๊ฐ€ >> ํ•œ๊ตญ์–ด fine-tunningํ•œ ๊ฐ์ •ํ‰๊ฐ€ ๋ชจ๋ธ์„ ์ด์šฉํ•ด ์ž…๋ ฅ๋œ ๊ธฐ์‚ฌ์— ๋Œ€ํ•œ ๊ฐ์ • ํ‰๊ฐ€ ์ง„ํ–‰
        """)

    with gr.Tab("๋ฐ๋ชจ"):
        Link_TXT = gr.Textbox(label="๋‰ด์Šค ๋‚ด์šฉ", placeholder = "๋‰ด์Šค ๊ธฐ์‚ฌ ๋‚ด์šฉ์„ ์ž…๋ ฅํ•˜์„ธ์š”.")
        gr.load("models/gabrielyang/finance_news_classifier-KR_v7",
                inputs = Link_TXT)
        Link_URL = gr.Textbox(label="๋‰ด์Šค URL")
        
        update_button = gr.Button(value="๋‰ด์Šค ๋ฐ์ดํ„ฐ ์—…๋ฐ์ดํŠธ")
        update_button.click(fn=update_news_data, inputs=None, outputs=None)
        
        gr.Examples(
            news_data,
            [Link_TXT, Link_URL],
        )

if __name__ == "__main__":
    demo.launch()