# app.py import streamlit as st import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import difflib import numpy as np import time # 設置網頁標題等信息 st.set_page_config( page_title="哞哞文章相似度檢測", page_icon="🐮", layout="wide", initial_sidebar_state="collapsed" ) # 自定義CSS樣式 st.markdown(""" """, unsafe_allow_html=True) # 顯示標題 st.markdown("

🐮 哞哞文章相似度檢測

", unsafe_allow_html=True) # 創建兩列佈局 col1, col2 = st.columns(2) with col1: st.markdown("### 📝 文章1") text1 = st.text_area("", height=300, placeholder="請在這裡輸入第一篇文章...", key="text1") with col2: st.markdown("### 📝 文章2") text2 = st.text_area("", height=300, placeholder="請在這裡輸入第二篇文章...", key="text2") # 創建按鈕列 col_btn1, col_btn2, col_btn3 = st.columns([1,1,1]) with col_btn2: start_btn = st.button("🚀 開始計算相似度", type="primary", use_container_width=True) def calculate_similarity(text1, text2): """計算文本相似度""" if not text1.strip() or not text2.strip(): return None, None # 1. 計算字詞重合度 words1 = list(jieba.cut(text1)) words2 = list(jieba.cut(text2)) word_set1 = set(words1) word_set2 = set(words2) word_similarity = len(word_set1.intersection(word_set2)) / len(word_set1.union(word_set2)) # 2. 計算句子相似度 sentences1 = text1.split("。") sentences2 = text2.split("。") sentence_matcher = difflib.SequenceMatcher(None, sentences1, sentences2) sentence_similarity = sentence_matcher.ratio() # 3. 計算TF-IDF相似度 vectorizer = TfidfVectorizer() try: tfidf_matrix = vectorizer.fit_transform([text1, text2]) cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] except: cosine_sim = 0 # 計算總相似度 weights = [0.4, 0.3, 0.3] total_similarity = (word_similarity * weights[0] + sentence_similarity * weights[1] + cosine_sim * weights[2]) * 100 similarity_score = round(total_similarity, 2) # 判定結果 if similarity_score <= 30: result = "兩篇文章沒有關係" elif similarity_score <= 60: result = "兩篇文章似乎有那麼一點關係" elif similarity_score <= 80: result = "兩篇文章很類似" else: result = "兩篇文章有抄襲犯罪的味道" return similarity_score, result if start_btn and text1 and text2: with st.spinner('🔍 分析中，請稍等...'): # 顯示進度條 progress_text = "計算中..." my_bar = st.progress(0, text=progress_text) for percent_complete in range(100): time.sleep(0.01) my_bar.progress(percent_complete + 1, text=progress_text) # 計算相似度 similarity_score, result = calculate_similarity(text1, text2) if similarity_score is not None: # 清除進度條 my_bar.empty() # 顯示結果 st.markdown("---") st.markdown("

✨ 分析結果

", unsafe_allow_html=True) result_text = f"""

相似度：{similarity_score}%

分析結果：{result}

""" st.markdown(result_text, unsafe_allow_html=True) # 顯示可愛的表情符號 if similarity_score <= 30: st.markdown("

😌

", unsafe_allow_html=True) elif similarity_score <= 60: st.markdown("

🤔

", unsafe_allow_html=True) elif similarity_score <= 80: st.markdown("

😮

", unsafe_allow_html=True) else: st.markdown("

😱

", unsafe_allow_html=True) else: st.info('👆 請在上方輸入兩篇要比較的文章，然後點擊"開始計算相似度"按鈕') # 訴訟資訊文本 lawsuit_info = """

馬的~這個小王八蛋抄襲我的文章! 一鍵網路告發直通車

·智慧財產局網站：https://www.tipo.gov.tw/

(備註)♡此次提告訴訟預估花費的公帑估算♡ 1. 人員成本（每月） • 法官薪資：約新台幣 80,000-100,000元 • 書記官薪資：約新台幣 45,000-60,000元 • 其他行政人員：約新台幣 40,000元 2. 訴訟時程與開庭資訊 • 智慧財產法院民事第一審平均結案日數：約240天 • 平均開庭次數：約3-4次 • 每次開庭約需2小時 3. 成本估算（以一個案件計）直接人事成本： 1. 法官成本： • 每月薪資÷20工作天÷8小時 × 2小時 × 4次開庭 • = (90,000÷160) × 2 × 4 = 約4,500元 2. 書記官成本： • 每月薪資÷20工作天÷8小時 × 2小時 × 4次開庭 • = (50,000÷160) × 2 × 4 = 約2,500元 3. 行政人員處理成本： • 約5,000元（含文書處理、歸檔等）間接成本： 1. 場地使用成本（含水電）：約3,000元 2. 文書處理成本：約2,000元 3. 行政作業成本：約3,000元總計：一個著作權侵權民事訴訟案件，預估會花費納稅人約新台幣20,000元。其他考量： 1. 若案件進入第二審，成本將增加約50-80% 2. 如果需要專業鑑定，費用會更高 3. 若有調解程序，可能會縮短訴訟時間，降低成本建議：你想靠這小賺一點錢的話，不如去新北市樹林區上午的菜市場給無照駕駛的老阿公或老阿罵撞還會理賠更多，有機會中大獎如民國104彭姓少年下樓買珍珠奶茶被阿罵無照駕駛機車撞住院住了半年案，可以獲得的理賠金更多。

""" # 在info提示後顯示訴訟資訊 st.markdown(lawsuit_info, unsafe_allow_html=True) # 在底部添加說明 st.markdown("---") st.markdown("""

💡 判定標準：
0-30%：文章沒有關係 | 31-60%：稍有關係 | 61-80%：很類似 | 81-100%：疑似抄襲

""", unsafe_allow_html=True)