Spaces:
Build error
Build error
Upload datasets.ipynb
Browse files- datasets.ipynb +582 -94
datasets.ipynb
CHANGED
@@ -9,18 +9,9 @@
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
-
"execution_count":
|
13 |
"metadata": {},
|
14 |
-
"outputs": [
|
15 |
-
{
|
16 |
-
"name": "stderr",
|
17 |
-
"output_type": "stream",
|
18 |
-
"text": [
|
19 |
-
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
20 |
-
" from .autonotebook import tqdm as notebook_tqdm\n"
|
21 |
-
]
|
22 |
-
}
|
23 |
-
],
|
24 |
"source": [
|
25 |
"from datasets import load_dataset\n",
|
26 |
"import pandas as pd \n",
|
@@ -39,7 +30,7 @@
|
|
39 |
},
|
40 |
{
|
41 |
"cell_type": "code",
|
42 |
-
"execution_count":
|
43 |
"metadata": {},
|
44 |
"outputs": [],
|
45 |
"source": [
|
@@ -50,7 +41,7 @@
|
|
50 |
},
|
51 |
{
|
52 |
"cell_type": "code",
|
53 |
-
"execution_count":
|
54 |
"metadata": {},
|
55 |
"outputs": [],
|
56 |
"source": [
|
@@ -60,7 +51,7 @@
|
|
60 |
},
|
61 |
{
|
62 |
"cell_type": "code",
|
63 |
-
"execution_count":
|
64 |
"metadata": {},
|
65 |
"outputs": [],
|
66 |
"source": [
|
@@ -71,7 +62,7 @@
|
|
71 |
},
|
72 |
{
|
73 |
"cell_type": "code",
|
74 |
-
"execution_count":
|
75 |
"metadata": {},
|
76 |
"outputs": [],
|
77 |
"source": [
|
@@ -101,7 +92,7 @@
|
|
101 |
},
|
102 |
{
|
103 |
"cell_type": "code",
|
104 |
-
"execution_count":
|
105 |
"metadata": {},
|
106 |
"outputs": [
|
107 |
{
|
@@ -151,7 +142,7 @@
|
|
151 |
},
|
152 |
{
|
153 |
"cell_type": "code",
|
154 |
-
"execution_count":
|
155 |
"metadata": {},
|
156 |
"outputs": [
|
157 |
{
|
@@ -206,12 +197,96 @@
|
|
206 |
" return train_collection,test_collection\n",
|
207 |
"\n",
|
208 |
"# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
|
209 |
-
"train_file_path = 'C:\\\\gitProjects\\\\
|
210 |
-
"test_file_path = 'C:\\\\gitProjects\\\\
|
211 |
"\n",
|
212 |
"train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
|
213 |
]
|
214 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
{
|
216 |
"cell_type": "markdown",
|
217 |
"metadata": {},
|
@@ -225,80 +300,466 @@
|
|
225 |
"metadata": {},
|
226 |
"outputs": [],
|
227 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
229 |
-
"from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
"\n",
|
231 |
-
"
|
232 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
"\n",
|
234 |
-
"#text dosyasını koleksiyon üzerinden çekme \n",
|
235 |
-
"# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
|
236 |
-
"# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
|
237 |
"class Database:\n",
|
238 |
" @staticmethod\n",
|
239 |
" def get_mongodb():\n",
|
240 |
-
"
|
241 |
-
" return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
|
242 |
"\n",
|
|
|
243 |
" @staticmethod\n",
|
244 |
-
" def
|
245 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
|
246 |
" client = MongoClient(mongo_url)\n",
|
247 |
" db = client[db_name]\n",
|
248 |
" collection = db[collection_name]\n",
|
249 |
-
"
|
250 |
-
"
|
251 |
-
"
|
252 |
-
"
|
253 |
-
" title_count = len(title_from_db)\n",
|
254 |
-
" return title_from_db, title_count\n",
|
255 |
" \n",
|
|
|
256 |
" @staticmethod\n",
|
257 |
-
" def
|
258 |
-
"
|
259 |
-
"
|
260 |
-
"
|
261 |
-
"
|
262 |
-
"
|
263 |
-
"
|
264 |
-
"
|
265 |
-
"
|
266 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
"\n",
|
268 |
"\n",
|
269 |
-
"#
|
270 |
-
"
|
271 |
-
"
|
|
|
272 |
"\n",
|
273 |
-
"#
|
274 |
-
"
|
275 |
-
"
|
276 |
-
"
|
277 |
-
"
|
278 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
]
|
280 |
},
|
281 |
{
|
282 |
-
"cell_type": "
|
|
|
283 |
"metadata": {},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
"source": [
|
285 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
]
|
287 |
},
|
288 |
{
|
289 |
"cell_type": "code",
|
290 |
-
"execution_count":
|
291 |
"metadata": {},
|
292 |
"outputs": [
|
293 |
{
|
294 |
-
"
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
"
|
301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
}
|
303 |
],
|
304 |
"source": [
|
@@ -315,6 +776,8 @@
|
|
315 |
" def get_mongodb():\n",
|
316 |
" return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
|
317 |
"\n",
|
|
|
|
|
318 |
" @staticmethod\n",
|
319 |
" def get_input_documents(limit=3):\n",
|
320 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
|
@@ -322,11 +785,11 @@
|
|
322 |
" db = client[db_name]\n",
|
323 |
" collection = db[collection_name]\n",
|
324 |
" cursor = collection.find().limit(limit)\n",
|
325 |
-
"
|
326 |
-
" document_count = len(
|
327 |
" \n",
|
328 |
" # Dökümanları isimlendir\n",
|
329 |
-
" named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(
|
330 |
" \n",
|
331 |
" return named_documents, document_count\n",
|
332 |
"\n",
|
@@ -353,7 +816,7 @@
|
|
353 |
" return Database.get_input_documents(limit)\n",
|
354 |
"\n",
|
355 |
"# Kullanım örneği\n",
|
356 |
-
"named_documents, document_count = Tf.get_input_documents(limit=
|
357 |
"\n",
|
358 |
"#tf-ıdf ile döküman içerisinden kelime seçme \n",
|
359 |
"\n",
|
@@ -387,23 +850,30 @@
|
|
387 |
" for word, score in sorted_words[:3]:\n",
|
388 |
" print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\n",
|
389 |
"\n",
|
|
|
|
|
390 |
"turkish_stop_words = [\n",
|
391 |
" 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
|
392 |
-
"
|
393 |
-
" '
|
394 |
-
" '
|
395 |
-
" '
|
396 |
-
" '
|
|
|
|
|
|
|
|
|
|
|
397 |
"]\n",
|
398 |
"\n",
|
399 |
-
"#
|
400 |
-
"def calculate_tfidf(
|
401 |
" vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
|
402 |
-
" tfidf_matrix = vectorizer.fit_transform(
|
403 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
404 |
" return tfidf_matrix, feature_names\n",
|
405 |
"\n",
|
406 |
-
"
|
407 |
"#kelimelerin ortalama skorlarını hesaplama \n",
|
408 |
"def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
|
409 |
" # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
|
@@ -411,48 +881,54 @@
|
|
411 |
" low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
|
412 |
" return low_tfidf_words\n",
|
413 |
"\n",
|
414 |
-
"#kelimelerin güncellenmesi \n",
|
415 |
"def update_stop_words(existing_stop_words, low_tfidf_words):\n",
|
416 |
" updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
|
417 |
" return list(updated_stop_words)\n",
|
418 |
"\n",
|
419 |
"\n",
|
420 |
-
"
|
|
|
421 |
" stop_words = set(initial_stop_words)\n",
|
422 |
" for _ in range(iterations):\n",
|
423 |
-
" tfidf_matrix, feature_names = calculate_tfidf(
|
424 |
" low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
|
425 |
" stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
|
426 |
" return list(stop_words)\n",
|
427 |
-
"
|
428 |
"\n",
|
429 |
"\n",
|
430 |
"def main ():\n",
|
431 |
"\n",
|
|
|
432 |
"#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
|
433 |
"\n",
|
434 |
"# Dökümanları liste olarak al\n",
|
435 |
" documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
|
436 |
"\n",
|
|
|
|
|
|
|
|
|
437 |
" #tf-ıdf hesaplama\n",
|
438 |
-
" tfidf_matrix, feature_names=calculate_tfidf(documents_list,
|
439 |
"\n",
|
440 |
-
"
|
441 |
-
" named_documents, document_count = Database.get_input_documents(limit=3)\n",
|
442 |
"\n",
|
443 |
-
"
|
444 |
-
"
|
|
|
445 |
"\n",
|
446 |
-
"
|
447 |
-
" final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
|
448 |
"\n",
|
449 |
-
"
|
|
|
450 |
"\n",
|
451 |
"\n",
|
452 |
"# Sonuçları yazdır\n",
|
453 |
-
"
|
454 |
-
"
|
455 |
-
"
|
456 |
"\n",
|
457 |
" print(\"\\nDökümanlar Listesi:\")\n",
|
458 |
" print(documents_list)\n",
|
@@ -534,9 +1010,21 @@
|
|
534 |
},
|
535 |
{
|
536 |
"cell_type": "code",
|
537 |
-
"execution_count":
|
538 |
"metadata": {},
|
539 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
540 |
"source": [
|
541 |
"\n",
|
542 |
"#---------------------------------------------------------------------------------------------------------------------------------\n",
|
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
+
"execution_count": 6,
|
13 |
"metadata": {},
|
14 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
"source": [
|
16 |
"from datasets import load_dataset\n",
|
17 |
"import pandas as pd \n",
|
|
|
30 |
},
|
31 |
{
|
32 |
"cell_type": "code",
|
33 |
+
"execution_count": 8,
|
34 |
"metadata": {},
|
35 |
"outputs": [],
|
36 |
"source": [
|
|
|
41 |
},
|
42 |
{
|
43 |
"cell_type": "code",
|
44 |
+
"execution_count": 9,
|
45 |
"metadata": {},
|
46 |
"outputs": [],
|
47 |
"source": [
|
|
|
51 |
},
|
52 |
{
|
53 |
"cell_type": "code",
|
54 |
+
"execution_count": 10,
|
55 |
"metadata": {},
|
56 |
"outputs": [],
|
57 |
"source": [
|
|
|
62 |
},
|
63 |
{
|
64 |
"cell_type": "code",
|
65 |
+
"execution_count": 11,
|
66 |
"metadata": {},
|
67 |
"outputs": [],
|
68 |
"source": [
|
|
|
92 |
},
|
93 |
{
|
94 |
"cell_type": "code",
|
95 |
+
"execution_count": 12,
|
96 |
"metadata": {},
|
97 |
"outputs": [
|
98 |
{
|
|
|
142 |
},
|
143 |
{
|
144 |
"cell_type": "code",
|
145 |
+
"execution_count": 13,
|
146 |
"metadata": {},
|
147 |
"outputs": [
|
148 |
{
|
|
|
197 |
" return train_collection,test_collection\n",
|
198 |
"\n",
|
199 |
"# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
|
200 |
+
"train_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
|
201 |
+
"test_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
|
202 |
"\n",
|
203 |
"train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
|
204 |
]
|
205 |
},
|
206 |
+
{
|
207 |
+
"cell_type": "code",
|
208 |
+
"execution_count": null,
|
209 |
+
"metadata": {},
|
210 |
+
"outputs": [],
|
211 |
+
"source": [
|
212 |
+
"import pandas as pd\n",
|
213 |
+
"from pymongo import MongoClient,errors\n",
|
214 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
215 |
+
"from sentence_transformers import SentenceTransformer\n",
|
216 |
+
"\n",
|
217 |
+
"# MongoDB bağlantı ve koleksiyon seçimi için fonksiyon\n",
|
218 |
+
"def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
|
219 |
+
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
220 |
+
" db = client[database_name]\n",
|
221 |
+
" train_collection = db[train_collection_name]\n",
|
222 |
+
" test_collection = db[test_collection_name]\n",
|
223 |
+
" return train_collection, test_collection\n",
|
224 |
+
"\n",
|
225 |
+
"# Dataset'i MongoDB'ye yükleme fonksiyonu\n",
|
226 |
+
"def dataset_read(train_file_path, test_file_path):\n",
|
227 |
+
" try:\n",
|
228 |
+
" # MongoDB koleksiyonlarını al\n",
|
229 |
+
" train_collection, test_collection = get_mongodb()\n",
|
230 |
+
"\n",
|
231 |
+
" # Eğer koleksiyonlar zaten doluysa, veri yüklemesi yapma\n",
|
232 |
+
" if train_collection.estimated_document_count() > 0 or test_collection.estimated_document_count() > 0:\n",
|
233 |
+
" print(\"Veriler zaten yüklendi, işlem yapılmadı.\")\n",
|
234 |
+
" return train_collection, test_collection\n",
|
235 |
+
"\n",
|
236 |
+
" # Datasetleri oku\n",
|
237 |
+
" data_train = pd.read_parquet(train_file_path, columns=['id', 'url', 'title', 'text'])\n",
|
238 |
+
" data_test = pd.read_parquet(test_file_path, columns=['id', 'url', 'title', 'text'])\n",
|
239 |
+
"\n",
|
240 |
+
" # Verileri MongoDB'ye yükle\n",
|
241 |
+
" train_collection.insert_many(data_train.to_dict(\"records\"))\n",
|
242 |
+
" test_collection.insert_many(data_test.to_dict(\"records\"))\n",
|
243 |
+
"\n",
|
244 |
+
" print(f\"Veriler başarıyla {train_collection.name} koleksiyonuna yüklendi.\")\n",
|
245 |
+
" print(f\"Veriler başarıyla {test_collection.name} koleksiyonuna yüklendi.\")\n",
|
246 |
+
" \n",
|
247 |
+
" except errors.PyMongoError as e:\n",
|
248 |
+
" print(f\"Veri yükleme sırasında hata oluştu: {e}\")\n",
|
249 |
+
"\n",
|
250 |
+
" return train_collection, test_collection\n",
|
251 |
+
"\n",
|
252 |
+
"\n",
|
253 |
+
"\n",
|
254 |
+
"# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
|
255 |
+
"class Database:\n",
|
256 |
+
" @staticmethod\n",
|
257 |
+
" def get_mongodb():\n",
|
258 |
+
" return get_mongodb()\n",
|
259 |
+
"\n",
|
260 |
+
" @staticmethod\n",
|
261 |
+
" def get_titles_and_texts():\n",
|
262 |
+
" # MongoDB bağlantısı ve koleksiyonları al\n",
|
263 |
+
" train_collection, _ = Database.get_mongodb()\n",
|
264 |
+
"\n",
|
265 |
+
" # Sorgu: Hem \"title\" hem de \"text\" alanı mevcut olan belgeler\n",
|
266 |
+
" query = {\"title\": {\"$exists\": True}, \"text\": {\"$exists\": True}}\n",
|
267 |
+
"\n",
|
268 |
+
" # Belirtilen alanları seçiyoruz: \"title\", \"text\"\n",
|
269 |
+
" cursor = train_collection.find(query, {\"title\": 1, \"text\": 1, \"_id\": 0})\n",
|
270 |
+
"\n",
|
271 |
+
" # Başlık ve metinleri doğru bir şekilde birleştiriyoruz\n",
|
272 |
+
" documents = [{\"title\": doc['title'], \"text\": doc['text']} for doc in cursor]\n",
|
273 |
+
" document_count = len(documents)\n",
|
274 |
+
" return documents, document_count\n",
|
275 |
+
"\n",
|
276 |
+
"# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
|
277 |
+
"train_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
|
278 |
+
"test_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
|
279 |
+
"\n",
|
280 |
+
"train_collection, test_collection = dataset_read(train_file_path, test_file_path)\n",
|
281 |
+
"\n",
|
282 |
+
"# Veritabanından başlıklar ve metinler alınır\n",
|
283 |
+
"documents, document_count = Database.get_titles_and_texts()\n",
|
284 |
+
"\n",
|
285 |
+
"# Sonuçların belirlenmesi\n",
|
286 |
+
"print(f\"Başlık ve metin çiftleri: {documents}\")\n",
|
287 |
+
"print(f\"Toplam çift sayısı: {document_count}\")\n"
|
288 |
+
]
|
289 |
+
},
|
290 |
{
|
291 |
"cell_type": "markdown",
|
292 |
"metadata": {},
|
|
|
300 |
"metadata": {},
|
301 |
"outputs": [],
|
302 |
"source": [
|
303 |
+
"\"\"\"@staticmethod\n",
|
304 |
+
" def get_input_titles():\n",
|
305 |
+
" collection = Database.get_mongodb(collection_name='train')\n",
|
306 |
+
" query = {\"title\": {\"$exists\": True}}\n",
|
307 |
+
" cursor = collection.find(query, {\"title\": 1, \"_id\": 0})\n",
|
308 |
+
" title_from_db = [doc['title'] for doc in cursor]\n",
|
309 |
+
"\n",
|
310 |
+
" return title_from_db\"\"\"\n",
|
311 |
+
"\n",
|
312 |
+
"\"\"\"@staticmethod\n",
|
313 |
+
" def get_input_texts():\n",
|
314 |
+
" collection = Database.get_mongodb(collection_name='train')\n",
|
315 |
+
" query = {\"texts\": {\"$exists\": True}}\n",
|
316 |
+
" cursor = collection.find(query, {\"texts\": 1, \"_id\": 0})\n",
|
317 |
+
" texts_from_db = [doc['texts'] for doc in cursor]\n",
|
318 |
+
" return texts_from_db\"\"\"\n",
|
319 |
+
" \n",
|
320 |
+
" #bin tane veri çekerek csv dosyası olarak kaydetme \n",
|
321 |
+
" \n",
|
322 |
+
" \n",
|
323 |
+
"\"\"\"@staticmethod\n",
|
324 |
+
" def get_titles_and_texts(batch_size=1000):\n",
|
325 |
+
"\n",
|
326 |
+
" \n",
|
327 |
+
" titles = Database.get_input_titles(batch_size=batch_size)\n",
|
328 |
+
" texts = Database.get_input_texts(batch_size=batch_size )\n",
|
329 |
+
" \n",
|
330 |
+
"\n",
|
331 |
+
"\n",
|
332 |
+
" def test_queries():\n",
|
333 |
+
"\n",
|
334 |
+
" collection = Database.get_mongodb(collection_name='train')\n",
|
335 |
+
" # Başlık sorgusu\n",
|
336 |
+
" titles_cursor = collection.find({\"title\": {\"$exists\": True}}, {\"title\": 1, \"_id\": 0})\n",
|
337 |
+
" titles = [doc['title'] for doc in titles_cursor]\n",
|
338 |
+
" \n",
|
339 |
+
"\n",
|
340 |
+
" # Metin sorgusu\n",
|
341 |
+
" texts_cursor = collection.find({\"text\": {\"$exists\": True}}, {\"text\": 1, \"_id\": 0})\n",
|
342 |
+
" texts = [doc['text'] for doc in texts_cursor]\n",
|
343 |
+
" \n",
|
344 |
+
" # Başlık ve metinlerin eşleşmesini sağlamak için zip kullanarak birleştiriyoruz\n",
|
345 |
+
" documents = [{\"title\": title, \"text\": text} for title, text in zip(titles, texts)]\n",
|
346 |
+
" document_count = len(documents)\n",
|
347 |
+
" return documents, document_count\n",
|
348 |
+
"\n",
|
349 |
+
"Database.test_queries()\n",
|
350 |
+
"\n",
|
351 |
+
"# Veritabanından başlıklar ve metinler alınır\n",
|
352 |
+
"documents, document_count = Database.get_titles_and_texts(batch_size=1000)\n",
|
353 |
+
"\n",
|
354 |
+
"# Sonuçların belirlenmesi\n",
|
355 |
+
"print(f\"Başlık ve metin çiftleri: {documents}\")\n",
|
356 |
+
"print(f\"Toplam çift sayısı: {document_count}\")\"\"\""
|
357 |
+
]
|
358 |
+
},
|
359 |
+
{
|
360 |
+
"cell_type": "markdown",
|
361 |
+
"metadata": {},
|
362 |
+
"source": [
|
363 |
+
"Output'u vereceğimiz title ve textin kodu"
|
364 |
+
]
|
365 |
+
},
|
366 |
+
{
|
367 |
+
"cell_type": "code",
|
368 |
+
"execution_count": 8,
|
369 |
+
"metadata": {},
|
370 |
+
"outputs": [
|
371 |
+
{
|
372 |
+
"name": "stdout",
|
373 |
+
"output_type": "stream",
|
374 |
+
"text": [
|
375 |
+
"0 **Pşıqo Ahecaqo** Pşıqo Ahecaqo (), Çerkes siy...\n",
|
376 |
+
"1 **Craterolophinae** Craterolophinae, Depastrid...\n",
|
377 |
+
"2 **Notocrabro** Notocrabro Crabronina oymağına ...\n",
|
378 |
+
"3 **Ibrahim Sissoko** İbrahim Sissoko (d. 30 Kas...\n",
|
379 |
+
"4 **Salah Cedid** Salah Cedid (1926-1993) (Arapç...\n",
|
380 |
+
"Name: combined, dtype: object\n",
|
381 |
+
"Veriler combined_output.csv dosyasına başarıyla kaydedildi.\n"
|
382 |
+
]
|
383 |
+
}
|
384 |
+
],
|
385 |
+
"source": [
|
386 |
+
"from pymongo import MongoClient\n",
|
387 |
+
"import pandas as pd\n",
|
388 |
+
"from tqdm.auto import tqdm, trange\n",
|
389 |
+
"\n",
|
390 |
+
"# Database bağlantıları ve verileri çekme işlevleri\n",
|
391 |
+
"class Database:\n",
|
392 |
+
" @staticmethod\n",
|
393 |
+
" def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
|
394 |
+
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
395 |
+
" db = client[database_name]\n",
|
396 |
+
" train_collection = db[train_collection_name]\n",
|
397 |
+
" test_collection = db[test_collection_name]\n",
|
398 |
+
" return train_collection, test_collection\n",
|
399 |
+
"\n",
|
400 |
+
" def export_to_csv(batch_size=1000, output_file='combined_output.csv'):\n",
|
401 |
+
" train_collection, _ = Database.get_mongodb()\n",
|
402 |
+
" cursor = train_collection.find({}, {\"title\": 1, \"text\": 1, \"_id\": 0})\n",
|
403 |
+
" cursor = cursor.batch_size(batch_size) # Fix: Call batch_size on the cursor object\n",
|
404 |
+
"\n",
|
405 |
+
" # Verileri DataFrame'e dönüştürme\n",
|
406 |
+
" df= pd.DataFrame(list(cursor))\n",
|
407 |
+
" \n",
|
408 |
+
" # title ve text sütunlarını birleştirme\n",
|
409 |
+
" df['combined'] = df.apply(lambda row: f'**{row[\"title\"]}** {row[\"text\"]}', axis=1)\n",
|
410 |
+
" \n",
|
411 |
+
" #title,text and combined sütunlarını ayrı ayrı tutma\n",
|
412 |
+
" #df2['title_only'] = df2['title']\n",
|
413 |
+
" #df2['text_only'] = df2['text']\n",
|
414 |
+
" #df['combined']= output_file\n",
|
415 |
+
"\n",
|
416 |
+
" # Sonuçları kontrol etme\n",
|
417 |
+
" combined_text= df['combined'] \n",
|
418 |
+
" # Print the combined column directly\n",
|
419 |
+
" \n",
|
420 |
+
" print(combined_text.head())\n",
|
421 |
+
"\n",
|
422 |
+
" # Birleşmiş verileri CSV'ye kaydetme\n",
|
423 |
+
" \n",
|
424 |
+
" df.to_csv(output_file, index=False)\n",
|
425 |
+
" \n",
|
426 |
+
" print(f\"Veriler combined_output.csv dosyasına başarıyla kaydedildi.\")\n",
|
427 |
+
" \n",
|
428 |
+
"\n",
|
429 |
+
"# CSV dosyasını okuma ve birleştirme işlemi\n",
|
430 |
+
"Database.export_to_csv()"
|
431 |
+
]
|
432 |
+
},
|
433 |
+
{
|
434 |
+
"cell_type": "markdown",
|
435 |
+
"metadata": {},
|
436 |
+
"source": [
|
437 |
+
"TF-IDF HESAPLAMA"
|
438 |
+
]
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"cell_type": "code",
|
442 |
+
"execution_count": 20,
|
443 |
+
"metadata": {},
|
444 |
+
"outputs": [
|
445 |
+
{
|
446 |
+
"name": "stderr",
|
447 |
+
"output_type": "stream",
|
448 |
+
"text": [
|
449 |
+
"[nltk_data] Downloading package wordnet to\n",
|
450 |
+
"[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
|
451 |
+
"[nltk_data] Package wordnet is already up-to-date!\n",
|
452 |
+
"[nltk_data] Downloading package omw-1.4 to\n",
|
453 |
+
"[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
|
454 |
+
"[nltk_data] Package omw-1.4 is already up-to-date!\n",
|
455 |
+
"[nltk_data] Downloading package stopwords to\n",
|
456 |
+
"[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
|
457 |
+
"[nltk_data] Package stopwords is already up-to-date!\n"
|
458 |
+
]
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"ename": "ValueError",
|
462 |
+
"evalue": "empty vocabulary; perhaps the documents only contain stop words",
|
463 |
+
"output_type": "error",
|
464 |
+
"traceback": [
|
465 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
466 |
+
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
467 |
+
"Cell \u001b[1;32mIn[20], line 100\u001b[0m\n\u001b[0;32m 97\u001b[0m documents, document_count \u001b[38;5;241m=\u001b[39m Database\u001b[38;5;241m.\u001b[39mget_input_documents()\n\u001b[0;32m 99\u001b[0m \u001b[38;5;66;03m# Calculate TF-IDF and get feature names\u001b[39;00m\n\u001b[1;32m--> 100\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m \u001b[43mDatabase\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mturkish_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 102\u001b[0m \u001b[38;5;66;03m# Extract keywords\u001b[39;00m\n\u001b[0;32m 103\u001b[0m keywords \u001b[38;5;241m=\u001b[39m Database\u001b[38;5;241m.\u001b[39mextract_keywords(tfidf_matrix, feature_names, stop_words\u001b[38;5;241m=\u001b[39mturkish_stop_words)\n",
|
468 |
+
"Cell \u001b[1;32mIn[20], line 43\u001b[0m, in \u001b[0;36mDatabase.calculate_tfidf\u001b[1;34m(documents, stop_words)\u001b[0m\n\u001b[0;32m 40\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[0;32m 41\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcalculate_tfidf\u001b[39m(documents, stop_words):\n\u001b[0;32m 42\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words, max_features\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m,min_df\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m---> 43\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mvectorizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 44\u001b[0m feature_names \u001b[38;5;241m=\u001b[39m vectorizer\u001b[38;5;241m.\u001b[39mget_feature_names_out()\n\u001b[0;32m 45\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names\n",
|
469 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:2091\u001b[0m, in \u001b[0;36mTfidfVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 2084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_params()\n\u001b[0;32m 2085\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf \u001b[38;5;241m=\u001b[39m TfidfTransformer(\n\u001b[0;32m 2086\u001b[0m norm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm,\n\u001b[0;32m 2087\u001b[0m use_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_idf,\n\u001b[0;32m 2088\u001b[0m smooth_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmooth_idf,\n\u001b[0;32m 2089\u001b[0m sublinear_tf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msublinear_tf,\n\u001b[0;32m 2090\u001b[0m )\n\u001b[1;32m-> 2091\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2092\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mfit(X)\n\u001b[0;32m 2093\u001b[0m \u001b[38;5;66;03m# X is already a transformed view of raw_documents so\u001b[39;00m\n\u001b[0;32m 2094\u001b[0m \u001b[38;5;66;03m# we set copy to False\u001b[39;00m\n",
|
470 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
471 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1372\u001b[0m, in \u001b[0;36mCountVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 1364\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1365\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUpper case characters found in\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1366\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m vocabulary while \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlowercase\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1367\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is True. These entries will not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1368\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be matched with any documents\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1369\u001b[0m )\n\u001b[0;32m 1370\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m-> 1372\u001b[0m vocabulary, X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_count_vocab\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfixed_vocabulary_\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1374\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbinary:\n\u001b[0;32m 1375\u001b[0m X\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mfill(\u001b[38;5;241m1\u001b[39m)\n",
|
472 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1278\u001b[0m, in \u001b[0;36mCountVectorizer._count_vocab\u001b[1;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[0;32m 1276\u001b[0m vocabulary \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(vocabulary)\n\u001b[0;32m 1277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m vocabulary:\n\u001b[1;32m-> 1278\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1279\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mempty vocabulary; perhaps the documents only contain stop words\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1280\u001b[0m )\n\u001b[0;32m 1282\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m indptr[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m>\u001b[39m np\u001b[38;5;241m.\u001b[39miinfo(np\u001b[38;5;241m.\u001b[39mint32)\u001b[38;5;241m.\u001b[39mmax: \u001b[38;5;66;03m# = 2**31 - 1\u001b[39;00m\n\u001b[0;32m 1283\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _IS_32BIT:\n",
|
473 |
+
"\u001b[1;31mValueError\u001b[0m: empty vocabulary; perhaps the documents only contain stop words"
|
474 |
+
]
|
475 |
+
}
|
476 |
+
],
|
477 |
+
"source": [
|
478 |
+
"#---------------------------güncel en yeni \n",
|
479 |
+
"from pymongo import MongoClient\n",
|
480 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
481 |
+
"from textblob import TextBlob as tb\n",
|
482 |
+
"import numpy as np\n",
|
483 |
+
"import math\n",
|
484 |
+
"from tqdm.auto import tqdm, trange\n",
|
485 |
+
"import tensorflow as tf\n",
|
486 |
+
"import nltk\n",
|
487 |
+
"from nltk.stem import WordNetLemmatizer\n",
|
488 |
+
"from nltk.corpus import stopwords\n",
|
489 |
"\n",
|
490 |
+
"turkish_stop_words = stopwords.words('turkish')\n",
|
491 |
+
"\n",
|
492 |
+
"nltk.download('wordnet')\n",
|
493 |
+
"nltk.download('omw-1.4')\n",
|
494 |
+
"nltk.download('stopwords')\n",
|
495 |
+
"\n",
|
496 |
+
"\n",
|
497 |
+
"import matplotlib.pyplot as plt \n",
|
498 |
"\n",
|
|
|
|
|
|
|
499 |
"class Database:\n",
|
500 |
" @staticmethod\n",
|
501 |
" def get_mongodb():\n",
|
502 |
+
" return 'mongodb://localhost:27017/', 'combined', 'combined_output'\n",
|
|
|
503 |
"\n",
|
504 |
+
" # Get input documents from MongoDB\n",
|
505 |
" @staticmethod\n",
|
506 |
+
" def get_input_documents(limit=1000):\n",
|
507 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
|
508 |
" client = MongoClient(mongo_url)\n",
|
509 |
" db = client[db_name]\n",
|
510 |
" collection = db[collection_name]\n",
|
511 |
+
" cursor = collection.find().limit(limit)\n",
|
512 |
+
" combined_text = [doc['text'] for doc in cursor]\n",
|
513 |
+
" document_count = len(combined_text)\n",
|
514 |
+
" return combined_text, document_count\n",
|
|
|
|
|
515 |
" \n",
|
516 |
+
" # Calculate TF-IDF and get feature names\n",
|
517 |
" @staticmethod\n",
|
518 |
+
" def calculate_tfidf(documents, stop_words):\n",
|
519 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000,min_df=2)\n",
|
520 |
+
" tfidf_matrix = vectorizer.fit_transform(documents)\n",
|
521 |
+
" feature_names = vectorizer.get_feature_names_out()\n",
|
522 |
+
" return tfidf_matrix, feature_names\n",
|
523 |
+
"\n",
|
524 |
+
" # Extract keywords using TF-IDF\n",
|
525 |
+
" def extract_keywords(tfidf_matrix, feature_names, top_n=10, stop_words=[]):\n",
|
526 |
+
" keywords = {}\n",
|
527 |
+
" for doc_idx, row in enumerate(tfidf_matrix):\n",
|
528 |
+
" filtered_feature_names = [name for name in feature_names if name.lower() not in stop_words]\n",
|
529 |
+
" scores = np.asarray(row.T.todense()).flatten()\n",
|
530 |
+
" sorted_indices = np.argsort(scores)[::-1]\n",
|
531 |
+
" top_features = sorted_indices[:top_n]\n",
|
532 |
+
" doc_keywords = [(filtered_feature_names[idx], scores[idx]) for idx in top_features]\n",
|
533 |
+
" keywords[f'document_{doc_idx+1}'] = doc_keywords\n",
|
534 |
+
" return keywords\n",
|
535 |
+
" \n",
|
536 |
+
" #zip keywords and combined text \n",
|
537 |
+
" \n",
|
538 |
+
" # Identify low TF-IDF words\n",
|
539 |
+
" @staticmethod\n",
|
540 |
+
" def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
|
541 |
+
" avg_scores = np.mean(tfidf_matrix, axis=0).A1\n",
|
542 |
+
" low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
|
543 |
+
" return low_tfidf_words\n",
|
544 |
+
" \n",
|
545 |
+
" # Update stop words with low TF-IDF words\n",
|
546 |
+
" @staticmethod\n",
|
547 |
+
" def update_stop_words(existing_stop_words, low_tfidf_words):\n",
|
548 |
+
" updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
|
549 |
+
" return list(updated_stop_words)\n",
|
550 |
"\n",
|
551 |
"\n",
|
552 |
+
"#tf-ıdf ile döküman içerisinden kelime seçme \n",
|
553 |
+
"#Term Frequency (TF): Bir kelimenin belli bir dökümanda tekrar etme değeri\n",
|
554 |
+
"#Inverse Document Frequency (IDF):bir kelimenin tüm dökümanlar arasındaki yaygınlığı Nadir bulunan kelimeler, daha yüksek IDF değerine sahip olur.\n",
|
555 |
+
"#tf-ıdf skoru ise bu ikisinin çarpımıdır.\n",
|
556 |
"\n",
|
557 |
+
" #buraya eşik değer belirlenmeli\n",
|
558 |
+
"\n",
|
559 |
+
"\n",
|
560 |
+
"turkish_stop_words = [\n",
|
561 |
+
" 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
|
562 |
+
" 'b', 'başlayan', 'bağlı', 'bazı', 'belirli', 'ben', 'bence', \n",
|
563 |
+
" 'birkaç', 'birlikte', 'bunu', 'burada', 'biten', 'biz', \n",
|
564 |
+
" 'bu', 'buna', 'çünkü', 'da', 'de', 'demek', 'den', 'derken', \n",
|
565 |
+
" 'değil', 'daha', 'dolayı', 'edilir', 'eğer', 'en', 'fakat', \n",
|
566 |
+
" 'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', \n",
|
567 |
+
" 'işte', 'itibaren', 'iyi', 'kadar', 'karşı', 'ki', 'kime', \n",
|
568 |
+
" 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', \n",
|
569 |
+
" 'olasılıkla', 'olabilir', 'oluşur', 'önce', 'şu', 'sadece', \n",
|
570 |
+
" 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da', \n",
|
571 |
+
" 'yanı', 'yani', 'yılında', 'yetenekli', 'yine'\n",
|
572 |
+
"]\n",
|
573 |
+
"# Get input documents\n",
|
574 |
+
"documents, document_count = Database.get_input_documents()\n",
|
575 |
+
"\n",
|
576 |
+
"# Calculate TF-IDF and get feature names\n",
|
577 |
+
"tfidf_matrix, feature_names = Database.calculate_tfidf(documents, turkish_stop_words)\n",
|
578 |
+
"\n",
|
579 |
+
"# Extract keywords\n",
|
580 |
+
"keywords = Database.extract_keywords(tfidf_matrix, feature_names, stop_words=turkish_stop_words)\n",
|
581 |
+
"print(keywords)\n",
|
582 |
+
"\n",
|
583 |
+
"# Identify low TF-IDF words\n",
|
584 |
+
"low_tfidf_words = Database.identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
|
585 |
+
"print(low_tfidf_words)\n",
|
586 |
+
"\n",
|
587 |
+
"# Update stop words\n",
|
588 |
+
"updated_stop_words = Database.update_stop_words(turkish_stop_words, low_tfidf_words)\n",
|
589 |
+
"print(updated_stop_words) "
|
590 |
]
|
591 |
},
|
592 |
{
|
593 |
+
"cell_type": "code",
|
594 |
+
"execution_count": 15,
|
595 |
"metadata": {},
|
596 |
+
"outputs": [
|
597 |
+
{
|
598 |
+
"ename": "TypeError",
|
599 |
+
"evalue": "unhashable type: 'set'",
|
600 |
+
"output_type": "error",
|
601 |
+
"traceback": [
|
602 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
603 |
+
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
604 |
+
"Cell \u001b[1;32mIn[15], line 162\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names,keywords\n\u001b[0;32m 161\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 162\u001b[0m tfidf_matrix, feature_names,keywords\u001b[38;5;241m=\u001b[39m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 164\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAnahtar Kelimler:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 165\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc, words \u001b[38;5;129;01min\u001b[39;00m keywords\u001b[38;5;241m.\u001b[39mitems():\n",
|
605 |
+
"Cell \u001b[1;32mIn[15], line 148\u001b[0m, in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 146\u001b[0m initial_stop_words \u001b[38;5;241m=\u001b[39m turkish_stop_words\n\u001b[0;32m 147\u001b[0m \u001b[38;5;66;03m# Stop-words listesini iteratif olarak güncelleyin\u001b[39;00m\n\u001b[1;32m--> 148\u001b[0m final_stop_words \u001b[38;5;241m=\u001b[39m \u001b[43miterative_update\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 149\u001b[0m \u001b[38;5;66;03m#tf-ıdf hesaplama\u001b[39;00m\n\u001b[0;32m 150\u001b[0m tfidf_matrix, feature_names\u001b[38;5;241m=\u001b[39mcalculate_tfidf(documents_list,final_stop_words)\n",
|
606 |
+
"Cell \u001b[1;32mIn[15], line 127\u001b[0m, in \u001b[0;36miterative_update\u001b[1;34m(documents, initial_stop_words, iterations)\u001b[0m\n\u001b[0;32m 126\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21miterative_update\u001b[39m(documents, initial_stop_words, iterations\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m):\n\u001b[1;32m--> 127\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mset\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 128\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(iterations):\n\u001b[0;32m 129\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m calculate_tfidf(documents, stop_words)\n",
|
607 |
+
"\u001b[1;31mTypeError\u001b[0m: unhashable type: 'set'"
|
608 |
+
]
|
609 |
+
}
|
610 |
+
],
|
611 |
"source": [
|
612 |
+
"\n",
|
613 |
+
"\n",
|
614 |
+
"\"\"\"class Tf:\n",
|
615 |
+
" @staticmethod\n",
|
616 |
+
" def tf(word, blob):\n",
|
617 |
+
" return blob.words.count(word) / len(blob.words)\n",
|
618 |
+
"\n",
|
619 |
+
" @staticmethod\n",
|
620 |
+
" def n_containing(word, bloblist):\n",
|
621 |
+
" return sum(1 for blob in bloblist if word in blob.words)\n",
|
622 |
+
"\n",
|
623 |
+
" @staticmethod\n",
|
624 |
+
" def idf(word, bloblist):\n",
|
625 |
+
" return math.log(len(bloblist) / (1 + Tf.n_containing(word, bloblist)))\n",
|
626 |
+
"\n",
|
627 |
+
" @staticmethod\n",
|
628 |
+
" def tfidf(word, blob, bloblist):\n",
|
629 |
+
" return Tf.tf(word, blob) * Tf.idf(word, bloblist)\n",
|
630 |
+
"\n",
|
631 |
+
" @staticmethod\n",
|
632 |
+
" def get_input_documents(limit=1000):\n",
|
633 |
+
" return Database.get_input_documents(limit)\"\"\"\n",
|
634 |
+
"\n",
|
635 |
+
"\n",
|
636 |
+
"\n",
|
637 |
+
"\n",
|
638 |
+
"\n",
|
639 |
+
" \"\"\"\n",
|
640 |
+
" Her döküman için anahtar kelimeleri seç.\n",
|
641 |
+
" :param tfidf_matrix: TF-IDF matris\n",
|
642 |
+
" :param feature_names: TF-IDF özellik isimleri\n",
|
643 |
+
" :param top_n: Her döküman için seçilecek anahtar kelime sayısı\n",
|
644 |
+
" :return: Anahtar kelimeler ve skorlari\n",
|
645 |
+
" \"\"\"\n",
|
646 |
+
" \n",
|
647 |
+
"\n",
|
648 |
+
"#--------------------------------------------------------------- burada aldığımız dökümanları listeliyoruz\n",
|
649 |
+
"# Dokümanları işleyerek TF-IDF hesaplama\n",
|
650 |
+
"#bloblist dökümanların bir listesi\n",
|
651 |
+
"\"\"\"bloblist = []\n",
|
652 |
+
"for i, blob in enumerate(bloblist):\n",
|
653 |
+
" print(\"Top words in document {}\".format(i + 1))\n",
|
654 |
+
" scores = {word: Tf.tfidf(word, blob, bloblist) for word in blob.words} #dökümanların içerisinde bulunan kelimeleri alır.\n",
|
655 |
+
" sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)\n",
|
656 |
+
" for word, score in sorted_words[:3]:\n",
|
657 |
+
" print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\"\"\"\n",
|
658 |
+
"\n",
|
659 |
+
"\n",
|
660 |
+
"# Dökümanları isimlendir\n",
|
661 |
+
"#named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(combined_text)}\n",
|
662 |
+
"\n",
|
663 |
+
"#features olarak top_keywordsleri belirleyerek metnin bu kelimelerin etrafında olması sağlanmalı \n",
|
664 |
+
"def calculate_tfidf(documents, stop_words):\n",
|
665 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
|
666 |
+
" tfidf_matrix = vectorizer.fit_transform(documents)\n",
|
667 |
+
" feature_names = vectorizer.get_feature_names_out()\n",
|
668 |
+
" return tfidf_matrix, feature_names\n",
|
669 |
+
"\n",
|
670 |
+
"#---------------------------------------------------------------------------------\n",
|
671 |
+
"#kelimelerin ortalama skorlarını hesaplama \n",
|
672 |
+
"def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
|
673 |
+
" # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
|
674 |
+
" avg_scores = np.mean(tfidf_matrix, axis=0).A1\n",
|
675 |
+
" low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
|
676 |
+
" return low_tfidf_words\n",
|
677 |
+
"\n",
|
678 |
+
"#kelimelerin yeni geliştirilen eşik değere göre güncellenmesi \n",
|
679 |
+
"def update_stop_words(existing_stop_words, low_tfidf_words):\n",
|
680 |
+
" updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
|
681 |
+
" return list(updated_stop_words)\n",
|
682 |
+
"\n",
|
683 |
+
"\n",
|
684 |
+
"#bu kısım detaylandırılmalı \n",
|
685 |
+
"def iterative_update(documents, initial_stop_words, iterations=5):\n",
|
686 |
+
" stop_words = set(initial_stop_words)\n",
|
687 |
+
" for _ in range(iterations):\n",
|
688 |
+
" tfidf_matrix, feature_names = calculate_tfidf(documents, stop_words)\n",
|
689 |
+
" low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
|
690 |
+
" stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
|
691 |
+
" return list(stop_words)\n",
|
692 |
+
"\n",
|
693 |
+
"\n",
|
694 |
+
"\n",
|
695 |
+
"def main ():\n",
|
696 |
+
"\n",
|
697 |
+
" \n",
|
698 |
+
"#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
|
699 |
+
"\n",
|
700 |
+
"# Dökümanları liste olarak al\n",
|
701 |
+
" named_documents, _ = Tf.get_input_documents(limit=1000)\n",
|
702 |
+
" documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
|
703 |
+
"\n",
|
704 |
+
" #başlangıç stop değerleriyle yeni olanları arasında değişim yapma \n",
|
705 |
+
" initial_stop_words = turkish_stop_words\n",
|
706 |
+
" # Stop-words listesini iteratif olarak güncelleyin\n",
|
707 |
+
" final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
|
708 |
+
" #tf-ıdf hesaplama\n",
|
709 |
+
" tfidf_matrix, feature_names=calculate_tfidf(documents_list,final_stop_words)\n",
|
710 |
+
" keywords=extract_keywords(tfidf_matrix,feature_names,top_n=10)\n",
|
711 |
+
"\n",
|
712 |
+
" \n",
|
713 |
+
"\n",
|
714 |
+
" print(\"Güncellenmiş Stop-Words Listesi:\", final_stop_words)\n",
|
715 |
+
" print(\"TF-IDF Matrix Shape:\", tfidf_matrix.shape)\n",
|
716 |
+
" print(\"Feature Names Sample:\", feature_names[:10]) # İlk 10 feature adını gösterir\n",
|
717 |
+
"\n",
|
718 |
+
" return tfidf_matrix, feature_names,keywords\n",
|
719 |
+
"\n",
|
720 |
+
"if __name__==\"__main__\":\n",
|
721 |
+
" tfidf_matrix, feature_names,keywords= main()\n",
|
722 |
+
"\n",
|
723 |
+
" print(\"Anahtar Kelimler:\")\n",
|
724 |
+
" for doc, words in keywords.items():\n",
|
725 |
+
" print(f\"{doc}: {words}\")\n",
|
726 |
+
" \n",
|
727 |
+
"\n",
|
728 |
+
"#---------------------------------------------------------\n",
|
729 |
+
" \"\"\"blobs = [tb(doc) for doc in documents_list] # veya 'title' kullanarak başlıkları işleyebilirsiniz\n",
|
730 |
+
" all_words = set(word for blob in blobs for word in blob.words)\n",
|
731 |
+
"\n",
|
732 |
+
" tfidf_scores = {}\n",
|
733 |
+
" for word in all_words:\n",
|
734 |
+
" tfidf_scores[word] = [Tf.tfidf(word, blob, blobs) for blob in blobs]\n",
|
735 |
+
"\n",
|
736 |
+
" print(\"TF-IDF Skorları:\")\n",
|
737 |
+
" for word, scores in tfidf_scores.items():\n",
|
738 |
+
" print(f\"Kelime: {word}, Skorlar: {scores}\")\"\"\"\n"
|
739 |
]
|
740 |
},
|
741 |
{
|
742 |
"cell_type": "code",
|
743 |
+
"execution_count": 2,
|
744 |
"metadata": {},
|
745 |
"outputs": [
|
746 |
{
|
747 |
+
"ename": "InvalidParameterError",
|
748 |
+
"evalue": "The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'o', 'den', 'an', 'şey', 'burada', 've', 'ah', 'ise', 'hiç', 'yine', 'biz', 'bu', 'da', 'genellikle', 'yılında', 'belirli', 'se', 'ne', 'kadar', 'neden', 'hem', 'aralar', 'yani', 'daha', 'araba', 'derken', 'dolayı', 'kısaca', 'karşı', 'niye', 'ki', 'bunu', 'buna', 'de', 'herhangi', 'önce', 'tabi', 'kime', 'biten', 'ben', 'ya', 'ya da', 'çünkü', 'mu', 'b', 'demek', 'fakat', 'şimdi', 'birlikte', 'her', 'bağlı', 'nasıl', 'şu', 'sadece', 'tüm', 'aslında', 'edilir', 'ama', 'bence', 'en', 'işte', 'gibi', 'ancak', 'birkaç', 'itibaren', 'mü', 'olabilir', 'bazı', 'oluşur', 'başlayan', 'yanı', 'olasılıkla', 'iyi', 'değil', 'eğer', 'yetenekli'} instead.",
|
749 |
+
"output_type": "error",
|
750 |
+
"traceback": [
|
751 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
752 |
+
"\u001b[1;31mInvalidParameterError\u001b[0m Traceback (most recent call last)",
|
753 |
+
"Cell \u001b[1;32mIn[2], line 155\u001b[0m\n\u001b[0;32m 152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names,documents_list \n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 155\u001b[0m tfidf_matrix, feature_names,documents_list\u001b[38;5;241m=\u001b[39m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 158\u001b[0m \u001b[38;5;66;03m# Sonuçları yazdır\u001b[39;00m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mİsimlendirilmiş Dökümanlar:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
754 |
+
"Cell \u001b[1;32mIn[2], line 142\u001b[0m, in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 140\u001b[0m initial_stop_words \u001b[38;5;241m=\u001b[39m turkish_stop_words\n\u001b[0;32m 141\u001b[0m \u001b[38;5;66;03m# Stop-words listesini iteratif olarak güncelleyin\u001b[39;00m\n\u001b[1;32m--> 142\u001b[0m final_stop_words \u001b[38;5;241m=\u001b[39m \u001b[43miterative_update\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 143\u001b[0m \u001b[38;5;66;03m#tf-ıdf hesaplama\u001b[39;00m\n\u001b[0;32m 144\u001b[0m tfidf_matrix, feature_names\u001b[38;5;241m=\u001b[39mcalculate_tfidf(documents_list,final_stop_words)\n",
|
755 |
+
"Cell \u001b[1;32mIn[2], line 124\u001b[0m, in \u001b[0;36miterative_update\u001b[1;34m(documents, initial_stop_words, iterations)\u001b[0m\n\u001b[0;32m 122\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m(initial_stop_words)\n\u001b[0;32m 123\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(iterations):\n\u001b[1;32m--> 124\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 125\u001b[0m low_tfidf_words \u001b[38;5;241m=\u001b[39m identify_low_tfidf_words(tfidf_matrix, feature_names)\n\u001b[0;32m 126\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m update_stop_words(stop_words, low_tfidf_words)\n",
|
756 |
+
"Cell \u001b[1;32mIn[2], line 103\u001b[0m, in \u001b[0;36mcalculate_tfidf\u001b[1;34m(documents, stop_words)\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcalculate_tfidf\u001b[39m(documents, stop_words):\n\u001b[0;32m 102\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words, max_features\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m)\n\u001b[1;32m--> 103\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mvectorizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 104\u001b[0m feature_names \u001b[38;5;241m=\u001b[39m vectorizer\u001b[38;5;241m.\u001b[39mget_feature_names_out()\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names\n",
|
757 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:2091\u001b[0m, in \u001b[0;36mTfidfVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 2084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_params()\n\u001b[0;32m 2085\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf \u001b[38;5;241m=\u001b[39m TfidfTransformer(\n\u001b[0;32m 2086\u001b[0m norm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm,\n\u001b[0;32m 2087\u001b[0m use_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_idf,\n\u001b[0;32m 2088\u001b[0m smooth_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmooth_idf,\n\u001b[0;32m 2089\u001b[0m sublinear_tf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msublinear_tf,\n\u001b[0;32m 2090\u001b[0m )\n\u001b[1;32m-> 2091\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2092\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mfit(X)\n\u001b[0;32m 2093\u001b[0m \u001b[38;5;66;03m# X is already a transformed view of raw_documents so\u001b[39;00m\n\u001b[0;32m 2094\u001b[0m \u001b[38;5;66;03m# we set copy to False\u001b[39;00m\n",
|
758 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:1466\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1461\u001b[0m partial_fit_and_fitted \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 1462\u001b[0m fit_method\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpartial_fit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m _is_fitted(estimator)\n\u001b[0;32m 1463\u001b[0m )\n\u001b[0;32m 1465\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m global_skip_validation \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m partial_fit_and_fitted:\n\u001b[1;32m-> 1466\u001b[0m \u001b[43mestimator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_params\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[0;32m 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
759 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:666\u001b[0m, in \u001b[0;36mBaseEstimator._validate_params\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 658\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_validate_params\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 659\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Validate types and values of constructor parameters\u001b[39;00m\n\u001b[0;32m 660\u001b[0m \n\u001b[0;32m 661\u001b[0m \u001b[38;5;124;03m The expected type and values must be defined in the `_parameter_constraints`\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 664\u001b[0m \u001b[38;5;124;03m accepted constraints.\u001b[39;00m\n\u001b[0;32m 665\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 666\u001b[0m \u001b[43mvalidate_parameter_constraints\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 667\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parameter_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_params\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdeep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43mcaller_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
760 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\_param_validation.py:95\u001b[0m, in \u001b[0;36mvalidate_parameter_constraints\u001b[1;34m(parameter_constraints, params, caller_name)\u001b[0m\n\u001b[0;32m 89\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 90\u001b[0m constraints_str \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 91\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([\u001b[38;5;28mstr\u001b[39m(c)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mfor\u001b[39;00m\u001b[38;5;250m \u001b[39mc\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01min\u001b[39;00m\u001b[38;5;250m \u001b[39mconstraints[:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 92\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstraints[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 93\u001b[0m )\n\u001b[1;32m---> 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidParameterError(\n\u001b[0;32m 96\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparam_name\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m parameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcaller_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 97\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstraints_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparam_val\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 98\u001b[0m )\n",
|
761 |
+
"\u001b[1;31mInvalidParameterError\u001b[0m: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'o', 'den', 'an', 'şey', 'burada', 've', 'ah', 'ise', 'hiç', 'yine', 'biz', 'bu', 'da', 'genellikle', 'yılında', 'belirli', 'se', 'ne', 'kadar', 'neden', 'hem', 'aralar', 'yani', 'daha', 'araba', 'derken', 'dolayı', 'kısaca', 'karşı', 'niye', 'ki', 'bunu', 'buna', 'de', 'herhangi', 'önce', 'tabi', 'kime', 'biten', 'ben', 'ya', 'ya da', 'çünkü', 'mu', 'b', 'demek', 'fakat', 'şimdi', 'birlikte', 'her', 'bağlı', 'nasıl', 'şu', 'sadece', 'tüm', 'aslında', 'edilir', 'ama', 'bence', 'en', 'işte', 'gibi', 'ancak', 'birkaç', 'itibaren', 'mü', 'olabilir', 'bazı', 'oluşur', 'başlayan', 'yanı', 'olasılıkla', 'iyi', 'değil', 'eğer', 'yetenekli'} instead."
|
762 |
+
]
|
763 |
}
|
764 |
],
|
765 |
"source": [
|
|
|
776 |
" def get_mongodb():\n",
|
777 |
" return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
|
778 |
"\n",
|
779 |
+
"#--------------------------------------------------------------------------\n",
|
780 |
+
"#combined_text eklenmeli \n",
|
781 |
" @staticmethod\n",
|
782 |
" def get_input_documents(limit=3):\n",
|
783 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
|
|
|
785 |
" db = client[db_name]\n",
|
786 |
" collection = db[collection_name]\n",
|
787 |
" cursor = collection.find().limit(limit)\n",
|
788 |
+
" combined_text = [doc for doc in cursor]\n",
|
789 |
+
" document_count = len(combined_text)\n",
|
790 |
" \n",
|
791 |
" # Dökümanları isimlendir\n",
|
792 |
+
" named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(combined_text)}\n",
|
793 |
" \n",
|
794 |
" return named_documents, document_count\n",
|
795 |
"\n",
|
|
|
816 |
" return Database.get_input_documents(limit)\n",
|
817 |
"\n",
|
818 |
"# Kullanım örneği\n",
|
819 |
+
"named_documents, document_count = Tf.get_input_documents(limit=1000)\n",
|
820 |
"\n",
|
821 |
"#tf-ıdf ile döküman içerisinden kelime seçme \n",
|
822 |
"\n",
|
|
|
850 |
" for word, score in sorted_words[:3]:\n",
|
851 |
" print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\n",
|
852 |
"\n",
|
853 |
+
"\n",
|
854 |
+
"#buraya eşik değer belirlenmeli\n",
|
855 |
"turkish_stop_words = [\n",
|
856 |
" 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
|
857 |
+
" 'b', 'başlayan', 'bağlı', 'bazı', 'belirli', 'ben', 'bence', \n",
|
858 |
+
" 'birkaç', 'birlikte', 'bunu', 'burada', 'biten', 'biz', \n",
|
859 |
+
" 'bu', 'buna', 'çünkü', 'da', 'de', 'demek', 'den', 'derken', \n",
|
860 |
+
" 'değil', 'daha', 'dolayı', 'edilir', 'eğer', 'en', 'fakat', \n",
|
861 |
+
" 'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', \n",
|
862 |
+
" 'işte', 'itibaren', 'iyi', 'kadar', 'karşı', 'ki', 'kime', \n",
|
863 |
+
" 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', \n",
|
864 |
+
" 'olasılıkla', 'olabilir', 'oluşur', 'önce', 'şu', 'sadece', \n",
|
865 |
+
" 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da', \n",
|
866 |
+
" 'yanı', 'yani', 'yılında', 'yetenekli', 'yine'\n",
|
867 |
"]\n",
|
868 |
"\n",
|
869 |
+
"#features olarak top_keywordsleri belirleyerek metnin bu kelimelerin etrafında olması sağlanmalı \n",
|
870 |
+
"def calculate_tfidf(combined_text, stop_words):\n",
|
871 |
" vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
|
872 |
+
" tfidf_matrix = vectorizer.fit_transform(combined_text)\n",
|
873 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
874 |
" return tfidf_matrix, feature_names\n",
|
875 |
"\n",
|
876 |
+
"#---------------------------------------------------------------------------------\n",
|
877 |
"#kelimelerin ortalama skorlarını hesaplama \n",
|
878 |
"def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
|
879 |
" # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
|
|
|
881 |
" low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
|
882 |
" return low_tfidf_words\n",
|
883 |
"\n",
|
884 |
+
"#kelimelerin yeni geliştirilen eşik değere göre güncellenmesi \n",
|
885 |
"def update_stop_words(existing_stop_words, low_tfidf_words):\n",
|
886 |
" updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
|
887 |
" return list(updated_stop_words)\n",
|
888 |
"\n",
|
889 |
"\n",
|
890 |
+
"#bu kısım detaylandırılmalı \n",
|
891 |
+
"def iterative_update(combined_text, initial_stop_words, iterations=5):\n",
|
892 |
" stop_words = set(initial_stop_words)\n",
|
893 |
" for _ in range(iterations):\n",
|
894 |
+
" tfidf_matrix, feature_names = calculate_tfidf(combined_text, stop_words)\n",
|
895 |
" low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
|
896 |
" stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
|
897 |
" return list(stop_words)\n",
|
898 |
+
"\n",
|
899 |
"\n",
|
900 |
"\n",
|
901 |
"def main ():\n",
|
902 |
"\n",
|
903 |
+
" \n",
|
904 |
"#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
|
905 |
"\n",
|
906 |
"# Dökümanları liste olarak al\n",
|
907 |
" documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
|
908 |
"\n",
|
909 |
+
" #başlangıç stop değerleriyle yeni olanları arasında değişim yapma \n",
|
910 |
+
" initial_stop_words = turkish_stop_words\n",
|
911 |
+
" # Stop-words listesini iteratif olarak güncelleyin\n",
|
912 |
+
" final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
|
913 |
" #tf-ıdf hesaplama\n",
|
914 |
+
" tfidf_matrix, feature_names=calculate_tfidf(documents_list,final_stop_words)\n",
|
915 |
"\n",
|
916 |
+
" \n",
|
|
|
917 |
"\n",
|
918 |
+
" print(\"Güncellenmiş Stop-Words Listesi:\", final_stop_words)\n",
|
919 |
+
" print(\"TF-IDF Matrix Shape:\", tfidf_matrix.shape)\n",
|
920 |
+
" print(\"Feature Names Sample:\", feature_names[:10]) # İlk 10 feature adını gösterir\n",
|
921 |
"\n",
|
922 |
+
" return tfidf_matrix, feature_names,documents_list \n",
|
|
|
923 |
"\n",
|
924 |
+
"if __name__==\"__main__\":\n",
|
925 |
+
" tfidf_matrix, feature_names,documents_list= main()\n",
|
926 |
"\n",
|
927 |
"\n",
|
928 |
"# Sonuçları yazdır\n",
|
929 |
+
"print(\"İsimlendirilmiş Dökümanlar:\")\n",
|
930 |
+
"for name, doc in named_documents.items():\n",
|
931 |
+
" print(f\"{name}: {doc}\")\n",
|
932 |
"\n",
|
933 |
" print(\"\\nDökümanlar Listesi:\")\n",
|
934 |
" print(documents_list)\n",
|
|
|
1010 |
},
|
1011 |
{
|
1012 |
"cell_type": "code",
|
1013 |
+
"execution_count": 1,
|
1014 |
"metadata": {},
|
1015 |
+
"outputs": [
|
1016 |
+
{
|
1017 |
+
"ename": "NameError",
|
1018 |
+
"evalue": "name 'TfidfVectorizer' is not defined",
|
1019 |
+
"output_type": "error",
|
1020 |
+
"traceback": [
|
1021 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
1022 |
+
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
1023 |
+
"Cell \u001b[1;32mIn[1], line 41\u001b[0m\n\u001b[0;32m 31\u001b[0m turkish_stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m([\n\u001b[0;32m 32\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124ma\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabide\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabla\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mad\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124madım\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mah\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mama\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124man\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mancak\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maraba\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maralar\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maslında\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[0;32m 33\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maşşağı\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maz\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbazı\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbelirli\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mben\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbence\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbunu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mburada\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbiz\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuna\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mçünkü\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mönce\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msadece\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msana\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mse\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşey\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşimdi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtabi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtüm\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mve\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mya\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mya da\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myani\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myine\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m 38\u001b[0m ])\n\u001b[0;32m 40\u001b[0m \u001b[38;5;66;03m# TF-IDF hesaplayıcı oluşturun ve Türkçe durak kelimelerini dahil edin\u001b[39;00m\n\u001b[1;32m---> 41\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m \u001b[43mTfidfVectorizer\u001b[49m(stop_words\u001b[38;5;241m=\u001b[39mturkish_stop_words)\n\u001b[0;32m 44\u001b[0m \u001b[38;5;124;03m\"\"\"IDF, derlemedeki belge sayısının,\u001b[39;00m\n\u001b[0;32m 45\u001b[0m \u001b[38;5;124;03mincelenen anahtar kelimeyi içeren topluluktaki belge sayısına \u001b[39;00m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;124;03mbölünmesiyle elde edilen algoritmadır. \u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;124;03mkülliyat yani incelenen tüm belgelerin adedi 10 ise ve test edilen anahtar kelime,\u001b[39;00m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;124;03mkülliyattaki üç belgede görünüyorsa, bu durumda IDF değeri 0.52’dir (log (10/3)).\"\"\"\u001b[39;00m\n\u001b[0;32m 51\u001b[0m \u001b[38;5;66;03m#TF-IDF puanı; Naive Bayes ve Destek Vektör Makineleri gibi algoritmalara aktarılabilir. Böylece kelime sayısı gibi daha temel yöntemlerin sonuçları büyük ölçüde iyileştirilebilir.\u001b[39;00m\n\u001b[0;32m 52\u001b[0m \u001b[38;5;66;03m#IDF = log ( Dokuman Sayısı / Terimin Geçtiği Dokuman Sayısı )\u001b[39;00m\n\u001b[0;32m 53\u001b[0m \u001b[38;5;66;03m#dokuman sayısılarını almakla başlayacağız.\u001b[39;00m\n\u001b[0;32m 54\u001b[0m \u001b[38;5;66;03m# : titlelerın sayısı / terimler ise \u001b[39;00m\n",
|
1024 |
+
"\u001b[1;31mNameError\u001b[0m: name 'TfidfVectorizer' is not defined"
|
1025 |
+
]
|
1026 |
+
}
|
1027 |
+
],
|
1028 |
"source": [
|
1029 |
"\n",
|
1030 |
"#---------------------------------------------------------------------------------------------------------------------------------\n",
|