Spaces:
Runtime error
Runtime error
first create
Browse files- LICENSE +21 -0
- README.md +36 -13
- data_creation/bible_data.csv +0 -0
- data_creation/bible_talmud_data.csv +0 -0
- data_creation/collect_talmud_data.py +69 -0
- data_creation/craet_model_bible_talmud.py +49 -0
- data_creation/creat_csv.py +168 -0
- data_creation/creat_model_bible.py +70 -0
- data_creation/divided_sentences.txt +107 -0
- data_creation/graf_model.py +62 -0
- data_creation/processed_text.csv +71 -0
- data_creation/text_identification_model.pkl +3 -0
- data_creation/text_identification_vectorizer.pkl +3 -0
- data_creation/try_model.py +49 -0
- is_this_bible_model.pkl +3 -0
- is_this_bible_vectorizer.pkl +3 -0
- templates/index.html +93 -0
- try_model.py +74 -0
- try_model_webui.py +35 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 NHLOCAL
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,13 +1,36 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# is this a bible?
|
2 |
+
An IA model that detects whether a given verse is from the Bible or not
|
3 |
+
|
4 |
+
The model presents capabilities at a very high recognition level, for the Hebrew language.
|
5 |
+
The complete dataset on which the model was trained is stored in the `bible_data.csv` file.
|
6 |
+
|
7 |
+
You can try the model's capabilities easily,By downloading the release file from here - https://github.com/NHLOCAL/is-this-bible/releases/download/v1.0/is-this-bible.zip.
|
8 |
+
|
9 |
+
**To run the model, download the following libraries using pip**:
|
10 |
+
|
11 |
+
`nltk`, `joblib`.
|
12 |
+
|
13 |
+
-----
|
14 |
+
|
15 |
+
**ืืืืื:**
|
16 |
+
|
17 |
+
ืงืื ืฉืืืื:
|
18 |
+
```shell
|
19 |
+
try_model.py "ืืืืืืื ื ืืชื ืืืขืืืช ืืขืจืืืช ืงืื ืคืชืื"
|
20 |
+
```
|
21 |
+
ืคืื:
|
22 |
+
|
23 |
+
|
24 |
+
```shell
|
25 |
+
Text: ืืืืืืื ื ืืชื ืืืขืืืช ืืขืจืืืช ืงืื ืคืชืื | Prediction: Other | Confidence Score: 0.0340
|
26 |
+
```
|
27 |
+
ืงืื ืืืืื:
|
28 |
+
|
29 |
+
```shell
|
30 |
+
try_model.py "ืขื ืื ืกืขืจื ืื ื ืืื ืื ื ืื ืื ืืจืืืฅ ืืคืื ืืื ืื"
|
31 |
+
```
|
32 |
+
ืคืื:
|
33 |
+
|
34 |
+
```shell
|
35 |
+
Text: ืขื ืื ืกืขืจื ืื ื ืืื ืื ื ืื ืื ืืจืืืฅ ืืคืื ืืื ืื ืืืกืืชืื ืืกืคืืจืื | Prediction: Bible | Confidence Score: 1.0000
|
36 |
+
```
|
data_creation/bible_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data_creation/bible_talmud_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data_creation/collect_talmud_data.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
|
4 |
+
def divide_into_sentences(text):
|
5 |
+
text = remove_headers(text)
|
6 |
+
# Split the text into words
|
7 |
+
words = text.split()
|
8 |
+
sentences = [' '.join(words[i:i+12]) for i in range(0, len(words), 12)]
|
9 |
+
# Split the text into sentences using periods, commas, and colons as separators
|
10 |
+
# Remove empty sentences
|
11 |
+
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
12 |
+
# add numver for all items
|
13 |
+
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
14 |
+
return sentences
|
15 |
+
|
16 |
+
def remove_headers(text):
|
17 |
+
pattern = r"(ืืฃ )(.*?)( ืืืจื )"
|
18 |
+
text = re.sub(pattern, "", text)
|
19 |
+
pattern = r"(ืืฃ )(.*?)( ืืฉื ื )"
|
20 |
+
result = re.sub(pattern, "", text)
|
21 |
+
return result
|
22 |
+
|
23 |
+
|
24 |
+
def write_sentences_to_file(sentences, output_filename):
|
25 |
+
modified_sentences = [f"{sentence},2" for sentence in sentences]
|
26 |
+
with open(output_filename, 'a', encoding='utf-8') as f:
|
27 |
+
f.write('\n'.join(modified_sentences))
|
28 |
+
|
29 |
+
# Example passage
|
30 |
+
example_passage = """
|
31 |
+
ืืฃ ืื,ื ืืืจื ืืข"ืค ืฉืื ืืคืฉืจ ืืืื ืื ืื ืืจ' ืืืืจ ืืข"ืค ืฉืื ืืคืฉืจ ืืจืื ื ืืืชื ื ืื ืืขืืืื ืจ"ื ืืืืจ ืื ืืืืฆืช ืืื ืืชืืืืช ืืื"ื ืื ืืืืฆืช ืื ืืชืืืืช ืืื ื ืืืขื ื ืืฉืื ืืื ืืคืฉืจ ืืื ืื ืื ืชื ื ืืข"ืค ืฉืื ืืคืฉืจ ืืื ืืืื ื ืจืื ื ืฉืื ืชืืชืื ืืชื ืืจืืฉื ืืืืขืื ืขืืืื ืืชื ืืจืืฉื ืืจืื ืืืืจ ืืืขืืื ืืืืืฉ ืืืืขืืื ืืจืื ื ืืืขืืืืื ืืื ืืืืฉื ืืืืขืืื ืืื ื ืืืื ืืกืชืื ืืื ืืืื ืืืืงื ืืื ืืฉืืื ืืืืจ ืืืื ืืื ืจืื ื ืืจ"ื ืืขืืืื ืงืืื ืงื"ื ืืื ืืคืฉืจ ืืืื ืืชื ืืื ืชืจ ืืื ืื ืชืจ ืืฉืืื ืืจ"ื ืืืื ื ืืืชืื (ืืืืงืื ืื) ืฉืืื ื ืืื ื ืืฉืขืจื ืฆืื ืืื ืืจืื ื ืืืคืื ืืืขื ืืื ื"ืง ืืืื ืฉืฉืืื ื ืืื ื ืืืืืข ืฉืฉืขืจื ืฆืื ืืฉืืื ืืจ"ื ืืืื ื ืืืชืื (ืืืืงืื ืื) ืืขืฉืืช ืืืฆืจืื ืืืื ืืืขื ืฉืื ื ืขืืจืื ืืื ืืจืื ื ืืืคืื ืืืขื ืืื ื"ืง ืืืื ืฉืืื ืืืื ืืืืืข ืฉืืื ื ืขืืจืื ืืืืืขืืช ืืืื ืืื ืฉืื ืืืื ืืืื ืืชืื ืื"ืง ืืงื"ื ืืืฉืจืื
|
32 |
+
|
33 |
+
ืืฃ ืื,ื ืืืจื ืืืืจืคื ืืืื ืื ืืืจืช ืื ืืืฉืชืื ืืืื ื ืื ืื ืืืจืช ืื ืืืืื ืขืืื ืืืื ืืชืืชืื ืกืืืื ื ืื ืื ืืืจ ืจื ืืืืื ืืืจ ืจื ืืื ืชื ื ืืื ืจ' ืืฉืืขืื ืืืจ ืงืจื (ืืืืืจ ื) ืืืฉ ืื ืืฉื ืื ืืขืฉื ืืื ืืืืืช ืืืื ืืฉืื ืืืชืื ืืฉื ืืืืฉ ืืื ืขืื ืฉืื ืฉืืชืืจื ืื ืืืฉ ืืกืืื ืืื ืืฃ ืืฉื ืืกืืื ืืื ืืืืื ืื ืืื ืื ืืื ืืืืฉ ืื ืืืฉ ืชืืชืื ืืื ืขืืืื ืืฃ ืืฉื ืชืืชืื ืืื ืขืืืื ืชื ืื ื ืื ืืื ื"ืจ ืืืืขืืจ ืืจ' ืฆืืืง ืื ืืื ืืคืจืฉืื ืืืื ื ืืืืจื ืืืื ืฉืื ืชืืชืื ืฉืื ืืื ืืฉืืืืื ืขื ืขืืืื ืชื ืื ืจืฉื"ื ืืืืจ ืื ืืช ืืจืืื ืชืืชืื ืืืืจ ืืื ืืคื ื ืฉืจืืืืืช ืืืจืืฆืืืช ืื ืืช ืืคืจืื ืขืืืื ืืืืจ ืืื ืืคื ื ืฉืืืื ืืช ืืจืืื ืจ"ืฉ ืื ืืืขืืจ ืืืืจ ืื ืืช ืขืฉืืจืื ืฆื ืืืื ืืืืจ ืืื ืฉื ืืฉืืฃ ืืืคืงืจืืกืืชื ืื ืืช ืขื ืืื ืฆื ืฉืืื ืืืืจ ืืื ืืคื ื ืฉืฉืืืืืช ืืื ืืื ืขืืืื ืืืืืขืืช ืืืื ืืคื ื ืฉื ืืฉืืื ืืืืื ืขื ืืกืกืืื ืช"ืจ ืฆื ืฉืืื ืงืืื ืืฆื ืืืื ืจืื ืื ืื ื ืื ืืื ืจ' ืืืืฉืข ืืืืจ ืืขืืื ืื ืงืื ืฆื ืฉืืื ืืฆื ืืืื ืืืฅ ืืืืช ืฉืืืชื ืืฉืืื ืชื ืฉืงืื ืฆื ืฉืืื ืืฆื ืืืื ืืืืจ ืืืืชื ื ืช"ืจ ืื ืื ืืืงืืช ื ืืืงืืช ืขื ืคื ื ืฉืื ืืื ืืื ืจืื ืืืืขืืจ ืืืกืจ ืืืฉืชื ืืจืื ืืฉืืขืื ืืืกืจ ืืืื ืจืื ืืืืื ืืืืจ ืืคื ื ืืคืจืง ืืืืืจ ืืคืจืง ื ืฉืื ืืืืงืืช ืืืชื ืชืื ืืคืจืง ืืื ื ืฉืื ืืืืงืืช ืืืชื ืฉืืื ืืฉืืืื ืกืคืงืืช ืขื ืคื ื ืฉืื ืจ"ืฉ ืืืืจ ืืฃ ืชืื ืืคืจืง ื ืฉืื ืืืืงืืช ืืืชื ืื ืืื ืช ืืฉื ืืืืืืจ ืืื ืื ืืืงื ืืืฆื ืืืืื ืืื ืฉืื ืชืืื ืงืื ื ืืื ืฉืื ืชืืืืฅ ืืื ืืื ื ืืื ืช ืืืืจ ืงืื ื ืืื ืฉืชืืื ืืืืืื ืืื ืฉืชืืืืฅ ืืืจ ืืจ ืจืื ืืืืื ืืืืจ ืืคื ื ืืคืจืง ืืืืืจ ืืคืจืง ื ืฉืื ืืืืงืืช ืืืชื ืืฉืืื ืืคื ื ืืคืจืง ืืขื ืืืืงื ืืื ืืฉืชืืื ืืืืจ ืืคืจืง ืฉืืื ื ืื ืื ืืื ืืืืจ ืืคืจืง ืืื ืื ืืืืงื ืืืืืจ ืจืื ืงืื ื ืฉืืืืขื ืืืื ืฉื ืืชืื ืืื ื ืฆืจืืื ืืืืงื ืืืงื ืืืืื ืกืืื ืื ืื ืืืจ ืจืื ืืืงื ืืืืืื ืืื ืืืืืฆื ืืขืื ืืืืงื ืชืื ืืคืจืง ืืื ื ืฉืื ืืืืงืืช ืืืชื ืงืกืืจ ืชืื ืืคืจืง ืืืืืจ ื๏ฟฝ๏ฟฝืจืง <ืืื> ืืืืืจ ืืคืจืง ืืืืื ืืืงื ืืจืื ืกืืืื ื ืื ืฉืื ืืืืงื ืชืื ืืคืจืง ืืืืื ืืืงื ืืจืื ืื ืกืืืื ื ืื ืฉืื ืืื ืืืงื ื ืฉืื ืจ"ืฉ ืืืืจ ืืฃ ืชืื ืืคืจืง ื ืฉืื ืืืืงืืช ืืืชื ืงืกืืจ ืชืื ืืคืจืง ืืืคื ื ืืคืจืง ืืืขืื ืืืืงื ืืื ืืฉืชืืื ืืืืจ ืืคืจืง ืฉืืื ื ืื ืื ืื ืืื ืช ืืฉื ืืืืืืจ ืืื ืื ืืืงื ืืื ืืื ืงืชื ื ืื ืืืืขืืช ืืืื ืจืื ืืืืื ืืืชืื ืืคืจืง
|
34 |
+
|
35 |
+
ืืฃ ืื,ื ืืืจื ืืืืืขืืช ืืืื ืจืื ืฉืืขืื ืืืืืจ ืืคืจืง ืืืืช ืืื ืืืงื ืืจืื: ืืคื ื ืฉืืืจื ืืคืฉืจ ืื': ืื ืชื ืืื ืื ืื ืชื ื ืืื ืจืืฉื ืืื ืชืืื ืืฉืื ืืงื ืืขื ืืืกืชืื ืืจืื ื ืคืฉืืื ืืืื ืืจืืื ืืืื ืืจืืื ืืื ืืชืืื ืืกืชืืจื ืืขืื ืืจ"ื ืืงื ืืกืืืข ืืื ืงืจืื ืงื"ื ืืืืืขืืช ืืืื ืืฉืื ืืงื ืืขื ืืืชื ื ืืืืฆื ืื:
|
36 |
+
|
37 |
+
ืืฃ ืื,ื ืืฉื ื ืืืืฆื ืื ืื ืืื ืืจืก ืฉืืื ืืื ืืก ืืืฆืื ืืืฉ ืฉืืืฆืื ืืืื ื ืืื ืืก ืื ืืืจ ืฉืืฉ ืื ืฆืคืืจื ืืฉ ืื ืขืฆื ืืืฉ ืฉืืฉ ืื ืขืฆื ืืืื ืื ืฆืคืืจื ืื ืืืืื ืืืจืก ืืืื ืืื ืืช ืืืฉ ืฉืืืื ืืื ืืช ืืืื ื ืืืื ืืืจืก:
|
38 |
+
|
39 |
+
ืืฃ ืื,ื ืืืจื ืืื ืืก ืคืกืื ืืื ืืืืช ืืคืกืื ืืฉืื ืืกืืจื ืืืฆืื ืืฉืจ ืืื ืืืืช ืืคืกืื ืืฉืื ืืกืืจื ืืืจ ืจื ืืกื ืฉืื ืื ืืื ืืจืก ืฉืืขืืจื ืืืื ืก ืืฉืงื ืืื ืืืจื ืืืฆืื ืืฉืงื ืืื ืืขื ืื ืืกืืจื ืืืื ืืื ืืขืื ืืืจ ืืจ ืืืืจื ืืจืื ืืจื ื ืืื ืืคื ืฉืืื ืืืืจืื ืืื ืืกืืจื ืืืกืืจื ืชื ื ืจืื ื ืืืฆื ืืืืงืื ืืื ืืจืก ืืืืข ืื ื ืืงื ืืืื ืก ืืฉืงื ืื ืืื ืืืื ืขืจืืื ืืืื ืืื ืื ืืชื ืงืืจื ืืชืืื ืื ืื ืกื ืืืืืข ืฉืืื ืก ืืฉืงื ืืื ืืื ืืืืืข ืฉืืืฆืื ืืฉืงื
|
40 |
+
|
41 |
+
ืืฃ ืื,ื ืืืจื ืจืื ืืืืื ืืืืจ ืืืคืฃ ืืื ื ืงืืจื ืืชืืื ืืืฆืืฃ ืขืืื ืืื ืืื ืืื ืก ืืืืืข ืฉืืื ืก ืืฉืงื ืืื ืืื ืืืืืข ืฉืืืฆืื ืืฉืงื ืื ืฉืืคืชื ืขื ืืื ืืืืจ ืื ืืืืจ ืืขืืืื ืืืืืข ืฉืืืฆืื ืืฉืงื ืืื ืืื ืืืืืข ืฉืืื ืืก ืืฉืงื ืจ' ืืืกื ืืืืจ ืืฃ ืื ืฉืืคืชื ืขื ืืื ืืืืจ ืืคื ื ืฉืืืืจ ืืขืืืื ืืื ืฉืืคืชื ืขื ืืื ืืจืืฅ ืื ืจืืฅ ืืขืืืื ืืืืืข ืฉืืืฆืื ืืฉืงื ืืื ืืื ืืืืืข ืฉืืื ืก ืืฉืงื ืืื ืืืจื ืืืคื ืืืจ ืืืคื ืืืืืข ืฉืืื ืก ืืฉืงื ืืื ืืืื ืืื ืช"ืง ืืจ' ืืืืื ืืืจ ืขืืื ืืื ืืก ืขื ืืื ืืืืง ืืืื ืืื ืืืื: ืื ืืืจ ืฉืืฉ ืื ืฆืคืืจื ืืื': ืืฉ ืื ืฆืคืืจื ืืืื ืืืืข ืืืืฉื ืืืืื ืืฉ ืื ืขืฆื ืืืื ืื ืฆืคืืจื ืืืื ืืืืข ืืืืฉื ืืืื ื ืืืื ืืืื ืืืจ ืจื ืืกืื ืืืจ ืื ืจืืื ื ืืืืื ืืืจื ืืืงืื ืืืื ืืขืืจื ืืฆืืข ืืชืจื ืฉืืฉ ืื ืขืฆื ืืืื ืื ืฆืคืืจื ืืืื ืืืืข ืืืืฉื ืืืื ื ืืืื ืืืื ืืืจ ืจืื ืืจ ืืจ ืื ื ื"ืจ ืืืื ื ืืืฉืืื ื ื ืกืคืจืช ืขื ืื ืืื: ืื ืืืืื ืืืจืก ืืื': ืื ืืืื ืืืืจืก ืืืื ืืื ืืช ืืืฉ ืฉืืืื ืืื ืืช ืืืื ืืืื ืืืจืก ืืืชืืื ืืื ืืืชืืื ืกืื ืืชืจืงื ืืชื ืื (ืืืงืจื ืื) ืืืืืฉื ืขื ืืืื ืืืื ืืคื ืกืื ืืืฉื ืขืืื ืื ืชืจืงื ืืืฉื ืขืืื ืืื ืืื ืช"ื (ืืืงืจื ืื) ืืฉืจ ืืฉื ืขืืื ืืื ืื ืฉืืืืื ืืืฉืืื ืืฆื ืื ืฉืืืืจืื ืื ืขืืื ืื ืขืฉื ืืืืืชื ื:
|
42 |
+
|
43 |
+
ืืฃ ืื,ื ืืฉื ื ืื ืืจืืื ืืืื ืืื ื ื ืคืฉืืช ืจืืื ืืืื ืืื ื ืืืื ืืช ืืืฉ ืฉืจืืื ืืืื ืืื ื ืืืื ืืช ืืืื ื ืจืืื ืืืื ืืื ื ื ืคืฉืืช:
|
44 |
+
|
45 |
+
ืืฃ ืื,ื ืืืจื ืืืจ ืจื ืืืืื ืืืชืืื ืืืืจ ืชื ืื ื ืืื ืืืื ื ืืื ืืฉืจืื ืืืื ืืื ื ืืืื ืืช ืืืื ืืื ืืฉืจืื ืืืื ืืื ื ื ืคืฉืืช ืืืืื ื ืื ืืืชืืื ืืื ืืืืจ ืจื ืืืืื ืืืชืืื ืืืืจ ืืื ืืืชืืื ืืจ ืืืื ืืืชืืื ืืืืจ ืืฆืจืืื ืืื ืืฉืืขืื ื ืืจ ืืฉืื ืืจืืื ืืื ืืงืื ืืื ืืืืจ ืืืื ืจืืื ืืื ืืงืื ืืืื ืื ืืื ืืฉืืขืื ื ืืืืจ ืืฉืื ืืงืืชื ืืืคื ืืฉืจื ืืื ืืจ ืืงืืชื ืืืคื ืคืกืืื ืืืื ืื ืฆืจืืื:
|
46 |
+
|
47 |
+
ืืฃ ืื,ื ืืฉื ื ืื ืืืฉืจ ืืืื ืืฉืจ ืืืขืื ืืืฉ ืฉืืฉืจ ืืืขืื ืืืื ื ืืฉืจ ืืืื:
|
48 |
+
|
49 |
+
ืืฃ ืื,ื ืืืจื ืืืชืืื ืืื ื"ืจ ืืืื ื ืืืชืืื ืกืืื ืืืืช ืืขืื ืื ืืื ื
|
50 |
+
|
51 |
+
ืืฃ ื ,ื ืืืจื ืจืื ืืืืจ ืืื ืืชื ืื ืืื ืจืื ืืืืจ ืืืืจ ืื ืช"ื (ืืืจืื ืื) ืขื ืคืืื ืืืื ืื ืจืื ืืื ื ืืข ืืื ืื ืขื ืื ืจืืื๏ฟฝ๏ฟฝ ืืฆื ื ืืขืื ืืงืืฉ ืจืืืื ืื ืืขืื ืื ื ืืขืื ืืืื ืืืชืื (ืืืงืจื ืื) ืืืืื ืืจืืืช ืื ืืฃ ืจืืืื ืืืื ืืื ื ืืขืื ืฉืื ืืกืืื ืืืชืื (ืืืงืจื ืื) ืืื ืืจืื ืขืื ื ืืืื ืืฃ ืจืืืื ืฉืื ืืกืืื ืืืงืืฉ ื ืืขืื ืืจืืืื ืื ืจืืืื ืฉืื ืืงืจืืืื ืืฃ ื ืืขืื ืฉืื ืืงืจืืืื ืื ืื ืจืืืื ืืฉืืฉื ืืฃ ื ืืขืื ืืฉืืฉื ืืืื ืืื ืืืื ื ืืฉืืฉื ืืืคื ืื ื"ืฉ ืช"ื (ืืืงืจื ืื) ืืืืื ืื ืืืจื ืืืื ืื ืื ืืื ืืื ืื ืืืื ืื ืื ืืืืช ืฉืืคืืื ืืื ืืื ืจืืื ืืช ืื ืืขืื ืืืื ืกืืื ืืืื ืืฉืืืืชืื ืืจืื ืืืื ื ืืืื ืงืืืื ืืื ื ืืื ืงืืืจ ืืื ืืื ืืืื ืืืื ืขืืื ืืื ืืืืืจ ืจืื ืืืื ื ืืืื ืืกืชื ืืฉื ื ืืชื ื ืื ืืืฉืจ ืืืื ืืฉืจ ืืืขืื ืืืฉ ืืฉืจ ืืืขืื ืืืื ืืฉืจ ืืืื ืืืืจืื ื ืืืชืืื ืืื ืืืืจ ืจืื ืืืื ื ืืืชืืื ืกืืื ืืืืช ืืขืื ืื ืจืื ืืืื ื ืกืชืื ืืืจืื ื ืืฉืื ืืชื ื ืืื ื ืืืื ืืช ืื ืื ืืืื ืืืืืจืื ืืืืื ืืืื ืืืืืื ืืืื ืกืชืื ืืืื ืกืชืื ืืืืขืืช ืืืื ืกืชืื ืืจืืื ืขืืืฃ ืืืืืขืืช ืืืื ืืฉืื ืืงืชื ื ืื ืืื ืืืืชื ืืืื ื:
|
52 |
+
|
53 |
+
ืืฃ ื ,ื ืืฉื ื ืื ืฉืืืื ืืืขืฉืจืืช ืืืื ืืืืืช ืืืืืื ืืืฉ ืฉืืืื ืืืืืช ืืืืืื ืืืื ื ืืืื ืืืขืฉืจืืช:
|
54 |
+
|
55 |
+
ืืฃ ื ,ื ืืืจื ืืืชืืื ืืื ืืืชืืื ืืฉืจ ืืืืื ืืืืฆืื:
|
56 |
+
|
57 |
+
ืืฃ ื ,ื ืืฉื ื ืื ืฉืืืื ืืคืื ืืืื ืืืขืฉืจืืช ืืืฉ ืฉืืืื ืืืขืฉืจืืช ืืืื ื ืืืื ืืคืื:
|
58 |
+
|
59 |
+
ืืฃ ื ,ื ืืืจื ืืืชืืื ืืื ืืืชืืื ืชืื ื ืืืจืง ืฉืืื ื ืืืื ืืคืื ืืชื ื ืืื ืืืจื ืืคืื ืื ืฉืืื ืืืื ืื ืฉืืจ ืืืืืืื ืื ืืืจืฅ ืืืงืืืชื ืืืื ืืืื ืืกื ืืงืืื ืืืื ืืคืื ืืืื ืืืขืืื ืกืคืืื ืกืืื ืืงืืฆื ืื ืฉืืจ ืืืขืืื ืืคืงืจ ืืืืืืื ืื ืืืจืฅ ืืืขืืื ืืืืื ืืคืืจืืืช ืืืงืืืชื ืืืื ืืืขืืื ืชืื ื ืืืื ืืกื ืืงืืื ืืืขืืื ืืจืง ืืืืื ืืื ืืขืฉืจ ืชื ื ืื ืฉืืื ืืืื ืื ืฉืืจ ืืืืืืื ืื ืืืจืฅ ืืืื ืืืขืฉืจืืช ืืืืื ืืงืืืชื ืืืื ืืืื ืืกื ืืงืืื ืื ืงืชื ื ืื ืืื ืืื ืฉืืืื ืืืฆืืื ืืืืืื ืืชื ื ืืืื ืืช ืืฆืืื ืฉืืื ืืืจืง ืจ' ืืืกื ืืืืจ ืคืื ืืื ืืืช ืืืืช ืืื"ื ืืืืช ืขื ืืื ืืืจ ืจืื ืืจ ืืจ ืื ื ื"ืจ ืืืื ื ืขืืืฉืื ืฉืืจืขื ืืชืืืื ืืืืื ืื ืืื ืขืืืื ืืืื
|
60 |
+
"""
|
61 |
+
|
62 |
+
# Divide the passage into sentences
|
63 |
+
divided_sentences = divide_into_sentences(example_passage)
|
64 |
+
|
65 |
+
# Write the divided sentences to a new text file
|
66 |
+
output_filename = "divided_sentences.txt"
|
67 |
+
write_sentences_to_file(divided_sentences, output_filename)
|
68 |
+
|
69 |
+
print(f"Divided sentences written to '{output_filename}'")
|
data_creation/craet_model_bible_talmud.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
import nltk
|
4 |
+
from nltk.tokenize import word_tokenize
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from sklearn.svm import SVC
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
9 |
+
import joblib
|
10 |
+
|
11 |
+
# Load the dataset (assuming it is in UTF-8 encoding)
|
12 |
+
data = pd.read_csv('bible_talmud_data.csv', encoding='utf-8')
|
13 |
+
|
14 |
+
# Separate features (text) and labels (0, 1, or 2)
|
15 |
+
X = data['text']
|
16 |
+
y = data['label']
|
17 |
+
|
18 |
+
# Create a TF-IDF vectorizer with Hebrew tokenizer
|
19 |
+
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True)
|
20 |
+
|
21 |
+
# Fit and transform the data with TF-IDF vectorizer
|
22 |
+
X_tfidf = vectorizer.fit_transform(X)
|
23 |
+
|
24 |
+
# Split data into training and test sets
|
25 |
+
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=15)
|
26 |
+
|
27 |
+
# Create a Support Vector Machine (SVM) classifier
|
28 |
+
classifier = SVC(kernel='linear', C=2.0, probability=True)
|
29 |
+
|
30 |
+
# Train the SVM classifier on the training data
|
31 |
+
classifier.fit(X_train, y_train)
|
32 |
+
|
33 |
+
# Evaluate the model on the test data
|
34 |
+
y_pred = classifier.predict(X_test)
|
35 |
+
accuracy = accuracy_score(y_test, y_pred)
|
36 |
+
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
|
37 |
+
recall = recall_score(y_test, y_pred, average='weighted')
|
38 |
+
f1 = f1_score(y_test, y_pred, average='weighted')
|
39 |
+
|
40 |
+
print("Accuracy:", accuracy)
|
41 |
+
print("Precision:", precision)
|
42 |
+
print("Recall:", recall)
|
43 |
+
print("F1 Score:", f1)
|
44 |
+
|
45 |
+
# Save the trained model and vectorizer to files
|
46 |
+
model_filename = "text_identification_model.pkl"
|
47 |
+
vectorizer_filename = "text_identification_vectorizer.pkl"
|
48 |
+
joblib.dump(classifier, model_filename)
|
49 |
+
joblib.dump(vectorizer, vectorizer_filename)
|
data_creation/creat_csv.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
|
3 |
+
def process_text(input_text):
|
4 |
+
lines = input_text.strip().split('\n')
|
5 |
+
processed_lines = []
|
6 |
+
for line in lines:
|
7 |
+
line = line.strip()
|
8 |
+
if line:
|
9 |
+
if not line.endswith('1'):
|
10 |
+
line += ',1'
|
11 |
+
processed_lines.append(line)
|
12 |
+
return processed_lines
|
13 |
+
|
14 |
+
def save_to_csv(processed_lines, output_file):
|
15 |
+
with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
|
16 |
+
csv_writer = csv.writer(csvfile)
|
17 |
+
for line in processed_lines:
|
18 |
+
csv_writer.writerow([line])
|
19 |
+
|
20 |
+
if __name__ == "__main__":
|
21 |
+
input_text = """
|
22 |
+
ืืืขื ืืืืคื ืืชืืื ื ืืืืืจ
|
23 |
+
|
24 |
+
ืื ืกื ืืืจ ืืืื ืชืืื ืืขืฆืจ ืืืืื ืื ืืืื
|
25 |
+
|
26 |
+
ืื ื ืืกืจืช ืจืืื ืืืืื ืจืคืืช ืชืืืง
|
27 |
+
|
28 |
+
ืืืฉื ืืงืืืื ืืืื ืืืจืืื ืืจืขืืช ืชืืืฅ
|
29 |
+
|
30 |
+
ืื ืขืชื ืชืืื ืืืื ืืชืื ืชืืข ืขืืื ืืชืืื
|
31 |
+
|
32 |
+
ืืื ืืจืืชื ืืกืืชื ืชืงืืชื ืืชื ืืจืืื
|
33 |
+
|
34 |
+
ืืืจ ื ื ืื ืืื ื ืงื ืืื ืืืืคื ืืฉืจืื ื ืืืื
|
35 |
+
|
36 |
+
ืืืฉืจ ืจืืืชื ืืจืฉื ืืื ืืืจืขื ืขืื ืืงืฆืจืื
|
37 |
+
|
38 |
+
ืื ืฉืืช ืืืื ืืืืื ืืืจืื ืืคื ืืืื
|
39 |
+
|
40 |
+
ืฉืืืช ืืจืื ืืงืื ืฉืื ืืฉื ื ืืคืืจืื ื ืชืขื
|
41 |
+
|
42 |
+
ืืืฉ ืืื ืืืื ืืจืฃ ืืื ื ืืืื ืืชืคืจืื
|
43 |
+
|
44 |
+
ืืืื ืืืจ ืืื ื ืืชืงื ืืื ื ืฉืืฅ ืื ืื
|
45 |
+
|
46 |
+
ืืฉืขืคืื ืืืืื ืืช ืืืื ืื ืคื ืชืจืืื ืขื ืื ืฉืื
|
47 |
+
|
48 |
+
ืคืื ืงืจืื ื ืืจืขืื ืืจื ืขืฆืืืชื ืืคืืื
|
49 |
+
|
50 |
+
ืืจืื ืขื ืคื ื ืืืืฃ ืชืกืืจ ืฉืขืจืช ืืฉืจื
|
51 |
+
|
52 |
+
ืืขืื ืืื ืืืืจ ืืจืืื ืชืืื ื ืื ืื ืขืื ื ืืืื ืืงืื ืืฉืืข
|
53 |
+
|
54 |
+
ืืื ืืฉ ืืืืื ืืฆืืง ืื ืืขืฉืื ืืืืจ ืืืจ
|
55 |
+
|
56 |
+
ืื ืืขืืืื ืื ืืืืื ืืืืืืืื ืืฉืื ืชืืื
|
57 |
+
|
58 |
+
ืืฃ ืฉืื ื ืืชื ืืืจ ืืฉืจ ืืขืคืจ ืืกืืื ืืืืืื ืืคื ื ืขืฉ
|
59 |
+
|
60 |
+
ืืืงืจ ืืขืจื ืืืชื ืืืื ืืฉืื ืื ืฆื ืืืืื
|
61 |
+
|
62 |
+
ืืื ื ืกืข ืืชืจื ืื ืืืืชื ืืื ืืืืื
|
63 |
+
|
64 |
+
ืงืจื ื ื ืืืฉ ืขืื ื ืืื ืื ืืงืืฉืื ืชืคื ื
|
65 |
+
|
66 |
+
ืื ืืืืื ืืืจื ืืขืฉ ืืคืชื ืชืืืช ืงื ืื
|
67 |
+
|
68 |
+
ืื ื ืจืืืชื ืืืื ืืฉืจืืฉ ืืืงืื ื ืืื ืคืชืื
|
69 |
+
|
70 |
+
ืืจืืงื ืื ืื ืืืฉืข ืืืืืื ืืฉืขืจ ืืืื ืืฆืื
|
71 |
+
|
72 |
+
ืืฉืจ ืงืฆืืจื ืจืขื ืืืื ืืื ืืฆื ืื ืืงืืื ืืฉืืฃ ืฆืืื ืืืื
|
73 |
+
|
74 |
+
ืื ืื ืืฆื ืืขืคืจ ืืื ืืืืืื ืื ืืฆืื ืขืื
|
75 |
+
|
76 |
+
ืื ืืื ืืขืื ืืืื ืืื ื ืจืฉืฃ ืืืืืื ืขืืฃ
|
77 |
+
|
78 |
+
ืืืื ืื ื ืืืจืฉ ืื ืื ืืื ืืืืื ืืฉืื ืืืจืชื
|
79 |
+
|
80 |
+
ืขืฉื ืืืืืช ืืืื ืืงืจ ื ืคืืืืช ืขื ืืื ืืกืคืจ
|
81 |
+
|
82 |
+
ืื ืชื ืืืจ ืขื ืคื ื ืืจืฅ ืืฉืื ืืื ืขื ืคื ื ืืืฆืืช
|
83 |
+
|
84 |
+
ืืฉืื ืฉืคืืื ืืืจืื ืืงืืจืื ืฉืืื ืืฉืข
|
85 |
+
|
86 |
+
ืืคืจ ืืืฉืืืช ืขืจืืืื ืืื ืชืขืฉืื ื ืืืืื ืชืืฉืื
|
87 |
+
|
88 |
+
ืืื ืืืืื ืืขืจืื ืืขืฆืช ื ืคืชืืื ื ืืืจื
|
89 |
+
|
90 |
+
ืืืื ืืคืืฉื ืืฉื ืืืืืื ืืืฉืฉื ืืฆืืจืื
|
91 |
+
|
92 |
+
ืืืฉืข ืืืจื ืืคืืื ืืืื ืืืง ืืืืื
|
93 |
+
|
94 |
+
ืืชืื ืืื ืชืงืื ืืขืืชื ืงืคืฆื ืคืื
|
95 |
+
|
96 |
+
ืื ืขื ืืืื ืืื ืืืชื ืืืืื ืืขืฉืจืช ืืืคืื ืืืจ ืืกืฃ ืืฉืงืื ืขื ืืื ืขืฉื ืืืืืื ืืืืื ืื ืื ืื ืืืื
|
97 |
+
|
98 |
+
ืืืกืจ ืืืื ืืช ืืืขืชื ืืขื ืืื ืืืชื ื ืืืื ืื ืืืืชื ืืืืื ืฆืจืจ ืืืืืืื
|
99 |
+
|
100 |
+
ืืืืืจ ืืืื ืืืื ืืืกืฃ ื ืชืื ืื ืืืขื ืืขืฉืืช ืื ืืืื ืืขืื ืื
|
101 |
+
|
102 |
+
ืืืงืจืื ืกืคืจื ืืืื ืืืืฉ ืืจืืฉืื ืืฉืืืฉื ืขืฉืจ ืืื ืื ืืืืชื ืืื ืืฉืจ ืฆืื ืืื ืื ืืืฉืืจืคื ื ืืืื ืืื ืืคืืืช ืืฉืจ ืขื ืืืื ื ืืืืื ื ืืื ืฉืจื ืขื ืืขื ืืืื ื ืืืืื ื ืืืชืื ืืขื ืืขื ืืืฉืื ื ืืฉื ืืืื ืืืฉืืจืฉ ื ืืชื ืื ืืชื ืืืืขืช ืืืื
|
103 |
+
|
104 |
+
ืื ืฉืืื ืกืคืจืื ืืื ืืจืฆืื ืื ืื ืืืื ืืช ืืืื ืืืฉืืื ืืืจื ืืืืื ืืช ืื ืืืืืืื ืื ืขืจ ืืขื ืืงื ืืฃ ืื ืฉืื ืืืื ืืื ืืฉืืืฉื ืขืฉืจ ืืืืฉ ืฉื ืื ืขืฉืจ ืืื ืืืฉ ืืืจ ืืฉืืื ืืืื
|
105 |
+
|
106 |
+
ืคืชืฉืื ืืืชื ืืื ืชื ืืช ืืื ืืืื ื ืืืืื ื ืืืื ืืื ืืขืืื ืืืืืช ืขืชืืื ืืืื ืืื
|
107 |
+
|
108 |
+
ืืจืฆืื ืืฆืื ืืืืคืื ืืืืจ ืืืื ืืืืช ื ืชื ื ืืฉืืฉื ืืืืจื ืืืืื ืืืื ืืฉืื ืืฉืชืืช ืืืขืืจ ืฉืืฉื ื ืืืื
|
109 |
+
|
110 |
+
|
111 |
+
ืืืจืืื ืืืข ืืช ืื ืืฉืจ ื ืขืฉื ืืืงืจืข ืืจืืื ืืช ืืืืื ืืืืืฉ ืฉืง ืืืคืจ ืืืฆื ืืชืื ืืขืืจ ืืืืขืง ืืขืงื ืืืื ืืืจื
|
112 |
+
|
113 |
+
ืืืืื ืขื ืืคื ื ืฉืขืจ ืืืื ืื ืืื ืืืื ืื ืฉืขืจ ืืืื ืืืืืฉ ืฉืง
|
114 |
+
|
115 |
+
ืืืื ืืืื ื ืืืืื ื ืืงืื ืืฉืจ ืืืจ ืืืื ืืืชื ืืืืข ืืื ืืืื ืืืืืืื ืืฆืื ืืืื ืืืกืคื ืฉืง ืืืคืจ ืืฆืข ืืจืืื
|
116 |
+
|
117 |
+
ืืชืืืืื ื [ืืชืืืื ื] ื ืขืจืืช ืืกืชืจ ืืกืจืืกืื ืืืืืื ืื ืืชืชืืืื ืืืืื ืืื ืืชืฉืื ืืืืื ืืืืืืฉ ๏ฟฝ๏ฟฝืช ืืจืืื ืืืืกืืจ ืฉืงื ืืขืืื ืืื ืงืื
|
118 |
+
|
119 |
+
ืืชืงืจื ืืกืชืจ ืืืชื ืืกืจืืกื ืืืื ืืฉืจ ืืขืืื ืืคื ืื ืืชืฆืืื ืขื ืืจืืื ืืืขืช ืื ืื ืืขื ืื ืื
|
120 |
+
|
121 |
+
ืืืฆื ืืชื ืื ืืจืืื ืื ืจืืื ืืขืืจ ืืฉืจ ืืคื ื ืฉืขืจ ืืืื
|
122 |
+
|
123 |
+
ืืืื ืื ืืจืืื ืืช ืื ืืฉืจ ืงืจืื ืืืช ืคืจืฉืช ืืืกืฃ ืืฉืจ ืืืจ ืืื ืืฉืงืื ืขื ืื ืื ืืืื ืืืืืืืื [ืืืืืืื] ืืืืื
|
124 |
+
|
125 |
+
ืืืช ืคืชืฉืื ืืชื ืืืช ืืฉืจ ื ืชื ืืฉืืฉื ืืืฉืืืื ื ืชื ืื ืืืจืืืช ืืช ืืกืชืจ ืืืืืื ืื ืืืฆืืืช ืขืืื ืืืื ืื ืืืื ืืืชืื ื ืื ืืืืงืฉ ืืืคื ืื ืขื ืขืื
|
126 |
+
|
127 |
+
ืืืืื ืืชื ืืืื ืืืกืชืจ ืืช ืืืจื ืืจืืื
|
128 |
+
|
129 |
+
ืืชืืืจ ืืกืชืจ ืืืชื ืืชืฆืืื ืื ืืจืืื
|
130 |
+
|
131 |
+
ืื ืขืืื ืืืื ืืขื ืืืื ืืช ืืืื ืืืืขืื ืืฉืจ ืื ืืืฉ ืืืฉื ืืฉืจ ืืืื ืื ืืืื ืื ืืืฆืจ ืืคื ืืืืช ืืฉืจ ืื ืืงืจื ืืืช ืืชื ืืืืืช ืืื ืืืฉืจ ืืืฉืื ืื ืืืื ืืช ืฉืจืืื ืืืื ืืืื ืืื ื ืื ื ืงืจืืชื ืืืื ืื ืืืื ืื ืฉืืืฉืื ืืื
|
132 |
+
|
133 |
+
ืืืืืื ืืืจืืื ืืช ืืืจื ืืกืชืจ
|
134 |
+
|
135 |
+
ืืืืืจ ืืจืืื ืืืฉืื ืื ืืกืชืจ ืื ืชืืื ืื ืคืฉื ืืืืื ืืืช ืืืื ืืื ืืืืืืื
|
136 |
+
|
137 |
+
ืื ืื ืืืจืฉ ืชืืจืืฉื ืืขืช ืืืืช ืจืื ืืืฆืื ืืขืืื ืืืืืืื ืืืงืื ืืืจ ืืืช ืืืืช ืืืื ืชืืืื ืืื ืืืืข ืื ืืขืช ืืืืช ืืืขืช ืืืืืืช
|
138 |
+
|
139 |
+
ืืชืืืจ ืืกืชืจ ืืืฉืื ืื ืืจืืื
|
140 |
+
ืืืืื ืืืื ืฆืืืจื ืื ืื ืืชืืคืืืช ืืืฃ ืืืื ืชืืื ืขืืื ืื ืฉืืื ืืืืืจืื
|
141 |
+
|
142 |
+
ืฉื ื ืฉืืื ืืฉื ื ืขืคืจืื ืชืืืื ืฆืืื ืืจืืขืื ืืฉืืฉื ืื
|
143 |
+
|
144 |
+
ืขื ืฉืืคืื ืืืื ืื ืกื ืืฆืืืื ืืื ืื ืื ืืจ ืืืืจ ืืื ืืืขืช ืืืืื ื
|
145 |
+
|
146 |
+
ืืื ืืคื ืจืขืืชื ืืืื ืืื ืื ืืชื ืืืื ืื ืืื ืืชื ืืืื ืื ืชืืืื ืชืฉืืจื ืืจืืฉ ืืื ื ืืจืืฉ ืฉื ืืจ ืืืจืืื ืืืขื ืืช ืืจืืืช ืืืจืจื ื ืืจืื
|
147 |
+
|
148 |
+
ืืืืชื ื ืืืชื ืืื ืืืืชืื ื ืืืื [ืืืืช] ืืขืื ืื ืืืื ืขื ืง ืืฆืืจื ืื
|
149 |
+
|
150 |
+
ืื ืืคื ืืืื ืืืชื ืืื ืื ืืื ืืืื ืืืื ืืจืื ืฉืื ืื ืืื ืืฉืืื
|
151 |
+
|
152 |
+
ื ืคืช ืชืืคื ื ืฉืคืชืืชืื ืืื ืืืฉ ืืืื ืชืืช ืืฉืื ื ืืจืื ืฉืืืชืื ืืจืื ืืื ืื ืื ื ืขืื ืืืชื ืืื ืื ื ืขืื ืืขืื ืืชืื
|
153 |
+
|
154 |
+
ืฉืืืื ืคืจืืก ืจืืื ืื ืขื ืคืจื ืืืืื ืืคืจืื ืขื ื ืจืืื
|
155 |
+
|
156 |
+
ื ืจื ืืืจืื ืงื ื ืืงื ืืื ืขื ืื ืขืฆื ืืืื ื ืืจ ืืืืืืช ืขื ืื ืจืืฉื ืืฉืืื
|
157 |
+
|
158 |
+
ืืขืื ืื ืื ืืืจ ืืื ืืืื ืื ืืืื ืื ืืื ืื
|
159 |
+
|
160 |
+
ืขืืจื ืฆืคืื ืืืืื ืชืืื ืืคืืื ืื ื ืืืื ืืฉืืื ืืื ืืืื ืืื ื ืืืืื ืคืจื ืืืืื
|
161 |
+
|
162 |
+
ืืืชื ืืื ื ืืืชื ืืื ืืจืืชื ืืืจื ืขื ืืฉืื ืืืืชื ืืขืจื ืขื ืืืฉื ืฉืชืืชื ืืื ื ืขื ืืืื ืืืื ืจืขืื ืฉืชื ืืฉืืจื ืืืืื
|
163 |
+
"""
|
164 |
+
|
165 |
+
processed_text = process_text(input_text)
|
166 |
+
|
167 |
+
output_file = 'processed_text.csv'
|
168 |
+
save_to_csv(processed_text, output_file)
|
data_creation/creat_model_bible.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
import nltk
|
4 |
+
from nltk.tokenize import word_tokenize
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from sklearn.svm import SVC
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
9 |
+
import joblib
|
10 |
+
|
11 |
+
|
12 |
+
"""
|
13 |
+
# Download the Hebrew stopwords (if not already downloaded)
|
14 |
+
nltk.download('stopwords')
|
15 |
+
|
16 |
+
# Function to remove punctuation and special characters from text
|
17 |
+
def remove_punctuation(text):
|
18 |
+
return re.sub(r'[^\w\s]', '', text)
|
19 |
+
|
20 |
+
# Function to remove custom stop words from text
|
21 |
+
def remove_custom_stopwords(text):
|
22 |
+
hebrew_stopwords = {'ืื ื', 'ืืชื', 'ืืช', 'ืื ืื ื', 'ืืชื', 'ืืชื', 'ืื', 'ืื'} # Add your custom Hebrew stopwords here
|
23 |
+
return ' '.join(word for word in text.split() if word not in hebrew_stopwords)
|
24 |
+
|
25 |
+
# Remove punctuation and custom stop words from the text data
|
26 |
+
data['text'] = data['text'].apply(remove_punctuation)
|
27 |
+
data['text'] = data['text'].apply(remove_custom_stopwords)
|
28 |
+
"""
|
29 |
+
|
30 |
+
# Load the dataset (assuming it is in UTF-8 encoding)
|
31 |
+
data = pd.read_csv('bible_data.csv', encoding='utf-8')
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
# Separate features (text) and labels (0 or 1)
|
36 |
+
X = data['text']
|
37 |
+
y = data['label']
|
38 |
+
|
39 |
+
# Create a TF-IDF vectorizer with Hebrew tokenizer
|
40 |
+
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True)
|
41 |
+
|
42 |
+
# Fit and transform the data with TF-IDF vectorizer
|
43 |
+
X_tfidf = vectorizer.fit_transform(X)
|
44 |
+
|
45 |
+
# Split data into training and test sets
|
46 |
+
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=47)
|
47 |
+
|
48 |
+
# Create a Support Vector Machine (SVM) classifier
|
49 |
+
classifier = SVC(kernel='linear', C=0.5, probability=True)
|
50 |
+
|
51 |
+
# Train the SVM classifier on the training data
|
52 |
+
classifier.fit(X_train, y_train)
|
53 |
+
|
54 |
+
# Evaluate the model on the test data
|
55 |
+
y_pred = classifier.predict(X_test)
|
56 |
+
accuracy = accuracy_score(y_test, y_pred)
|
57 |
+
precision = precision_score(y_test, y_pred)
|
58 |
+
recall = recall_score(y_test, y_pred)
|
59 |
+
f1 = f1_score(y_test, y_pred)
|
60 |
+
|
61 |
+
print("Accuracy:", accuracy)
|
62 |
+
print("Precision:", precision)
|
63 |
+
print("Recall:", recall)
|
64 |
+
print("F1 Score:", f1)
|
65 |
+
|
66 |
+
# Save the trained model and vectorizer to files
|
67 |
+
model_filename = "is_this_bible_model.pkl"
|
68 |
+
vectorizer_filename = "is_this_bible_vectorizer.pkl"
|
69 |
+
joblib.dump(classifier, model_filename)
|
70 |
+
joblib.dump(vectorizer, vectorizer_filename)
|
data_creation/divided_sentences.txt
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ืืข"ืค ืฉืื ืืคืฉืจ ืืืื ืื ืื ืืจ' ืืืืจ ืืข"ืค ืฉืื ืืคืฉืจ ืืจืื ื,2
|
2 |
+
ืืืชื ื ืื ืืขืืืื ืจ"ื ืืืืจ ืื ืืืืฆืช ืืื ืืชืืืืช ืืื"ื ืื ืืืืฆืช,2
|
3 |
+
ืื ืืชืืืืช ืืื ื ืืืขื ื ืืฉืื ืืื ืืคืฉืจ ืืื ืื ืื ืชื ื ืืข"ืค,2
|
4 |
+
ืฉืื ืืคืฉืจ ืืื ืืืื ื ืจืื ื ืฉืื ืชืืชืื ืืชื ืืจืืฉื ืืืืขืื ืขืืืื ืืชื,2
|
5 |
+
ืืจืืฉื ืืจืื ืืืืจ ืืืขืืื ืืืืืฉ ืืืืขืืื ืืจืื ื ืืืขืืืืื ืืื ืืืืฉื ืืืืขืืื ืืื ื,2
|
6 |
+
ืืืื ืืกืชืื ืืื ืืืื ืืืืงื ืืื ืืฉืืื ืืืืจ ืืืื ืืื ืจืื ื ืืจ"ื,2
|
7 |
+
ืืขืืืื ืงืืื ืงื"ื ืืื ืืคืฉืจ ืืืื ืืชื ืืื ืชืจ ืืื ืื ืชืจ ืืฉืืื ืืจ"ื,2
|
8 |
+
ืืืื ื ืืืชืื (ืืืืงืื ืื) ืฉืืื ื ืืื ื ืืฉืขืจื ืฆืื ืืื ืืจืื ื ืืืคืื ืืืขื,2
|
9 |
+
ืืื ื"ืง ืืืื ืฉืฉืืื ื ืืื ื ืืืืืข ืฉืฉืขืจื ืฆืื ืืฉืืื ืืจ"ื ืืืื ื ืืืชืื,2
|
10 |
+
(ืืืืงืื ืื) ืืขืฉืืช ืืืฆืจืื ืืืื ืืืขื ืฉืื ื ืขืืจืื ืืื ืืจืื ื ืืืคืื ืืืขื,2
|
11 |
+
ืืื ื"ืง ืืืื ืฉืืื ืืืื ืืืืืข ืฉืืื ื ืขืืจืื ืืืืืขืืช ืืืื ืืื ืฉืื,2
|
12 |
+
ืืืื ืืืื ืืชืื ืื"ืง ืืงื"ื ืืืฉืจืื ืืืืจืคื ืืืื ืื ืืืจืช ืื ืืืฉืชืื,2
|
13 |
+
ืืืื ื ืื ืื ืืืจืช ืื ืืืืื ืขืืื ืืืื ืืชืืชืื ืกืืืื ื ืื ืื ืืืจ,2
|
14 |
+
ืจื ืืืืื ืืืจ ืจื ืืื ืชื ื ืืื ืจ' ืืฉืืขืื ืืืจ ืงืจื (ืืืืืจ,2
|
15 |
+
ื) ืืืฉ ืื ืืฉื ืื ืืขืฉื ืืื ืืืืืช ืืืื ืืฉืื ืืืชืื ืืฉื,2
|
16 |
+
ืืืืฉ ืืื ืขืื ืฉืื ืฉืืชืืจื ืื ืืืฉ ืืกืืื ืืื ืืฃ ืืฉื ืืกืืื ืืื,2
|
17 |
+
ืืืืื ืื ืืื ืื ืืื ืืืืฉ ืื ืืืฉ ืชืืชืื ืืื ืขืืืื ืืฃ,2
|
18 |
+
ืืฉื ืชืืชืื ืืื ืขืืืื ืชื ืื ื ืื ืืื ื"ืจ ืืืืขืืจ ืืจ' ืฆืืืง ืื,2
|
19 |
+
ืืื ืืคืจืฉืื ืืืื ื ืืืืจื ืืืื ืฉืื ืชืืชืื ืฉืื ืืื ืืฉืืืืื ืขื ืขืืืื,2
|
20 |
+
ืชื ืื ืจืฉื"ื ืืืืจ ืื ืืช ืืจืืื ืชืืชืื ืืืืจ ืืื ืืคื ื ืฉืจืืืืืช ืืืจืืฆืืืช ืื ืืช,2
|
21 |
+
ืืคืจืื ืขืืืื ืืืืจ ืืื ืืคื ื ืฉืืืื ืืช ืืจืืื ืจ"ืฉ ืื ืืืขืืจ ืืืืจ ืื ืืช,2
|
22 |
+
ืขืฉืืจืื ืฆื ืืืื ืืืืจ ืืื ืฉื ืืฉืืฃ ืืืคืงืจืืกืืชื ืื ืืช ืขื ืืื ืฆื ืฉืืื ืืืืจ,2
|
23 |
+
ืืื ืืคื ื ืฉืฉืืืืืช ืืื ืืื ืขืืืื ืืืืืขืืช ืืืื ืืคื ื ืฉื ืืฉืืื ืืืืื ืขื,2
|
24 |
+
ืืกืกืืื ืช"ืจ ืฆื ืฉืืื ืงืืื ืืฆื ืืืื ืจืื ืื ืื ื ืื ืืื ืจ',2
|
25 |
+
ืืืืฉืข ืืืืจ ืืขืืื ืื ืงืื ืฆื ืฉืืื ืืฆื ืืืื ืืืฅ ืืืืช ืฉืืืชื,2
|
26 |
+
ืืฉืืื ืชื ืฉืงืื ืฆื ืฉืืื ืืฆื ืืืื ืืืืจ ืืืืชื ื ืช"ืจ ืื ืื ืืืงืืช ื ืืืงืืช,2
|
27 |
+
ืขื ืคื ื ืฉืื ืืื ืืื ืจืื ืืืืขืืจ ืืืกืจ ืืืฉืชื ืืจืื ืืฉืืขืื ืืืกืจ,2
|
28 |
+
ืืืื ืจืื ืืืืื ืืืืจ ืืคื ื ืืคืจืง ืืืืืจ ืืคืจืง ื ืฉืื ืืืืงืืช ืืืชื ืชืื,2
|
29 |
+
ืืคืจืง ืืื ื ืฉืื ืืืืงืืช ืืืชื ืฉืืื ืืฉืืืื ืกืคืงืืช ืขื ืคื ื ืฉืื ืจ"ืฉ,2
|
30 |
+
ืืืืจ ืืฃ ืชืื ืืคืจืง ื ืฉืื ืืืืงืืช ืืืชื ืื ืืื ืช ืืฉื ืืืืืืจ ืืื ืื,2
|
31 |
+
ืืืงื ืืืฆื ืืืืื ืืื ืฉืื ืชืืื ืงืื ื ืืื ืฉืื ืชืืืืฅ ืืื ืืื,2
|
32 |
+
ื ืืื ืช ืืืืจ ืงืื ื ืืื ืฉืชืืื ืืืืืื ืืื ืฉืชืืืืฅ ืืืจ ืืจ ืจืื ืืืืื,2
|
33 |
+
ืืืืจ ืืคื ื ืืคืจืง ืืืืืจ ืืคืจืง ื ืฉืื ืืืืงืืช ืืืชื ืืฉืืื ืืคื ื ืืคืจืง ืืขื,2
|
34 |
+
ืืืืงื ืืื ืืฉืชืืื ืืืืจ ืืคืจืง ืฉืืื ื ืื ืื ืืื ืืืืจ ืืคืจืง ืืื ืื,2
|
35 |
+
ืืืืงื ืืืืืจ ืจืื ืงืื ื ืฉืืืืขื ืืืื ืฉื ืืชืื ืืื ื ืฆืจืืื ืืืืงื ืืืงื ืืืืื,2
|
36 |
+
ืกืืื ืื ืื ืืืจ ืจืื ืืืงื ืืืืืื ืืื ืืืืืฆื ืืขืื ืืืืงื ืชืื ืืคืจืง,2
|
37 |
+
ืืื ื ืฉืื ืืืืงืืช ืืืชื ืงืกืืจ ืชืื ืืคืจืง ืืืืืจ ืืคืจืง <ืืื> ืืืืืจ ืืคืจืง,2
|
38 |
+
ืืืืื ืืืงื ืืจืื ืกืืืื ื ืื ืฉืื ืืืืงื ืชืื ืืคืจืง ืืืืื ืืืงื ืืจืื ืื,2
|
39 |
+
ืกืืืื ื ืื ืฉืื ืืื ืืืงื ื ืฉืื ืจ"ืฉ ืืืืจ ืืฃ ืชืื ืืคืจืง ื ืฉืื ืืืืงืืช,2
|
40 |
+
ืืืชื ืงืกืืจ ืชืื ืืคืจืง ืืืคื ื ืืคืจืง ืืืขืื ืืืืงื ืืื ืืฉืชืืื ืืืืจ ืืคืจืง,2
|
41 |
+
ืฉืืื ื ืื ืื ืื ืืื ืช ืืฉื ืืืืืืจ ืืื ืื ืืืงื ืืื ืืื ืงืชื ื ืื,2
|
42 |
+
ืืืืขืืช ืืืื ืจืื ืืืืื ืืืชืื ืืคืจืง ืืืืืขืืช ืืืื ืจืื ืฉืืขืื ืืืืืจ ืืคืจืง,2
|
43 |
+
ืืืืช ืืื ืืืงื ืืจืื: ืืคื ื ืฉืืืจื ืืคืฉืจ ืื': ืื ืชื ืืื ืื,2
|
44 |
+
ืื ืชื ื ืืื ืจืืฉื ืืื ืชืืื ืืฉืื ืืงื ืืขื ืืืกืชืื ืืจืื ื ืคืฉืืื,2
|
45 |
+
ืืืื ืืจืืื ืืืื ืืจืืื ืืื ืืชืืื ืืกืชืืจื ืืขืื ืืจ"ื ืืงื ืืกืืืข ืืื,2
|
46 |
+
ืงืจืื ืงื"ื ืืืืืขืืช ืืืื ืืฉืื ืืงื ืืขื ืืืชื ื ืืืืฆื ืื: ืืืืฆื ืื,2
|
47 |
+
ืื ืืื ืืจืก ืฉืืื ืืื ืืก ืืืฆืื ืืืฉ ืฉืืืฆืื ืืืื ื ืืื ืืก ืื ืืืจ,2
|
48 |
+
ืฉืืฉ ืื ืฆืคืืจื ืืฉ ืื ืขืฆื ืืืฉ ืฉืืฉ ืื ืขืฆื ืืืื ืื,2
|
49 |
+
ืฆืคืืจื ืื ืืืืื ืืืจืก ืืืื ืืื ืืช ืืืฉ ืฉืืืื ืืื ืืช ืืืื ื,2
|
50 |
+
ืืืื ืืืจืก: ืืื ืืก ืคืกืื ืืื ืืืืช ืืคืกืื ืืฉืื ืืกืืจื ืืืฆืื ืืฉืจ ืืื,2
|
51 |
+
ืืืืช ืืคืกืื ืืฉืื ืืกืืจื ืืืจ ืจื ืืกื ืฉืื ืื ืืื ืืจืก ืฉืืขืืจื ืืืื ืก,2
|
52 |
+
ืืฉืงื ืืื ืืืจื ืืืฆืื ืืฉืงื ืืื ืืขื ืื ืืกืืจื ืืืื ืืื ืืขืื ืืืจ,2
|
53 |
+
ืืจ ืืืืจื ืืจืื ืืจื ื ืืื ืืคื ืฉืืื ืืืืจืื ืืื ืืกืืจื ืืืกืืจื ืชื ื,2
|
54 |
+
ืจืื ื ืืืฆื ืืืืงืื ืืื ืืจืก ืืืืข ืื ื ืืงื ืืืื ืก ืืฉืงื ืื ืืื,2
|
55 |
+
ืืืื ืขืจืืื ืืืื ืืื ืื ืืชื ืงืืจื ืืชืืื ืื ืื ืกื ืืืืืข ืฉืืื ืก ืืฉืงื,2
|
56 |
+
ืืื ืืื ืืืืืข ืฉืืืฆืื ืืฉืงื ืจืื ืืืืื ืืืืจ ืืืคืฃ ืืื ื ืงืืจื ืืชืืื,2
|
57 |
+
ืืืฆืืฃ ืขืืื ืืื ืืื ืืื ืก ืืืืืข ืฉืืื ืก ืืฉืงื ืืื ืืื ืืืืืข ืฉืืืฆืื,2
|
58 |
+
ืืฉืงื ืื ืฉืืคืชื ืขื ืืื ืืืืจ ืื ืืืืจ ืืขืืืื ืืืืืข ืฉืืืฆืื ืืฉืงื,2
|
59 |
+
ืืื ืืื ืืืืืข ืฉืืื ืืก ืืฉืงื ืจ' ืืืกื ืืืืจ ืืฃ ืื ืฉืืคืชื ืขื,2
|
60 |
+
ืืื ืืืืจ ืืคื ื ืฉืืืืจ ืืขืืืื ืืื ืฉืืคืชื ืขื ืืื ืืจืืฅ ืื ืจืืฅ,2
|
61 |
+
ืืขืืืื ืืืืืข ืฉืืืฆืื ืืฉืงื ืืื ืืื ืืืืืข ืฉืืื ืก ืืฉืงื ืืื ืืืจื ืืืคื,2
|
62 |
+
ืืืจ ืืืคื ืืืืืข ืฉืืื ืก ืืฉืงื ืืื ืืืื ืืื ืช"ืง ืืจ' ืืืืื ืืืจ,2
|
63 |
+
ืขืืื ืืื ืืก ืขื ืืื ืืืืง ืืืื ืืื ืืืื: ืื ืืืจ ืฉืืฉ ืื ืฆืคืืจื,2
|
64 |
+
ืืื': ืืฉ ืื ืฆืคืืจื ืืืื ืืืืข ืืืืฉื ืืืืื ืืฉ ืื ืขืฆื ืืืื,2
|
65 |
+
ืื ืฆืคืืจื ืืืื ืืืืข ืืืืฉื ืืืื ื ืืืื ืืืื ืืืจ ืจื ืืกืื ืืืจ,2
|
66 |
+
ืื ืจืืื ื ืืืืื ืืืจื ืืืงืื ืืืื ืืขืืจื ืืฆืืข ืืชืจื ืฉืืฉ ืื ืขืฆื,2
|
67 |
+
ืืืื ืื ืฆืคืืจื ืืืื ืืืืข ืืืืฉื ืืืื ื ืืืื ืืืื ืืืจ ืจืื ืืจ,2
|
68 |
+
ืืจ ืื ื ื"ืจ ืืืื ื ืืืฉืืื ื ื ืกืคืจืช ืขื ืื ืืื: ืื ืืืืื ืืืจืก,2
|
69 |
+
ืืื': ืื ืืืื ืืืืจืก ืืืื ืืื ืืช ืืืฉ ืฉืืืื ืืื ืืช ืืืื,2
|
70 |
+
ืืืื ืืืจืก ืืืชืืื ืืื ืืืชืืื ืกืื ืืชืจืงื ืืชื ืื (ืืืงืจื ืื) ืืืืืฉื ืขื,2
|
71 |
+
ืืืื ืืืื ืืคื ืกืื ืืืฉื ืขืืื ืื ืชืจืงื ืืืฉื ืขืืื ืืื ืืื,2
|
72 |
+
ืช"ื (ืืืงืจื ืื) ืืฉืจ ืืฉื ืขืืื ืืื ืื ืฉืืืืื ืืืฉืืื ืืฆื ืื,2
|
73 |
+
ืฉืืืืจืื ืื ืขืืื ืื ืขืฉื ืืืืืชื ื: ืื ืืจืืื ืืืื ืืื ื ื ืคืฉืืช ืจืืื ืืืื,2
|
74 |
+
ืืื ื ืืืื ืืช ืืืฉ ืฉืจืืื ืืืื ืืื ื ืืืื ืืช ืืืื ื ืจืืื ืืืื ืืื ื ื ืคืฉืืช:,2
|
75 |
+
ืืืจ ืจื ืืืืื ืืืชืืื ืืืืจ ืชื ืื ื ืืื ืืืื ื ืืื ืืฉืจืื ืืืื ืืื ื,2
|
76 |
+
ืืืื ืืช ืืืื ืืื ืืฉืจืื ืืืื ืืื ื ื ืคืฉืืช ืืืืื ื ืื ืืืชืืื ืืื ืืืืจ,2
|
77 |
+
ืจื ืืืืื ืืืชืืื ืืืืจ ืืื ืืืชืืื ืืจ ืืืื ืืืชืืื ืืืืจ ืืฆืจืืื ืืื,2
|
78 |
+
ืืฉืืขืื ื ืืจ ืืฉืื ืืจืืื ืืื ืืงืื ืืื ืืืืจ ืืืื ืจืืื ืืื ืืงืื,2
|
79 |
+
ืืืื ืื ืืื ืืฉืืขืื ื ืืืืจ ืืฉืื ืืงืืชื ืืืคื ืืฉืจื ืืื ืืจ ืืงืืชื,2
|
80 |
+
ืืืคื ืคืกืืื ืืืื ืื ืฆืจืืื: ืื ืืืฉืจ ืืืื ืืฉืจ ืืืขืื ืืืฉ ืฉืืฉืจ,2
|
81 |
+
ืืืขืื ืืืื ื ืืฉืจ ืืืื: ืืืชืืื ืืื ื"ืจ ืืืื ื ืืืชืืื ืกืืื ืืืืช ืืขืื ืื,2
|
82 |
+
ืืื ื ืจืื ืืืืจ ืืื ืืชื ืื ืืื ืจืื ืืืืจ ืืืืจ ืื ืช"ื (ืืืจืื,2
|
83 |
+
ืื) ืขื ืคืืื ืืืื ืื ืจืื ืืื ื ืืข ืืื ืื ืขื ืื ืจืืืื,2
|
84 |
+
ืืฆื ื ืืขืื ืืงืืฉ ืจืืืื ืื ืืขืื ืื ื ืืขืื ืืืื ืืืชืื (ืืืงืจื ืื) ืืืืื,2
|
85 |
+
ืืจืืืช ืื ืืฃ ืจืืืื ืืืื ืืื ื ืืขืื ืฉืื ืืกืืื ืืืชืื (ืืืงืจื ืื),2
|
86 |
+
ืืื ืืจืื ืขืื ื ืืืื ืืฃ ืจืืืื ืฉืื ืืกืืื ืืืงืืฉ ื ืืขืื ืืจืืืื ืื,2
|
87 |
+
ืจืืืื ืฉืื ืืงืจืืืื ืืฃ ื ืืขืื ืฉืื ืืงืจืืืื ืื ืื ืจืืืื ืืฉืืฉื ืืฃ,2
|
88 |
+
ื ืืขืื ืืฉืืฉื ืืืื ืืื ืืืื ื ืืฉืืฉื ืืืคื ืื ื"ืฉ ืช"ื (ืืืงืจื ืื),2
|
89 |
+
ืืืืื ืื ืืืจื ืืืื ืื ืื ืืื ืืื ืื ืืืื ืื ืื ืืืืช ืฉืืคืืื,2
|
90 |
+
ืืื ืืื ืจืืื ืืช ืื ืืขืื ืืืื ืกืืื ืืืื ืืฉืืืืชืื ืืจืื ืืืื ื ืืืื,2
|
91 |
+
ืงืืืื ืืื ื ืืื ืงืืืจ ืืื ืืื ืืืื ืืืื ืขืืื ืืื ืืืืืจ ืจืื,2
|
92 |
+
ืืืื ื ืืืื ืืกืชื ืืฉื ื ืืชื ื ืื ืืืฉืจ ืืืื ืืฉืจ ืืืขืื ืืืฉ ืืฉืจ,2
|
93 |
+
ืืืขืื ืืืื ืืฉืจ ืืืื ืืืืจืื ื ืืืชืืื ืืื ืืืืจ ืจืื ืืืื ื ืืืชืืื ืกืืื,2
|
94 |
+
ืืืืช ืืขืื ืื ืจืื ืืืื ื ืกืชืื ืืืจืื ื ืืฉืื ืืชื ื ืืื ื ืืืื ืืช ืื ืื ืืืื,2
|
95 |
+
ืืืืืจืื ืืืืื ืืืื ืืืืืื ืืืื ืกืชืื ืืืื ืกืชืื ืืืืขืืช ืืืื ืกืชืื ืืจืืื,2
|
96 |
+
ืขืืืฃ ืืืืืขืืช ืืืื ืืฉืื ืืงืชื ื ืื ืืื ืืืืชื ืืืื ื: ืื ืฉืืืื ืืืขืฉืจืืช,2
|
97 |
+
ืืืื ืืืืืช ืืืืืื ืืืฉ ืฉืืืื ืืืืืช ืืืืืื ืืืื ื ืืืื ืืืขืฉืจืืช: ืืืชืืื ืืื,2
|
98 |
+
ืืืชืืื ืืฉืจ ืืืืื ืืืืฆืื: ืื ืฉืืืื ืืคืื ืืืื ืืืขืฉืจืืช ืืืฉ ืฉืืืื ืืืขืฉืจืืช,2
|
99 |
+
ืืืื ื ืืืื ืืคืื: ืืืชืืื ืืื ืืืชืืื ืชืื ื ืืืจืง ืฉืืื ื ืืืื ืืคืื ืืชื ื,2
|
100 |
+
ืืื ืืืจื ืืคืื ืื ืฉืืื ืืืื ืื ืฉืืจ ืืืืืืื ืื ืืืจืฅ ืืืงืืืชื ืืืื,2
|
101 |
+
ืืืื ืืกื ืืงืืื ืืืื ืืคืื ืืืื ืืืขืืื ืกืคืืื ืกืืื ืืงืืฆื ืื ืฉืืจ ืืืขืืื ืืคืงืจ,2
|
102 |
+
ืืืืืืื ืื ืืืจืฅ ืืืขืืื ืืืืื ืืคืืจืืืช ืืืงืืืชื ืืืื ืืืขืืื ืชืื ื ืืืื ืืกื ืืงืืื,2
|
103 |
+
ืืืขืืื ืืจืง ืืืืื ืืื ืืขืฉืจ ืชื ื ืื ืฉืืื ืืืื ืื ืฉืืจ ืืืืืืื ืื,2
|
104 |
+
ืืืจืฅ ืืืื ืืืขืฉืจืืช ืืืืื ืืงืืืชื ืืืื ืืืื ืืกื ืืงืืื ืื ืงืชื ื ืื ืืื,2
|
105 |
+
ืืื ืฉืืืื ืืืฆืืื ืืืืืื ืืชื ื ืืืื ืืช ืืฆืืื ืฉืืื ืืืจืง ืจ' ืืืกื ืืืืจ,2
|
106 |
+
ืคืื ืืื ืืืช ืืืืช ืืื"ื ืืืืช ืขื ืืื ืืืจ ืจืื ืืจ ืืจ,2
|
107 |
+
ืื ื ื"ืจ ืืืื ื ืขืืืฉืื ืฉืืจืขื ืืชืืืื ืืืืื ืื ืืื ืขืืืื ืืืื,2
|
data_creation/graf_model.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from sklearn.svm import SVC
|
5 |
+
|
6 |
+
# Load the trained model and vectorizer
|
7 |
+
model_filename = "text_identification_model.pkl"
|
8 |
+
vectorizer_filename = "text_identification_vectorizer.pkl"
|
9 |
+
loaded_classifier = joblib.load(model_filename)
|
10 |
+
vectorizer = joblib.load(vectorizer_filename)
|
11 |
+
|
12 |
+
# Create a sample text
|
13 |
+
sample_text = "ืงืืฅ ืืจืื ืื ืขืื ืืืจืื!"
|
14 |
+
|
15 |
+
# Transform the sample text using the vectorizer
|
16 |
+
sample_text_tfidf = vectorizer.transform([sample_text])
|
17 |
+
|
18 |
+
# Make predictions using the loaded model
|
19 |
+
predicted_class = loaded_classifier.predict(sample_text_tfidf)
|
20 |
+
|
21 |
+
# Visualize the decision boundaries (example with a simple 2D dataset)
|
22 |
+
# Modify this part according to your data and model
|
23 |
+
# For complex data, consider using libraries like plotly
|
24 |
+
# to create more informative visualizations
|
25 |
+
|
26 |
+
# Generate data for visualization
|
27 |
+
X_visual = np.random.rand(300, 2) * 10
|
28 |
+
y_visual = np.random.randint(0, 3, size=300)
|
29 |
+
|
30 |
+
# Train an SVM on the generated data
|
31 |
+
svm_classifier = SVC(kernel='linear', C=1.0)
|
32 |
+
svm_classifier.fit(X_visual, y_visual)
|
33 |
+
|
34 |
+
# Plot the data points
|
35 |
+
plt.scatter(X_visual[:, 0], X_visual[:, 1], c=y_visual, cmap=plt.cm.Paired)
|
36 |
+
|
37 |
+
# Plot the decision boundaries
|
38 |
+
ax = plt.gca()
|
39 |
+
xlim = ax.get_xlim()
|
40 |
+
ylim = ax.get_ylim()
|
41 |
+
|
42 |
+
xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 50),
|
43 |
+
np.linspace(ylim[0], ylim[1], 50))
|
44 |
+
|
45 |
+
Z = svm_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
|
46 |
+
Z = Z.reshape(xx.shape)
|
47 |
+
|
48 |
+
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
|
49 |
+
|
50 |
+
# Highlight the support vectors
|
51 |
+
plt.scatter(svm_classifier.support_vectors_[:, 0],
|
52 |
+
svm_classifier.support_vectors_[:, 1],
|
53 |
+
s=100, facecolors='none', edgecolors='k')
|
54 |
+
|
55 |
+
# Plot the predicted sample point
|
56 |
+
plt.scatter(sample_text_tfidf[0, 0], sample_text_tfidf[0, 1], marker='x', color='red', label=f'Predicted Class: {predicted_class[0]}')
|
57 |
+
|
58 |
+
plt.title('Support Vector Machine Visualization')
|
59 |
+
plt.xlabel('Feature 1')
|
60 |
+
plt.ylabel('Feature 2')
|
61 |
+
plt.legend()
|
62 |
+
plt.show()
|
data_creation/processed_text.csv
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"ืืืขื ืืืืคื ืืชืืื ื ืืืืืจ,1"
|
2 |
+
"ืื ืกื ืืืจ ืืืื ืชืืื ืืขืฆืจ ืืืืื ืื ืืืื,1"
|
3 |
+
"ืื ื ืืกืจืช ืจืืื ืืืืื ืจืคืืช ืชืืืง,1"
|
4 |
+
"ืืืฉื ืืงืืืื ืืืื ืืืจืืื ืืจืขืืช ืชืืืฅ,1"
|
5 |
+
"ืื ืขืชื ืชืืื ืืืื ืืชืื ืชืืข ืขืืื ืืชืืื,1"
|
6 |
+
"ืืื ืืจืืชื ืืกืืชื ืชืงืืชื ืืชื ืืจืืื,1"
|
7 |
+
"ืืืจ ื ื ืื ืืื ื ืงื ืืื ืืืืคื ืืฉืจืื ื ืืืื,1"
|
8 |
+
"ืืืฉืจ ืจืืืชื ืืจืฉื ืืื ืืืจืขื ืขืื ืืงืฆืจืื,1"
|
9 |
+
"ืื ืฉืืช ืืืื ืืืืื ืืืจืื ืืคื ืืืื,1"
|
10 |
+
"ืฉืืืช ืืจืื ืืงืื ืฉืื ืืฉื ื ืืคืืจืื ื ืชืขื,1"
|
11 |
+
"ืืืฉ ืืื ืืืื ืืจืฃ ืืื ื ืืืื ืืชืคืจืื,1"
|
12 |
+
"ืืืื ืืืจ ืืื ื ืืชืงื ืืื ื ืฉืืฅ ืื ืื,1"
|
13 |
+
"ืืฉืขืคืื ืืืืื ืืช ืืืื ืื ืคื ืชืจืืื ืขื ืื ืฉืื,1"
|
14 |
+
"ืคืื ืงืจืื ื ืืจืขืื ืืจื ืขืฆืืืชื ืืคืืื,1"
|
15 |
+
"ืืจืื ืขื ืคื ื ืืืืฃ ืชืกืืจ ืฉืขืจืช ืืฉืจื,1"
|
16 |
+
"ืืขืื ืืื ืืืืจ ืืจืืื ืชืืื ื ืื ืื ืขืื ื ืืืื ืืงืื ืืฉืืข,1"
|
17 |
+
"ืืื ืืฉ ืืืืื ืืฆืืง ืื ืืขืฉืื ืืืืจ ืืืจ,1"
|
18 |
+
"ืื ืืขืืืื ืื ืืืืื ืืืืืืืื ืืฉืื ืชืืื,1"
|
19 |
+
"ืืฃ ืฉืื ื ืืชื ืืืจ ืืฉืจ ืืขืคืจ ืืกืืื ืืืืืื ืืคื ื ืขืฉ,1"
|
20 |
+
"ืืืงืจ ืืขืจื ืืืชื ืืืื ืืฉืื ืื ืฆื ืืืืื,1"
|
21 |
+
"ืืื ื ืกืข ืืชืจื ืื ืืืืชื ืืื ืืืืื,1"
|
22 |
+
"ืงืจื ื ื ืืืฉ ืขืื ื ืืื ืื ืืงืืฉืื ืชืคื ื,1"
|
23 |
+
"ืื ืืืืื ืืืจื ืืขืฉ ืืคืชื ืชืืืช ืงื ืื,1"
|
24 |
+
"ืื ื ืจืืืชื ืืืื ืืฉืจืืฉ ืืืงืื ื ืืื ืคืชืื,1"
|
25 |
+
"ืืจืืงื ืื ืื ืืืฉืข ืืืืืื ืืฉืขืจ ืืืื ืืฆืื,1"
|
26 |
+
"ืืฉืจ ืงืฆืืจื ืจืขื ืืืื ืืื ืืฆื ืื ืืงืืื ืืฉืืฃ ืฆืืื ืืืื,1"
|
27 |
+
"ืื ืื ืืฆื ืืขืคืจ ืืื ืืืืืื ืื ืืฆืื ืขืื,1"
|
28 |
+
"ืื ืืื ืืขืื ืืืื ืืื ื ืจืฉืฃ ืืืืืื ืขืืฃ,1"
|
29 |
+
"ืืืื ืื ื ืืืจืฉ ืื ืื ืืื ืืืืื ืืฉืื ืืืจืชื,1"
|
30 |
+
"ืขืฉื ืืืืืช ืืืื ืืงืจ ื ืคืืืืช ืขื ืืื ืืกืคืจ,1"
|
31 |
+
"ืื ืชื ืืืจ ืขื ืคื ื ืืจืฅ ืืฉืื ืืื ืขื ืคื ื ืืืฆืืช,1"
|
32 |
+
"ืืฉืื ืฉืคืืื ืืืจืื ืืงืืจืื ืฉืืื ืืฉืข,1"
|
33 |
+
"ืืคืจ ืืืฉืืืช ืขืจืืืื ืืื ืชืขืฉืื ื ืืืืื ืชืืฉืื,1"
|
34 |
+
"ืืื ืืืืื ืืขืจืื ืืขืฆืช ื ืคืชืืื ื ืืืจื,1"
|
35 |
+
"ืืืื ืืคืืฉื ืืฉื ืืืืืื ืืืฉืฉื ืืฆืืจืื,1"
|
36 |
+
"ืืืฉืข ืืืจื ืืคืืื ืืืื ืืืง ืืืืื,1"
|
37 |
+
"ืืชืื ืืื ืชืงืื ืืขืืชื ืงืคืฆื ืคืื,1"
|
38 |
+
"ืื ืขื ืืืื ืืื ืืืชื ืืืืื ืืขืฉืจืช ืืืคืื ืืืจ ืืกืฃ ืืฉืงืื ืขื ืืื ืขืฉื ืืืืืื ืืืืื ืื ืื ืื ืืืื,1"
|
39 |
+
"ืืืกืจ ืืืื ืืช ืืืขืชื ืืขื ืืื ืืืชื ื ืืืื ืื ืืืืชื ืืืืื ืฆืจืจ ืืืืืืื,1"
|
40 |
+
"ืืืืืจ ืืืื ืืืื ืืืกืฃ ื ืชืื ืื ืืืขื ืืขืฉืืช ืื ืืืื ืืขืื ืื,1"
|
41 |
+
"ืืืงืจืื ืกืคืจื ืืืื ืืืืฉ ืืจืืฉืื ืืฉืืืฉื ืขืฉืจ ืืื ืื ืืืืชื ืืื ืืฉืจ ืฆืื ืืื ืื ืืืฉืืจืคื ื ืืืื ืืื ืืคืืืช ืืฉืจ ืขื ืืืื ื ืืืืื ื ืืื ืฉืจื ืขื ืืขื ืืืื ื ืืืืื ื ืืืชืื ืืขื ืืขื ืืืฉืื ื ืืฉื ืืืื ืืืฉืืจืฉ ื ืืชื ืื ืืชื ืืืืขืช ืืืื,1"
|
42 |
+
"ืื ืฉืืื ืกืคืจืื ืืื ืืจืฆืื ืื ืื ืืืื ืืช ืืืื ืืืฉืืื ืืืจื ืืืืื ืืช ืื ืืืืืืื ืื ืขืจ ืืขื ืืงื ืืฃ ืื ืฉืื ืืืื ืืื ืืฉืืืฉื ืขืฉืจ ืืืืฉ ืฉื ืื ืขืฉืจ ืืื ืืืฉ ืืืจ ืืฉืืื ืืืื,1"
|
43 |
+
"ืคืชืฉืื ืืืชื ืืื ืชื ืืช ืืื ืืืื ื ืืืืื ื ืืืื ืืื ืืขืืื ืืืืืช ืขืชืืื ืืืื ืืื,1"
|
44 |
+
"ืืจืฆืื ืืฆืื ืืืืคืื ืืืืจ ืืืื ืืืืช ื ืชื ื ืืฉืืฉื ืืืืจื ืืืืื ืืืื ืืฉืื ืืฉืชืืช ืืืขืืจ ืฉืืฉื ื ืืืื,1"
|
45 |
+
"ืืืจืืื ืืืข ืืช ืื ืืฉืจ ื ืขืฉื ืืืงืจืข ืืจืืื ืืช ืืืืื ืืืืืฉ ืฉืง ืืืคืจ ืืืฆื ืืชืื ืืขืืจ ืืืืขืง ืืขืงื ืืืื ืืืจื,1"
|
46 |
+
"ืืืืื ืขื ืืคื ื ืฉืขืจ ืืืื ืื ืืื ืืืื ืื ืฉืขืจ ืืืื ืืืืืฉ ืฉืง,1"
|
47 |
+
"ืืืื ืืืื ื ืืืืื ื ืืงืื ืืฉืจ ืืืจ ืืืื ืืืชื ืืืืข ืืื ืืืื ืืืืืืื ืืฆืื ืืืื ืืืกืคื ืฉืง ืืืคืจ ืืฆืข ืืจืืื,1"
|
48 |
+
"ืืชืืืืื ื [ืืชืืืื ื] ื ืขืจืืช ืืกืชืจ ืืกืจืืกืื ืืืืืื ืื ืืชืชืืืื ืืืืื ืืื ืืชืฉืื ืืืืื ืืืืืืฉ ืืช ืืจืืื ืืืืกืืจ ืฉืงื ืืขืืื ืืื ืงืื,1"
|
49 |
+
"ืืชืงืจื ืืกืชืจ ืืืชื ืืกืจืืกื ืืืื ืืฉืจ ืืขืืื ืืคื ืื ืืชืฆืืื ืขื ืืจืืื ืืืขืช ืื ืื ืืขื ืื ืื,1"
|
50 |
+
"ืืืฆื ืืชื ืื ืืจืืื ืื ืจืืื ืืขืืจ ืืฉืจ ืืคื ื ืฉืขืจ ืืืื,1"
|
51 |
+
"ืืืื ืื ืืจืืื ืืช ืื ืืฉืจ ืงืจืื ืืืช ืคืจืฉืช ืืืกืฃ ืืฉืจ ืืืจ ืืื ืืฉืงืื ืขื ืื ืื ืืืื ืืืืืืืื [ืืืืืืื] ืืืืื,1"
|
52 |
+
"ืืืช ืคืชืฉืื ืืชื ืืืช ืืฉืจ ื ืชื ื๏ฟฝ๏ฟฝืืฉื ืืืฉืืืื ื ืชื ืื ืืืจืืืช ืืช ืืกืชืจ ืืืืืื ืื ืืืฆืืืช ืขืืื ืืืื ืื ืืืื ืืืชืื ื ืื ืืืืงืฉ ืืืคื ืื ืขื ืขืื,1"
|
53 |
+
"ืืืืื ืืชื ืืืื ืืืกืชืจ ืืช ืืืจื ืืจืืื,1"
|
54 |
+
"ืืชืืืจ ืืกืชืจ ืืืชื ืืชืฆืืื ืื ืืจืืื,1"
|
55 |
+
"ืื ืขืืื ืืืื ืืขื ืืืื ืืช ืืืื ืืืืขืื ืืฉืจ ืื ืืืฉ ืืืฉื ืืฉืจ ืืืื ืื ืืืื ืื ืืืฆืจ ืืคื ืืืืช ืืฉืจ ืื ืืงืจื ืืืช ืืชื ืืืืืช ืืื ืืืฉืจ ืืืฉืื ืื ืืืื ืืช ืฉืจืืื ืืืื ืืืื ืืื ื ืื ื ืงืจืืชื ืืืื ืื ืืืื ืื ืฉืืืฉืื ืืื,1"
|
56 |
+
"ืืืืืื ืืืจืืื ืืช ืืืจื ืืกืชืจ,1"
|
57 |
+
"ืืืืืจ ืืจืืื ืืืฉืื ืื ืืกืชืจ ืื ืชืืื ืื ืคืฉื ืืืืื ืืืช ืืืื ืืื ืืืืืืื,1"
|
58 |
+
"ืื ืื ืืืจืฉ ืชืืจืืฉื ืืขืช ืืืืช ืจืื ืืืฆืื ืืขืืื ืืืืืืื ืืืงืื ืืืจ ืืืช ืืืืช ืืืื ืชืืืื ืืื ืืืืข ืื ืืขืช ืืืืช ืืืขืช ืืืืืืช,1"
|
59 |
+
"ืืชืืืจ ืืกืชืจ ืืืฉืื ืื ืืจืืื,1"
|
60 |
+
"ืืืืื ืืืื ืฆืืืจื ืื ืื ืืชืืคืืืช ืืืฃ ืืืื ืชืืื ืขืืื ืื ืฉืืื ืืืืืจืื,1"
|
61 |
+
"ืฉื ื ืฉืืื ืืฉื ื ืขืคืจืื ืชืืืื ืฆืืื ืืจืืขืื ืืฉืืฉื ืื,1"
|
62 |
+
"ืขื ืฉืืคืื ืืืื ืื ืกื ืืฆืืืื ืืื ืื ืื ืืจ ืืืืจ ืืื ืืืขืช ืืืืื ื,1"
|
63 |
+
"ืืื ืืคื ืจืขืืชื ืืืื ืืื ืื ืืชื ืืืื ืื ืืื ืืชื ืืืื ืื ืชืืืื ืชืฉืืจื ืืจืืฉ ืืื ื ืืจืืฉ ืฉื ืืจ ืืืจืืื ืืืขื ืืช ืืจืืืช ืืืจืจื ื ืืจืื,1"
|
64 |
+
"ืืืืชื ื ืืืชื ืืื ืืืืชืื ื ืืืื [ืืืืช] ืืขืื ืื ืืืื ืขื ืง ืืฆืืจื ืื,1"
|
65 |
+
"ืื ืืคื ืืืื ืืืชื ืืื ืื ืืื ืืืื ืืืื ืืจืื ืฉืื ืื ืืื ืืฉืืื,1"
|
66 |
+
"ื ืคืช ืชืืคื ื ืฉืคืชืืชืื ืืื ืืืฉ ืืืื ืชืืช ืืฉืื ื ืืจืื ืฉืืืชืื ืืจืื ืืื ืื ืื ื ืขืื ืืืชื ืืื ืื ื ืขืื ืืขืื ืืชืื,1"
|
67 |
+
"ืฉืืืื ืคืจืืก ืจืืื ืื ืขื ืคืจื ืืืืื ืืคืจืื ืขื ื ืจืืื,1"
|
68 |
+
"ื ืจื ืืืจืื ืงื ื ืืงื ืืื ืขื ืื ืขืฆื ืืืื ื ืืจ ืืืืืืช ืขื ืื ืจืืฉื ืืฉืืื,1"
|
69 |
+
"ืืขืื ืื ืื ืืืจ ืืื ืืืื ืื ืืืื ืื ืืื ืื,1"
|
70 |
+
"ืขืืจื ืฆืคืื ืืืืื ืชืืื ืืคืืื ืื ื ืืืื ืืฉืืื ืืื ืืืื ืืื ื ืืืืื ืคืจื ืืืืื,1"
|
71 |
+
"ืืืชื ืืื ื ืืืชื ืืื ืืจืืชื ืืืจื ืขื ืืฉืื ืืืืชื ืืขืจื ืขื ืืืฉื ืฉืชืืชื ืืื ื ืขื ืืืื ืืืื ืจืขืื ืฉืชื ืืฉืืจื ืืืืื,1"
|
data_creation/text_identification_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d34b21921901b9ca8fb1ed6ad2896e731a2bfb0d83a0202f874e2244bd0aa44c
|
3 |
+
size 180099
|
data_creation/text_identification_vectorizer.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef67b7027f2a506145ea801c6daeb177c535b97b2cbfefabafeb033e84487366
|
3 |
+
size 241793
|
data_creation/try_model.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sys import argv
|
2 |
+
import nltk
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
import joblib
|
6 |
+
|
7 |
+
# Load the trained model from the file
|
8 |
+
loaded_classifier = joblib.load("text_identification_model.pkl")
|
9 |
+
|
10 |
+
# Load the TF-IDF vectorizer used for training
|
11 |
+
vectorizer = joblib.load("text_identification_vectorizer.pkl")
|
12 |
+
|
13 |
+
# Define labels for your categories
|
14 |
+
categories = {0: 'Other', 1: 'Bible', 2: 'Talmud'}
|
15 |
+
|
16 |
+
def parse_text(new_text):
|
17 |
+
# Transform the new text using the TF-IDF vectorizer
|
18 |
+
new_text_tfidf = vectorizer.transform([new_text])
|
19 |
+
|
20 |
+
# Make predictions on the new text
|
21 |
+
prediction = loaded_classifier.predict(new_text_tfidf)
|
22 |
+
|
23 |
+
# Get the confidence score for the predicted class
|
24 |
+
probabilities = loaded_classifier.predict_proba(new_text_tfidf)
|
25 |
+
confidence_score = probabilities[0, 1] # Confidence score for class "Bible" (index 1)
|
26 |
+
|
27 |
+
# Determine the predicted category label
|
28 |
+
predicted_category = categories[prediction[0]]
|
29 |
+
|
30 |
+
# Print the prediction and the confidence score
|
31 |
+
print(f"Text: {new_text} | Prediction: {predicted_category} | Confidence Score: {confidence_score:.4f}")
|
32 |
+
|
33 |
+
|
34 |
+
text_list = [
|
35 |
+
'ืืื ืืคื ืื ืื ืืฉืฉืืืขืื ืืฉืืจื ืฉืืื',
|
36 |
+
'ืืืฉืืช ืืขืจื: ืฉืืืฉื ืื ืฉืื ื ืฆืื ืืืืขืื ืืืื ืจืช',
|
37 |
+
'ืืืื ืืขืช ืืืื ืืืคืฉ ืืช ืืจืืฉืืื ืื ืจืืช ืืืืืขืชืื ืืช ืื ืชืืขืืืชืื',
|
38 |
+
'ืืืืืจ ืืฉื ืื ืื ื ืืฉืจืื',
|
39 |
+
'ืืืืจ ื ืฉืื ืืืื ืฉืขืืจ ืชื ืื ืืชื ื',
|
40 |
+
'ืืืจ ืืื ืืืื ืืจื ืืขืืจื',
|
41 |
+
'ืืืืื ืื ืงื ืืืื ืฉืขืืจื ืืืฉืื',]
|
42 |
+
|
43 |
+
|
44 |
+
if argv[1:]:
|
45 |
+
new_text = argv[1]
|
46 |
+
parse_text(new_text)
|
47 |
+
else:
|
48 |
+
for new_text in text_list:
|
49 |
+
parse_text(new_text)
|
is_this_bible_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb51c50be730acff9cf5af92f2e322aee9572e2b4b9381c434559f2fa0a87da1
|
3 |
+
size 115035
|
is_this_bible_vectorizer.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d5a76c9d59793b194d9022b091115b85ecbef1f7feef33bd87b503aaabc93ea
|
3 |
+
size 181677
|
templates/index.html
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html dir="rtl" lang="he">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>ืืืืื ืคืกืืงื ืืชื "ื ืืืืฆืขืืช AI</title>
|
7 |
+
<style>
|
8 |
+
body {
|
9 |
+
font-family: 'Tahoma', sans-serif;
|
10 |
+
background-color: #f9f9f9;
|
11 |
+
color: #333;
|
12 |
+
margin: 0;
|
13 |
+
padding: 0;
|
14 |
+
display: flex;
|
15 |
+
align-items: center;
|
16 |
+
justify-content: center;
|
17 |
+
min-height: 100vh;
|
18 |
+
overflow: hidden;
|
19 |
+
}
|
20 |
+
.container {
|
21 |
+
width: 400px;
|
22 |
+
padding: 20px;
|
23 |
+
background-color: #fff;
|
24 |
+
border-radius: 15px;
|
25 |
+
text-align: center;
|
26 |
+
box-shadow: 0px 10px 25px rgba(0, 0, 0, 0.1);
|
27 |
+
}
|
28 |
+
.fixed-input {
|
29 |
+
width: 90%;
|
30 |
+
padding: 10px;
|
31 |
+
margin-bottom: 15px;
|
32 |
+
border: none;
|
33 |
+
border-radius: 8px;
|
34 |
+
background-color: #f2f2f2;
|
35 |
+
color: #333;
|
36 |
+
resize: none;
|
37 |
+
}
|
38 |
+
input[type="submit"] {
|
39 |
+
width: 90%;
|
40 |
+
padding: 10px;
|
41 |
+
border: none;
|
42 |
+
border-radius: 8px;
|
43 |
+
background-color: #5d8d77;
|
44 |
+
color: white;
|
45 |
+
cursor: pointer;
|
46 |
+
transition: background-color 0.3s ease-in-out;
|
47 |
+
}
|
48 |
+
input[type="submit"]:hover {
|
49 |
+
background-color: #507b66;
|
50 |
+
}
|
51 |
+
h1 {
|
52 |
+
color: #5d8d77;
|
53 |
+
margin-bottom: 5px;
|
54 |
+
}
|
55 |
+
h2 {
|
56 |
+
color: #777;
|
57 |
+
margin-top: 20px;
|
58 |
+
}
|
59 |
+
.result {
|
60 |
+
border-top: 1px solid #ddd;
|
61 |
+
padding-top: 20px;
|
62 |
+
margin-top: 20px;
|
63 |
+
transition: all 0.5s ease-in-out;
|
64 |
+
}
|
65 |
+
</style>
|
66 |
+
<script>
|
67 |
+
function revealResults() {
|
68 |
+
var result = document.querySelector('.result');
|
69 |
+
result.style.opacity = 1;
|
70 |
+
result.style.marginTop = '20px';
|
71 |
+
}
|
72 |
+
</script>
|
73 |
+
</head>
|
74 |
+
<body>
|
75 |
+
<div class="container">
|
76 |
+
<h1>ืืืืื ืคืกืืงื ืืชื "ื ืืืืฆืขืืช AI</h1>
|
77 |
+
<p>ืืงืืืื ืืช ืืืงืกื ืฉืชืจืฆื, ืืืื ืืื ืืื ืืื ืืืคืืข ืืชื "ื ืืืืฆืขืืช ืงืกื ืืืื ื ืืืืืืืชืืช</p>
|
78 |
+
<form method="POST" action="/" onsubmit="revealResults()">
|
79 |
+
<textarea class="fixed-input" name="new_text" rows="4" cols="50" placeholder="ืืงืืืื ืืช ืืืงืกื ืืื..."></textarea><br>
|
80 |
+
<input type="submit" value="ืืคืขืืื ืืช ืืงืกื">
|
81 |
+
</form>
|
82 |
+
|
83 |
+
<div class="result">
|
84 |
+
{% if prediction %}
|
85 |
+
<h2>ืืกืืืืช ืืชื "ืืืื ื ืืฉืคืื</h2>
|
86 |
+
<p>ืืืงืกื: {{ new_text }}</p>
|
87 |
+
<p>ืืืื ื...: {{ prediction }}</p>
|
88 |
+
<p>ืฆืืื ืืืืืช: {{ confidence_score }}</p>
|
89 |
+
{% endif %}
|
90 |
+
</div>
|
91 |
+
</div>
|
92 |
+
</body>
|
93 |
+
</html>
|
try_model.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sys import argv
|
2 |
+
#import re
|
3 |
+
import nltk
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
import joblib
|
6 |
+
|
7 |
+
|
8 |
+
"""
|
9 |
+
# Remove punctuation and special characters
|
10 |
+
def remove_punctuation(text):
|
11 |
+
return re.sub(r'[^\w\s]', '', text)
|
12 |
+
|
13 |
+
# Function to remove custom stop words from text
|
14 |
+
def remove_custom_stopwords(text):
|
15 |
+
hebrew_stopwords = set(stopwords.words('hebrew'))
|
16 |
+
additional_stopwords = {'ืื ื', 'ืืชื', 'ืืช', 'ืื ืื ื', 'ืืชื', 'ืืชื', 'ืื', 'ืื'}
|
17 |
+
hebrew_stopwords.update(additional_stopwords)
|
18 |
+
return ' '.join(word for word in text.split() if word not in hebrew_stopwords)
|
19 |
+
|
20 |
+
|
21 |
+
# Preprocess the new text (remove punctuation and custom stop words)
|
22 |
+
# ืื ืจืืฆืื ืืืืืืจ ืืช ืืคืื ืงืฆืืื ืืื ืคืขืืื ืืฉ ืืืขืืืจ ืืช ืืืฉืชื ื ืืืจื ืืืฉืชื ื new_text
|
23 |
+
new_text_cleaned = remove_custom_stopwords(remove_punctuation(new_text))
|
24 |
+
"""
|
25 |
+
|
26 |
+
|
27 |
+
# Load the trained model from the file
|
28 |
+
loaded_classifier = joblib.load("is_this_bible_model.pkl")
|
29 |
+
|
30 |
+
# Load the TF-IDF vectorizer used for training
|
31 |
+
vectorizer = joblib.load("is_this_bible_vectorizer.pkl")
|
32 |
+
|
33 |
+
def parse_text(new_text):
|
34 |
+
# Transform the new text using the TF-IDF vectorizer
|
35 |
+
new_text_tfidf = vectorizer.transform([new_text])
|
36 |
+
|
37 |
+
# Make predictions on the new text
|
38 |
+
prediction = loaded_classifier.predict(new_text_tfidf)
|
39 |
+
|
40 |
+
# Get the confidence score for the predicted class
|
41 |
+
probabilities = loaded_classifier.predict_proba(new_text_tfidf)
|
42 |
+
confidence_score = probabilities[0, 1] # The confidence score for class "Bible" (index 1)
|
43 |
+
|
44 |
+
# Print the prediction and the confidence score
|
45 |
+
print(f"Text: {new_text} | Prediction: {'Bible' if prediction[0] == 1 else 'Other'} | Confidence Score: {confidence_score:.4f}")
|
46 |
+
|
47 |
+
|
48 |
+
text_list = [
|
49 |
+
'ืื ื ืืืฉื ืคื ืืฉืงื ืืืงืื ืืช ืืขืืืื ืฉืืืง ืืืชืืื ืืช ืฉืื ื ืืชืืืง ืงืฉืืจื ืืคืืืชืื 2.4, ืฉืืื ืื ืืช ืื',
|
50 |
+
'ืืื ืืคื ืื ืื ืืฉืฉืืืขืื ืืฉืืจื ืฉืืื',
|
51 |
+
'ืืืื ืืขืช ืืืื ืืืคืฉ ืืช ืืจืืฉืืื ืื ืจืืช ืืืืืขืชืื ืืช ืื ืชืืขืืืชืื',
|
52 |
+
'ืืืื ืฉืขืืื ืืืืืชืื ื ืืื ื ืฉืื ืืื ืืืื ืขืื ืขืืื ื ืืืืืชืื ื',
|
53 |
+
'ืื ื ืืกืชืืืชื ืืฉืืื ืืชื ืฆืืืช ืืืื',
|
54 |
+
'ืืฆื ืืื ืืขื ืืืื ืฉืื ืืื ืืืืืฉื',
|
55 |
+
'ืืืื ืื ืฉืืจ ืืฆืืื ืืื ืืชืจ ืืืจืืฉืืื ืงืืืฉ ืืืืจ ืื',
|
56 |
+
'ืฉืืจ ืืฉืืจืื ืืฉืจ ืืฉืืื',
|
57 |
+
'ืืฉืงื ื ืื ืฉืืงืืช ืคืืื ืื ืืืืื ืืืืื ืืืื',
|
58 |
+
'ืืืื ืจืง ืืื ืฉืืื ืืืืื ืชืืื ืืฉืืื ืืืืจ ืืื ืขื ืืื ืืกืชื ืืื ืื ืฉืืฉื ืงืฆืืืช',
|
59 |
+
'ืื ืืขืฉื ืฉืื ืืื ืืขืฉื ืฉืื ืืขืื ืื ืื ื ืืืืจ ืืืืจืื',
|
60 |
+
'ืืืื ืืจื ืืื ื ืืขืจืืืืช ืืืืฉื ืืจืขืืช ืืื ืื ืืืืงืื ืฉืืฉื ืื',
|
61 |
+
'ืืืืจื ืื ืืืช ืืฉืจืื ืืืืืจ ืืืงืืชื ืื ืืืื ืืืช ืืฉืคืื ืืืกื ืืฉืจ ืืขืฉื ืืชื ืืืื ืืื ืืื',
|
62 |
+
'ืื ืื ืืฉื ื ืืืคื ืืื ื ืขืืืื ืืขืืงืจ ืื ืืืืื',
|
63 |
+
'ืืืืจ ืืช ืืื ืืฉืืช ืืงืืฉื',
|
64 |
+
'ืืืฉืื ืืขืงื ืืืืืื ืืคื ืื ืื ืขืฉืื ืืืื',
|
65 |
+
'ืื ืื ืืืจืฆื ืืืืืืืชื ืืืืืช ืืืื',
|
66 |
+
'ืขืืืื :ืืืจ ืืืืจ ืชื "ื ,ืืืืจืขืืช ืืืื ืืชื "ื ืงืจืืื']
|
67 |
+
|
68 |
+
if argv[1:]:
|
69 |
+
new_text = argv[1]
|
70 |
+
parse_text(new_text)
|
71 |
+
|
72 |
+
else:
|
73 |
+
for new_text in text_list:
|
74 |
+
parse_text(new_text)
|
try_model_webui.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, request
|
2 |
+
import webbrowser
|
3 |
+
import nltk
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
import joblib
|
6 |
+
|
7 |
+
app = Flask(__name__)
|
8 |
+
|
9 |
+
# Load the trained model and vectorizer outside the routes for better performance
|
10 |
+
loaded_classifier = joblib.load("is_this_bible_model.pkl")
|
11 |
+
vectorizer = joblib.load("is_this_bible_vectorizer.pkl")
|
12 |
+
|
13 |
+
def parse_text(new_text):
|
14 |
+
new_text_tfidf = vectorizer.transform([new_text])
|
15 |
+
prediction = loaded_classifier.predict(new_text_tfidf)
|
16 |
+
probabilities = loaded_classifier.predict_proba(new_text_tfidf)
|
17 |
+
confidence_score = probabilities[0, 1]
|
18 |
+
return 'ืชื "ื' if prediction[0] == 1 else 'ืืืจ', confidence_score
|
19 |
+
|
20 |
+
@app.route('/', methods=['GET', 'POST'])
|
21 |
+
def index():
|
22 |
+
prediction = None
|
23 |
+
confidence_score = None
|
24 |
+
new_text = None
|
25 |
+
|
26 |
+
if request.method == 'POST':
|
27 |
+
new_text = request.form['new_text']
|
28 |
+
if new_text:
|
29 |
+
prediction, confidence_score = parse_text(new_text)
|
30 |
+
return render_template('index.html', new_text=new_text, prediction=prediction, confidence_score=confidence_score)
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == '__main__':
|
34 |
+
webbrowser.open('http://127.0.0.1:5000/')
|
35 |
+
app.run(debug=True)
|