Yurii Paniv commited on
Commit
0659669
·
1 Parent(s): aa0bba0

Replace apostrophe

Browse files
scripts/extract_text_corpus.py CHANGED
@@ -11,7 +11,7 @@ text_file = open(OUT_FILE, mode="a")
11
  tokenizer = nltk.SpaceTokenizer()
12
  paranthesis_regex = re.compile(r'\(.*\)')
13
  allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
14
- "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
15
 
16
  for subdir, dirs, files in os.walk(FOLDER):
17
  for file in files:
@@ -25,6 +25,7 @@ for subdir, dirs, files in os.walk(FOLDER):
25
  input_file = open(file_path, encoding="cp1251")
26
  cleaned_text = input_file.read()
27
  cleaned_text = cleaned_text.lower()
 
28
  cleaned_text = paranthesis_regex.sub('', cleaned_text)
29
  cleaned_text = cleaned_text.strip()
30
  cleaned_text = cleaned_text.split(".")
 
11
  tokenizer = nltk.SpaceTokenizer()
12
  paranthesis_regex = re.compile(r'\(.*\)')
13
  allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
14
+ "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", ""]
15
 
16
  for subdir, dirs, files in os.walk(FOLDER):
17
  for file in files:
 
25
  input_file = open(file_path, encoding="cp1251")
26
  cleaned_text = input_file.read()
27
  cleaned_text = cleaned_text.lower()
28
+ cleaned_text = cleaned_text.replace("'", "’")
29
  cleaned_text = paranthesis_regex.sub('', cleaned_text)
30
  cleaned_text = cleaned_text.strip()
31
  cleaned_text = cleaned_text.split(".")
scripts/wiki_import.py CHANGED
@@ -15,7 +15,7 @@ text_file = open(OUT_PATH, mode="a")
15
  tokenizer = nltk.SpaceTokenizer()
16
  paranthesis_regex = re.compile(r'\(.*\)')
17
  allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
18
- "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
19
 
20
  cleaner = Cleaner()
21
  # iter = 0
@@ -27,6 +27,7 @@ for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.x
27
  cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
28
  cleaned_text = cleaned_text.replace("ім.", "імені")
29
  cleaned_text = cleaned_text.replace("див.", "дивись")
 
30
  cleaned_text = paranthesis_regex.sub('', cleaned_text)
31
  cleaned_text = cleaned_text.strip()
32
  cleaned_text = cleaned_text.split(".")
 
15
  tokenizer = nltk.SpaceTokenizer()
16
  paranthesis_regex = re.compile(r'\(.*\)')
17
  allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
18
+ "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", ""]
19
 
20
  cleaner = Cleaner()
21
  # iter = 0
 
27
  cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
28
  cleaned_text = cleaned_text.replace("ім.", "імені")
29
  cleaned_text = cleaned_text.replace("див.", "дивись")
30
+ cleaned_text = cleaned_text.replace("'", "’")
31
  cleaned_text = paranthesis_regex.sub('', cleaned_text)
32
  cleaned_text = cleaned_text.strip()
33
  cleaned_text = cleaned_text.split(".")