droussis commited on
Commit
0161a68
1 Parent(s): a63fb32

Upload preprocess.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. preprocess.sh +58 -0
preprocess.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # USAGE preprocess.sh langid spmodel < input > output
4
+ #
5
+ # replace SPMENCODE with your own setup!
6
+ #
7
+ # CHANGES
8
+ #
9
+ # * issue with perl code that removes control characters
10
+ # unicode property Other = \p{C}) seems to remove
11
+ # newline characters as well --> add negative lookahead
12
+ # to avoid removing newline characters!
13
+ #
14
+
15
+ SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
16
+
17
+ ## simple pre-processing steps adapted from Moses tools
18
+
19
+ sed -e 's/,/,/g' \
20
+ -e 's/。 */. /g' \
21
+ -e 's/、/,/g' \
22
+ -e 's/”/"/g' \
23
+ -e 's/“/"/g' \
24
+ -e 's/∶/:/g' \
25
+ -e 's/:/:/g' \
26
+ -e 's/?/\?/g' \
27
+ -e 's/《/"/g' \
28
+ -e 's/》/"/g' \
29
+ -e 's/)/\)/g' \
30
+ -e 's/!/\!/g' \
31
+ -e 's/(/\(/g' \
32
+ -e 's/;/;/g' \
33
+ -e 's/1/"/g' \
34
+ -e 's/」/"/g' \
35
+ -e 's/「/"/g' \
36
+ -e 's/0/0/g' \
37
+ -e 's/3/3/g' \
38
+ -e 's/2/2/g' \
39
+ -e 's/5/5/g' \
40
+ -e 's/6/6/g' \
41
+ -e 's/9/9/g' \
42
+ -e 's/7/7/g' \
43
+ -e 's/8/8/g' \
44
+ -e 's/4/4/g' \
45
+ -e 's/. */. /g' \
46
+ -e 's/~/\~/g' \
47
+ -e "s/’/\'/g" \
48
+ -e 's/…/\.\.\./g' \
49
+ -e 's/━/\-/g' \
50
+ -e 's/〈/\</g' \
51
+ -e 's/〉/\>/g' \
52
+ -e 's/【/\[/g' \
53
+ -e 's/】/\]/g' \
54
+ -e 's/%/\%/g' |
55
+ perl -C -pe 's/(?!\n)\p{C}/ /g;' |
56
+ perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
57
+ sed 's/ */ /g;s/^ *//g;s/ *$//g'
58
+