File size: 2,380 Bytes
10b0761
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


CWD=`pwd`
INSTALL_PATH=$CWD/tokenizers/thirdparty

MOSES=$INSTALL_PATH/mosesdecoder
if [ ! -d $MOSES ]; then
    echo 'Cloning Moses github repository (for tokenization scripts)...'
    git clone https://github.com/moses-smt/mosesdecoder.git $MOSES
    cd $MOSES
    # To deal with differences in handling ' vs "
    git checkout 03578921cc1a03402
    cd -
fi

WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts
if [ ! -d $WMT16_SCRIPTS ]; then
    echo 'Cloning Romanian tokenization scripts'
    git clone https://github.com/rsennrich/wmt16-scripts.git $WMT16_SCRIPTS
fi

KYTEA=$INSTALL_PATH/kytea
if [ ! -f $KYTEA/bin/kytea ]; then
    git clone https://github.com/neubig/kytea.git $KYTEA
    cd $KYTEA
    autoreconf -i
    ./configure --prefix=`pwd`
    make
    make install
    cd ..
fi

export MECAB=$INSTALL_PATH/mecab-0.996-ko-0.9.2
if [ ! -f $MECAB/bin/mecab ]; then
    cd $INSTALL_PATH
    curl -LO https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
    tar zxfv mecab-0.996-ko-0.9.2.tar.gz
    cd mecab-0.996-ko-0.9.2/
    ./configure --prefix=`pwd`
    make
    make install

    cd ..
    curl -LO https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz
    tar zxfv mecab-ko-dic-2.1.1-20180720.tar.gz
    cd mecab-ko-dic-2.1.1-20180720/
    ./autogen.sh
    ./configure --prefix=`pwd` --with-dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic --with-mecab-config=$MECAB/bin/mecab-config
    make
    sh -c 'echo "dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic" > $MECAB/etc/mecabrc'
    make install
    cd $CWD
fi

INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources
if [ ! -d $INDIC_RESOURCES_PATH ]; then
    echo 'Cloning indic_nlp_resources'
    git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git $INDIC_RESOURCES_PATH
fi


if [ ! -f $INSTALL_PATH/seg_my.py ]; then
    cd $INSTALL_PATH
    wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip
    unzip wat2020.my-en.zip
    # switch to python3
    cat wat2020.my-en/myseg.py  |sed 's/^sys.std/###sys.std/g' | sed 's/### sys/sys/g' | sed 's/unichr/chr/g' > seg_my.py
    cd $CWD
fi


pip install pythainlp sacrebleu indic-nlp-library