File size: 4,273 Bytes
685cc58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# taken and modified from https://github.com/harvardnlp/im2markup
# tokenize latex formulas
import sys
import os
import re
import argparse
import subprocess
import shutil
from threading import Timer
from datetime import datetime


def run_cmd(cmd, timeout_sec=30):
    proc = subprocess.Popen(cmd, shell=True)
    kill_proc = lambda p: p.kill()
    timer = Timer(timeout_sec, kill_proc, [proc])
    try:
        timer.start()
        stdout,stderr = proc.communicate()
    finally:
        timer.cancel()
        
def tokenize_latex(latex_code, latex_type="", middle_file=""):
    if not latex_code:
        return False, latex_code
    if not latex_type:
        latex_type = "tabular" if "tabular" in latex_code else "formula"
    if not middle_file:
        middle_file = "out-" + datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + ".txt"
    temp_file = middle_file + '.tmp'
    
    if latex_type == "formula":
        with open(temp_file, 'w') as f:
            prepre = latex_code
            # replace split, align with aligned
            prepre = re.sub(r'\\begin{(split|align|alignedat|alignat|eqnarray)\*?}(.+?)\\end{\1\*?}', r'\\begin{aligned}\2\\end{aligned}', prepre, flags=re.S)
            prepre = re.sub(r'\\begin{(smallmatrix)\*?}(.+?)\\end{\1\*?}', r'\\begin{matrix}\2\\end{matrix}', prepre, flags=re.S)
            f.write(prepre)
    
        cmd = r"cat %s | node %s %s > %s " % (temp_file, os.path.join(os.path.dirname(__file__), 'preprocess_formula.js'), 'normalize', middle_file)
        ret = subprocess.call(cmd, shell=True)
        os.remove(temp_file)
        if ret != 0:
            return False, latex_code
        
        operators = '\s?'.join('|'.join(['arccos', 'arcsin', 'arctan', 'arg', 'cos', 'cosh', 'cot', 'coth', 'csc', 'deg', 'det', 'dim', 'exp', 'gcd', 'hom', 'inf',
                                        'injlim', 'ker', 'lg', 'lim', 'liminf', 'limsup', 'ln', 'log', 'max', 'min', 'Pr', 'projlim', 'sec', 'sin', 'sinh', 'sup', 'tan', 'tanh']))
        ops = re.compile(r'\\operatorname {(%s)}' % operators)
        with open(middle_file, 'r') as fin:
            for line in fin:
                tokens = line.strip().split()
                tokens_out = []
                for token in tokens:
                    tokens_out.append(token)
                post = ' '.join(tokens_out)
                # use \sin instead of \operatorname{sin}
                names = ['\\'+x.replace(' ', '') for x in re.findall(ops, post)]
                post = re.sub(ops, lambda match: str(names.pop(0)), post).replace(r'\\ \end{array}', r'\end{array}')
        os.remove(middle_file)
        return True, post
    
    elif latex_type == "tabular":
        latex_code = latex_code.replace("\\\\%", "\\\\ %")
        latex_code = latex_code.replace("\%", "<PERCENTAGE_TOKEN>")
        latex_code = latex_code.split('%')[0]
        latex_code = latex_code.replace("<PERCENTAGE_TOKEN>", "\%")
        if not "\\end{tabular}" in latex_code:
            latex_code += "\\end{tabular}"
        with open(middle_file, 'w') as f:
            f.write(latex_code.replace('\r', ' ').replace('\n', ' '))
        cmd = "perl -pe 's|hskip(.*?)(cm\\|in\\|pt\\|mm\\|em)|hspace{\\1\\2}|g' %s > %s"%(middle_file, temp_file)
        ret = subprocess.call(cmd, shell=True)
        if ret != 0:
            return False, latex_code
        os.remove(middle_file)
        cmd = r"cat %s | node %s %s > %s " % (temp_file, os.path.join(os.path.dirname(__file__), 'preprocess_tabular.js'), 'tokenize', middle_file)
        ret = subprocess.call(cmd, shell=True)
        os.remove(temp_file)
        if ret != 0:
            return False, latex_code
        with open(middle_file, 'r') as fin:
            for line in fin:
                tokens = line.strip().split()
                tokens_out = []
                for token in tokens:
                    tokens_out.append(token)
                post = ' '.join(tokens_out)
        os.remove(middle_file)
        return True, post
    else:
        print(f"latex type{latex_type} unrecognized.")
        return False, latex_code

if __name__ == '__main__':
    latex_code = open("2.txt", 'r').read().replace('\r', ' ')
    print("=>", latex_code)
    new_code = tokenize_latex(latex_code)
    print("=>", new_code)