File size: 1,003 Bytes
77841b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import spacy, re
from spacy.tokenizer import Tokenizer

"""
python -m spacy package ref_he packages -c /Users/nss/sefaria/project/sefaria/spacy_function_registry.py -b wheel,sdist -n ref_ner -v 1.0.0
python -m spacy huggingface-hub push packages/he_ref_ner-1.0.0/dist/he_ref_ner-1.0.0-py3-none-any.whl -o Sefaria
"""

@spacy.registry.tokenizers("inner_punct_tokenizer")
def inner_punct_tokenizer_factory():
    def inner_punct_tokenizer(nlp):
        # infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
        infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–\-/()<>]''')
        prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
        suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)

        return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                         suffix_search=suffix_re.search,
                         infix_finditer=infix_re.finditer,
                         token_match=None)
    return inner_punct_tokenizer