--- license: cc-by-4.0 language: - he inference: false --- # DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew State-of-the-art language model for Hebrew, released [here](https://arxiv.org/abs/2308.16687). This is the fine-tuned BERT-tiny model for the joint parsing of the following tasks: - Prefix Segmentation - Morphological Disabmgiuation - Lexicographical Analysis (Lemmatization) - Syntactical Parsing (Dependency-Tree) - Named-Entity Recognition For the bert-base models for other tasks, see [here](https://huggingface.co/collections/dicta-il/dictabert-6588e7cc08f83845fc42a18b). Sample usage: ```python from transformers import AutoModel, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-tiny-joint') model = AutoModel.from_pretrained('dicta-il/dictabert-tiny-joint', trust_remote_code=True) model.eval() sentence = 'בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים' print(model.predict([sentence], tokenizer)) ``` Output: ```json [ { "text": "בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים", "tokens": [ { "token": "בשנת", "syntax": { "word": "בשנת", "dep_head_idx": 2, "dep_func": "obl", "dep_head": "השלים" }, "seg": [ "ב", "שנת" ], "lex": "שנה", "morph": { "token": "בשנת", "pos": "NOUN", "feats": { "Gender": "Fem", "Number": "Sing" }, "prefixes": [ "ADP" ], "suffix": false } }, { "token": "1948", "syntax": { "word": "1948", "dep_head_idx": 0, "dep_func": "compound", "dep_head": "בשנת" }, "seg": [ "1948" ], "lex": "1948", "morph": { "token": "1948", "pos": "NUM", "feats": {}, "prefixes": [], "suffix": false } }, { "token": "השלים", "syntax": { "word": "השלים", "dep_head_idx": -1, "dep_func": "root", "dep_head": "הומוריסטיים" }, "seg": [ "השלים" ], "lex": "השלים", "morph": { "token": "השלים", "pos": "VERB", "feats": { "Gender": "Masc", "Number": "Sing", "Person": "3", "Tense": "Past" }, "prefixes": [], "suffix": false } }, { "token": "אפרים", "syntax": { "word": "אפרים", "dep_head_idx": 2, "dep_func": "nsubj", "dep_head": "השלים" }, "seg": [ "אפרים" ], "lex": "אפרים", "morph": { "token": "אפרים", "pos": "PROPN", "feats": {}, "prefixes": [], "suffix": false } }, { "token": "קישון", "syntax": { "word": "קישון", "dep_head_idx": 3, "dep_func": "flat", "dep_head": "אפרים" }, "seg": [ "קישון" ], "lex": "קישון", "morph": { "token": "קישון", "pos": "PROPN", "feats": {}, "prefixes": [], "suffix": false } }, { "token": "את", "syntax": { "word": "את", "dep_head_idx": 6, "dep_func": "case", "dep_head": "לימודיו" }, "seg": [ "את" ], "lex": "את", "morph": { "token": "את", "pos": "ADP", "feats": {}, "prefixes": [], "suffix": false } }, { "token": "לימודיו", "syntax": { "word": "לימודיו", "dep_head_idx": 2, "dep_func": "obj", "dep_head": "השלים" }, "seg": [ "לימודיו" ], "lex": "לימוד", "morph": { "token": "לימודיו", "pos": "NOUN", "feats": { "Gender": "Masc", "Number": "Plur" }, "prefixes": [], "suffix": "PRON", "suffix_feats": { "Gender": "Masc", "Number": "Sing", "Person": "3" } } }, { "token": "בפיסול", "syntax": { "word": "בפיסול", "dep_head_idx": 6, "dep_func": "nmod", "dep_head": "לימודיו" }, "seg": [ "ב", "פיסול" ], "lex": "פיסול", "morph": { "token": "בפיסול", "pos": "NOUN", "feats": { "Gender": "Masc", "Number": "Sing" }, "prefixes": [ "ADP" ], "suffix": false } }, { "token": "מתכת", "syntax": { "word": "מתכת", "dep_head_idx": 7, "dep_func": "compound", "dep_head": "בפיסול" }, "seg": [ "מתכת" ], "lex": "מתכת", "morph": { "token": "מתכת", "pos": "NOUN", "feats": { "Gender": "Fem", "Number": "Sing" }, "prefixes": [], "suffix": false } }, { "token": "ובתולדות", "syntax": { "word": "ובתולדות", "dep_head_idx": 7, "dep_func": "conj", "dep_head": "בפיסול" }, "seg": [ "וב", "תולדות" ], "lex": "תולדה", "morph": { "token": "ובתולדות", "pos": "NOUN", "feats": { "Gender": "Fem", "Number": "Plur" }, "prefixes": [ "CCONJ", "ADP" ], "suffix": false } }, { "token": "האמנות", "syntax": { "word": "האמנות", "dep_head_idx": 9, "dep_func": "compound", "dep_head": "ובתולדות" }, "seg": [ "ה", "אמנות" ], "lex": "אומנות", "morph": { "token": "האמנות", "pos": "NOUN", "feats": { "Gender": "Fem", "Number": "Sing" }, "prefixes": [ "DET" ], "suffix": false } }, { "token": "והחל", "syntax": { "word": "והחל", "dep_head_idx": 2, "dep_func": "conj", "dep_head": "השלים" }, "seg": [ "ו", "החל" ], "lex": "החל", "morph": { "token": "והחל", "pos": "VERB", "feats": { "Gender": "Masc", "Number": "Sing", "Person": "3", "Tense": "Past" }, "prefixes": [ "CCONJ" ], "suffix": false } }, { "token": "לפרסם", "syntax": { "word": "לפרסם", "dep_head_idx": 11, "dep_func": "xcomp", "dep_head": "והחל" }, "seg": [ "לפרסם" ], "lex": "פרסם", "morph": { "token": "לפרסם", "pos": "VERB", "feats": {}, "prefixes": [], "suffix": false } }, { "token": "מאמרים", "syntax": { "word": "מאמרים", "dep_head_idx": 12, "dep_func": "obj", "dep_head": "לפרסם" }, "seg": [ "מאמרים" ], "lex": "מאמר", "morph": { "token": "מאמרים", "pos": "NOUN", "feats": { "Gender": "Masc", "Number": "Plur" }, "prefixes": [], "suffix": false } }, { "token": "הומוריסטיים", "syntax": { "word": "הומוריסטיים", "dep_head_idx": 13, "dep_func": "amod", "dep_head": "מאמרים" }, "seg": [ "הומוריסטיים" ], "lex": "הומוריסטי", "morph": { "token": "הומוריסטיים", "pos": "ADJ", "feats": { "Gender": "Masc", "Number": "Plur" }, "prefixes": [], "suffix": false } } ], "root_idx": 2, "ner_entities": [ { "phrase": "1948", "label": "TIMEX" }, { "phrase": "אפרים קישון", "label": "PER" } ] } ] ``` ## Citation If you use DictaBERT in your research, please cite ```DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew``` **BibTeX:** ```bibtex @misc{shmidman2023dictabert, title={DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew}, author={Shaltiel Shmidman and Avi Shmidman and Moshe Koppel}, year={2023}, eprint={2308.16687}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` ## License Shield: [![CC BY 4.0][cc-by-shield]][cc-by] This work is licensed under a [Creative Commons Attribution 4.0 International License][cc-by]. [![CC BY 4.0][cc-by-image]][cc-by] [cc-by]: http://creativecommons.org/licenses/by/4.0/ [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg