96abhishekarora commited on
Commit
51ea4cf
·
1 Parent(s): edcf572

Modified validation and training for linktransformer model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
LT_training_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_save_dir": "models",
3
+ "model_save_name": "linkage_un_data_multi_fine_coarse",
4
+ "opt_model_description": "This model was trained on a dataset prepared by linking product classifications from [UN stats](https://unstats.un.org/unsd/classifications/Econ). \n This model is designed to link different products to their industrial classification (ISIC) - trained on variation brought on by product level correspondance. It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n ",
5
+ "opt_model_lang": [
6
+ "en",
7
+ "fr",
8
+ "es"
9
+ ],
10
+ "train_batch_size": 64,
11
+ "num_epochs": 100,
12
+ "warm_up_perc": 1,
13
+ "learning_rate": 2e-06,
14
+ "val_perc": 0.2,
15
+ "wandb_names": {
16
+ "project": "linkage",
17
+ "id": "econabhishek",
18
+ "run": "linkage_un_data_multi_fine_coarse",
19
+ "entity": "econabhishek"
20
+ },
21
+ "add_pooling_layer": false,
22
+ "large_val": true,
23
+ "eval_steps_perc": 0.1,
24
+ "test_at_end": true,
25
+ "save_val_test_pickles": true,
26
+ "val_query_prop": 0.5,
27
+ "eval_type": "retrieval",
28
+ "training_dataset": "dataframe",
29
+ "base_model_path": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
30
+ "best_model_path": "models/linkage_un_data_multi_fine_coarse"
31
+ }
README.md ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ language:
4
+ - en
5
+ - fr
6
+ - es
7
+ tags:
8
+ - linktransformer
9
+ - sentence-transformers
10
+ - sentence-similarity
11
+ - tabular-classification
12
+
13
+ ---
14
+
15
+ # dell-research-harvard/lt-un-data-fine-coarse-multi
16
+
17
+ This is a [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
18
+ It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
19
+ Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
20
+ It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
21
+ Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
22
+
23
+
24
+ This model has been fine-tuned on the model : sentence-transformers/paraphrase-multilingual-mpnet-base-v2. It is pretrained for the language : - en
25
+ - fr
26
+ - es.
27
+
28
+
29
+ This model was trained on a dataset prepared by linking product classifications from [UN stats](https://unstats.un.org/unsd/classifications/Econ).
30
+ This model is designed to link different products to their industrial classification (ISIC) - trained on variation brought on by product level correspondance. It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
31
+
32
+
33
+ ## Usage (LinkTransformer)
34
+
35
+ Using this model becomes easy when you have [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) installed:
36
+
37
+ ```
38
+ pip install -U linktransformer
39
+ ```
40
+
41
+ Then you can use the model like this:
42
+
43
+ ```python
44
+ import linktransformer as lt
45
+ import pandas as pd
46
+
47
+ ##Load the two dataframes that you want to link. For example, 2 dataframes with company names that are written differently
48
+ df1=pd.read_csv("data/df1.csv") ###This is the left dataframe with key CompanyName for instance
49
+ df2=pd.read_csv("data/df2.csv") ###This is the right dataframe with key CompanyName for instance
50
+
51
+ ###Merge the two dataframes on the key column!
52
+ df_merged = lt.merge(df1, df2, on="CompanyName", how="inner")
53
+
54
+ ##Done! The merged dataframe has a column called "score" that contains the similarity score between the two company names
55
+
56
+ ```
57
+
58
+
59
+ ## Training your own LinkTransformer model
60
+ Any Sentence Transformers can be used as a backbone by simply adding a pooling layer. Any other transformer on HuggingFace can also be used by specifying the option add_pooling_layer==True
61
+ The model was trained using SupCon loss.
62
+ Usage can be found in the package docs.
63
+ The training config can be found in the repo with the name LT_training_config.json
64
+ To replicate the training, you can download the file and specify the path in the config_path argument of the training function. You can also override the config by specifying the training_args argument.
65
+ Here is an example.
66
+
67
+
68
+ ```python
69
+
70
+ ##Consider the example in the paper that has a dataset of Mexican products and their tariff codes from 1947 and 1948 and we want train a model to link the two tariff codes.
71
+ saved_model_path = train_model(
72
+ model_path="hiiamsid/sentence_similarity_spanish_es",
73
+ dataset_path=dataset_path,
74
+ left_col_names=["description47"],
75
+ right_col_names=['description48'],
76
+ left_id_name=['tariffcode47'],
77
+ right_id_name=['tariffcode48'],
78
+ log_wandb=False,
79
+ config_path=LINKAGE_CONFIG_PATH,
80
+ training_args={"num_epochs": 1}
81
+ )
82
+
83
+ ```
84
+
85
+
86
+ You can also use this package for deduplication (clusters a df on the supplied key column). Merging a fine class (like product) to a coarse class (like HS code) is also possible.
87
+ Read our paper and the documentation for more!
88
+
89
+
90
+
91
+ ## Evaluation Results
92
+
93
+ <!--- Describe how your model was evaluated -->
94
+
95
+ You can evaluate the model using the [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) package's inference functions.
96
+ We have provided a few datasets in the package for you to try out. We plan to host more datasets on Huggingface and our website (Coming soon) that you can take a look at.
97
+
98
+
99
+ ## Training
100
+ The model was trained with the parameters:
101
+
102
+ **DataLoader**:
103
+
104
+ `torch.utils.data.dataloader.DataLoader` of length 51 with parameters:
105
+ ```
106
+ {'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
107
+ ```
108
+
109
+ **Loss**:
110
+
111
+ `linktransformer.modified_sbert.losses.SupConLoss_wandb`
112
+
113
+ Parameters of the fit()-Method:
114
+ ```
115
+ {
116
+ "epochs": 100,
117
+ "evaluation_steps": 510,
118
+ "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
119
+ "max_grad_norm": 1,
120
+ "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
121
+ "optimizer_params": {
122
+ "lr": 2e-06
123
+ },
124
+ "scheduler": "WarmupLinear",
125
+ "steps_per_epoch": null,
126
+ "warmup_steps": 5100,
127
+ "weight_decay": 0.01
128
+ }
129
+ ```
130
+
131
+
132
+
133
+
134
+ LinkTransformer(
135
+ (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
136
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
137
+ )
138
+ ```
139
+
140
+ ## Citing & Authors
141
+
142
+ <!--- Describe where people can find more information -->
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/linkage_un_data_multi_fine_coarse/",
3
+ "architectures": [
4
+ "XLMRobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "xlm-roberta",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "absolute",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.31.0",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 250002
29
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.0.0",
4
+ "transformers": "4.7.0",
5
+ "pytorch": "1.9.0+cu102"
6
+ }
7
+ }
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73df030296c8931f6c0a3a0086456ccfbb52ff44a1f4cc63e1c37cd007ee8e1e
3
+ size 1112238569
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 128,
3
+ "do_lower_case": false
4
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b60b6b43406a48bf3638526314f3d232d97058bc93472ff2de930d43686fa441
3
+ size 17082913
tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 512,
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "tokenizer_class": "XLMRobertaTokenizer",
18
+ "unk_token": "<unk>"
19
+ }