ljw20180420 commited on
Commit
f6fa04c
·
verified ·
1 Parent(s): 98e362a

Upload folder using huggingface_hub

Browse files
FOREcasT_model/README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ datasets:
6
+ - crispr_data
7
+ model-index:
8
+ - name: SX_spcas9_FOREcasT
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # SX_spcas9_FOREcasT
16
+
17
+ This model is a fine-tuned version of [](https://huggingface.co/) on the crispr_data dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 90.0301
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.001
39
+ - train_batch_size: 100
40
+ - eval_batch_size: 100
41
+ - seed: 63036
42
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
+ - lr_scheduler_type: linear
44
+ - lr_scheduler_warmup_ratio: 0.05
45
+ - num_epochs: 30.0
46
+
47
+ ### Training results
48
+
49
+ | Training Loss | Epoch | Step | Validation Loss |
50
+ |:-------------:|:-----:|:----:|:---------------:|
51
+ | 5299.9422 | 1.0 | 322 | 4486.6816 |
52
+ | 3409.13 | 2.0 | 644 | 2367.0308 |
53
+ | 1740.73 | 3.0 | 966 | 1208.4238 |
54
+ | 886.9884 | 4.0 | 1288 | 619.2037 |
55
+ | 463.7353 | 5.0 | 1610 | 335.7322 |
56
+ | 264.2752 | 6.0 | 1932 | 205.3073 |
57
+ | 173.0323 | 7.0 | 2254 | 145.6962 |
58
+ | 131.4609 | 8.0 | 2576 | 117.9419 |
59
+ | 111.7286 | 9.0 | 2898 | 104.6322 |
60
+ | 102.0464 | 10.0 | 3220 | 97.9003 |
61
+ | 97.07 | 11.0 | 3542 | 94.3013 |
62
+ | 94.5021 | 12.0 | 3864 | 92.5289 |
63
+ | 93.0652 | 13.0 | 4186 | 91.5912 |
64
+ | 92.3189 | 14.0 | 4508 | 91.0249 |
65
+ | 91.8618 | 15.0 | 4830 | 90.6213 |
66
+ | 91.6092 | 16.0 | 5152 | 90.4249 |
67
+ | 91.4372 | 17.0 | 5474 | 90.2542 |
68
+ | 91.3401 | 18.0 | 5796 | 90.2745 |
69
+ | 91.2793 | 19.0 | 6118 | 90.1836 |
70
+ | 91.2196 | 20.0 | 6440 | 90.1465 |
71
+ | 91.1831 | 21.0 | 6762 | 90.0652 |
72
+ | 91.1484 | 22.0 | 7084 | 90.1792 |
73
+ | 91.1333 | 23.0 | 7406 | 90.0813 |
74
+ | 91.1064 | 24.0 | 7728 | 90.1987 |
75
+ | 91.09 | 25.0 | 8050 | 90.0518 |
76
+ | 91.0658 | 26.0 | 8372 | 90.0653 |
77
+ | 91.0503 | 27.0 | 8694 | 90.0538 |
78
+ | 91.0277 | 28.0 | 9016 | 90.0120 |
79
+ | 91.013 | 29.0 | 9338 | 90.0356 |
80
+ | 90.9967 | 30.0 | 9660 | 90.0301 |
81
+
82
+
83
+ ### Framework versions
84
+
85
+ - Transformers 4.44.2
86
+ - Pytorch 2.4.0+cu124
87
+ - Datasets 2.21.0
88
+ - Tokenizers 0.19.1
FOREcasT_model/config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": "/home/ljw/sdc1/CRISPR_results/FOREcasT/SX_spcas9_FOREcasT",
3
  "architectures": [
4
  "FOREcasTModel"
5
  ],
 
1
  {
 
2
  "architectures": [
3
  "FOREcasTModel"
4
  ],
FOREcasT_model/model.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig, PreTrainedModel
2
+ import torch.nn as nn
3
+ import torch
4
+ import torch.nn.functional as F
5
+
6
+ class FOREcasTConfig(PretrainedConfig):
7
+ model_type = "FOREcasT"
8
+ label_names = ["count"]
9
+
10
+ def __init__(
11
+ self,
12
+ reg_const = 0.01, # regularization coefficient for insertion
13
+ i1_reg_const = 0.01, # regularization coefficient for deletion
14
+ seed = 63036, # random seed for intialization
15
+ **kwargs
16
+ ):
17
+ self.reg_const = reg_const
18
+ self.i1_reg_const = i1_reg_const
19
+ self.seed = seed
20
+ super().__init__(**kwargs)
21
+
22
+ class FOREcasTModel(PreTrainedModel):
23
+ config_class = FOREcasTConfig
24
+
25
+ @staticmethod
26
+ def get_feature_label():
27
+ def features_pairwise_label(features1_label, features2_label):
28
+ features_label = []
29
+ for label1 in features1_label:
30
+ for label2 in features2_label:
31
+ features_label.append(f'PW_{label1}_vs_{label2}')
32
+ return features_label
33
+ feature_DelSize_label = ["Any Deletion", "D1", "D2-3", "D4-7", "D8-12", "D>12"]
34
+ feature_InsSize_label = ["Any Insertion", "I1", "I2"]
35
+ feature_DelLoc_label = ['DL-1--1', 'DL-2--2', 'DL-3--3', 'DL-4--6', 'DL-7--10', 'DL-11--15', 'DL-16--30', 'DL<-30', 'DL>=0', 'DR0-0', 'DR1-1', 'DR2-2', 'DR3-5', 'DR6-9', 'DR10-14', 'DR15-29', 'DR<0', 'DR>=30']
36
+ feature_InsSeq_label = ["I1_A", "I1_C", "I1_G", "I1_T", "I2_AA", "I2_AC", "I2_AG", "I2_AT", "I2_CA", "I2_CC", "I2_CG", "I2_CT", "I2_GA", "I2_GC", "I2_GG", "I2_GT", "I2_TA", "I2_TC", "I2_TG", "I2_TT"]
37
+ feature_InsLoc_label = ["IL-1--1", "IL-2--2", "IL-3--3", "IL<-3", "IL>=0"]
38
+ feature_LocalCutSiteSequence_label = []
39
+ for offset in range(-5, 4):
40
+ for nt in ["A", "G", "C", "T"]:
41
+ feature_LocalCutSiteSequence_label.append(f"CS{offset}_NT={nt}")
42
+ feature_LocalCutSiteSeqMatches_label = []
43
+ for offset1 in range(-3, 2):
44
+ for offset2 in range(-3, offset1):
45
+ for nt in ["A", "G", "C", "T"]:
46
+ feature_LocalCutSiteSeqMatches_label.append(f"M_CS{offset1}_{offset2}_NT={nt}")
47
+ feature_LocalRelativeSequence_label = []
48
+ for offset in range(-3, 3):
49
+ for nt in ["A", "G", "C", "T"]:
50
+ feature_LocalRelativeSequence_label.append(f'L{offset}_NT={nt}')
51
+ for offset in range(-3, 3):
52
+ for nt in ["A", "G", "C", "T"]:
53
+ feature_LocalRelativeSequence_label.append(f'R{offset}_NT={nt}')
54
+ feature_SeqMatches_label = []
55
+ for loffset in range(-3, 3):
56
+ for roffset in range(-3, 3):
57
+ feature_SeqMatches_label.append(f'X_L{loffset}_R{roffset}')
58
+ feature_SeqMatches_label.append(f'M_L{loffset}_R{roffset}')
59
+ feature_I1or2Rpt_label = ['I1Rpt', 'I1NonRpt', 'I2Rpt', 'I2NonRpt']
60
+ feature_microhomology_label = ['L_MH1-1', 'R_MH1-1', 'L_MH2-2', 'R_MH2-2', 'L_MH3-3', 'R_MH3-3', 'L_MM1_MH3-3', 'R_MM1_MH3-3', 'L_MH4-6', 'R_MH4-6', 'L_MM1_MH4-6', 'R_MM1_MH4-6', 'L_MH7-10', 'R_MH7-10', 'L_MM1_MH7-10', 'R_MM1_MH7-10', 'L_MH11-15', 'R_MH11-15', 'L_MM1_MH11-15', 'R_MM1_MH11-15', 'No MH']
61
+ return (
62
+ features_pairwise_label(feature_DelSize_label, feature_DelLoc_label) +
63
+ feature_InsSize_label +
64
+ feature_DelSize_label +
65
+ feature_DelLoc_label +
66
+ feature_InsLoc_label +
67
+ feature_InsSeq_label +
68
+ features_pairwise_label(feature_LocalCutSiteSequence_label, feature_InsSize_label + feature_DelSize_label) +
69
+ features_pairwise_label(feature_microhomology_label + feature_LocalRelativeSequence_label, feature_DelSize_label + feature_DelLoc_label) +
70
+ features_pairwise_label(feature_LocalCutSiteSeqMatches_label + feature_SeqMatches_label, feature_DelSize_label) +
71
+ features_pairwise_label(feature_InsSeq_label + feature_LocalCutSiteSequence_label + feature_LocalCutSiteSeqMatches_label, feature_I1or2Rpt_label) +
72
+ feature_I1or2Rpt_label +
73
+ feature_LocalCutSiteSequence_label +
74
+ feature_LocalCutSiteSeqMatches_label +
75
+ feature_LocalRelativeSequence_label +
76
+ feature_SeqMatches_label +
77
+ feature_microhomology_label
78
+ )
79
+
80
+ def __init__(self, config) -> None:
81
+ super().__init__(config)
82
+ # In more recent versions of PyTorch, you no longer need to explicitly register_parameter, it's enough to set a member of your nn.Module with nn.Parameter to "notify" pytorch that this variable should be treated as a trainable parameter (https://stackoverflow.com/questions/59234238/how-to-add-parameters-in-module-class-in-pytorch-custom-model).
83
+ self.generator = torch.Generator().manual_seed(config.seed)
84
+ is_delete = torch.tensor(['I' not in label for label in FOREcasTModel.get_feature_label()])
85
+ self.register_buffer('reg_coff', (is_delete * config.reg_const + ~is_delete * config.i1_reg_const))
86
+ self.linear = nn.Linear(in_features=len(self.reg_coff), out_features=1, bias=False)
87
+ self.initialize_weights()
88
+
89
+ def initialize_weights(self):
90
+ for m in self.modules():
91
+ if isinstance(m, nn.Linear):
92
+ nn.init.normal_(m.weight, mean=0, std=1, generator=self.generator)
93
+ if m.bias is not None:
94
+ nn.init.constant_(m.bias, 0)
95
+
96
+ def forward(self, feature, count=None) -> torch.Tensor:
97
+ logit = self.linear(feature).squeeze()
98
+ if count is not None:
99
+ return {
100
+ "logit": logit,
101
+ "loss": self.kl_divergence(logit, count)
102
+ }
103
+ return {"logit": logit}
104
+
105
+ def kl_divergence(self, logit, count):
106
+ return F.kl_div(
107
+ F.log_softmax(logit, dim=-1),
108
+ F.normalize(count + 0.5, p=1.0, dim=-1), # add 0.5 to prevent log(0), see loadOligoFeaturesAndReadCounts
109
+ reduction='sum'
110
+ ) + logit.shape[0] * (self.reg_coff * (self.linear.weight ** 2)).sum()
FOREcasT_model/runs/Oct28_17-30-29_ljw-System-Product-Name/events.out.tfevents.1730107830.ljw-System-Product-Name.282387.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec014376f4c178912a6d64e3fe4e6df67254c3ac67fc6483f19674b0ebcef1f3
3
+ size 19358
FOREcasT_model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6534c7b2d9df1faa57fd5848215821482443cd8494c6f6c1d1af9993ab331ef
3
+ size 5304