hdallatorre commited on
Commit
ff80424
·
verified ·
1 Parent(s): 870bbf9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +41 -9
README.md CHANGED
@@ -39,29 +39,61 @@ A small snippet of code is given here in order to retrieve both logits and embed
39
  from transformers import AutoTokenizer, AutoModel
40
  import torch
41
 
42
- tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt_30kb", use_auth_token=hf_token, trust_remote_code=True)
43
- model = AutoModel.from_pretrained("InstaDeepAI/segment_nt_30kb", use_auth_token=hf_token, trust_remote_code=True)
44
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Choose the length to which the input sequences are padded. By default, the
47
  # model max length is chosen, but feel free to decrease it as the time taken to
48
  # obtain the embeddings increases significantly with it.
49
- max_length = tokenizer.model_max_length
 
 
 
 
 
 
50
 
51
  # Create a dummy dna sequence and tokenize it
52
  sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
53
- tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
54
 
55
- # Compute the embeddings
56
- attention_mask = torch_tokens != tokenizer.pad_token_id
57
  outs = model(
58
- torch_tokens,
59
  attention_mask=attention_mask,
60
  output_hidden_states=True
61
  )
62
 
63
- logits = outs.logits.detach().numpy()
 
 
64
  probabilities = torch.nn.functional.softmax(logits, dim=-1)
 
 
 
 
 
 
 
 
65
  ```
66
 
67
 
 
39
  from transformers import AutoTokenizer, AutoModel
40
  import torch
41
 
42
+ features = [
43
+ "protein_coding_gene",
44
+ "lncRNA",
45
+ "exon",
46
+ "intron",
47
+ "splice_donor",
48
+ "splice_acceptor",
49
+ "5UTR",
50
+ "3UTR",
51
+ "CTCF-bound",
52
+ "polyA_signal",
53
+ "enhancer_Tissue_specific",
54
+ "enhancer_Tissue_invariant",
55
+ "promoter_Tissue_specific",
56
+ "promoter_Tissue_invariant",
57
+ ]
58
+
59
+ tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt_30kb", trust_remote_code=True)
60
+ model = AutoModel.from_pretrained("InstaDeepAI/segment_nt_30kb", trust_remote_code=True)
61
 
62
  # Choose the length to which the input sequences are padded. By default, the
63
  # model max length is chosen, but feel free to decrease it as the time taken to
64
  # obtain the embeddings increases significantly with it.
65
+ # The number of DNA tokens (excluding the CLS token prepended) needs to be dividible by
66
+ # 2 to the power of the number of downsampling block, i.e 4.
67
+ max_length = 12 + 1
68
+
69
+ assert (max_length - 1) % 4 == 0, (
70
+ "The number of DNA tokens (excluding the CLS token prepended) needs to be dividible by"
71
+ "2 to the power of the number of downsampling block, i.e 4.")
72
 
73
  # Create a dummy dna sequence and tokenize it
74
  sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
75
+ tokens = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
76
 
77
+ # Infer
78
+ attention_mask = tokens != tokenizer.pad_token_id
79
  outs = model(
80
+ tokens,
81
  attention_mask=attention_mask,
82
  output_hidden_states=True
83
  )
84
 
85
+ # Obtain the logits over the genomic features
86
+ logits = outs.logits.detach()
87
+ # Transform them in probabilities
88
  probabilities = torch.nn.functional.softmax(logits, dim=-1)
89
+ print(f"Probabilities shape: {probabilities.shape}")
90
+
91
+ # Get probabilities associated with intron
92
+ idx_intron = features.index("intron")
93
+ probabilities_intron = probabilities[:,:,idx_intron]
94
+ print(f"Intron probabilities shape: {probabilities_intron.shape}")
95
+
96
+
97
  ```
98
 
99