Ligeti Balázs commited on
Commit
12bee07
·
1 Parent(s): d514933

Tokenizer base

Browse files
config_utils.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Config utils
2
+ import yaml
3
+ import pathlib
4
+ from os.path import join
5
+ import os
6
+ import numpy as np
7
+ import torch
8
+ from multiprocessing import cpu_count
9
+
10
+ class BaseConfig:
11
+ """Base class for managing and validating configurations."""
12
+
13
+ numpy_dtype_mapping = {1: np.int8,
14
+ 2: np.int16,
15
+ 8: np.int64,
16
+ 4: np.int32}
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ def cast_to_expected_type(self, parameter_class: str, parameter_name: str, value: any) -> any:
22
+ """
23
+ Cast the given value to the expected type.
24
+
25
+ :param parameter_class: The class/category of the parameter.
26
+ :type parameter_class: str
27
+ :param parameter_name: The name of the parameter.
28
+ :type parameter_name: str
29
+ :param value: The value to be casted.
30
+ :type value: any
31
+ :return: Value casted to the expected type.
32
+ :rtype: any
33
+ :raises ValueError: If casting fails.
34
+ """
35
+ expected_type = self.parameters[parameter_class][parameter_name]['type']
36
+
37
+ if expected_type in ["integer", "int"]:
38
+ try:
39
+ return int(value)
40
+ except ValueError:
41
+ raise ValueError(f"Failed to cast value '{value}' to integer for parameter '{parameter_name}' in class '{parameter_class}'.")
42
+ elif expected_type == "float":
43
+ try:
44
+ return float(value)
45
+ except ValueError:
46
+ raise ValueError(f"Failed to cast value '{value}' to float for parameter '{parameter_name}' in class '{parameter_class}'.")
47
+ elif expected_type in ["string", "str"]:
48
+ return str(value)
49
+ elif expected_type in ["boolean", "bool"]:
50
+ if isinstance(value, bool):
51
+ return value
52
+ elif str(value).lower() == "true":
53
+ return True
54
+ elif str(value).lower() == "false":
55
+ return False
56
+ else:
57
+ raise ValueError(f"Failed to cast value '{value}' to boolean for parameter '{parameter_name}' in class '{parameter_class}'.")
58
+ elif expected_type == "type":
59
+ # For this type, we will simply return the value without casting.
60
+ # It assumes the configuration provides valid Python types.
61
+ return value
62
+ elif expected_type == "list":
63
+ if isinstance(value, list):
64
+ return value
65
+ else:
66
+ raise ValueError(f"Failed to validate value '{value}' as a list for parameter '{parameter_name}' in class '{parameter_class}'.")
67
+ elif expected_type == "tuple":
68
+ if isinstance(value, tuple):
69
+ return value
70
+ else:
71
+ raise ValueError(f"Failed to validate value '{value}' as a tuple for parameter '{parameter_name}' in class '{parameter_class}'.")
72
+ elif expected_type == "set":
73
+ if isinstance(value, set):
74
+ return value
75
+ else:
76
+ raise ValueError(f"Failed to validate value '{value}' as a set for parameter '{parameter_name}' in class '{parameter_class}'.")
77
+ elif expected_type == "dict":
78
+ if isinstance(value, dict):
79
+ return value
80
+ else:
81
+ raise ValueError(f"Failed to validate value '{value}' as a dict for parameter '{parameter_name}' in class '{parameter_class}'.")
82
+ else:
83
+ raise ValueError(f"Unknown expected type '{expected_type}' for parameter '{parameter_name}' in class '{parameter_class}'.")
84
+
85
+
86
+
87
+ def get_parameter(self, parameter_class: str, parameter_name: str) -> any:
88
+ """
89
+ Retrieve the default value of a specified parameter.
90
+
91
+ :param parameter_class: The class/category of the parameter (e.g., 'segmentation').
92
+ :type parameter_class: str
93
+ :param parameter_name: The name of the parameter.
94
+ :type parameter_name: str
95
+ :return: Default value of the parameter, casted to the expected type.
96
+ :rtype: any
97
+ """
98
+ default_value = self.parameters[parameter_class][parameter_name]['default']
99
+ return self.cast_to_expected_type(parameter_class, parameter_name, default_value)
100
+
101
+
102
+
103
+ def validate_type(self, parameter_class: str, parameter_name: str, value: any) -> bool:
104
+ """
105
+ Validate the type of a given value against the expected type.
106
+
107
+ :param parameter_class: The class/category of the parameter.
108
+ :type parameter_class: str
109
+ :param parameter_name: The name of the parameter.
110
+ :type parameter_name: str
111
+ :param value: The value to be validated.
112
+ :type value: any
113
+ :return: True if the value is of the expected type, otherwise False.
114
+ :rtype: bool
115
+ """
116
+ expected_type = self.parameters[parameter_class][parameter_name]['type']
117
+
118
+ if expected_type == "integer" and not isinstance(value, int):
119
+ return False
120
+ elif expected_type == "float" and not isinstance(value, float):
121
+ return False
122
+ elif expected_type == "string" and not isinstance(value, str):
123
+ return False
124
+ else:
125
+ return True
126
+
127
+ def validate_value(self, parameter_class: str, parameter_name: str, value: any) -> bool:
128
+ """
129
+ Validate the value of a parameter against its constraints.
130
+
131
+ :param parameter_class: The class/category of the parameter.
132
+ :type parameter_class: str
133
+ :param parameter_name: The name of the parameter.
134
+ :type parameter_name: str
135
+ :param value: The value to be validated.
136
+ :type value: any
137
+ :return: True if the value meets the constraints, otherwise False.
138
+ :rtype: bool
139
+ """
140
+ constraints = self.parameters[parameter_class][parameter_name].get('constraints', {})
141
+
142
+ if 'options' in constraints and value not in constraints['options']:
143
+ return False
144
+ if 'min' in constraints and value < constraints['min']:
145
+ return False
146
+ if 'max' in constraints and value > constraints['max']:
147
+ return False
148
+ return True
149
+
150
+
151
+ def validate(self, parameter_class: str, parameter_name: str, value: any):
152
+ """
153
+ Validate both the type and value of a parameter.
154
+
155
+ :param parameter_class: The class/category of the parameter.
156
+ :type parameter_class: str
157
+ :param parameter_name: The name of the parameter.
158
+ :type parameter_name: str
159
+ :param value: The value to be validated.
160
+ :type value: any
161
+ :raises TypeError: If the value is not of the expected type.
162
+ :raises ValueError: If the value does not meet the parameter's constraints.
163
+ """
164
+ if not self.validate_type(parameter_class, parameter_name, value):
165
+ raise TypeError(f"Invalid type for {parameter_name} for parameter class '{parameter_class}'. Expected {self.parameters[parameter_class][parameter_name]['type']}.")
166
+
167
+ if not self.validate_value(parameter_class, parameter_name, value):
168
+ raise ValueError(f"Invalid value for {parameter_name} for parameter class '{parameter_class}'. Constraints: {self.parameters[parameter_class][parameter_name].get('constraints', {})}.")
169
+
170
+ def describe(self, parameter_class: str, parameter_name: str) -> str:
171
+ """
172
+ Retrieve the description of a parameter.
173
+
174
+ :param parameter_class: The class/category of the parameter.
175
+ :type parameter_class: str
176
+ :param parameter_name: The name of the parameter.
177
+ :type parameter_name: str
178
+ :return: Description of the parameter.
179
+ :rtype: str
180
+ """
181
+ return self.parameters[parameter_class][parameter_name]['description']
182
+
183
+
184
+
185
+ class SeqConfig(BaseConfig):
186
+ """Class to manage and validate sequence processing configurations."""
187
+
188
+ def __init__(self):
189
+ super().__init__()
190
+ self.default_seq_config_file = self._get_default_sequence_processing_config_file()
191
+ with open(self.default_seq_config_file, 'r') as file:
192
+ self.parameters = yaml.safe_load(file)
193
+
194
+ # Some postprocessing steps
195
+ self.parameters['tokenization']['shift']['constraints']['max'] = self.parameters['tokenization']['kmer']['default']-1
196
+ # Ha valaki update-li a k-mer paramter-t, akkor triggerelni kellene, hogy mi legyen.
197
+
198
+ self.get_and_set_segmentation_parameters()
199
+ self.get_and_set_tokenization_parameters()
200
+ self.get_and_set_computational_parameters()
201
+
202
+ def _get_default_sequence_processing_config_file(self) -> str:
203
+ """
204
+ Retrieve the default sequence processing configuration file.
205
+
206
+ :return: Path to the configuration file.
207
+ :rtype: str
208
+ """
209
+ current_path = pathlib.Path(__file__).parent
210
+ prokbert_seq_config_file = join(current_path, 'configs', 'sequence_processing.yaml')
211
+ self.current_path = current_path
212
+
213
+ try:
214
+ # Attempt to read the environment variable
215
+ prokbert_seq_config_file = os.environ['SEQ_CONFIG_FILE']
216
+ except KeyError:
217
+ # Handle the case when the environment variable is not found
218
+ print("SEQ_CONFIG_FILE environment variable has not been set. Using default value: {0}".format(prokbert_seq_config_file))
219
+ return prokbert_seq_config_file
220
+
221
+
222
+ def get_and_set_segmentation_parameters(self, parameters: dict = {}) -> dict:
223
+ """
224
+ Retrieve and validate the provided parameters for segmentation.
225
+
226
+ :param parameters: A dictionary of parameters to be validated.
227
+ :type parameters: dict
228
+ :return: A dictionary of validated segmentation parameters.
229
+ :rtype: dict
230
+ :raises ValueError: If an invalid segmentation parameter is provided.
231
+ """
232
+ segmentation_params = {k: self.get_parameter('segmentation', k) for k in self.parameters['segmentation']}
233
+
234
+ for param, param_value in parameters.items():
235
+ if param not in segmentation_params:
236
+ raise ValueError(f"The provided {param} is an INVALID segmentation parameter! The valid parameters are: {list(segmentation_params.keys())}")
237
+ self.validate('segmentation', param, param_value)
238
+ segmentation_params[param] = param_value
239
+ self.segmentation_params = segmentation_params
240
+
241
+
242
+ return segmentation_params
243
+
244
+
245
+ def get_and_set_tokenization_parameters(self, parameters: dict = {}) -> dict:
246
+ # Updating the other parameters if necesseary, i.e. if k-mer has-been changed, then the shift is updated and we run a parameter check at the end
247
+
248
+ tokenization_params = {k: self.get_parameter('tokenization', k) for k in self.parameters['tokenization']}
249
+ for param, param_value in parameters.items():
250
+ if param not in tokenization_params:
251
+ raise ValueError(f"The provided {param} is an INVALID tokenization parameter! The valid parameters are: {list(tokenization_params.keys())}")
252
+ self.validate('tokenization', param, param_value)
253
+ tokenization_params[param] = param_value
254
+
255
+ # Loading and check the vocab file. It is assumed that its ordered dictionary
256
+ vocabfile=tokenization_params['vocabfile']
257
+ act_kmer = tokenization_params['kmer']
258
+ if vocabfile=='auto':
259
+ print(self.current_path)
260
+ vocabfile_path = join(self.current_path, 'data/prokbert_vocabs/', f'prokbert-base-dna{act_kmer}', 'vocab.txt')
261
+ tokenization_params['vocabfile'] = vocabfile_path
262
+ else:
263
+ vocabfile_path = vocabfile
264
+ with open(vocabfile_path) as vocabfile_in:
265
+ vocabmap = {line.strip(): i for i, line in enumerate(vocabfile_in)}
266
+ tokenization_params['vocabmap'] = vocabmap
267
+
268
+ # Loading the vocab
269
+ self.tokenization_params = tokenization_params
270
+ return tokenization_params
271
+
272
+ def get_and_set_computational_parameters(self, parameters: dict = {}) -> dict:
273
+ """ Reading and validating the computational paramters
274
+ """
275
+
276
+ computational_params = {k: self.get_parameter('computation', k) for k in self.parameters['computation']}
277
+ core_count = cpu_count()
278
+
279
+ if computational_params['cpu_cores_for_segmentation'] == -1:
280
+ computational_params['cpu_cores_for_segmentation'] = core_count
281
+
282
+ if computational_params['cpu_cores_for_tokenization'] == -1:
283
+ computational_params['cpu_cores_for_tokenization'] = core_count
284
+
285
+
286
+
287
+ for param, param_value in parameters.items():
288
+ if param not in computational_params:
289
+ raise ValueError(f"The provided {param} is an INVALID computation parameter! The valid parameters are: {list(computational_params.keys())}")
290
+ self.validate('computation', param, param_value)
291
+ computational_params[param] = param_value
292
+
293
+ np_tokentype= SeqConfig.numpy_dtype_mapping[computational_params['numpy_token_integer_prec_byte']]
294
+ computational_params['np_tokentype'] = np_tokentype
295
+ self.computational_params = computational_params
296
+ return computational_params
297
+
298
+
299
+ def get_maximum_segment_length_from_token_count_from_params(self):
300
+ """Calculating the maximum length of the segment from the token count """
301
+ max_token_counts = self.tokenization_params['token_limit']
302
+ shift = self.tokenization_params['shift']
303
+ kmer = self.tokenization_params['kmer']
304
+ return self.get_maximum_segment_length_from_token_count(max_token_counts, shift, kmer)
305
+
306
+ def get_maximum_token_count_from_max_length_from_params(self):
307
+ """Calculating the maximum length of the segment from the token count """
308
+
309
+
310
+ max_segment_length = self.tokenization_params['max_segment_length']
311
+ shift = self.tokenization_params['shift']
312
+ kmer = self.tokenization_params['kmer']
313
+ max_token_count = self.get_maximum_token_count_from_max_length(max_segment_length, shift, kmer)
314
+
315
+ return max_token_count
316
+
317
+ @staticmethod
318
+ def get_maximum_segment_length_from_token_count(max_token_counts, shift, kmer):
319
+ """Calcuates how long sequence can be covered
320
+ """
321
+
322
+ max_segment_length = (max_token_counts-3)*shift + kmer
323
+ return max_segment_length
324
+
325
+ @staticmethod
326
+ def get_maximum_token_count_from_max_length(max_segment_length, shift, kmer):
327
+ """Calcuates how long sequence can be covered
328
+ """
329
+ max_token_count = int(np.ceil((max_segment_length - kmer)/shift+3))
330
+ return max_token_count
331
+
332
+ class ProkBERTConfig(BaseConfig):
333
+ """Class to manage and validate pretraining configurations."""
334
+
335
+ torch_dtype_mapping = {1: torch.uint8,
336
+ 2: torch.int16,
337
+ 8: torch.int64,
338
+ 4: torch.int32}
339
+
340
+ def __init__(self):
341
+ super().__init__()
342
+
343
+ self.default_pretrain_config_file = self._get_default_pretrain_config_file()
344
+ with open(self.default_pretrain_config_file, 'r') as file:
345
+ self.parameters = yaml.safe_load(file)
346
+
347
+ # Load and validate each parameter set
348
+ self.data_collator_params = self.get_set_parameters('data_collator')
349
+ self.model_params = self.get_set_parameters('model')
350
+ self.dataset_params = self.get_set_parameters('dataset')
351
+ self.pretraining_params = self.get_set_parameters('pretraining')
352
+ # Getting the sequtils params as well
353
+
354
+ self.def_seq_config = SeqConfig()
355
+ self.segmentation_params = self.def_seq_config.get_and_set_segmentation_parameters(self.parameters['segmentation'])
356
+ self.tokenization_params = self.def_seq_config.get_and_set_tokenization_parameters(self.parameters['tokenization'])
357
+ self.computation_params = self.def_seq_config.get_and_set_computational_parameters(self.parameters['computation'])
358
+
359
+ self.default_torchtype = ProkBERTConfig.torch_dtype_mapping[self.computation_params['numpy_token_integer_prec_byte']]
360
+
361
+ def _get_default_pretrain_config_file(self) -> str:
362
+ """
363
+ Retrieve the default pretraining configuration file.
364
+
365
+ :return: Path to the configuration file.
366
+ :rtype: str
367
+ """
368
+ current_path = pathlib.Path(__file__).parent
369
+ pretrain_config_file = join(current_path, 'configs', 'pretraining.yaml')
370
+
371
+ try:
372
+ # Attempt to read the environment variable
373
+ pretrain_config_file = os.environ['PRETRAIN_CONFIG_FILE']
374
+ except KeyError:
375
+ # Handle the case when the environment variable is not found
376
+ print(f"PRETRAIN_CONFIG_FILE environment variable has not been set. Using default value: {pretrain_config_file}")
377
+ return pretrain_config_file
378
+
379
+ def get_set_parameters(self, parameter_class: str, parameters: dict = {}) -> dict:
380
+ """
381
+ Retrieve and validate the provided parameters for a given parameter class.
382
+
383
+ :param parameter_class: The class/category of the parameter (e.g., 'data_collator').
384
+ :type parameter_class: str
385
+ :param parameters: A dictionary of parameters to be validated.
386
+ :type parameters: dict
387
+ :return: A dictionary of validated parameters.
388
+ :rtype: dict
389
+ :raises ValueError: If an invalid parameter is provided.
390
+ """
391
+ class_params = {k: self.get_parameter(parameter_class, k) for k in self.parameters[parameter_class]}
392
+
393
+ # First validatiading the class parameters as well
394
+ for param, param_value in class_params.items():
395
+
396
+ self.validate(parameter_class, param, param_value)
397
+
398
+
399
+ for param, param_value in parameters.items():
400
+ if param not in class_params:
401
+ raise ValueError(f"The provided {param} is an INVALID {parameter_class} parameter! The valid parameters are: {list(class_params.keys())}")
402
+ self.validate(parameter_class, param, param_value)
403
+ class_params[param] = param_value
404
+
405
+ return class_params
406
+
407
+ def get_and_set_model_parameters(self, parameters: dict = {}) -> dict:
408
+ """ Setting the model parameters """
409
+
410
+ self.model_params = self.get_set_parameters('model', parameters)
411
+
412
+ return self.model_params
413
+
414
+ def get_and_set_dataset_parameters(self, parameters: dict = {}) -> dict:
415
+ """ Setting the dataset parameters """
416
+
417
+ self.dataset_params = self.get_set_parameters('dataset', parameters)
418
+
419
+ return self.dataset_params
420
+
421
+ def get_and_set_pretraining_parameters(self, parameters: dict = {}) -> dict:
422
+ """ Setting the model parameters """
423
+ self.pretraining_params = self.get_set_parameters('pretraining', parameters)
424
+
425
+ return self.pretraining_params
426
+
427
+
428
+ def get_and_set_datacollator_parameters(self, parameters: dict = {}) -> dict:
429
+ """ Setting the model parameters """
430
+ self.data_collator_params = self.get_set_parameters('data_collator', parameters)
431
+ return self.data_collator_params
432
+
433
+ def get_and_set_segmentation_parameters(self, parameters: dict = {}) -> dict:
434
+ self.segmentation_params = self.def_seq_config.get_and_set_segmentation_parameters(parameters)
435
+
436
+ return self.segmentation_params
437
+ def get_and_set_tokenization_parameters(self, parameters: dict = {}) -> dict:
438
+ self.tokenization_params = self.def_seq_config.get_and_set_tokenization_parameters(parameters)
439
+
440
+ return self.tokenization_params
441
+ def get_and_set_computation_params(self, parameters: dict = {}) -> dict:
442
+ self.computation_params = self.def_seq_config.get_and_set_computational_parameters(parameters)
443
+ return self.computation_params
data/prokbert_vocabs/prokbert-base-dna1/vocab.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ A
7
+ C
8
+ G
9
+ T
data/prokbert_vocabs/prokbert-base-dna2/vocab.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ AA
7
+ AC
8
+ AG
9
+ AT
10
+ CA
11
+ CC
12
+ CG
13
+ CT
14
+ GA
15
+ GC
16
+ GG
17
+ GT
18
+ TA
19
+ TC
20
+ TG
21
+ TT
data/prokbert_vocabs/prokbert-base-dna3/vocab.txt ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ AAA
7
+ AAC
8
+ AAG
9
+ AAT
10
+ ACA
11
+ ACC
12
+ ACG
13
+ ACT
14
+ AGA
15
+ AGC
16
+ AGG
17
+ AGT
18
+ ATA
19
+ ATC
20
+ ATG
21
+ ATT
22
+ CAA
23
+ CAC
24
+ CAG
25
+ CAT
26
+ CCA
27
+ CCC
28
+ CCG
29
+ CCT
30
+ CGA
31
+ CGC
32
+ CGG
33
+ CGT
34
+ CTA
35
+ CTC
36
+ CTG
37
+ CTT
38
+ GAA
39
+ GAC
40
+ GAG
41
+ GAT
42
+ GCA
43
+ GCC
44
+ GCG
45
+ GCT
46
+ GGA
47
+ GGC
48
+ GGG
49
+ GGT
50
+ GTA
51
+ GTC
52
+ GTG
53
+ GTT
54
+ TAA
55
+ TAC
56
+ TAG
57
+ TAT
58
+ TCA
59
+ TCC
60
+ TCG
61
+ TCT
62
+ TGA
63
+ TGC
64
+ TGG
65
+ TGT
66
+ TTA
67
+ TTC
68
+ TTG
69
+ TTT
data/prokbert_vocabs/prokbert-base-dna4/vocab.txt ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ AAAA
7
+ AAAC
8
+ AAAG
9
+ AAAT
10
+ AACA
11
+ AACC
12
+ AACG
13
+ AACT
14
+ AAGA
15
+ AAGC
16
+ AAGG
17
+ AAGT
18
+ AATA
19
+ AATC
20
+ AATG
21
+ AATT
22
+ ACAA
23
+ ACAC
24
+ ACAG
25
+ ACAT
26
+ ACCA
27
+ ACCC
28
+ ACCG
29
+ ACCT
30
+ ACGA
31
+ ACGC
32
+ ACGG
33
+ ACGT
34
+ ACTA
35
+ ACTC
36
+ ACTG
37
+ ACTT
38
+ AGAA
39
+ AGAC
40
+ AGAG
41
+ AGAT
42
+ AGCA
43
+ AGCC
44
+ AGCG
45
+ AGCT
46
+ AGGA
47
+ AGGC
48
+ AGGG
49
+ AGGT
50
+ AGTA
51
+ AGTC
52
+ AGTG
53
+ AGTT
54
+ ATAA
55
+ ATAC
56
+ ATAG
57
+ ATAT
58
+ ATCA
59
+ ATCC
60
+ ATCG
61
+ ATCT
62
+ ATGA
63
+ ATGC
64
+ ATGG
65
+ ATGT
66
+ ATTA
67
+ ATTC
68
+ ATTG
69
+ ATTT
70
+ CAAA
71
+ CAAC
72
+ CAAG
73
+ CAAT
74
+ CACA
75
+ CACC
76
+ CACG
77
+ CACT
78
+ CAGA
79
+ CAGC
80
+ CAGG
81
+ CAGT
82
+ CATA
83
+ CATC
84
+ CATG
85
+ CATT
86
+ CCAA
87
+ CCAC
88
+ CCAG
89
+ CCAT
90
+ CCCA
91
+ CCCC
92
+ CCCG
93
+ CCCT
94
+ CCGA
95
+ CCGC
96
+ CCGG
97
+ CCGT
98
+ CCTA
99
+ CCTC
100
+ CCTG
101
+ CCTT
102
+ CGAA
103
+ CGAC
104
+ CGAG
105
+ CGAT
106
+ CGCA
107
+ CGCC
108
+ CGCG
109
+ CGCT
110
+ CGGA
111
+ CGGC
112
+ CGGG
113
+ CGGT
114
+ CGTA
115
+ CGTC
116
+ CGTG
117
+ CGTT
118
+ CTAA
119
+ CTAC
120
+ CTAG
121
+ CTAT
122
+ CTCA
123
+ CTCC
124
+ CTCG
125
+ CTCT
126
+ CTGA
127
+ CTGC
128
+ CTGG
129
+ CTGT
130
+ CTTA
131
+ CTTC
132
+ CTTG
133
+ CTTT
134
+ GAAA
135
+ GAAC
136
+ GAAG
137
+ GAAT
138
+ GACA
139
+ GACC
140
+ GACG
141
+ GACT
142
+ GAGA
143
+ GAGC
144
+ GAGG
145
+ GAGT
146
+ GATA
147
+ GATC
148
+ GATG
149
+ GATT
150
+ GCAA
151
+ GCAC
152
+ GCAG
153
+ GCAT
154
+ GCCA
155
+ GCCC
156
+ GCCG
157
+ GCCT
158
+ GCGA
159
+ GCGC
160
+ GCGG
161
+ GCGT
162
+ GCTA
163
+ GCTC
164
+ GCTG
165
+ GCTT
166
+ GGAA
167
+ GGAC
168
+ GGAG
169
+ GGAT
170
+ GGCA
171
+ GGCC
172
+ GGCG
173
+ GGCT
174
+ GGGA
175
+ GGGC
176
+ GGGG
177
+ GGGT
178
+ GGTA
179
+ GGTC
180
+ GGTG
181
+ GGTT
182
+ GTAA
183
+ GTAC
184
+ GTAG
185
+ GTAT
186
+ GTCA
187
+ GTCC
188
+ GTCG
189
+ GTCT
190
+ GTGA
191
+ GTGC
192
+ GTGG
193
+ GTGT
194
+ GTTA
195
+ GTTC
196
+ GTTG
197
+ GTTT
198
+ TAAA
199
+ TAAC
200
+ TAAG
201
+ TAAT
202
+ TACA
203
+ TACC
204
+ TACG
205
+ TACT
206
+ TAGA
207
+ TAGC
208
+ TAGG
209
+ TAGT
210
+ TATA
211
+ TATC
212
+ TATG
213
+ TATT
214
+ TCAA
215
+ TCAC
216
+ TCAG
217
+ TCAT
218
+ TCCA
219
+ TCCC
220
+ TCCG
221
+ TCCT
222
+ TCGA
223
+ TCGC
224
+ TCGG
225
+ TCGT
226
+ TCTA
227
+ TCTC
228
+ TCTG
229
+ TCTT
230
+ TGAA
231
+ TGAC
232
+ TGAG
233
+ TGAT
234
+ TGCA
235
+ TGCC
236
+ TGCG
237
+ TGCT
238
+ TGGA
239
+ TGGC
240
+ TGGG
241
+ TGGT
242
+ TGTA
243
+ TGTC
244
+ TGTG
245
+ TGTT
246
+ TTAA
247
+ TTAC
248
+ TTAG
249
+ TTAT
250
+ TTCA
251
+ TTCC
252
+ TTCG
253
+ TTCT
254
+ TTGA
255
+ TTGC
256
+ TTGG
257
+ TTGT
258
+ TTTA
259
+ TTTC
260
+ TTTG
261
+ TTTT
data/prokbert_vocabs/prokbert-base-dna5/vocab.txt ADDED
@@ -0,0 +1,1029 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ AAAAA
7
+ AAAAC
8
+ AAAAG
9
+ AAAAT
10
+ AAACA
11
+ AAACC
12
+ AAACG
13
+ AAACT
14
+ AAAGA
15
+ AAAGC
16
+ AAAGG
17
+ AAAGT
18
+ AAATA
19
+ AAATC
20
+ AAATG
21
+ AAATT
22
+ AACAA
23
+ AACAC
24
+ AACAG
25
+ AACAT
26
+ AACCA
27
+ AACCC
28
+ AACCG
29
+ AACCT
30
+ AACGA
31
+ AACGC
32
+ AACGG
33
+ AACGT
34
+ AACTA
35
+ AACTC
36
+ AACTG
37
+ AACTT
38
+ AAGAA
39
+ AAGAC
40
+ AAGAG
41
+ AAGAT
42
+ AAGCA
43
+ AAGCC
44
+ AAGCG
45
+ AAGCT
46
+ AAGGA
47
+ AAGGC
48
+ AAGGG
49
+ AAGGT
50
+ AAGTA
51
+ AAGTC
52
+ AAGTG
53
+ AAGTT
54
+ AATAA
55
+ AATAC
56
+ AATAG
57
+ AATAT
58
+ AATCA
59
+ AATCC
60
+ AATCG
61
+ AATCT
62
+ AATGA
63
+ AATGC
64
+ AATGG
65
+ AATGT
66
+ AATTA
67
+ AATTC
68
+ AATTG
69
+ AATTT
70
+ ACAAA
71
+ ACAAC
72
+ ACAAG
73
+ ACAAT
74
+ ACACA
75
+ ACACC
76
+ ACACG
77
+ ACACT
78
+ ACAGA
79
+ ACAGC
80
+ ACAGG
81
+ ACAGT
82
+ ACATA
83
+ ACATC
84
+ ACATG
85
+ ACATT
86
+ ACCAA
87
+ ACCAC
88
+ ACCAG
89
+ ACCAT
90
+ ACCCA
91
+ ACCCC
92
+ ACCCG
93
+ ACCCT
94
+ ACCGA
95
+ ACCGC
96
+ ACCGG
97
+ ACCGT
98
+ ACCTA
99
+ ACCTC
100
+ ACCTG
101
+ ACCTT
102
+ ACGAA
103
+ ACGAC
104
+ ACGAG
105
+ ACGAT
106
+ ACGCA
107
+ ACGCC
108
+ ACGCG
109
+ ACGCT
110
+ ACGGA
111
+ ACGGC
112
+ ACGGG
113
+ ACGGT
114
+ ACGTA
115
+ ACGTC
116
+ ACGTG
117
+ ACGTT
118
+ ACTAA
119
+ ACTAC
120
+ ACTAG
121
+ ACTAT
122
+ ACTCA
123
+ ACTCC
124
+ ACTCG
125
+ ACTCT
126
+ ACTGA
127
+ ACTGC
128
+ ACTGG
129
+ ACTGT
130
+ ACTTA
131
+ ACTTC
132
+ ACTTG
133
+ ACTTT
134
+ AGAAA
135
+ AGAAC
136
+ AGAAG
137
+ AGAAT
138
+ AGACA
139
+ AGACC
140
+ AGACG
141
+ AGACT
142
+ AGAGA
143
+ AGAGC
144
+ AGAGG
145
+ AGAGT
146
+ AGATA
147
+ AGATC
148
+ AGATG
149
+ AGATT
150
+ AGCAA
151
+ AGCAC
152
+ AGCAG
153
+ AGCAT
154
+ AGCCA
155
+ AGCCC
156
+ AGCCG
157
+ AGCCT
158
+ AGCGA
159
+ AGCGC
160
+ AGCGG
161
+ AGCGT
162
+ AGCTA
163
+ AGCTC
164
+ AGCTG
165
+ AGCTT
166
+ AGGAA
167
+ AGGAC
168
+ AGGAG
169
+ AGGAT
170
+ AGGCA
171
+ AGGCC
172
+ AGGCG
173
+ AGGCT
174
+ AGGGA
175
+ AGGGC
176
+ AGGGG
177
+ AGGGT
178
+ AGGTA
179
+ AGGTC
180
+ AGGTG
181
+ AGGTT
182
+ AGTAA
183
+ AGTAC
184
+ AGTAG
185
+ AGTAT
186
+ AGTCA
187
+ AGTCC
188
+ AGTCG
189
+ AGTCT
190
+ AGTGA
191
+ AGTGC
192
+ AGTGG
193
+ AGTGT
194
+ AGTTA
195
+ AGTTC
196
+ AGTTG
197
+ AGTTT
198
+ ATAAA
199
+ ATAAC
200
+ ATAAG
201
+ ATAAT
202
+ ATACA
203
+ ATACC
204
+ ATACG
205
+ ATACT
206
+ ATAGA
207
+ ATAGC
208
+ ATAGG
209
+ ATAGT
210
+ ATATA
211
+ ATATC
212
+ ATATG
213
+ ATATT
214
+ ATCAA
215
+ ATCAC
216
+ ATCAG
217
+ ATCAT
218
+ ATCCA
219
+ ATCCC
220
+ ATCCG
221
+ ATCCT
222
+ ATCGA
223
+ ATCGC
224
+ ATCGG
225
+ ATCGT
226
+ ATCTA
227
+ ATCTC
228
+ ATCTG
229
+ ATCTT
230
+ ATGAA
231
+ ATGAC
232
+ ATGAG
233
+ ATGAT
234
+ ATGCA
235
+ ATGCC
236
+ ATGCG
237
+ ATGCT
238
+ ATGGA
239
+ ATGGC
240
+ ATGGG
241
+ ATGGT
242
+ ATGTA
243
+ ATGTC
244
+ ATGTG
245
+ ATGTT
246
+ ATTAA
247
+ ATTAC
248
+ ATTAG
249
+ ATTAT
250
+ ATTCA
251
+ ATTCC
252
+ ATTCG
253
+ ATTCT
254
+ ATTGA
255
+ ATTGC
256
+ ATTGG
257
+ ATTGT
258
+ ATTTA
259
+ ATTTC
260
+ ATTTG
261
+ ATTTT
262
+ CAAAA
263
+ CAAAC
264
+ CAAAG
265
+ CAAAT
266
+ CAACA
267
+ CAACC
268
+ CAACG
269
+ CAACT
270
+ CAAGA
271
+ CAAGC
272
+ CAAGG
273
+ CAAGT
274
+ CAATA
275
+ CAATC
276
+ CAATG
277
+ CAATT
278
+ CACAA
279
+ CACAC
280
+ CACAG
281
+ CACAT
282
+ CACCA
283
+ CACCC
284
+ CACCG
285
+ CACCT
286
+ CACGA
287
+ CACGC
288
+ CACGG
289
+ CACGT
290
+ CACTA
291
+ CACTC
292
+ CACTG
293
+ CACTT
294
+ CAGAA
295
+ CAGAC
296
+ CAGAG
297
+ CAGAT
298
+ CAGCA
299
+ CAGCC
300
+ CAGCG
301
+ CAGCT
302
+ CAGGA
303
+ CAGGC
304
+ CAGGG
305
+ CAGGT
306
+ CAGTA
307
+ CAGTC
308
+ CAGTG
309
+ CAGTT
310
+ CATAA
311
+ CATAC
312
+ CATAG
313
+ CATAT
314
+ CATCA
315
+ CATCC
316
+ CATCG
317
+ CATCT
318
+ CATGA
319
+ CATGC
320
+ CATGG
321
+ CATGT
322
+ CATTA
323
+ CATTC
324
+ CATTG
325
+ CATTT
326
+ CCAAA
327
+ CCAAC
328
+ CCAAG
329
+ CCAAT
330
+ CCACA
331
+ CCACC
332
+ CCACG
333
+ CCACT
334
+ CCAGA
335
+ CCAGC
336
+ CCAGG
337
+ CCAGT
338
+ CCATA
339
+ CCATC
340
+ CCATG
341
+ CCATT
342
+ CCCAA
343
+ CCCAC
344
+ CCCAG
345
+ CCCAT
346
+ CCCCA
347
+ CCCCC
348
+ CCCCG
349
+ CCCCT
350
+ CCCGA
351
+ CCCGC
352
+ CCCGG
353
+ CCCGT
354
+ CCCTA
355
+ CCCTC
356
+ CCCTG
357
+ CCCTT
358
+ CCGAA
359
+ CCGAC
360
+ CCGAG
361
+ CCGAT
362
+ CCGCA
363
+ CCGCC
364
+ CCGCG
365
+ CCGCT
366
+ CCGGA
367
+ CCGGC
368
+ CCGGG
369
+ CCGGT
370
+ CCGTA
371
+ CCGTC
372
+ CCGTG
373
+ CCGTT
374
+ CCTAA
375
+ CCTAC
376
+ CCTAG
377
+ CCTAT
378
+ CCTCA
379
+ CCTCC
380
+ CCTCG
381
+ CCTCT
382
+ CCTGA
383
+ CCTGC
384
+ CCTGG
385
+ CCTGT
386
+ CCTTA
387
+ CCTTC
388
+ CCTTG
389
+ CCTTT
390
+ CGAAA
391
+ CGAAC
392
+ CGAAG
393
+ CGAAT
394
+ CGACA
395
+ CGACC
396
+ CGACG
397
+ CGACT
398
+ CGAGA
399
+ CGAGC
400
+ CGAGG
401
+ CGAGT
402
+ CGATA
403
+ CGATC
404
+ CGATG
405
+ CGATT
406
+ CGCAA
407
+ CGCAC
408
+ CGCAG
409
+ CGCAT
410
+ CGCCA
411
+ CGCCC
412
+ CGCCG
413
+ CGCCT
414
+ CGCGA
415
+ CGCGC
416
+ CGCGG
417
+ CGCGT
418
+ CGCTA
419
+ CGCTC
420
+ CGCTG
421
+ CGCTT
422
+ CGGAA
423
+ CGGAC
424
+ CGGAG
425
+ CGGAT
426
+ CGGCA
427
+ CGGCC
428
+ CGGCG
429
+ CGGCT
430
+ CGGGA
431
+ CGGGC
432
+ CGGGG
433
+ CGGGT
434
+ CGGTA
435
+ CGGTC
436
+ CGGTG
437
+ CGGTT
438
+ CGTAA
439
+ CGTAC
440
+ CGTAG
441
+ CGTAT
442
+ CGTCA
443
+ CGTCC
444
+ CGTCG
445
+ CGTCT
446
+ CGTGA
447
+ CGTGC
448
+ CGTGG
449
+ CGTGT
450
+ CGTTA
451
+ CGTTC
452
+ CGTTG
453
+ CGTTT
454
+ CTAAA
455
+ CTAAC
456
+ CTAAG
457
+ CTAAT
458
+ CTACA
459
+ CTACC
460
+ CTACG
461
+ CTACT
462
+ CTAGA
463
+ CTAGC
464
+ CTAGG
465
+ CTAGT
466
+ CTATA
467
+ CTATC
468
+ CTATG
469
+ CTATT
470
+ CTCAA
471
+ CTCAC
472
+ CTCAG
473
+ CTCAT
474
+ CTCCA
475
+ CTCCC
476
+ CTCCG
477
+ CTCCT
478
+ CTCGA
479
+ CTCGC
480
+ CTCGG
481
+ CTCGT
482
+ CTCTA
483
+ CTCTC
484
+ CTCTG
485
+ CTCTT
486
+ CTGAA
487
+ CTGAC
488
+ CTGAG
489
+ CTGAT
490
+ CTGCA
491
+ CTGCC
492
+ CTGCG
493
+ CTGCT
494
+ CTGGA
495
+ CTGGC
496
+ CTGGG
497
+ CTGGT
498
+ CTGTA
499
+ CTGTC
500
+ CTGTG
501
+ CTGTT
502
+ CTTAA
503
+ CTTAC
504
+ CTTAG
505
+ CTTAT
506
+ CTTCA
507
+ CTTCC
508
+ CTTCG
509
+ CTTCT
510
+ CTTGA
511
+ CTTGC
512
+ CTTGG
513
+ CTTGT
514
+ CTTTA
515
+ CTTTC
516
+ CTTTG
517
+ CTTTT
518
+ GAAAA
519
+ GAAAC
520
+ GAAAG
521
+ GAAAT
522
+ GAACA
523
+ GAACC
524
+ GAACG
525
+ GAACT
526
+ GAAGA
527
+ GAAGC
528
+ GAAGG
529
+ GAAGT
530
+ GAATA
531
+ GAATC
532
+ GAATG
533
+ GAATT
534
+ GACAA
535
+ GACAC
536
+ GACAG
537
+ GACAT
538
+ GACCA
539
+ GACCC
540
+ GACCG
541
+ GACCT
542
+ GACGA
543
+ GACGC
544
+ GACGG
545
+ GACGT
546
+ GACTA
547
+ GACTC
548
+ GACTG
549
+ GACTT
550
+ GAGAA
551
+ GAGAC
552
+ GAGAG
553
+ GAGAT
554
+ GAGCA
555
+ GAGCC
556
+ GAGCG
557
+ GAGCT
558
+ GAGGA
559
+ GAGGC
560
+ GAGGG
561
+ GAGGT
562
+ GAGTA
563
+ GAGTC
564
+ GAGTG
565
+ GAGTT
566
+ GATAA
567
+ GATAC
568
+ GATAG
569
+ GATAT
570
+ GATCA
571
+ GATCC
572
+ GATCG
573
+ GATCT
574
+ GATGA
575
+ GATGC
576
+ GATGG
577
+ GATGT
578
+ GATTA
579
+ GATTC
580
+ GATTG
581
+ GATTT
582
+ GCAAA
583
+ GCAAC
584
+ GCAAG
585
+ GCAAT
586
+ GCACA
587
+ GCACC
588
+ GCACG
589
+ GCACT
590
+ GCAGA
591
+ GCAGC
592
+ GCAGG
593
+ GCAGT
594
+ GCATA
595
+ GCATC
596
+ GCATG
597
+ GCATT
598
+ GCCAA
599
+ GCCAC
600
+ GCCAG
601
+ GCCAT
602
+ GCCCA
603
+ GCCCC
604
+ GCCCG
605
+ GCCCT
606
+ GCCGA
607
+ GCCGC
608
+ GCCGG
609
+ GCCGT
610
+ GCCTA
611
+ GCCTC
612
+ GCCTG
613
+ GCCTT
614
+ GCGAA
615
+ GCGAC
616
+ GCGAG
617
+ GCGAT
618
+ GCGCA
619
+ GCGCC
620
+ GCGCG
621
+ GCGCT
622
+ GCGGA
623
+ GCGGC
624
+ GCGGG
625
+ GCGGT
626
+ GCGTA
627
+ GCGTC
628
+ GCGTG
629
+ GCGTT
630
+ GCTAA
631
+ GCTAC
632
+ GCTAG
633
+ GCTAT
634
+ GCTCA
635
+ GCTCC
636
+ GCTCG
637
+ GCTCT
638
+ GCTGA
639
+ GCTGC
640
+ GCTGG
641
+ GCTGT
642
+ GCTTA
643
+ GCTTC
644
+ GCTTG
645
+ GCTTT
646
+ GGAAA
647
+ GGAAC
648
+ GGAAG
649
+ GGAAT
650
+ GGACA
651
+ GGACC
652
+ GGACG
653
+ GGACT
654
+ GGAGA
655
+ GGAGC
656
+ GGAGG
657
+ GGAGT
658
+ GGATA
659
+ GGATC
660
+ GGATG
661
+ GGATT
662
+ GGCAA
663
+ GGCAC
664
+ GGCAG
665
+ GGCAT
666
+ GGCCA
667
+ GGCCC
668
+ GGCCG
669
+ GGCCT
670
+ GGCGA
671
+ GGCGC
672
+ GGCGG
673
+ GGCGT
674
+ GGCTA
675
+ GGCTC
676
+ GGCTG
677
+ GGCTT
678
+ GGGAA
679
+ GGGAC
680
+ GGGAG
681
+ GGGAT
682
+ GGGCA
683
+ GGGCC
684
+ GGGCG
685
+ GGGCT
686
+ GGGGA
687
+ GGGGC
688
+ GGGGG
689
+ GGGGT
690
+ GGGTA
691
+ GGGTC
692
+ GGGTG
693
+ GGGTT
694
+ GGTAA
695
+ GGTAC
696
+ GGTAG
697
+ GGTAT
698
+ GGTCA
699
+ GGTCC
700
+ GGTCG
701
+ GGTCT
702
+ GGTGA
703
+ GGTGC
704
+ GGTGG
705
+ GGTGT
706
+ GGTTA
707
+ GGTTC
708
+ GGTTG
709
+ GGTTT
710
+ GTAAA
711
+ GTAAC
712
+ GTAAG
713
+ GTAAT
714
+ GTACA
715
+ GTACC
716
+ GTACG
717
+ GTACT
718
+ GTAGA
719
+ GTAGC
720
+ GTAGG
721
+ GTAGT
722
+ GTATA
723
+ GTATC
724
+ GTATG
725
+ GTATT
726
+ GTCAA
727
+ GTCAC
728
+ GTCAG
729
+ GTCAT
730
+ GTCCA
731
+ GTCCC
732
+ GTCCG
733
+ GTCCT
734
+ GTCGA
735
+ GTCGC
736
+ GTCGG
737
+ GTCGT
738
+ GTCTA
739
+ GTCTC
740
+ GTCTG
741
+ GTCTT
742
+ GTGAA
743
+ GTGAC
744
+ GTGAG
745
+ GTGAT
746
+ GTGCA
747
+ GTGCC
748
+ GTGCG
749
+ GTGCT
750
+ GTGGA
751
+ GTGGC
752
+ GTGGG
753
+ GTGGT
754
+ GTGTA
755
+ GTGTC
756
+ GTGTG
757
+ GTGTT
758
+ GTTAA
759
+ GTTAC
760
+ GTTAG
761
+ GTTAT
762
+ GTTCA
763
+ GTTCC
764
+ GTTCG
765
+ GTTCT
766
+ GTTGA
767
+ GTTGC
768
+ GTTGG
769
+ GTTGT
770
+ GTTTA
771
+ GTTTC
772
+ GTTTG
773
+ GTTTT
774
+ TAAAA
775
+ TAAAC
776
+ TAAAG
777
+ TAAAT
778
+ TAACA
779
+ TAACC
780
+ TAACG
781
+ TAACT
782
+ TAAGA
783
+ TAAGC
784
+ TAAGG
785
+ TAAGT
786
+ TAATA
787
+ TAATC
788
+ TAATG
789
+ TAATT
790
+ TACAA
791
+ TACAC
792
+ TACAG
793
+ TACAT
794
+ TACCA
795
+ TACCC
796
+ TACCG
797
+ TACCT
798
+ TACGA
799
+ TACGC
800
+ TACGG
801
+ TACGT
802
+ TACTA
803
+ TACTC
804
+ TACTG
805
+ TACTT
806
+ TAGAA
807
+ TAGAC
808
+ TAGAG
809
+ TAGAT
810
+ TAGCA
811
+ TAGCC
812
+ TAGCG
813
+ TAGCT
814
+ TAGGA
815
+ TAGGC
816
+ TAGGG
817
+ TAGGT
818
+ TAGTA
819
+ TAGTC
820
+ TAGTG
821
+ TAGTT
822
+ TATAA
823
+ TATAC
824
+ TATAG
825
+ TATAT
826
+ TATCA
827
+ TATCC
828
+ TATCG
829
+ TATCT
830
+ TATGA
831
+ TATGC
832
+ TATGG
833
+ TATGT
834
+ TATTA
835
+ TATTC
836
+ TATTG
837
+ TATTT
838
+ TCAAA
839
+ TCAAC
840
+ TCAAG
841
+ TCAAT
842
+ TCACA
843
+ TCACC
844
+ TCACG
845
+ TCACT
846
+ TCAGA
847
+ TCAGC
848
+ TCAGG
849
+ TCAGT
850
+ TCATA
851
+ TCATC
852
+ TCATG
853
+ TCATT
854
+ TCCAA
855
+ TCCAC
856
+ TCCAG
857
+ TCCAT
858
+ TCCCA
859
+ TCCCC
860
+ TCCCG
861
+ TCCCT
862
+ TCCGA
863
+ TCCGC
864
+ TCCGG
865
+ TCCGT
866
+ TCCTA
867
+ TCCTC
868
+ TCCTG
869
+ TCCTT
870
+ TCGAA
871
+ TCGAC
872
+ TCGAG
873
+ TCGAT
874
+ TCGCA
875
+ TCGCC
876
+ TCGCG
877
+ TCGCT
878
+ TCGGA
879
+ TCGGC
880
+ TCGGG
881
+ TCGGT
882
+ TCGTA
883
+ TCGTC
884
+ TCGTG
885
+ TCGTT
886
+ TCTAA
887
+ TCTAC
888
+ TCTAG
889
+ TCTAT
890
+ TCTCA
891
+ TCTCC
892
+ TCTCG
893
+ TCTCT
894
+ TCTGA
895
+ TCTGC
896
+ TCTGG
897
+ TCTGT
898
+ TCTTA
899
+ TCTTC
900
+ TCTTG
901
+ TCTTT
902
+ TGAAA
903
+ TGAAC
904
+ TGAAG
905
+ TGAAT
906
+ TGACA
907
+ TGACC
908
+ TGACG
909
+ TGACT
910
+ TGAGA
911
+ TGAGC
912
+ TGAGG
913
+ TGAGT
914
+ TGATA
915
+ TGATC
916
+ TGATG
917
+ TGATT
918
+ TGCAA
919
+ TGCAC
920
+ TGCAG
921
+ TGCAT
922
+ TGCCA
923
+ TGCCC
924
+ TGCCG
925
+ TGCCT
926
+ TGCGA
927
+ TGCGC
928
+ TGCGG
929
+ TGCGT
930
+ TGCTA
931
+ TGCTC
932
+ TGCTG
933
+ TGCTT
934
+ TGGAA
935
+ TGGAC
936
+ TGGAG
937
+ TGGAT
938
+ TGGCA
939
+ TGGCC
940
+ TGGCG
941
+ TGGCT
942
+ TGGGA
943
+ TGGGC
944
+ TGGGG
945
+ TGGGT
946
+ TGGTA
947
+ TGGTC
948
+ TGGTG
949
+ TGGTT
950
+ TGTAA
951
+ TGTAC
952
+ TGTAG
953
+ TGTAT
954
+ TGTCA
955
+ TGTCC
956
+ TGTCG
957
+ TGTCT
958
+ TGTGA
959
+ TGTGC
960
+ TGTGG
961
+ TGTGT
962
+ TGTTA
963
+ TGTTC
964
+ TGTTG
965
+ TGTTT
966
+ TTAAA
967
+ TTAAC
968
+ TTAAG
969
+ TTAAT
970
+ TTACA
971
+ TTACC
972
+ TTACG
973
+ TTACT
974
+ TTAGA
975
+ TTAGC
976
+ TTAGG
977
+ TTAGT
978
+ TTATA
979
+ TTATC
980
+ TTATG
981
+ TTATT
982
+ TTCAA
983
+ TTCAC
984
+ TTCAG
985
+ TTCAT
986
+ TTCCA
987
+ TTCCC
988
+ TTCCG
989
+ TTCCT
990
+ TTCGA
991
+ TTCGC
992
+ TTCGG
993
+ TTCGT
994
+ TTCTA
995
+ TTCTC
996
+ TTCTG
997
+ TTCTT
998
+ TTGAA
999
+ TTGAC
1000
+ TTGAG
1001
+ TTGAT
1002
+ TTGCA
1003
+ TTGCC
1004
+ TTGCG
1005
+ TTGCT
1006
+ TTGGA
1007
+ TTGGC
1008
+ TTGGG
1009
+ TTGGT
1010
+ TTGTA
1011
+ TTGTC
1012
+ TTGTG
1013
+ TTGTT
1014
+ TTTAA
1015
+ TTTAC
1016
+ TTTAG
1017
+ TTTAT
1018
+ TTTCA
1019
+ TTTCC
1020
+ TTTCG
1021
+ TTTCT
1022
+ TTTGA
1023
+ TTTGC
1024
+ TTTGG
1025
+ TTTGT
1026
+ TTTTA
1027
+ TTTTC
1028
+ TTTTG
1029
+ TTTTT
data/prokbert_vocabs/prokbert-base-dna6/vocab.txt ADDED
@@ -0,0 +1,4101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ AAAAAA
7
+ AAAAAC
8
+ AAAAAG
9
+ AAAAAT
10
+ AAAACA
11
+ AAAACC
12
+ AAAACG
13
+ AAAACT
14
+ AAAAGA
15
+ AAAAGC
16
+ AAAAGG
17
+ AAAAGT
18
+ AAAATA
19
+ AAAATC
20
+ AAAATG
21
+ AAAATT
22
+ AAACAA
23
+ AAACAC
24
+ AAACAG
25
+ AAACAT
26
+ AAACCA
27
+ AAACCC
28
+ AAACCG
29
+ AAACCT
30
+ AAACGA
31
+ AAACGC
32
+ AAACGG
33
+ AAACGT
34
+ AAACTA
35
+ AAACTC
36
+ AAACTG
37
+ AAACTT
38
+ AAAGAA
39
+ AAAGAC
40
+ AAAGAG
41
+ AAAGAT
42
+ AAAGCA
43
+ AAAGCC
44
+ AAAGCG
45
+ AAAGCT
46
+ AAAGGA
47
+ AAAGGC
48
+ AAAGGG
49
+ AAAGGT
50
+ AAAGTA
51
+ AAAGTC
52
+ AAAGTG
53
+ AAAGTT
54
+ AAATAA
55
+ AAATAC
56
+ AAATAG
57
+ AAATAT
58
+ AAATCA
59
+ AAATCC
60
+ AAATCG
61
+ AAATCT
62
+ AAATGA
63
+ AAATGC
64
+ AAATGG
65
+ AAATGT
66
+ AAATTA
67
+ AAATTC
68
+ AAATTG
69
+ AAATTT
70
+ AACAAA
71
+ AACAAC
72
+ AACAAG
73
+ AACAAT
74
+ AACACA
75
+ AACACC
76
+ AACACG
77
+ AACACT
78
+ AACAGA
79
+ AACAGC
80
+ AACAGG
81
+ AACAGT
82
+ AACATA
83
+ AACATC
84
+ AACATG
85
+ AACATT
86
+ AACCAA
87
+ AACCAC
88
+ AACCAG
89
+ AACCAT
90
+ AACCCA
91
+ AACCCC
92
+ AACCCG
93
+ AACCCT
94
+ AACCGA
95
+ AACCGC
96
+ AACCGG
97
+ AACCGT
98
+ AACCTA
99
+ AACCTC
100
+ AACCTG
101
+ AACCTT
102
+ AACGAA
103
+ AACGAC
104
+ AACGAG
105
+ AACGAT
106
+ AACGCA
107
+ AACGCC
108
+ AACGCG
109
+ AACGCT
110
+ AACGGA
111
+ AACGGC
112
+ AACGGG
113
+ AACGGT
114
+ AACGTA
115
+ AACGTC
116
+ AACGTG
117
+ AACGTT
118
+ AACTAA
119
+ AACTAC
120
+ AACTAG
121
+ AACTAT
122
+ AACTCA
123
+ AACTCC
124
+ AACTCG
125
+ AACTCT
126
+ AACTGA
127
+ AACTGC
128
+ AACTGG
129
+ AACTGT
130
+ AACTTA
131
+ AACTTC
132
+ AACTTG
133
+ AACTTT
134
+ AAGAAA
135
+ AAGAAC
136
+ AAGAAG
137
+ AAGAAT
138
+ AAGACA
139
+ AAGACC
140
+ AAGACG
141
+ AAGACT
142
+ AAGAGA
143
+ AAGAGC
144
+ AAGAGG
145
+ AAGAGT
146
+ AAGATA
147
+ AAGATC
148
+ AAGATG
149
+ AAGATT
150
+ AAGCAA
151
+ AAGCAC
152
+ AAGCAG
153
+ AAGCAT
154
+ AAGCCA
155
+ AAGCCC
156
+ AAGCCG
157
+ AAGCCT
158
+ AAGCGA
159
+ AAGCGC
160
+ AAGCGG
161
+ AAGCGT
162
+ AAGCTA
163
+ AAGCTC
164
+ AAGCTG
165
+ AAGCTT
166
+ AAGGAA
167
+ AAGGAC
168
+ AAGGAG
169
+ AAGGAT
170
+ AAGGCA
171
+ AAGGCC
172
+ AAGGCG
173
+ AAGGCT
174
+ AAGGGA
175
+ AAGGGC
176
+ AAGGGG
177
+ AAGGGT
178
+ AAGGTA
179
+ AAGGTC
180
+ AAGGTG
181
+ AAGGTT
182
+ AAGTAA
183
+ AAGTAC
184
+ AAGTAG
185
+ AAGTAT
186
+ AAGTCA
187
+ AAGTCC
188
+ AAGTCG
189
+ AAGTCT
190
+ AAGTGA
191
+ AAGTGC
192
+ AAGTGG
193
+ AAGTGT
194
+ AAGTTA
195
+ AAGTTC
196
+ AAGTTG
197
+ AAGTTT
198
+ AATAAA
199
+ AATAAC
200
+ AATAAG
201
+ AATAAT
202
+ AATACA
203
+ AATACC
204
+ AATACG
205
+ AATACT
206
+ AATAGA
207
+ AATAGC
208
+ AATAGG
209
+ AATAGT
210
+ AATATA
211
+ AATATC
212
+ AATATG
213
+ AATATT
214
+ AATCAA
215
+ AATCAC
216
+ AATCAG
217
+ AATCAT
218
+ AATCCA
219
+ AATCCC
220
+ AATCCG
221
+ AATCCT
222
+ AATCGA
223
+ AATCGC
224
+ AATCGG
225
+ AATCGT
226
+ AATCTA
227
+ AATCTC
228
+ AATCTG
229
+ AATCTT
230
+ AATGAA
231
+ AATGAC
232
+ AATGAG
233
+ AATGAT
234
+ AATGCA
235
+ AATGCC
236
+ AATGCG
237
+ AATGCT
238
+ AATGGA
239
+ AATGGC
240
+ AATGGG
241
+ AATGGT
242
+ AATGTA
243
+ AATGTC
244
+ AATGTG
245
+ AATGTT
246
+ AATTAA
247
+ AATTAC
248
+ AATTAG
249
+ AATTAT
250
+ AATTCA
251
+ AATTCC
252
+ AATTCG
253
+ AATTCT
254
+ AATTGA
255
+ AATTGC
256
+ AATTGG
257
+ AATTGT
258
+ AATTTA
259
+ AATTTC
260
+ AATTTG
261
+ AATTTT
262
+ ACAAAA
263
+ ACAAAC
264
+ ACAAAG
265
+ ACAAAT
266
+ ACAACA
267
+ ACAACC
268
+ ACAACG
269
+ ACAACT
270
+ ACAAGA
271
+ ACAAGC
272
+ ACAAGG
273
+ ACAAGT
274
+ ACAATA
275
+ ACAATC
276
+ ACAATG
277
+ ACAATT
278
+ ACACAA
279
+ ACACAC
280
+ ACACAG
281
+ ACACAT
282
+ ACACCA
283
+ ACACCC
284
+ ACACCG
285
+ ACACCT
286
+ ACACGA
287
+ ACACGC
288
+ ACACGG
289
+ ACACGT
290
+ ACACTA
291
+ ACACTC
292
+ ACACTG
293
+ ACACTT
294
+ ACAGAA
295
+ ACAGAC
296
+ ACAGAG
297
+ ACAGAT
298
+ ACAGCA
299
+ ACAGCC
300
+ ACAGCG
301
+ ACAGCT
302
+ ACAGGA
303
+ ACAGGC
304
+ ACAGGG
305
+ ACAGGT
306
+ ACAGTA
307
+ ACAGTC
308
+ ACAGTG
309
+ ACAGTT
310
+ ACATAA
311
+ ACATAC
312
+ ACATAG
313
+ ACATAT
314
+ ACATCA
315
+ ACATCC
316
+ ACATCG
317
+ ACATCT
318
+ ACATGA
319
+ ACATGC
320
+ ACATGG
321
+ ACATGT
322
+ ACATTA
323
+ ACATTC
324
+ ACATTG
325
+ ACATTT
326
+ ACCAAA
327
+ ACCAAC
328
+ ACCAAG
329
+ ACCAAT
330
+ ACCACA
331
+ ACCACC
332
+ ACCACG
333
+ ACCACT
334
+ ACCAGA
335
+ ACCAGC
336
+ ACCAGG
337
+ ACCAGT
338
+ ACCATA
339
+ ACCATC
340
+ ACCATG
341
+ ACCATT
342
+ ACCCAA
343
+ ACCCAC
344
+ ACCCAG
345
+ ACCCAT
346
+ ACCCCA
347
+ ACCCCC
348
+ ACCCCG
349
+ ACCCCT
350
+ ACCCGA
351
+ ACCCGC
352
+ ACCCGG
353
+ ACCCGT
354
+ ACCCTA
355
+ ACCCTC
356
+ ACCCTG
357
+ ACCCTT
358
+ ACCGAA
359
+ ACCGAC
360
+ ACCGAG
361
+ ACCGAT
362
+ ACCGCA
363
+ ACCGCC
364
+ ACCGCG
365
+ ACCGCT
366
+ ACCGGA
367
+ ACCGGC
368
+ ACCGGG
369
+ ACCGGT
370
+ ACCGTA
371
+ ACCGTC
372
+ ACCGTG
373
+ ACCGTT
374
+ ACCTAA
375
+ ACCTAC
376
+ ACCTAG
377
+ ACCTAT
378
+ ACCTCA
379
+ ACCTCC
380
+ ACCTCG
381
+ ACCTCT
382
+ ACCTGA
383
+ ACCTGC
384
+ ACCTGG
385
+ ACCTGT
386
+ ACCTTA
387
+ ACCTTC
388
+ ACCTTG
389
+ ACCTTT
390
+ ACGAAA
391
+ ACGAAC
392
+ ACGAAG
393
+ ACGAAT
394
+ ACGACA
395
+ ACGACC
396
+ ACGACG
397
+ ACGACT
398
+ ACGAGA
399
+ ACGAGC
400
+ ACGAGG
401
+ ACGAGT
402
+ ACGATA
403
+ ACGATC
404
+ ACGATG
405
+ ACGATT
406
+ ACGCAA
407
+ ACGCAC
408
+ ACGCAG
409
+ ACGCAT
410
+ ACGCCA
411
+ ACGCCC
412
+ ACGCCG
413
+ ACGCCT
414
+ ACGCGA
415
+ ACGCGC
416
+ ACGCGG
417
+ ACGCGT
418
+ ACGCTA
419
+ ACGCTC
420
+ ACGCTG
421
+ ACGCTT
422
+ ACGGAA
423
+ ACGGAC
424
+ ACGGAG
425
+ ACGGAT
426
+ ACGGCA
427
+ ACGGCC
428
+ ACGGCG
429
+ ACGGCT
430
+ ACGGGA
431
+ ACGGGC
432
+ ACGGGG
433
+ ACGGGT
434
+ ACGGTA
435
+ ACGGTC
436
+ ACGGTG
437
+ ACGGTT
438
+ ACGTAA
439
+ ACGTAC
440
+ ACGTAG
441
+ ACGTAT
442
+ ACGTCA
443
+ ACGTCC
444
+ ACGTCG
445
+ ACGTCT
446
+ ACGTGA
447
+ ACGTGC
448
+ ACGTGG
449
+ ACGTGT
450
+ ACGTTA
451
+ ACGTTC
452
+ ACGTTG
453
+ ACGTTT
454
+ ACTAAA
455
+ ACTAAC
456
+ ACTAAG
457
+ ACTAAT
458
+ ACTACA
459
+ ACTACC
460
+ ACTACG
461
+ ACTACT
462
+ ACTAGA
463
+ ACTAGC
464
+ ACTAGG
465
+ ACTAGT
466
+ ACTATA
467
+ ACTATC
468
+ ACTATG
469
+ ACTATT
470
+ ACTCAA
471
+ ACTCAC
472
+ ACTCAG
473
+ ACTCAT
474
+ ACTCCA
475
+ ACTCCC
476
+ ACTCCG
477
+ ACTCCT
478
+ ACTCGA
479
+ ACTCGC
480
+ ACTCGG
481
+ ACTCGT
482
+ ACTCTA
483
+ ACTCTC
484
+ ACTCTG
485
+ ACTCTT
486
+ ACTGAA
487
+ ACTGAC
488
+ ACTGAG
489
+ ACTGAT
490
+ ACTGCA
491
+ ACTGCC
492
+ ACTGCG
493
+ ACTGCT
494
+ ACTGGA
495
+ ACTGGC
496
+ ACTGGG
497
+ ACTGGT
498
+ ACTGTA
499
+ ACTGTC
500
+ ACTGTG
501
+ ACTGTT
502
+ ACTTAA
503
+ ACTTAC
504
+ ACTTAG
505
+ ACTTAT
506
+ ACTTCA
507
+ ACTTCC
508
+ ACTTCG
509
+ ACTTCT
510
+ ACTTGA
511
+ ACTTGC
512
+ ACTTGG
513
+ ACTTGT
514
+ ACTTTA
515
+ ACTTTC
516
+ ACTTTG
517
+ ACTTTT
518
+ AGAAAA
519
+ AGAAAC
520
+ AGAAAG
521
+ AGAAAT
522
+ AGAACA
523
+ AGAACC
524
+ AGAACG
525
+ AGAACT
526
+ AGAAGA
527
+ AGAAGC
528
+ AGAAGG
529
+ AGAAGT
530
+ AGAATA
531
+ AGAATC
532
+ AGAATG
533
+ AGAATT
534
+ AGACAA
535
+ AGACAC
536
+ AGACAG
537
+ AGACAT
538
+ AGACCA
539
+ AGACCC
540
+ AGACCG
541
+ AGACCT
542
+ AGACGA
543
+ AGACGC
544
+ AGACGG
545
+ AGACGT
546
+ AGACTA
547
+ AGACTC
548
+ AGACTG
549
+ AGACTT
550
+ AGAGAA
551
+ AGAGAC
552
+ AGAGAG
553
+ AGAGAT
554
+ AGAGCA
555
+ AGAGCC
556
+ AGAGCG
557
+ AGAGCT
558
+ AGAGGA
559
+ AGAGGC
560
+ AGAGGG
561
+ AGAGGT
562
+ AGAGTA
563
+ AGAGTC
564
+ AGAGTG
565
+ AGAGTT
566
+ AGATAA
567
+ AGATAC
568
+ AGATAG
569
+ AGATAT
570
+ AGATCA
571
+ AGATCC
572
+ AGATCG
573
+ AGATCT
574
+ AGATGA
575
+ AGATGC
576
+ AGATGG
577
+ AGATGT
578
+ AGATTA
579
+ AGATTC
580
+ AGATTG
581
+ AGATTT
582
+ AGCAAA
583
+ AGCAAC
584
+ AGCAAG
585
+ AGCAAT
586
+ AGCACA
587
+ AGCACC
588
+ AGCACG
589
+ AGCACT
590
+ AGCAGA
591
+ AGCAGC
592
+ AGCAGG
593
+ AGCAGT
594
+ AGCATA
595
+ AGCATC
596
+ AGCATG
597
+ AGCATT
598
+ AGCCAA
599
+ AGCCAC
600
+ AGCCAG
601
+ AGCCAT
602
+ AGCCCA
603
+ AGCCCC
604
+ AGCCCG
605
+ AGCCCT
606
+ AGCCGA
607
+ AGCCGC
608
+ AGCCGG
609
+ AGCCGT
610
+ AGCCTA
611
+ AGCCTC
612
+ AGCCTG
613
+ AGCCTT
614
+ AGCGAA
615
+ AGCGAC
616
+ AGCGAG
617
+ AGCGAT
618
+ AGCGCA
619
+ AGCGCC
620
+ AGCGCG
621
+ AGCGCT
622
+ AGCGGA
623
+ AGCGGC
624
+ AGCGGG
625
+ AGCGGT
626
+ AGCGTA
627
+ AGCGTC
628
+ AGCGTG
629
+ AGCGTT
630
+ AGCTAA
631
+ AGCTAC
632
+ AGCTAG
633
+ AGCTAT
634
+ AGCTCA
635
+ AGCTCC
636
+ AGCTCG
637
+ AGCTCT
638
+ AGCTGA
639
+ AGCTGC
640
+ AGCTGG
641
+ AGCTGT
642
+ AGCTTA
643
+ AGCTTC
644
+ AGCTTG
645
+ AGCTTT
646
+ AGGAAA
647
+ AGGAAC
648
+ AGGAAG
649
+ AGGAAT
650
+ AGGACA
651
+ AGGACC
652
+ AGGACG
653
+ AGGACT
654
+ AGGAGA
655
+ AGGAGC
656
+ AGGAGG
657
+ AGGAGT
658
+ AGGATA
659
+ AGGATC
660
+ AGGATG
661
+ AGGATT
662
+ AGGCAA
663
+ AGGCAC
664
+ AGGCAG
665
+ AGGCAT
666
+ AGGCCA
667
+ AGGCCC
668
+ AGGCCG
669
+ AGGCCT
670
+ AGGCGA
671
+ AGGCGC
672
+ AGGCGG
673
+ AGGCGT
674
+ AGGCTA
675
+ AGGCTC
676
+ AGGCTG
677
+ AGGCTT
678
+ AGGGAA
679
+ AGGGAC
680
+ AGGGAG
681
+ AGGGAT
682
+ AGGGCA
683
+ AGGGCC
684
+ AGGGCG
685
+ AGGGCT
686
+ AGGGGA
687
+ AGGGGC
688
+ AGGGGG
689
+ AGGGGT
690
+ AGGGTA
691
+ AGGGTC
692
+ AGGGTG
693
+ AGGGTT
694
+ AGGTAA
695
+ AGGTAC
696
+ AGGTAG
697
+ AGGTAT
698
+ AGGTCA
699
+ AGGTCC
700
+ AGGTCG
701
+ AGGTCT
702
+ AGGTGA
703
+ AGGTGC
704
+ AGGTGG
705
+ AGGTGT
706
+ AGGTTA
707
+ AGGTTC
708
+ AGGTTG
709
+ AGGTTT
710
+ AGTAAA
711
+ AGTAAC
712
+ AGTAAG
713
+ AGTAAT
714
+ AGTACA
715
+ AGTACC
716
+ AGTACG
717
+ AGTACT
718
+ AGTAGA
719
+ AGTAGC
720
+ AGTAGG
721
+ AGTAGT
722
+ AGTATA
723
+ AGTATC
724
+ AGTATG
725
+ AGTATT
726
+ AGTCAA
727
+ AGTCAC
728
+ AGTCAG
729
+ AGTCAT
730
+ AGTCCA
731
+ AGTCCC
732
+ AGTCCG
733
+ AGTCCT
734
+ AGTCGA
735
+ AGTCGC
736
+ AGTCGG
737
+ AGTCGT
738
+ AGTCTA
739
+ AGTCTC
740
+ AGTCTG
741
+ AGTCTT
742
+ AGTGAA
743
+ AGTGAC
744
+ AGTGAG
745
+ AGTGAT
746
+ AGTGCA
747
+ AGTGCC
748
+ AGTGCG
749
+ AGTGCT
750
+ AGTGGA
751
+ AGTGGC
752
+ AGTGGG
753
+ AGTGGT
754
+ AGTGTA
755
+ AGTGTC
756
+ AGTGTG
757
+ AGTGTT
758
+ AGTTAA
759
+ AGTTAC
760
+ AGTTAG
761
+ AGTTAT
762
+ AGTTCA
763
+ AGTTCC
764
+ AGTTCG
765
+ AGTTCT
766
+ AGTTGA
767
+ AGTTGC
768
+ AGTTGG
769
+ AGTTGT
770
+ AGTTTA
771
+ AGTTTC
772
+ AGTTTG
773
+ AGTTTT
774
+ ATAAAA
775
+ ATAAAC
776
+ ATAAAG
777
+ ATAAAT
778
+ ATAACA
779
+ ATAACC
780
+ ATAACG
781
+ ATAACT
782
+ ATAAGA
783
+ ATAAGC
784
+ ATAAGG
785
+ ATAAGT
786
+ ATAATA
787
+ ATAATC
788
+ ATAATG
789
+ ATAATT
790
+ ATACAA
791
+ ATACAC
792
+ ATACAG
793
+ ATACAT
794
+ ATACCA
795
+ ATACCC
796
+ ATACCG
797
+ ATACCT
798
+ ATACGA
799
+ ATACGC
800
+ ATACGG
801
+ ATACGT
802
+ ATACTA
803
+ ATACTC
804
+ ATACTG
805
+ ATACTT
806
+ ATAGAA
807
+ ATAGAC
808
+ ATAGAG
809
+ ATAGAT
810
+ ATAGCA
811
+ ATAGCC
812
+ ATAGCG
813
+ ATAGCT
814
+ ATAGGA
815
+ ATAGGC
816
+ ATAGGG
817
+ ATAGGT
818
+ ATAGTA
819
+ ATAGTC
820
+ ATAGTG
821
+ ATAGTT
822
+ ATATAA
823
+ ATATAC
824
+ ATATAG
825
+ ATATAT
826
+ ATATCA
827
+ ATATCC
828
+ ATATCG
829
+ ATATCT
830
+ ATATGA
831
+ ATATGC
832
+ ATATGG
833
+ ATATGT
834
+ ATATTA
835
+ ATATTC
836
+ ATATTG
837
+ ATATTT
838
+ ATCAAA
839
+ ATCAAC
840
+ ATCAAG
841
+ ATCAAT
842
+ ATCACA
843
+ ATCACC
844
+ ATCACG
845
+ ATCACT
846
+ ATCAGA
847
+ ATCAGC
848
+ ATCAGG
849
+ ATCAGT
850
+ ATCATA
851
+ ATCATC
852
+ ATCATG
853
+ ATCATT
854
+ ATCCAA
855
+ ATCCAC
856
+ ATCCAG
857
+ ATCCAT
858
+ ATCCCA
859
+ ATCCCC
860
+ ATCCCG
861
+ ATCCCT
862
+ ATCCGA
863
+ ATCCGC
864
+ ATCCGG
865
+ ATCCGT
866
+ ATCCTA
867
+ ATCCTC
868
+ ATCCTG
869
+ ATCCTT
870
+ ATCGAA
871
+ ATCGAC
872
+ ATCGAG
873
+ ATCGAT
874
+ ATCGCA
875
+ ATCGCC
876
+ ATCGCG
877
+ ATCGCT
878
+ ATCGGA
879
+ ATCGGC
880
+ ATCGGG
881
+ ATCGGT
882
+ ATCGTA
883
+ ATCGTC
884
+ ATCGTG
885
+ ATCGTT
886
+ ATCTAA
887
+ ATCTAC
888
+ ATCTAG
889
+ ATCTAT
890
+ ATCTCA
891
+ ATCTCC
892
+ ATCTCG
893
+ ATCTCT
894
+ ATCTGA
895
+ ATCTGC
896
+ ATCTGG
897
+ ATCTGT
898
+ ATCTTA
899
+ ATCTTC
900
+ ATCTTG
901
+ ATCTTT
902
+ ATGAAA
903
+ ATGAAC
904
+ ATGAAG
905
+ ATGAAT
906
+ ATGACA
907
+ ATGACC
908
+ ATGACG
909
+ ATGACT
910
+ ATGAGA
911
+ ATGAGC
912
+ ATGAGG
913
+ ATGAGT
914
+ ATGATA
915
+ ATGATC
916
+ ATGATG
917
+ ATGATT
918
+ ATGCAA
919
+ ATGCAC
920
+ ATGCAG
921
+ ATGCAT
922
+ ATGCCA
923
+ ATGCCC
924
+ ATGCCG
925
+ ATGCCT
926
+ ATGCGA
927
+ ATGCGC
928
+ ATGCGG
929
+ ATGCGT
930
+ ATGCTA
931
+ ATGCTC
932
+ ATGCTG
933
+ ATGCTT
934
+ ATGGAA
935
+ ATGGAC
936
+ ATGGAG
937
+ ATGGAT
938
+ ATGGCA
939
+ ATGGCC
940
+ ATGGCG
941
+ ATGGCT
942
+ ATGGGA
943
+ ATGGGC
944
+ ATGGGG
945
+ ATGGGT
946
+ ATGGTA
947
+ ATGGTC
948
+ ATGGTG
949
+ ATGGTT
950
+ ATGTAA
951
+ ATGTAC
952
+ ATGTAG
953
+ ATGTAT
954
+ ATGTCA
955
+ ATGTCC
956
+ ATGTCG
957
+ ATGTCT
958
+ ATGTGA
959
+ ATGTGC
960
+ ATGTGG
961
+ ATGTGT
962
+ ATGTTA
963
+ ATGTTC
964
+ ATGTTG
965
+ ATGTTT
966
+ ATTAAA
967
+ ATTAAC
968
+ ATTAAG
969
+ ATTAAT
970
+ ATTACA
971
+ ATTACC
972
+ ATTACG
973
+ ATTACT
974
+ ATTAGA
975
+ ATTAGC
976
+ ATTAGG
977
+ ATTAGT
978
+ ATTATA
979
+ ATTATC
980
+ ATTATG
981
+ ATTATT
982
+ ATTCAA
983
+ ATTCAC
984
+ ATTCAG
985
+ ATTCAT
986
+ ATTCCA
987
+ ATTCCC
988
+ ATTCCG
989
+ ATTCCT
990
+ ATTCGA
991
+ ATTCGC
992
+ ATTCGG
993
+ ATTCGT
994
+ ATTCTA
995
+ ATTCTC
996
+ ATTCTG
997
+ ATTCTT
998
+ ATTGAA
999
+ ATTGAC
1000
+ ATTGAG
1001
+ ATTGAT
1002
+ ATTGCA
1003
+ ATTGCC
1004
+ ATTGCG
1005
+ ATTGCT
1006
+ ATTGGA
1007
+ ATTGGC
1008
+ ATTGGG
1009
+ ATTGGT
1010
+ ATTGTA
1011
+ ATTGTC
1012
+ ATTGTG
1013
+ ATTGTT
1014
+ ATTTAA
1015
+ ATTTAC
1016
+ ATTTAG
1017
+ ATTTAT
1018
+ ATTTCA
1019
+ ATTTCC
1020
+ ATTTCG
1021
+ ATTTCT
1022
+ ATTTGA
1023
+ ATTTGC
1024
+ ATTTGG
1025
+ ATTTGT
1026
+ ATTTTA
1027
+ ATTTTC
1028
+ ATTTTG
1029
+ ATTTTT
1030
+ CAAAAA
1031
+ CAAAAC
1032
+ CAAAAG
1033
+ CAAAAT
1034
+ CAAACA
1035
+ CAAACC
1036
+ CAAACG
1037
+ CAAACT
1038
+ CAAAGA
1039
+ CAAAGC
1040
+ CAAAGG
1041
+ CAAAGT
1042
+ CAAATA
1043
+ CAAATC
1044
+ CAAATG
1045
+ CAAATT
1046
+ CAACAA
1047
+ CAACAC
1048
+ CAACAG
1049
+ CAACAT
1050
+ CAACCA
1051
+ CAACCC
1052
+ CAACCG
1053
+ CAACCT
1054
+ CAACGA
1055
+ CAACGC
1056
+ CAACGG
1057
+ CAACGT
1058
+ CAACTA
1059
+ CAACTC
1060
+ CAACTG
1061
+ CAACTT
1062
+ CAAGAA
1063
+ CAAGAC
1064
+ CAAGAG
1065
+ CAAGAT
1066
+ CAAGCA
1067
+ CAAGCC
1068
+ CAAGCG
1069
+ CAAGCT
1070
+ CAAGGA
1071
+ CAAGGC
1072
+ CAAGGG
1073
+ CAAGGT
1074
+ CAAGTA
1075
+ CAAGTC
1076
+ CAAGTG
1077
+ CAAGTT
1078
+ CAATAA
1079
+ CAATAC
1080
+ CAATAG
1081
+ CAATAT
1082
+ CAATCA
1083
+ CAATCC
1084
+ CAATCG
1085
+ CAATCT
1086
+ CAATGA
1087
+ CAATGC
1088
+ CAATGG
1089
+ CAATGT
1090
+ CAATTA
1091
+ CAATTC
1092
+ CAATTG
1093
+ CAATTT
1094
+ CACAAA
1095
+ CACAAC
1096
+ CACAAG
1097
+ CACAAT
1098
+ CACACA
1099
+ CACACC
1100
+ CACACG
1101
+ CACACT
1102
+ CACAGA
1103
+ CACAGC
1104
+ CACAGG
1105
+ CACAGT
1106
+ CACATA
1107
+ CACATC
1108
+ CACATG
1109
+ CACATT
1110
+ CACCAA
1111
+ CACCAC
1112
+ CACCAG
1113
+ CACCAT
1114
+ CACCCA
1115
+ CACCCC
1116
+ CACCCG
1117
+ CACCCT
1118
+ CACCGA
1119
+ CACCGC
1120
+ CACCGG
1121
+ CACCGT
1122
+ CACCTA
1123
+ CACCTC
1124
+ CACCTG
1125
+ CACCTT
1126
+ CACGAA
1127
+ CACGAC
1128
+ CACGAG
1129
+ CACGAT
1130
+ CACGCA
1131
+ CACGCC
1132
+ CACGCG
1133
+ CACGCT
1134
+ CACGGA
1135
+ CACGGC
1136
+ CACGGG
1137
+ CACGGT
1138
+ CACGTA
1139
+ CACGTC
1140
+ CACGTG
1141
+ CACGTT
1142
+ CACTAA
1143
+ CACTAC
1144
+ CACTAG
1145
+ CACTAT
1146
+ CACTCA
1147
+ CACTCC
1148
+ CACTCG
1149
+ CACTCT
1150
+ CACTGA
1151
+ CACTGC
1152
+ CACTGG
1153
+ CACTGT
1154
+ CACTTA
1155
+ CACTTC
1156
+ CACTTG
1157
+ CACTTT
1158
+ CAGAAA
1159
+ CAGAAC
1160
+ CAGAAG
1161
+ CAGAAT
1162
+ CAGACA
1163
+ CAGACC
1164
+ CAGACG
1165
+ CAGACT
1166
+ CAGAGA
1167
+ CAGAGC
1168
+ CAGAGG
1169
+ CAGAGT
1170
+ CAGATA
1171
+ CAGATC
1172
+ CAGATG
1173
+ CAGATT
1174
+ CAGCAA
1175
+ CAGCAC
1176
+ CAGCAG
1177
+ CAGCAT
1178
+ CAGCCA
1179
+ CAGCCC
1180
+ CAGCCG
1181
+ CAGCCT
1182
+ CAGCGA
1183
+ CAGCGC
1184
+ CAGCGG
1185
+ CAGCGT
1186
+ CAGCTA
1187
+ CAGCTC
1188
+ CAGCTG
1189
+ CAGCTT
1190
+ CAGGAA
1191
+ CAGGAC
1192
+ CAGGAG
1193
+ CAGGAT
1194
+ CAGGCA
1195
+ CAGGCC
1196
+ CAGGCG
1197
+ CAGGCT
1198
+ CAGGGA
1199
+ CAGGGC
1200
+ CAGGGG
1201
+ CAGGGT
1202
+ CAGGTA
1203
+ CAGGTC
1204
+ CAGGTG
1205
+ CAGGTT
1206
+ CAGTAA
1207
+ CAGTAC
1208
+ CAGTAG
1209
+ CAGTAT
1210
+ CAGTCA
1211
+ CAGTCC
1212
+ CAGTCG
1213
+ CAGTCT
1214
+ CAGTGA
1215
+ CAGTGC
1216
+ CAGTGG
1217
+ CAGTGT
1218
+ CAGTTA
1219
+ CAGTTC
1220
+ CAGTTG
1221
+ CAGTTT
1222
+ CATAAA
1223
+ CATAAC
1224
+ CATAAG
1225
+ CATAAT
1226
+ CATACA
1227
+ CATACC
1228
+ CATACG
1229
+ CATACT
1230
+ CATAGA
1231
+ CATAGC
1232
+ CATAGG
1233
+ CATAGT
1234
+ CATATA
1235
+ CATATC
1236
+ CATATG
1237
+ CATATT
1238
+ CATCAA
1239
+ CATCAC
1240
+ CATCAG
1241
+ CATCAT
1242
+ CATCCA
1243
+ CATCCC
1244
+ CATCCG
1245
+ CATCCT
1246
+ CATCGA
1247
+ CATCGC
1248
+ CATCGG
1249
+ CATCGT
1250
+ CATCTA
1251
+ CATCTC
1252
+ CATCTG
1253
+ CATCTT
1254
+ CATGAA
1255
+ CATGAC
1256
+ CATGAG
1257
+ CATGAT
1258
+ CATGCA
1259
+ CATGCC
1260
+ CATGCG
1261
+ CATGCT
1262
+ CATGGA
1263
+ CATGGC
1264
+ CATGGG
1265
+ CATGGT
1266
+ CATGTA
1267
+ CATGTC
1268
+ CATGTG
1269
+ CATGTT
1270
+ CATTAA
1271
+ CATTAC
1272
+ CATTAG
1273
+ CATTAT
1274
+ CATTCA
1275
+ CATTCC
1276
+ CATTCG
1277
+ CATTCT
1278
+ CATTGA
1279
+ CATTGC
1280
+ CATTGG
1281
+ CATTGT
1282
+ CATTTA
1283
+ CATTTC
1284
+ CATTTG
1285
+ CATTTT
1286
+ CCAAAA
1287
+ CCAAAC
1288
+ CCAAAG
1289
+ CCAAAT
1290
+ CCAACA
1291
+ CCAACC
1292
+ CCAACG
1293
+ CCAACT
1294
+ CCAAGA
1295
+ CCAAGC
1296
+ CCAAGG
1297
+ CCAAGT
1298
+ CCAATA
1299
+ CCAATC
1300
+ CCAATG
1301
+ CCAATT
1302
+ CCACAA
1303
+ CCACAC
1304
+ CCACAG
1305
+ CCACAT
1306
+ CCACCA
1307
+ CCACCC
1308
+ CCACCG
1309
+ CCACCT
1310
+ CCACGA
1311
+ CCACGC
1312
+ CCACGG
1313
+ CCACGT
1314
+ CCACTA
1315
+ CCACTC
1316
+ CCACTG
1317
+ CCACTT
1318
+ CCAGAA
1319
+ CCAGAC
1320
+ CCAGAG
1321
+ CCAGAT
1322
+ CCAGCA
1323
+ CCAGCC
1324
+ CCAGCG
1325
+ CCAGCT
1326
+ CCAGGA
1327
+ CCAGGC
1328
+ CCAGGG
1329
+ CCAGGT
1330
+ CCAGTA
1331
+ CCAGTC
1332
+ CCAGTG
1333
+ CCAGTT
1334
+ CCATAA
1335
+ CCATAC
1336
+ CCATAG
1337
+ CCATAT
1338
+ CCATCA
1339
+ CCATCC
1340
+ CCATCG
1341
+ CCATCT
1342
+ CCATGA
1343
+ CCATGC
1344
+ CCATGG
1345
+ CCATGT
1346
+ CCATTA
1347
+ CCATTC
1348
+ CCATTG
1349
+ CCATTT
1350
+ CCCAAA
1351
+ CCCAAC
1352
+ CCCAAG
1353
+ CCCAAT
1354
+ CCCACA
1355
+ CCCACC
1356
+ CCCACG
1357
+ CCCACT
1358
+ CCCAGA
1359
+ CCCAGC
1360
+ CCCAGG
1361
+ CCCAGT
1362
+ CCCATA
1363
+ CCCATC
1364
+ CCCATG
1365
+ CCCATT
1366
+ CCCCAA
1367
+ CCCCAC
1368
+ CCCCAG
1369
+ CCCCAT
1370
+ CCCCCA
1371
+ CCCCCC
1372
+ CCCCCG
1373
+ CCCCCT
1374
+ CCCCGA
1375
+ CCCCGC
1376
+ CCCCGG
1377
+ CCCCGT
1378
+ CCCCTA
1379
+ CCCCTC
1380
+ CCCCTG
1381
+ CCCCTT
1382
+ CCCGAA
1383
+ CCCGAC
1384
+ CCCGAG
1385
+ CCCGAT
1386
+ CCCGCA
1387
+ CCCGCC
1388
+ CCCGCG
1389
+ CCCGCT
1390
+ CCCGGA
1391
+ CCCGGC
1392
+ CCCGGG
1393
+ CCCGGT
1394
+ CCCGTA
1395
+ CCCGTC
1396
+ CCCGTG
1397
+ CCCGTT
1398
+ CCCTAA
1399
+ CCCTAC
1400
+ CCCTAG
1401
+ CCCTAT
1402
+ CCCTCA
1403
+ CCCTCC
1404
+ CCCTCG
1405
+ CCCTCT
1406
+ CCCTGA
1407
+ CCCTGC
1408
+ CCCTGG
1409
+ CCCTGT
1410
+ CCCTTA
1411
+ CCCTTC
1412
+ CCCTTG
1413
+ CCCTTT
1414
+ CCGAAA
1415
+ CCGAAC
1416
+ CCGAAG
1417
+ CCGAAT
1418
+ CCGACA
1419
+ CCGACC
1420
+ CCGACG
1421
+ CCGACT
1422
+ CCGAGA
1423
+ CCGAGC
1424
+ CCGAGG
1425
+ CCGAGT
1426
+ CCGATA
1427
+ CCGATC
1428
+ CCGATG
1429
+ CCGATT
1430
+ CCGCAA
1431
+ CCGCAC
1432
+ CCGCAG
1433
+ CCGCAT
1434
+ CCGCCA
1435
+ CCGCCC
1436
+ CCGCCG
1437
+ CCGCCT
1438
+ CCGCGA
1439
+ CCGCGC
1440
+ CCGCGG
1441
+ CCGCGT
1442
+ CCGCTA
1443
+ CCGCTC
1444
+ CCGCTG
1445
+ CCGCTT
1446
+ CCGGAA
1447
+ CCGGAC
1448
+ CCGGAG
1449
+ CCGGAT
1450
+ CCGGCA
1451
+ CCGGCC
1452
+ CCGGCG
1453
+ CCGGCT
1454
+ CCGGGA
1455
+ CCGGGC
1456
+ CCGGGG
1457
+ CCGGGT
1458
+ CCGGTA
1459
+ CCGGTC
1460
+ CCGGTG
1461
+ CCGGTT
1462
+ CCGTAA
1463
+ CCGTAC
1464
+ CCGTAG
1465
+ CCGTAT
1466
+ CCGTCA
1467
+ CCGTCC
1468
+ CCGTCG
1469
+ CCGTCT
1470
+ CCGTGA
1471
+ CCGTGC
1472
+ CCGTGG
1473
+ CCGTGT
1474
+ CCGTTA
1475
+ CCGTTC
1476
+ CCGTTG
1477
+ CCGTTT
1478
+ CCTAAA
1479
+ CCTAAC
1480
+ CCTAAG
1481
+ CCTAAT
1482
+ CCTACA
1483
+ CCTACC
1484
+ CCTACG
1485
+ CCTACT
1486
+ CCTAGA
1487
+ CCTAGC
1488
+ CCTAGG
1489
+ CCTAGT
1490
+ CCTATA
1491
+ CCTATC
1492
+ CCTATG
1493
+ CCTATT
1494
+ CCTCAA
1495
+ CCTCAC
1496
+ CCTCAG
1497
+ CCTCAT
1498
+ CCTCCA
1499
+ CCTCCC
1500
+ CCTCCG
1501
+ CCTCCT
1502
+ CCTCGA
1503
+ CCTCGC
1504
+ CCTCGG
1505
+ CCTCGT
1506
+ CCTCTA
1507
+ CCTCTC
1508
+ CCTCTG
1509
+ CCTCTT
1510
+ CCTGAA
1511
+ CCTGAC
1512
+ CCTGAG
1513
+ CCTGAT
1514
+ CCTGCA
1515
+ CCTGCC
1516
+ CCTGCG
1517
+ CCTGCT
1518
+ CCTGGA
1519
+ CCTGGC
1520
+ CCTGGG
1521
+ CCTGGT
1522
+ CCTGTA
1523
+ CCTGTC
1524
+ CCTGTG
1525
+ CCTGTT
1526
+ CCTTAA
1527
+ CCTTAC
1528
+ CCTTAG
1529
+ CCTTAT
1530
+ CCTTCA
1531
+ CCTTCC
1532
+ CCTTCG
1533
+ CCTTCT
1534
+ CCTTGA
1535
+ CCTTGC
1536
+ CCTTGG
1537
+ CCTTGT
1538
+ CCTTTA
1539
+ CCTTTC
1540
+ CCTTTG
1541
+ CCTTTT
1542
+ CGAAAA
1543
+ CGAAAC
1544
+ CGAAAG
1545
+ CGAAAT
1546
+ CGAACA
1547
+ CGAACC
1548
+ CGAACG
1549
+ CGAACT
1550
+ CGAAGA
1551
+ CGAAGC
1552
+ CGAAGG
1553
+ CGAAGT
1554
+ CGAATA
1555
+ CGAATC
1556
+ CGAATG
1557
+ CGAATT
1558
+ CGACAA
1559
+ CGACAC
1560
+ CGACAG
1561
+ CGACAT
1562
+ CGACCA
1563
+ CGACCC
1564
+ CGACCG
1565
+ CGACCT
1566
+ CGACGA
1567
+ CGACGC
1568
+ CGACGG
1569
+ CGACGT
1570
+ CGACTA
1571
+ CGACTC
1572
+ CGACTG
1573
+ CGACTT
1574
+ CGAGAA
1575
+ CGAGAC
1576
+ CGAGAG
1577
+ CGAGAT
1578
+ CGAGCA
1579
+ CGAGCC
1580
+ CGAGCG
1581
+ CGAGCT
1582
+ CGAGGA
1583
+ CGAGGC
1584
+ CGAGGG
1585
+ CGAGGT
1586
+ CGAGTA
1587
+ CGAGTC
1588
+ CGAGTG
1589
+ CGAGTT
1590
+ CGATAA
1591
+ CGATAC
1592
+ CGATAG
1593
+ CGATAT
1594
+ CGATCA
1595
+ CGATCC
1596
+ CGATCG
1597
+ CGATCT
1598
+ CGATGA
1599
+ CGATGC
1600
+ CGATGG
1601
+ CGATGT
1602
+ CGATTA
1603
+ CGATTC
1604
+ CGATTG
1605
+ CGATTT
1606
+ CGCAAA
1607
+ CGCAAC
1608
+ CGCAAG
1609
+ CGCAAT
1610
+ CGCACA
1611
+ CGCACC
1612
+ CGCACG
1613
+ CGCACT
1614
+ CGCAGA
1615
+ CGCAGC
1616
+ CGCAGG
1617
+ CGCAGT
1618
+ CGCATA
1619
+ CGCATC
1620
+ CGCATG
1621
+ CGCATT
1622
+ CGCCAA
1623
+ CGCCAC
1624
+ CGCCAG
1625
+ CGCCAT
1626
+ CGCCCA
1627
+ CGCCCC
1628
+ CGCCCG
1629
+ CGCCCT
1630
+ CGCCGA
1631
+ CGCCGC
1632
+ CGCCGG
1633
+ CGCCGT
1634
+ CGCCTA
1635
+ CGCCTC
1636
+ CGCCTG
1637
+ CGCCTT
1638
+ CGCGAA
1639
+ CGCGAC
1640
+ CGCGAG
1641
+ CGCGAT
1642
+ CGCGCA
1643
+ CGCGCC
1644
+ CGCGCG
1645
+ CGCGCT
1646
+ CGCGGA
1647
+ CGCGGC
1648
+ CGCGGG
1649
+ CGCGGT
1650
+ CGCGTA
1651
+ CGCGTC
1652
+ CGCGTG
1653
+ CGCGTT
1654
+ CGCTAA
1655
+ CGCTAC
1656
+ CGCTAG
1657
+ CGCTAT
1658
+ CGCTCA
1659
+ CGCTCC
1660
+ CGCTCG
1661
+ CGCTCT
1662
+ CGCTGA
1663
+ CGCTGC
1664
+ CGCTGG
1665
+ CGCTGT
1666
+ CGCTTA
1667
+ CGCTTC
1668
+ CGCTTG
1669
+ CGCTTT
1670
+ CGGAAA
1671
+ CGGAAC
1672
+ CGGAAG
1673
+ CGGAAT
1674
+ CGGACA
1675
+ CGGACC
1676
+ CGGACG
1677
+ CGGACT
1678
+ CGGAGA
1679
+ CGGAGC
1680
+ CGGAGG
1681
+ CGGAGT
1682
+ CGGATA
1683
+ CGGATC
1684
+ CGGATG
1685
+ CGGATT
1686
+ CGGCAA
1687
+ CGGCAC
1688
+ CGGCAG
1689
+ CGGCAT
1690
+ CGGCCA
1691
+ CGGCCC
1692
+ CGGCCG
1693
+ CGGCCT
1694
+ CGGCGA
1695
+ CGGCGC
1696
+ CGGCGG
1697
+ CGGCGT
1698
+ CGGCTA
1699
+ CGGCTC
1700
+ CGGCTG
1701
+ CGGCTT
1702
+ CGGGAA
1703
+ CGGGAC
1704
+ CGGGAG
1705
+ CGGGAT
1706
+ CGGGCA
1707
+ CGGGCC
1708
+ CGGGCG
1709
+ CGGGCT
1710
+ CGGGGA
1711
+ CGGGGC
1712
+ CGGGGG
1713
+ CGGGGT
1714
+ CGGGTA
1715
+ CGGGTC
1716
+ CGGGTG
1717
+ CGGGTT
1718
+ CGGTAA
1719
+ CGGTAC
1720
+ CGGTAG
1721
+ CGGTAT
1722
+ CGGTCA
1723
+ CGGTCC
1724
+ CGGTCG
1725
+ CGGTCT
1726
+ CGGTGA
1727
+ CGGTGC
1728
+ CGGTGG
1729
+ CGGTGT
1730
+ CGGTTA
1731
+ CGGTTC
1732
+ CGGTTG
1733
+ CGGTTT
1734
+ CGTAAA
1735
+ CGTAAC
1736
+ CGTAAG
1737
+ CGTAAT
1738
+ CGTACA
1739
+ CGTACC
1740
+ CGTACG
1741
+ CGTACT
1742
+ CGTAGA
1743
+ CGTAGC
1744
+ CGTAGG
1745
+ CGTAGT
1746
+ CGTATA
1747
+ CGTATC
1748
+ CGTATG
1749
+ CGTATT
1750
+ CGTCAA
1751
+ CGTCAC
1752
+ CGTCAG
1753
+ CGTCAT
1754
+ CGTCCA
1755
+ CGTCCC
1756
+ CGTCCG
1757
+ CGTCCT
1758
+ CGTCGA
1759
+ CGTCGC
1760
+ CGTCGG
1761
+ CGTCGT
1762
+ CGTCTA
1763
+ CGTCTC
1764
+ CGTCTG
1765
+ CGTCTT
1766
+ CGTGAA
1767
+ CGTGAC
1768
+ CGTGAG
1769
+ CGTGAT
1770
+ CGTGCA
1771
+ CGTGCC
1772
+ CGTGCG
1773
+ CGTGCT
1774
+ CGTGGA
1775
+ CGTGGC
1776
+ CGTGGG
1777
+ CGTGGT
1778
+ CGTGTA
1779
+ CGTGTC
1780
+ CGTGTG
1781
+ CGTGTT
1782
+ CGTTAA
1783
+ CGTTAC
1784
+ CGTTAG
1785
+ CGTTAT
1786
+ CGTTCA
1787
+ CGTTCC
1788
+ CGTTCG
1789
+ CGTTCT
1790
+ CGTTGA
1791
+ CGTTGC
1792
+ CGTTGG
1793
+ CGTTGT
1794
+ CGTTTA
1795
+ CGTTTC
1796
+ CGTTTG
1797
+ CGTTTT
1798
+ CTAAAA
1799
+ CTAAAC
1800
+ CTAAAG
1801
+ CTAAAT
1802
+ CTAACA
1803
+ CTAACC
1804
+ CTAACG
1805
+ CTAACT
1806
+ CTAAGA
1807
+ CTAAGC
1808
+ CTAAGG
1809
+ CTAAGT
1810
+ CTAATA
1811
+ CTAATC
1812
+ CTAATG
1813
+ CTAATT
1814
+ CTACAA
1815
+ CTACAC
1816
+ CTACAG
1817
+ CTACAT
1818
+ CTACCA
1819
+ CTACCC
1820
+ CTACCG
1821
+ CTACCT
1822
+ CTACGA
1823
+ CTACGC
1824
+ CTACGG
1825
+ CTACGT
1826
+ CTACTA
1827
+ CTACTC
1828
+ CTACTG
1829
+ CTACTT
1830
+ CTAGAA
1831
+ CTAGAC
1832
+ CTAGAG
1833
+ CTAGAT
1834
+ CTAGCA
1835
+ CTAGCC
1836
+ CTAGCG
1837
+ CTAGCT
1838
+ CTAGGA
1839
+ CTAGGC
1840
+ CTAGGG
1841
+ CTAGGT
1842
+ CTAGTA
1843
+ CTAGTC
1844
+ CTAGTG
1845
+ CTAGTT
1846
+ CTATAA
1847
+ CTATAC
1848
+ CTATAG
1849
+ CTATAT
1850
+ CTATCA
1851
+ CTATCC
1852
+ CTATCG
1853
+ CTATCT
1854
+ CTATGA
1855
+ CTATGC
1856
+ CTATGG
1857
+ CTATGT
1858
+ CTATTA
1859
+ CTATTC
1860
+ CTATTG
1861
+ CTATTT
1862
+ CTCAAA
1863
+ CTCAAC
1864
+ CTCAAG
1865
+ CTCAAT
1866
+ CTCACA
1867
+ CTCACC
1868
+ CTCACG
1869
+ CTCACT
1870
+ CTCAGA
1871
+ CTCAGC
1872
+ CTCAGG
1873
+ CTCAGT
1874
+ CTCATA
1875
+ CTCATC
1876
+ CTCATG
1877
+ CTCATT
1878
+ CTCCAA
1879
+ CTCCAC
1880
+ CTCCAG
1881
+ CTCCAT
1882
+ CTCCCA
1883
+ CTCCCC
1884
+ CTCCCG
1885
+ CTCCCT
1886
+ CTCCGA
1887
+ CTCCGC
1888
+ CTCCGG
1889
+ CTCCGT
1890
+ CTCCTA
1891
+ CTCCTC
1892
+ CTCCTG
1893
+ CTCCTT
1894
+ CTCGAA
1895
+ CTCGAC
1896
+ CTCGAG
1897
+ CTCGAT
1898
+ CTCGCA
1899
+ CTCGCC
1900
+ CTCGCG
1901
+ CTCGCT
1902
+ CTCGGA
1903
+ CTCGGC
1904
+ CTCGGG
1905
+ CTCGGT
1906
+ CTCGTA
1907
+ CTCGTC
1908
+ CTCGTG
1909
+ CTCGTT
1910
+ CTCTAA
1911
+ CTCTAC
1912
+ CTCTAG
1913
+ CTCTAT
1914
+ CTCTCA
1915
+ CTCTCC
1916
+ CTCTCG
1917
+ CTCTCT
1918
+ CTCTGA
1919
+ CTCTGC
1920
+ CTCTGG
1921
+ CTCTGT
1922
+ CTCTTA
1923
+ CTCTTC
1924
+ CTCTTG
1925
+ CTCTTT
1926
+ CTGAAA
1927
+ CTGAAC
1928
+ CTGAAG
1929
+ CTGAAT
1930
+ CTGACA
1931
+ CTGACC
1932
+ CTGACG
1933
+ CTGACT
1934
+ CTGAGA
1935
+ CTGAGC
1936
+ CTGAGG
1937
+ CTGAGT
1938
+ CTGATA
1939
+ CTGATC
1940
+ CTGATG
1941
+ CTGATT
1942
+ CTGCAA
1943
+ CTGCAC
1944
+ CTGCAG
1945
+ CTGCAT
1946
+ CTGCCA
1947
+ CTGCCC
1948
+ CTGCCG
1949
+ CTGCCT
1950
+ CTGCGA
1951
+ CTGCGC
1952
+ CTGCGG
1953
+ CTGCGT
1954
+ CTGCTA
1955
+ CTGCTC
1956
+ CTGCTG
1957
+ CTGCTT
1958
+ CTGGAA
1959
+ CTGGAC
1960
+ CTGGAG
1961
+ CTGGAT
1962
+ CTGGCA
1963
+ CTGGCC
1964
+ CTGGCG
1965
+ CTGGCT
1966
+ CTGGGA
1967
+ CTGGGC
1968
+ CTGGGG
1969
+ CTGGGT
1970
+ CTGGTA
1971
+ CTGGTC
1972
+ CTGGTG
1973
+ CTGGTT
1974
+ CTGTAA
1975
+ CTGTAC
1976
+ CTGTAG
1977
+ CTGTAT
1978
+ CTGTCA
1979
+ CTGTCC
1980
+ CTGTCG
1981
+ CTGTCT
1982
+ CTGTGA
1983
+ CTGTGC
1984
+ CTGTGG
1985
+ CTGTGT
1986
+ CTGTTA
1987
+ CTGTTC
1988
+ CTGTTG
1989
+ CTGTTT
1990
+ CTTAAA
1991
+ CTTAAC
1992
+ CTTAAG
1993
+ CTTAAT
1994
+ CTTACA
1995
+ CTTACC
1996
+ CTTACG
1997
+ CTTACT
1998
+ CTTAGA
1999
+ CTTAGC
2000
+ CTTAGG
2001
+ CTTAGT
2002
+ CTTATA
2003
+ CTTATC
2004
+ CTTATG
2005
+ CTTATT
2006
+ CTTCAA
2007
+ CTTCAC
2008
+ CTTCAG
2009
+ CTTCAT
2010
+ CTTCCA
2011
+ CTTCCC
2012
+ CTTCCG
2013
+ CTTCCT
2014
+ CTTCGA
2015
+ CTTCGC
2016
+ CTTCGG
2017
+ CTTCGT
2018
+ CTTCTA
2019
+ CTTCTC
2020
+ CTTCTG
2021
+ CTTCTT
2022
+ CTTGAA
2023
+ CTTGAC
2024
+ CTTGAG
2025
+ CTTGAT
2026
+ CTTGCA
2027
+ CTTGCC
2028
+ CTTGCG
2029
+ CTTGCT
2030
+ CTTGGA
2031
+ CTTGGC
2032
+ CTTGGG
2033
+ CTTGGT
2034
+ CTTGTA
2035
+ CTTGTC
2036
+ CTTGTG
2037
+ CTTGTT
2038
+ CTTTAA
2039
+ CTTTAC
2040
+ CTTTAG
2041
+ CTTTAT
2042
+ CTTTCA
2043
+ CTTTCC
2044
+ CTTTCG
2045
+ CTTTCT
2046
+ CTTTGA
2047
+ CTTTGC
2048
+ CTTTGG
2049
+ CTTTGT
2050
+ CTTTTA
2051
+ CTTTTC
2052
+ CTTTTG
2053
+ CTTTTT
2054
+ GAAAAA
2055
+ GAAAAC
2056
+ GAAAAG
2057
+ GAAAAT
2058
+ GAAACA
2059
+ GAAACC
2060
+ GAAACG
2061
+ GAAACT
2062
+ GAAAGA
2063
+ GAAAGC
2064
+ GAAAGG
2065
+ GAAAGT
2066
+ GAAATA
2067
+ GAAATC
2068
+ GAAATG
2069
+ GAAATT
2070
+ GAACAA
2071
+ GAACAC
2072
+ GAACAG
2073
+ GAACAT
2074
+ GAACCA
2075
+ GAACCC
2076
+ GAACCG
2077
+ GAACCT
2078
+ GAACGA
2079
+ GAACGC
2080
+ GAACGG
2081
+ GAACGT
2082
+ GAACTA
2083
+ GAACTC
2084
+ GAACTG
2085
+ GAACTT
2086
+ GAAGAA
2087
+ GAAGAC
2088
+ GAAGAG
2089
+ GAAGAT
2090
+ GAAGCA
2091
+ GAAGCC
2092
+ GAAGCG
2093
+ GAAGCT
2094
+ GAAGGA
2095
+ GAAGGC
2096
+ GAAGGG
2097
+ GAAGGT
2098
+ GAAGTA
2099
+ GAAGTC
2100
+ GAAGTG
2101
+ GAAGTT
2102
+ GAATAA
2103
+ GAATAC
2104
+ GAATAG
2105
+ GAATAT
2106
+ GAATCA
2107
+ GAATCC
2108
+ GAATCG
2109
+ GAATCT
2110
+ GAATGA
2111
+ GAATGC
2112
+ GAATGG
2113
+ GAATGT
2114
+ GAATTA
2115
+ GAATTC
2116
+ GAATTG
2117
+ GAATTT
2118
+ GACAAA
2119
+ GACAAC
2120
+ GACAAG
2121
+ GACAAT
2122
+ GACACA
2123
+ GACACC
2124
+ GACACG
2125
+ GACACT
2126
+ GACAGA
2127
+ GACAGC
2128
+ GACAGG
2129
+ GACAGT
2130
+ GACATA
2131
+ GACATC
2132
+ GACATG
2133
+ GACATT
2134
+ GACCAA
2135
+ GACCAC
2136
+ GACCAG
2137
+ GACCAT
2138
+ GACCCA
2139
+ GACCCC
2140
+ GACCCG
2141
+ GACCCT
2142
+ GACCGA
2143
+ GACCGC
2144
+ GACCGG
2145
+ GACCGT
2146
+ GACCTA
2147
+ GACCTC
2148
+ GACCTG
2149
+ GACCTT
2150
+ GACGAA
2151
+ GACGAC
2152
+ GACGAG
2153
+ GACGAT
2154
+ GACGCA
2155
+ GACGCC
2156
+ GACGCG
2157
+ GACGCT
2158
+ GACGGA
2159
+ GACGGC
2160
+ GACGGG
2161
+ GACGGT
2162
+ GACGTA
2163
+ GACGTC
2164
+ GACGTG
2165
+ GACGTT
2166
+ GACTAA
2167
+ GACTAC
2168
+ GACTAG
2169
+ GACTAT
2170
+ GACTCA
2171
+ GACTCC
2172
+ GACTCG
2173
+ GACTCT
2174
+ GACTGA
2175
+ GACTGC
2176
+ GACTGG
2177
+ GACTGT
2178
+ GACTTA
2179
+ GACTTC
2180
+ GACTTG
2181
+ GACTTT
2182
+ GAGAAA
2183
+ GAGAAC
2184
+ GAGAAG
2185
+ GAGAAT
2186
+ GAGACA
2187
+ GAGACC
2188
+ GAGACG
2189
+ GAGACT
2190
+ GAGAGA
2191
+ GAGAGC
2192
+ GAGAGG
2193
+ GAGAGT
2194
+ GAGATA
2195
+ GAGATC
2196
+ GAGATG
2197
+ GAGATT
2198
+ GAGCAA
2199
+ GAGCAC
2200
+ GAGCAG
2201
+ GAGCAT
2202
+ GAGCCA
2203
+ GAGCCC
2204
+ GAGCCG
2205
+ GAGCCT
2206
+ GAGCGA
2207
+ GAGCGC
2208
+ GAGCGG
2209
+ GAGCGT
2210
+ GAGCTA
2211
+ GAGCTC
2212
+ GAGCTG
2213
+ GAGCTT
2214
+ GAGGAA
2215
+ GAGGAC
2216
+ GAGGAG
2217
+ GAGGAT
2218
+ GAGGCA
2219
+ GAGGCC
2220
+ GAGGCG
2221
+ GAGGCT
2222
+ GAGGGA
2223
+ GAGGGC
2224
+ GAGGGG
2225
+ GAGGGT
2226
+ GAGGTA
2227
+ GAGGTC
2228
+ GAGGTG
2229
+ GAGGTT
2230
+ GAGTAA
2231
+ GAGTAC
2232
+ GAGTAG
2233
+ GAGTAT
2234
+ GAGTCA
2235
+ GAGTCC
2236
+ GAGTCG
2237
+ GAGTCT
2238
+ GAGTGA
2239
+ GAGTGC
2240
+ GAGTGG
2241
+ GAGTGT
2242
+ GAGTTA
2243
+ GAGTTC
2244
+ GAGTTG
2245
+ GAGTTT
2246
+ GATAAA
2247
+ GATAAC
2248
+ GATAAG
2249
+ GATAAT
2250
+ GATACA
2251
+ GATACC
2252
+ GATACG
2253
+ GATACT
2254
+ GATAGA
2255
+ GATAGC
2256
+ GATAGG
2257
+ GATAGT
2258
+ GATATA
2259
+ GATATC
2260
+ GATATG
2261
+ GATATT
2262
+ GATCAA
2263
+ GATCAC
2264
+ GATCAG
2265
+ GATCAT
2266
+ GATCCA
2267
+ GATCCC
2268
+ GATCCG
2269
+ GATCCT
2270
+ GATCGA
2271
+ GATCGC
2272
+ GATCGG
2273
+ GATCGT
2274
+ GATCTA
2275
+ GATCTC
2276
+ GATCTG
2277
+ GATCTT
2278
+ GATGAA
2279
+ GATGAC
2280
+ GATGAG
2281
+ GATGAT
2282
+ GATGCA
2283
+ GATGCC
2284
+ GATGCG
2285
+ GATGCT
2286
+ GATGGA
2287
+ GATGGC
2288
+ GATGGG
2289
+ GATGGT
2290
+ GATGTA
2291
+ GATGTC
2292
+ GATGTG
2293
+ GATGTT
2294
+ GATTAA
2295
+ GATTAC
2296
+ GATTAG
2297
+ GATTAT
2298
+ GATTCA
2299
+ GATTCC
2300
+ GATTCG
2301
+ GATTCT
2302
+ GATTGA
2303
+ GATTGC
2304
+ GATTGG
2305
+ GATTGT
2306
+ GATTTA
2307
+ GATTTC
2308
+ GATTTG
2309
+ GATTTT
2310
+ GCAAAA
2311
+ GCAAAC
2312
+ GCAAAG
2313
+ GCAAAT
2314
+ GCAACA
2315
+ GCAACC
2316
+ GCAACG
2317
+ GCAACT
2318
+ GCAAGA
2319
+ GCAAGC
2320
+ GCAAGG
2321
+ GCAAGT
2322
+ GCAATA
2323
+ GCAATC
2324
+ GCAATG
2325
+ GCAATT
2326
+ GCACAA
2327
+ GCACAC
2328
+ GCACAG
2329
+ GCACAT
2330
+ GCACCA
2331
+ GCACCC
2332
+ GCACCG
2333
+ GCACCT
2334
+ GCACGA
2335
+ GCACGC
2336
+ GCACGG
2337
+ GCACGT
2338
+ GCACTA
2339
+ GCACTC
2340
+ GCACTG
2341
+ GCACTT
2342
+ GCAGAA
2343
+ GCAGAC
2344
+ GCAGAG
2345
+ GCAGAT
2346
+ GCAGCA
2347
+ GCAGCC
2348
+ GCAGCG
2349
+ GCAGCT
2350
+ GCAGGA
2351
+ GCAGGC
2352
+ GCAGGG
2353
+ GCAGGT
2354
+ GCAGTA
2355
+ GCAGTC
2356
+ GCAGTG
2357
+ GCAGTT
2358
+ GCATAA
2359
+ GCATAC
2360
+ GCATAG
2361
+ GCATAT
2362
+ GCATCA
2363
+ GCATCC
2364
+ GCATCG
2365
+ GCATCT
2366
+ GCATGA
2367
+ GCATGC
2368
+ GCATGG
2369
+ GCATGT
2370
+ GCATTA
2371
+ GCATTC
2372
+ GCATTG
2373
+ GCATTT
2374
+ GCCAAA
2375
+ GCCAAC
2376
+ GCCAAG
2377
+ GCCAAT
2378
+ GCCACA
2379
+ GCCACC
2380
+ GCCACG
2381
+ GCCACT
2382
+ GCCAGA
2383
+ GCCAGC
2384
+ GCCAGG
2385
+ GCCAGT
2386
+ GCCATA
2387
+ GCCATC
2388
+ GCCATG
2389
+ GCCATT
2390
+ GCCCAA
2391
+ GCCCAC
2392
+ GCCCAG
2393
+ GCCCAT
2394
+ GCCCCA
2395
+ GCCCCC
2396
+ GCCCCG
2397
+ GCCCCT
2398
+ GCCCGA
2399
+ GCCCGC
2400
+ GCCCGG
2401
+ GCCCGT
2402
+ GCCCTA
2403
+ GCCCTC
2404
+ GCCCTG
2405
+ GCCCTT
2406
+ GCCGAA
2407
+ GCCGAC
2408
+ GCCGAG
2409
+ GCCGAT
2410
+ GCCGCA
2411
+ GCCGCC
2412
+ GCCGCG
2413
+ GCCGCT
2414
+ GCCGGA
2415
+ GCCGGC
2416
+ GCCGGG
2417
+ GCCGGT
2418
+ GCCGTA
2419
+ GCCGTC
2420
+ GCCGTG
2421
+ GCCGTT
2422
+ GCCTAA
2423
+ GCCTAC
2424
+ GCCTAG
2425
+ GCCTAT
2426
+ GCCTCA
2427
+ GCCTCC
2428
+ GCCTCG
2429
+ GCCTCT
2430
+ GCCTGA
2431
+ GCCTGC
2432
+ GCCTGG
2433
+ GCCTGT
2434
+ GCCTTA
2435
+ GCCTTC
2436
+ GCCTTG
2437
+ GCCTTT
2438
+ GCGAAA
2439
+ GCGAAC
2440
+ GCGAAG
2441
+ GCGAAT
2442
+ GCGACA
2443
+ GCGACC
2444
+ GCGACG
2445
+ GCGACT
2446
+ GCGAGA
2447
+ GCGAGC
2448
+ GCGAGG
2449
+ GCGAGT
2450
+ GCGATA
2451
+ GCGATC
2452
+ GCGATG
2453
+ GCGATT
2454
+ GCGCAA
2455
+ GCGCAC
2456
+ GCGCAG
2457
+ GCGCAT
2458
+ GCGCCA
2459
+ GCGCCC
2460
+ GCGCCG
2461
+ GCGCCT
2462
+ GCGCGA
2463
+ GCGCGC
2464
+ GCGCGG
2465
+ GCGCGT
2466
+ GCGCTA
2467
+ GCGCTC
2468
+ GCGCTG
2469
+ GCGCTT
2470
+ GCGGAA
2471
+ GCGGAC
2472
+ GCGGAG
2473
+ GCGGAT
2474
+ GCGGCA
2475
+ GCGGCC
2476
+ GCGGCG
2477
+ GCGGCT
2478
+ GCGGGA
2479
+ GCGGGC
2480
+ GCGGGG
2481
+ GCGGGT
2482
+ GCGGTA
2483
+ GCGGTC
2484
+ GCGGTG
2485
+ GCGGTT
2486
+ GCGTAA
2487
+ GCGTAC
2488
+ GCGTAG
2489
+ GCGTAT
2490
+ GCGTCA
2491
+ GCGTCC
2492
+ GCGTCG
2493
+ GCGTCT
2494
+ GCGTGA
2495
+ GCGTGC
2496
+ GCGTGG
2497
+ GCGTGT
2498
+ GCGTTA
2499
+ GCGTTC
2500
+ GCGTTG
2501
+ GCGTTT
2502
+ GCTAAA
2503
+ GCTAAC
2504
+ GCTAAG
2505
+ GCTAAT
2506
+ GCTACA
2507
+ GCTACC
2508
+ GCTACG
2509
+ GCTACT
2510
+ GCTAGA
2511
+ GCTAGC
2512
+ GCTAGG
2513
+ GCTAGT
2514
+ GCTATA
2515
+ GCTATC
2516
+ GCTATG
2517
+ GCTATT
2518
+ GCTCAA
2519
+ GCTCAC
2520
+ GCTCAG
2521
+ GCTCAT
2522
+ GCTCCA
2523
+ GCTCCC
2524
+ GCTCCG
2525
+ GCTCCT
2526
+ GCTCGA
2527
+ GCTCGC
2528
+ GCTCGG
2529
+ GCTCGT
2530
+ GCTCTA
2531
+ GCTCTC
2532
+ GCTCTG
2533
+ GCTCTT
2534
+ GCTGAA
2535
+ GCTGAC
2536
+ GCTGAG
2537
+ GCTGAT
2538
+ GCTGCA
2539
+ GCTGCC
2540
+ GCTGCG
2541
+ GCTGCT
2542
+ GCTGGA
2543
+ GCTGGC
2544
+ GCTGGG
2545
+ GCTGGT
2546
+ GCTGTA
2547
+ GCTGTC
2548
+ GCTGTG
2549
+ GCTGTT
2550
+ GCTTAA
2551
+ GCTTAC
2552
+ GCTTAG
2553
+ GCTTAT
2554
+ GCTTCA
2555
+ GCTTCC
2556
+ GCTTCG
2557
+ GCTTCT
2558
+ GCTTGA
2559
+ GCTTGC
2560
+ GCTTGG
2561
+ GCTTGT
2562
+ GCTTTA
2563
+ GCTTTC
2564
+ GCTTTG
2565
+ GCTTTT
2566
+ GGAAAA
2567
+ GGAAAC
2568
+ GGAAAG
2569
+ GGAAAT
2570
+ GGAACA
2571
+ GGAACC
2572
+ GGAACG
2573
+ GGAACT
2574
+ GGAAGA
2575
+ GGAAGC
2576
+ GGAAGG
2577
+ GGAAGT
2578
+ GGAATA
2579
+ GGAATC
2580
+ GGAATG
2581
+ GGAATT
2582
+ GGACAA
2583
+ GGACAC
2584
+ GGACAG
2585
+ GGACAT
2586
+ GGACCA
2587
+ GGACCC
2588
+ GGACCG
2589
+ GGACCT
2590
+ GGACGA
2591
+ GGACGC
2592
+ GGACGG
2593
+ GGACGT
2594
+ GGACTA
2595
+ GGACTC
2596
+ GGACTG
2597
+ GGACTT
2598
+ GGAGAA
2599
+ GGAGAC
2600
+ GGAGAG
2601
+ GGAGAT
2602
+ GGAGCA
2603
+ GGAGCC
2604
+ GGAGCG
2605
+ GGAGCT
2606
+ GGAGGA
2607
+ GGAGGC
2608
+ GGAGGG
2609
+ GGAGGT
2610
+ GGAGTA
2611
+ GGAGTC
2612
+ GGAGTG
2613
+ GGAGTT
2614
+ GGATAA
2615
+ GGATAC
2616
+ GGATAG
2617
+ GGATAT
2618
+ GGATCA
2619
+ GGATCC
2620
+ GGATCG
2621
+ GGATCT
2622
+ GGATGA
2623
+ GGATGC
2624
+ GGATGG
2625
+ GGATGT
2626
+ GGATTA
2627
+ GGATTC
2628
+ GGATTG
2629
+ GGATTT
2630
+ GGCAAA
2631
+ GGCAAC
2632
+ GGCAAG
2633
+ GGCAAT
2634
+ GGCACA
2635
+ GGCACC
2636
+ GGCACG
2637
+ GGCACT
2638
+ GGCAGA
2639
+ GGCAGC
2640
+ GGCAGG
2641
+ GGCAGT
2642
+ GGCATA
2643
+ GGCATC
2644
+ GGCATG
2645
+ GGCATT
2646
+ GGCCAA
2647
+ GGCCAC
2648
+ GGCCAG
2649
+ GGCCAT
2650
+ GGCCCA
2651
+ GGCCCC
2652
+ GGCCCG
2653
+ GGCCCT
2654
+ GGCCGA
2655
+ GGCCGC
2656
+ GGCCGG
2657
+ GGCCGT
2658
+ GGCCTA
2659
+ GGCCTC
2660
+ GGCCTG
2661
+ GGCCTT
2662
+ GGCGAA
2663
+ GGCGAC
2664
+ GGCGAG
2665
+ GGCGAT
2666
+ GGCGCA
2667
+ GGCGCC
2668
+ GGCGCG
2669
+ GGCGCT
2670
+ GGCGGA
2671
+ GGCGGC
2672
+ GGCGGG
2673
+ GGCGGT
2674
+ GGCGTA
2675
+ GGCGTC
2676
+ GGCGTG
2677
+ GGCGTT
2678
+ GGCTAA
2679
+ GGCTAC
2680
+ GGCTAG
2681
+ GGCTAT
2682
+ GGCTCA
2683
+ GGCTCC
2684
+ GGCTCG
2685
+ GGCTCT
2686
+ GGCTGA
2687
+ GGCTGC
2688
+ GGCTGG
2689
+ GGCTGT
2690
+ GGCTTA
2691
+ GGCTTC
2692
+ GGCTTG
2693
+ GGCTTT
2694
+ GGGAAA
2695
+ GGGAAC
2696
+ GGGAAG
2697
+ GGGAAT
2698
+ GGGACA
2699
+ GGGACC
2700
+ GGGACG
2701
+ GGGACT
2702
+ GGGAGA
2703
+ GGGAGC
2704
+ GGGAGG
2705
+ GGGAGT
2706
+ GGGATA
2707
+ GGGATC
2708
+ GGGATG
2709
+ GGGATT
2710
+ GGGCAA
2711
+ GGGCAC
2712
+ GGGCAG
2713
+ GGGCAT
2714
+ GGGCCA
2715
+ GGGCCC
2716
+ GGGCCG
2717
+ GGGCCT
2718
+ GGGCGA
2719
+ GGGCGC
2720
+ GGGCGG
2721
+ GGGCGT
2722
+ GGGCTA
2723
+ GGGCTC
2724
+ GGGCTG
2725
+ GGGCTT
2726
+ GGGGAA
2727
+ GGGGAC
2728
+ GGGGAG
2729
+ GGGGAT
2730
+ GGGGCA
2731
+ GGGGCC
2732
+ GGGGCG
2733
+ GGGGCT
2734
+ GGGGGA
2735
+ GGGGGC
2736
+ GGGGGG
2737
+ GGGGGT
2738
+ GGGGTA
2739
+ GGGGTC
2740
+ GGGGTG
2741
+ GGGGTT
2742
+ GGGTAA
2743
+ GGGTAC
2744
+ GGGTAG
2745
+ GGGTAT
2746
+ GGGTCA
2747
+ GGGTCC
2748
+ GGGTCG
2749
+ GGGTCT
2750
+ GGGTGA
2751
+ GGGTGC
2752
+ GGGTGG
2753
+ GGGTGT
2754
+ GGGTTA
2755
+ GGGTTC
2756
+ GGGTTG
2757
+ GGGTTT
2758
+ GGTAAA
2759
+ GGTAAC
2760
+ GGTAAG
2761
+ GGTAAT
2762
+ GGTACA
2763
+ GGTACC
2764
+ GGTACG
2765
+ GGTACT
2766
+ GGTAGA
2767
+ GGTAGC
2768
+ GGTAGG
2769
+ GGTAGT
2770
+ GGTATA
2771
+ GGTATC
2772
+ GGTATG
2773
+ GGTATT
2774
+ GGTCAA
2775
+ GGTCAC
2776
+ GGTCAG
2777
+ GGTCAT
2778
+ GGTCCA
2779
+ GGTCCC
2780
+ GGTCCG
2781
+ GGTCCT
2782
+ GGTCGA
2783
+ GGTCGC
2784
+ GGTCGG
2785
+ GGTCGT
2786
+ GGTCTA
2787
+ GGTCTC
2788
+ GGTCTG
2789
+ GGTCTT
2790
+ GGTGAA
2791
+ GGTGAC
2792
+ GGTGAG
2793
+ GGTGAT
2794
+ GGTGCA
2795
+ GGTGCC
2796
+ GGTGCG
2797
+ GGTGCT
2798
+ GGTGGA
2799
+ GGTGGC
2800
+ GGTGGG
2801
+ GGTGGT
2802
+ GGTGTA
2803
+ GGTGTC
2804
+ GGTGTG
2805
+ GGTGTT
2806
+ GGTTAA
2807
+ GGTTAC
2808
+ GGTTAG
2809
+ GGTTAT
2810
+ GGTTCA
2811
+ GGTTCC
2812
+ GGTTCG
2813
+ GGTTCT
2814
+ GGTTGA
2815
+ GGTTGC
2816
+ GGTTGG
2817
+ GGTTGT
2818
+ GGTTTA
2819
+ GGTTTC
2820
+ GGTTTG
2821
+ GGTTTT
2822
+ GTAAAA
2823
+ GTAAAC
2824
+ GTAAAG
2825
+ GTAAAT
2826
+ GTAACA
2827
+ GTAACC
2828
+ GTAACG
2829
+ GTAACT
2830
+ GTAAGA
2831
+ GTAAGC
2832
+ GTAAGG
2833
+ GTAAGT
2834
+ GTAATA
2835
+ GTAATC
2836
+ GTAATG
2837
+ GTAATT
2838
+ GTACAA
2839
+ GTACAC
2840
+ GTACAG
2841
+ GTACAT
2842
+ GTACCA
2843
+ GTACCC
2844
+ GTACCG
2845
+ GTACCT
2846
+ GTACGA
2847
+ GTACGC
2848
+ GTACGG
2849
+ GTACGT
2850
+ GTACTA
2851
+ GTACTC
2852
+ GTACTG
2853
+ GTACTT
2854
+ GTAGAA
2855
+ GTAGAC
2856
+ GTAGAG
2857
+ GTAGAT
2858
+ GTAGCA
2859
+ GTAGCC
2860
+ GTAGCG
2861
+ GTAGCT
2862
+ GTAGGA
2863
+ GTAGGC
2864
+ GTAGGG
2865
+ GTAGGT
2866
+ GTAGTA
2867
+ GTAGTC
2868
+ GTAGTG
2869
+ GTAGTT
2870
+ GTATAA
2871
+ GTATAC
2872
+ GTATAG
2873
+ GTATAT
2874
+ GTATCA
2875
+ GTATCC
2876
+ GTATCG
2877
+ GTATCT
2878
+ GTATGA
2879
+ GTATGC
2880
+ GTATGG
2881
+ GTATGT
2882
+ GTATTA
2883
+ GTATTC
2884
+ GTATTG
2885
+ GTATTT
2886
+ GTCAAA
2887
+ GTCAAC
2888
+ GTCAAG
2889
+ GTCAAT
2890
+ GTCACA
2891
+ GTCACC
2892
+ GTCACG
2893
+ GTCACT
2894
+ GTCAGA
2895
+ GTCAGC
2896
+ GTCAGG
2897
+ GTCAGT
2898
+ GTCATA
2899
+ GTCATC
2900
+ GTCATG
2901
+ GTCATT
2902
+ GTCCAA
2903
+ GTCCAC
2904
+ GTCCAG
2905
+ GTCCAT
2906
+ GTCCCA
2907
+ GTCCCC
2908
+ GTCCCG
2909
+ GTCCCT
2910
+ GTCCGA
2911
+ GTCCGC
2912
+ GTCCGG
2913
+ GTCCGT
2914
+ GTCCTA
2915
+ GTCCTC
2916
+ GTCCTG
2917
+ GTCCTT
2918
+ GTCGAA
2919
+ GTCGAC
2920
+ GTCGAG
2921
+ GTCGAT
2922
+ GTCGCA
2923
+ GTCGCC
2924
+ GTCGCG
2925
+ GTCGCT
2926
+ GTCGGA
2927
+ GTCGGC
2928
+ GTCGGG
2929
+ GTCGGT
2930
+ GTCGTA
2931
+ GTCGTC
2932
+ GTCGTG
2933
+ GTCGTT
2934
+ GTCTAA
2935
+ GTCTAC
2936
+ GTCTAG
2937
+ GTCTAT
2938
+ GTCTCA
2939
+ GTCTCC
2940
+ GTCTCG
2941
+ GTCTCT
2942
+ GTCTGA
2943
+ GTCTGC
2944
+ GTCTGG
2945
+ GTCTGT
2946
+ GTCTTA
2947
+ GTCTTC
2948
+ GTCTTG
2949
+ GTCTTT
2950
+ GTGAAA
2951
+ GTGAAC
2952
+ GTGAAG
2953
+ GTGAAT
2954
+ GTGACA
2955
+ GTGACC
2956
+ GTGACG
2957
+ GTGACT
2958
+ GTGAGA
2959
+ GTGAGC
2960
+ GTGAGG
2961
+ GTGAGT
2962
+ GTGATA
2963
+ GTGATC
2964
+ GTGATG
2965
+ GTGATT
2966
+ GTGCAA
2967
+ GTGCAC
2968
+ GTGCAG
2969
+ GTGCAT
2970
+ GTGCCA
2971
+ GTGCCC
2972
+ GTGCCG
2973
+ GTGCCT
2974
+ GTGCGA
2975
+ GTGCGC
2976
+ GTGCGG
2977
+ GTGCGT
2978
+ GTGCTA
2979
+ GTGCTC
2980
+ GTGCTG
2981
+ GTGCTT
2982
+ GTGGAA
2983
+ GTGGAC
2984
+ GTGGAG
2985
+ GTGGAT
2986
+ GTGGCA
2987
+ GTGGCC
2988
+ GTGGCG
2989
+ GTGGCT
2990
+ GTGGGA
2991
+ GTGGGC
2992
+ GTGGGG
2993
+ GTGGGT
2994
+ GTGGTA
2995
+ GTGGTC
2996
+ GTGGTG
2997
+ GTGGTT
2998
+ GTGTAA
2999
+ GTGTAC
3000
+ GTGTAG
3001
+ GTGTAT
3002
+ GTGTCA
3003
+ GTGTCC
3004
+ GTGTCG
3005
+ GTGTCT
3006
+ GTGTGA
3007
+ GTGTGC
3008
+ GTGTGG
3009
+ GTGTGT
3010
+ GTGTTA
3011
+ GTGTTC
3012
+ GTGTTG
3013
+ GTGTTT
3014
+ GTTAAA
3015
+ GTTAAC
3016
+ GTTAAG
3017
+ GTTAAT
3018
+ GTTACA
3019
+ GTTACC
3020
+ GTTACG
3021
+ GTTACT
3022
+ GTTAGA
3023
+ GTTAGC
3024
+ GTTAGG
3025
+ GTTAGT
3026
+ GTTATA
3027
+ GTTATC
3028
+ GTTATG
3029
+ GTTATT
3030
+ GTTCAA
3031
+ GTTCAC
3032
+ GTTCAG
3033
+ GTTCAT
3034
+ GTTCCA
3035
+ GTTCCC
3036
+ GTTCCG
3037
+ GTTCCT
3038
+ GTTCGA
3039
+ GTTCGC
3040
+ GTTCGG
3041
+ GTTCGT
3042
+ GTTCTA
3043
+ GTTCTC
3044
+ GTTCTG
3045
+ GTTCTT
3046
+ GTTGAA
3047
+ GTTGAC
3048
+ GTTGAG
3049
+ GTTGAT
3050
+ GTTGCA
3051
+ GTTGCC
3052
+ GTTGCG
3053
+ GTTGCT
3054
+ GTTGGA
3055
+ GTTGGC
3056
+ GTTGGG
3057
+ GTTGGT
3058
+ GTTGTA
3059
+ GTTGTC
3060
+ GTTGTG
3061
+ GTTGTT
3062
+ GTTTAA
3063
+ GTTTAC
3064
+ GTTTAG
3065
+ GTTTAT
3066
+ GTTTCA
3067
+ GTTTCC
3068
+ GTTTCG
3069
+ GTTTCT
3070
+ GTTTGA
3071
+ GTTTGC
3072
+ GTTTGG
3073
+ GTTTGT
3074
+ GTTTTA
3075
+ GTTTTC
3076
+ GTTTTG
3077
+ GTTTTT
3078
+ TAAAAA
3079
+ TAAAAC
3080
+ TAAAAG
3081
+ TAAAAT
3082
+ TAAACA
3083
+ TAAACC
3084
+ TAAACG
3085
+ TAAACT
3086
+ TAAAGA
3087
+ TAAAGC
3088
+ TAAAGG
3089
+ TAAAGT
3090
+ TAAATA
3091
+ TAAATC
3092
+ TAAATG
3093
+ TAAATT
3094
+ TAACAA
3095
+ TAACAC
3096
+ TAACAG
3097
+ TAACAT
3098
+ TAACCA
3099
+ TAACCC
3100
+ TAACCG
3101
+ TAACCT
3102
+ TAACGA
3103
+ TAACGC
3104
+ TAACGG
3105
+ TAACGT
3106
+ TAACTA
3107
+ TAACTC
3108
+ TAACTG
3109
+ TAACTT
3110
+ TAAGAA
3111
+ TAAGAC
3112
+ TAAGAG
3113
+ TAAGAT
3114
+ TAAGCA
3115
+ TAAGCC
3116
+ TAAGCG
3117
+ TAAGCT
3118
+ TAAGGA
3119
+ TAAGGC
3120
+ TAAGGG
3121
+ TAAGGT
3122
+ TAAGTA
3123
+ TAAGTC
3124
+ TAAGTG
3125
+ TAAGTT
3126
+ TAATAA
3127
+ TAATAC
3128
+ TAATAG
3129
+ TAATAT
3130
+ TAATCA
3131
+ TAATCC
3132
+ TAATCG
3133
+ TAATCT
3134
+ TAATGA
3135
+ TAATGC
3136
+ TAATGG
3137
+ TAATGT
3138
+ TAATTA
3139
+ TAATTC
3140
+ TAATTG
3141
+ TAATTT
3142
+ TACAAA
3143
+ TACAAC
3144
+ TACAAG
3145
+ TACAAT
3146
+ TACACA
3147
+ TACACC
3148
+ TACACG
3149
+ TACACT
3150
+ TACAGA
3151
+ TACAGC
3152
+ TACAGG
3153
+ TACAGT
3154
+ TACATA
3155
+ TACATC
3156
+ TACATG
3157
+ TACATT
3158
+ TACCAA
3159
+ TACCAC
3160
+ TACCAG
3161
+ TACCAT
3162
+ TACCCA
3163
+ TACCCC
3164
+ TACCCG
3165
+ TACCCT
3166
+ TACCGA
3167
+ TACCGC
3168
+ TACCGG
3169
+ TACCGT
3170
+ TACCTA
3171
+ TACCTC
3172
+ TACCTG
3173
+ TACCTT
3174
+ TACGAA
3175
+ TACGAC
3176
+ TACGAG
3177
+ TACGAT
3178
+ TACGCA
3179
+ TACGCC
3180
+ TACGCG
3181
+ TACGCT
3182
+ TACGGA
3183
+ TACGGC
3184
+ TACGGG
3185
+ TACGGT
3186
+ TACGTA
3187
+ TACGTC
3188
+ TACGTG
3189
+ TACGTT
3190
+ TACTAA
3191
+ TACTAC
3192
+ TACTAG
3193
+ TACTAT
3194
+ TACTCA
3195
+ TACTCC
3196
+ TACTCG
3197
+ TACTCT
3198
+ TACTGA
3199
+ TACTGC
3200
+ TACTGG
3201
+ TACTGT
3202
+ TACTTA
3203
+ TACTTC
3204
+ TACTTG
3205
+ TACTTT
3206
+ TAGAAA
3207
+ TAGAAC
3208
+ TAGAAG
3209
+ TAGAAT
3210
+ TAGACA
3211
+ TAGACC
3212
+ TAGACG
3213
+ TAGACT
3214
+ TAGAGA
3215
+ TAGAGC
3216
+ TAGAGG
3217
+ TAGAGT
3218
+ TAGATA
3219
+ TAGATC
3220
+ TAGATG
3221
+ TAGATT
3222
+ TAGCAA
3223
+ TAGCAC
3224
+ TAGCAG
3225
+ TAGCAT
3226
+ TAGCCA
3227
+ TAGCCC
3228
+ TAGCCG
3229
+ TAGCCT
3230
+ TAGCGA
3231
+ TAGCGC
3232
+ TAGCGG
3233
+ TAGCGT
3234
+ TAGCTA
3235
+ TAGCTC
3236
+ TAGCTG
3237
+ TAGCTT
3238
+ TAGGAA
3239
+ TAGGAC
3240
+ TAGGAG
3241
+ TAGGAT
3242
+ TAGGCA
3243
+ TAGGCC
3244
+ TAGGCG
3245
+ TAGGCT
3246
+ TAGGGA
3247
+ TAGGGC
3248
+ TAGGGG
3249
+ TAGGGT
3250
+ TAGGTA
3251
+ TAGGTC
3252
+ TAGGTG
3253
+ TAGGTT
3254
+ TAGTAA
3255
+ TAGTAC
3256
+ TAGTAG
3257
+ TAGTAT
3258
+ TAGTCA
3259
+ TAGTCC
3260
+ TAGTCG
3261
+ TAGTCT
3262
+ TAGTGA
3263
+ TAGTGC
3264
+ TAGTGG
3265
+ TAGTGT
3266
+ TAGTTA
3267
+ TAGTTC
3268
+ TAGTTG
3269
+ TAGTTT
3270
+ TATAAA
3271
+ TATAAC
3272
+ TATAAG
3273
+ TATAAT
3274
+ TATACA
3275
+ TATACC
3276
+ TATACG
3277
+ TATACT
3278
+ TATAGA
3279
+ TATAGC
3280
+ TATAGG
3281
+ TATAGT
3282
+ TATATA
3283
+ TATATC
3284
+ TATATG
3285
+ TATATT
3286
+ TATCAA
3287
+ TATCAC
3288
+ TATCAG
3289
+ TATCAT
3290
+ TATCCA
3291
+ TATCCC
3292
+ TATCCG
3293
+ TATCCT
3294
+ TATCGA
3295
+ TATCGC
3296
+ TATCGG
3297
+ TATCGT
3298
+ TATCTA
3299
+ TATCTC
3300
+ TATCTG
3301
+ TATCTT
3302
+ TATGAA
3303
+ TATGAC
3304
+ TATGAG
3305
+ TATGAT
3306
+ TATGCA
3307
+ TATGCC
3308
+ TATGCG
3309
+ TATGCT
3310
+ TATGGA
3311
+ TATGGC
3312
+ TATGGG
3313
+ TATGGT
3314
+ TATGTA
3315
+ TATGTC
3316
+ TATGTG
3317
+ TATGTT
3318
+ TATTAA
3319
+ TATTAC
3320
+ TATTAG
3321
+ TATTAT
3322
+ TATTCA
3323
+ TATTCC
3324
+ TATTCG
3325
+ TATTCT
3326
+ TATTGA
3327
+ TATTGC
3328
+ TATTGG
3329
+ TATTGT
3330
+ TATTTA
3331
+ TATTTC
3332
+ TATTTG
3333
+ TATTTT
3334
+ TCAAAA
3335
+ TCAAAC
3336
+ TCAAAG
3337
+ TCAAAT
3338
+ TCAACA
3339
+ TCAACC
3340
+ TCAACG
3341
+ TCAACT
3342
+ TCAAGA
3343
+ TCAAGC
3344
+ TCAAGG
3345
+ TCAAGT
3346
+ TCAATA
3347
+ TCAATC
3348
+ TCAATG
3349
+ TCAATT
3350
+ TCACAA
3351
+ TCACAC
3352
+ TCACAG
3353
+ TCACAT
3354
+ TCACCA
3355
+ TCACCC
3356
+ TCACCG
3357
+ TCACCT
3358
+ TCACGA
3359
+ TCACGC
3360
+ TCACGG
3361
+ TCACGT
3362
+ TCACTA
3363
+ TCACTC
3364
+ TCACTG
3365
+ TCACTT
3366
+ TCAGAA
3367
+ TCAGAC
3368
+ TCAGAG
3369
+ TCAGAT
3370
+ TCAGCA
3371
+ TCAGCC
3372
+ TCAGCG
3373
+ TCAGCT
3374
+ TCAGGA
3375
+ TCAGGC
3376
+ TCAGGG
3377
+ TCAGGT
3378
+ TCAGTA
3379
+ TCAGTC
3380
+ TCAGTG
3381
+ TCAGTT
3382
+ TCATAA
3383
+ TCATAC
3384
+ TCATAG
3385
+ TCATAT
3386
+ TCATCA
3387
+ TCATCC
3388
+ TCATCG
3389
+ TCATCT
3390
+ TCATGA
3391
+ TCATGC
3392
+ TCATGG
3393
+ TCATGT
3394
+ TCATTA
3395
+ TCATTC
3396
+ TCATTG
3397
+ TCATTT
3398
+ TCCAAA
3399
+ TCCAAC
3400
+ TCCAAG
3401
+ TCCAAT
3402
+ TCCACA
3403
+ TCCACC
3404
+ TCCACG
3405
+ TCCACT
3406
+ TCCAGA
3407
+ TCCAGC
3408
+ TCCAGG
3409
+ TCCAGT
3410
+ TCCATA
3411
+ TCCATC
3412
+ TCCATG
3413
+ TCCATT
3414
+ TCCCAA
3415
+ TCCCAC
3416
+ TCCCAG
3417
+ TCCCAT
3418
+ TCCCCA
3419
+ TCCCCC
3420
+ TCCCCG
3421
+ TCCCCT
3422
+ TCCCGA
3423
+ TCCCGC
3424
+ TCCCGG
3425
+ TCCCGT
3426
+ TCCCTA
3427
+ TCCCTC
3428
+ TCCCTG
3429
+ TCCCTT
3430
+ TCCGAA
3431
+ TCCGAC
3432
+ TCCGAG
3433
+ TCCGAT
3434
+ TCCGCA
3435
+ TCCGCC
3436
+ TCCGCG
3437
+ TCCGCT
3438
+ TCCGGA
3439
+ TCCGGC
3440
+ TCCGGG
3441
+ TCCGGT
3442
+ TCCGTA
3443
+ TCCGTC
3444
+ TCCGTG
3445
+ TCCGTT
3446
+ TCCTAA
3447
+ TCCTAC
3448
+ TCCTAG
3449
+ TCCTAT
3450
+ TCCTCA
3451
+ TCCTCC
3452
+ TCCTCG
3453
+ TCCTCT
3454
+ TCCTGA
3455
+ TCCTGC
3456
+ TCCTGG
3457
+ TCCTGT
3458
+ TCCTTA
3459
+ TCCTTC
3460
+ TCCTTG
3461
+ TCCTTT
3462
+ TCGAAA
3463
+ TCGAAC
3464
+ TCGAAG
3465
+ TCGAAT
3466
+ TCGACA
3467
+ TCGACC
3468
+ TCGACG
3469
+ TCGACT
3470
+ TCGAGA
3471
+ TCGAGC
3472
+ TCGAGG
3473
+ TCGAGT
3474
+ TCGATA
3475
+ TCGATC
3476
+ TCGATG
3477
+ TCGATT
3478
+ TCGCAA
3479
+ TCGCAC
3480
+ TCGCAG
3481
+ TCGCAT
3482
+ TCGCCA
3483
+ TCGCCC
3484
+ TCGCCG
3485
+ TCGCCT
3486
+ TCGCGA
3487
+ TCGCGC
3488
+ TCGCGG
3489
+ TCGCGT
3490
+ TCGCTA
3491
+ TCGCTC
3492
+ TCGCTG
3493
+ TCGCTT
3494
+ TCGGAA
3495
+ TCGGAC
3496
+ TCGGAG
3497
+ TCGGAT
3498
+ TCGGCA
3499
+ TCGGCC
3500
+ TCGGCG
3501
+ TCGGCT
3502
+ TCGGGA
3503
+ TCGGGC
3504
+ TCGGGG
3505
+ TCGGGT
3506
+ TCGGTA
3507
+ TCGGTC
3508
+ TCGGTG
3509
+ TCGGTT
3510
+ TCGTAA
3511
+ TCGTAC
3512
+ TCGTAG
3513
+ TCGTAT
3514
+ TCGTCA
3515
+ TCGTCC
3516
+ TCGTCG
3517
+ TCGTCT
3518
+ TCGTGA
3519
+ TCGTGC
3520
+ TCGTGG
3521
+ TCGTGT
3522
+ TCGTTA
3523
+ TCGTTC
3524
+ TCGTTG
3525
+ TCGTTT
3526
+ TCTAAA
3527
+ TCTAAC
3528
+ TCTAAG
3529
+ TCTAAT
3530
+ TCTACA
3531
+ TCTACC
3532
+ TCTACG
3533
+ TCTACT
3534
+ TCTAGA
3535
+ TCTAGC
3536
+ TCTAGG
3537
+ TCTAGT
3538
+ TCTATA
3539
+ TCTATC
3540
+ TCTATG
3541
+ TCTATT
3542
+ TCTCAA
3543
+ TCTCAC
3544
+ TCTCAG
3545
+ TCTCAT
3546
+ TCTCCA
3547
+ TCTCCC
3548
+ TCTCCG
3549
+ TCTCCT
3550
+ TCTCGA
3551
+ TCTCGC
3552
+ TCTCGG
3553
+ TCTCGT
3554
+ TCTCTA
3555
+ TCTCTC
3556
+ TCTCTG
3557
+ TCTCTT
3558
+ TCTGAA
3559
+ TCTGAC
3560
+ TCTGAG
3561
+ TCTGAT
3562
+ TCTGCA
3563
+ TCTGCC
3564
+ TCTGCG
3565
+ TCTGCT
3566
+ TCTGGA
3567
+ TCTGGC
3568
+ TCTGGG
3569
+ TCTGGT
3570
+ TCTGTA
3571
+ TCTGTC
3572
+ TCTGTG
3573
+ TCTGTT
3574
+ TCTTAA
3575
+ TCTTAC
3576
+ TCTTAG
3577
+ TCTTAT
3578
+ TCTTCA
3579
+ TCTTCC
3580
+ TCTTCG
3581
+ TCTTCT
3582
+ TCTTGA
3583
+ TCTTGC
3584
+ TCTTGG
3585
+ TCTTGT
3586
+ TCTTTA
3587
+ TCTTTC
3588
+ TCTTTG
3589
+ TCTTTT
3590
+ TGAAAA
3591
+ TGAAAC
3592
+ TGAAAG
3593
+ TGAAAT
3594
+ TGAACA
3595
+ TGAACC
3596
+ TGAACG
3597
+ TGAACT
3598
+ TGAAGA
3599
+ TGAAGC
3600
+ TGAAGG
3601
+ TGAAGT
3602
+ TGAATA
3603
+ TGAATC
3604
+ TGAATG
3605
+ TGAATT
3606
+ TGACAA
3607
+ TGACAC
3608
+ TGACAG
3609
+ TGACAT
3610
+ TGACCA
3611
+ TGACCC
3612
+ TGACCG
3613
+ TGACCT
3614
+ TGACGA
3615
+ TGACGC
3616
+ TGACGG
3617
+ TGACGT
3618
+ TGACTA
3619
+ TGACTC
3620
+ TGACTG
3621
+ TGACTT
3622
+ TGAGAA
3623
+ TGAGAC
3624
+ TGAGAG
3625
+ TGAGAT
3626
+ TGAGCA
3627
+ TGAGCC
3628
+ TGAGCG
3629
+ TGAGCT
3630
+ TGAGGA
3631
+ TGAGGC
3632
+ TGAGGG
3633
+ TGAGGT
3634
+ TGAGTA
3635
+ TGAGTC
3636
+ TGAGTG
3637
+ TGAGTT
3638
+ TGATAA
3639
+ TGATAC
3640
+ TGATAG
3641
+ TGATAT
3642
+ TGATCA
3643
+ TGATCC
3644
+ TGATCG
3645
+ TGATCT
3646
+ TGATGA
3647
+ TGATGC
3648
+ TGATGG
3649
+ TGATGT
3650
+ TGATTA
3651
+ TGATTC
3652
+ TGATTG
3653
+ TGATTT
3654
+ TGCAAA
3655
+ TGCAAC
3656
+ TGCAAG
3657
+ TGCAAT
3658
+ TGCACA
3659
+ TGCACC
3660
+ TGCACG
3661
+ TGCACT
3662
+ TGCAGA
3663
+ TGCAGC
3664
+ TGCAGG
3665
+ TGCAGT
3666
+ TGCATA
3667
+ TGCATC
3668
+ TGCATG
3669
+ TGCATT
3670
+ TGCCAA
3671
+ TGCCAC
3672
+ TGCCAG
3673
+ TGCCAT
3674
+ TGCCCA
3675
+ TGCCCC
3676
+ TGCCCG
3677
+ TGCCCT
3678
+ TGCCGA
3679
+ TGCCGC
3680
+ TGCCGG
3681
+ TGCCGT
3682
+ TGCCTA
3683
+ TGCCTC
3684
+ TGCCTG
3685
+ TGCCTT
3686
+ TGCGAA
3687
+ TGCGAC
3688
+ TGCGAG
3689
+ TGCGAT
3690
+ TGCGCA
3691
+ TGCGCC
3692
+ TGCGCG
3693
+ TGCGCT
3694
+ TGCGGA
3695
+ TGCGGC
3696
+ TGCGGG
3697
+ TGCGGT
3698
+ TGCGTA
3699
+ TGCGTC
3700
+ TGCGTG
3701
+ TGCGTT
3702
+ TGCTAA
3703
+ TGCTAC
3704
+ TGCTAG
3705
+ TGCTAT
3706
+ TGCTCA
3707
+ TGCTCC
3708
+ TGCTCG
3709
+ TGCTCT
3710
+ TGCTGA
3711
+ TGCTGC
3712
+ TGCTGG
3713
+ TGCTGT
3714
+ TGCTTA
3715
+ TGCTTC
3716
+ TGCTTG
3717
+ TGCTTT
3718
+ TGGAAA
3719
+ TGGAAC
3720
+ TGGAAG
3721
+ TGGAAT
3722
+ TGGACA
3723
+ TGGACC
3724
+ TGGACG
3725
+ TGGACT
3726
+ TGGAGA
3727
+ TGGAGC
3728
+ TGGAGG
3729
+ TGGAGT
3730
+ TGGATA
3731
+ TGGATC
3732
+ TGGATG
3733
+ TGGATT
3734
+ TGGCAA
3735
+ TGGCAC
3736
+ TGGCAG
3737
+ TGGCAT
3738
+ TGGCCA
3739
+ TGGCCC
3740
+ TGGCCG
3741
+ TGGCCT
3742
+ TGGCGA
3743
+ TGGCGC
3744
+ TGGCGG
3745
+ TGGCGT
3746
+ TGGCTA
3747
+ TGGCTC
3748
+ TGGCTG
3749
+ TGGCTT
3750
+ TGGGAA
3751
+ TGGGAC
3752
+ TGGGAG
3753
+ TGGGAT
3754
+ TGGGCA
3755
+ TGGGCC
3756
+ TGGGCG
3757
+ TGGGCT
3758
+ TGGGGA
3759
+ TGGGGC
3760
+ TGGGGG
3761
+ TGGGGT
3762
+ TGGGTA
3763
+ TGGGTC
3764
+ TGGGTG
3765
+ TGGGTT
3766
+ TGGTAA
3767
+ TGGTAC
3768
+ TGGTAG
3769
+ TGGTAT
3770
+ TGGTCA
3771
+ TGGTCC
3772
+ TGGTCG
3773
+ TGGTCT
3774
+ TGGTGA
3775
+ TGGTGC
3776
+ TGGTGG
3777
+ TGGTGT
3778
+ TGGTTA
3779
+ TGGTTC
3780
+ TGGTTG
3781
+ TGGTTT
3782
+ TGTAAA
3783
+ TGTAAC
3784
+ TGTAAG
3785
+ TGTAAT
3786
+ TGTACA
3787
+ TGTACC
3788
+ TGTACG
3789
+ TGTACT
3790
+ TGTAGA
3791
+ TGTAGC
3792
+ TGTAGG
3793
+ TGTAGT
3794
+ TGTATA
3795
+ TGTATC
3796
+ TGTATG
3797
+ TGTATT
3798
+ TGTCAA
3799
+ TGTCAC
3800
+ TGTCAG
3801
+ TGTCAT
3802
+ TGTCCA
3803
+ TGTCCC
3804
+ TGTCCG
3805
+ TGTCCT
3806
+ TGTCGA
3807
+ TGTCGC
3808
+ TGTCGG
3809
+ TGTCGT
3810
+ TGTCTA
3811
+ TGTCTC
3812
+ TGTCTG
3813
+ TGTCTT
3814
+ TGTGAA
3815
+ TGTGAC
3816
+ TGTGAG
3817
+ TGTGAT
3818
+ TGTGCA
3819
+ TGTGCC
3820
+ TGTGCG
3821
+ TGTGCT
3822
+ TGTGGA
3823
+ TGTGGC
3824
+ TGTGGG
3825
+ TGTGGT
3826
+ TGTGTA
3827
+ TGTGTC
3828
+ TGTGTG
3829
+ TGTGTT
3830
+ TGTTAA
3831
+ TGTTAC
3832
+ TGTTAG
3833
+ TGTTAT
3834
+ TGTTCA
3835
+ TGTTCC
3836
+ TGTTCG
3837
+ TGTTCT
3838
+ TGTTGA
3839
+ TGTTGC
3840
+ TGTTGG
3841
+ TGTTGT
3842
+ TGTTTA
3843
+ TGTTTC
3844
+ TGTTTG
3845
+ TGTTTT
3846
+ TTAAAA
3847
+ TTAAAC
3848
+ TTAAAG
3849
+ TTAAAT
3850
+ TTAACA
3851
+ TTAACC
3852
+ TTAACG
3853
+ TTAACT
3854
+ TTAAGA
3855
+ TTAAGC
3856
+ TTAAGG
3857
+ TTAAGT
3858
+ TTAATA
3859
+ TTAATC
3860
+ TTAATG
3861
+ TTAATT
3862
+ TTACAA
3863
+ TTACAC
3864
+ TTACAG
3865
+ TTACAT
3866
+ TTACCA
3867
+ TTACCC
3868
+ TTACCG
3869
+ TTACCT
3870
+ TTACGA
3871
+ TTACGC
3872
+ TTACGG
3873
+ TTACGT
3874
+ TTACTA
3875
+ TTACTC
3876
+ TTACTG
3877
+ TTACTT
3878
+ TTAGAA
3879
+ TTAGAC
3880
+ TTAGAG
3881
+ TTAGAT
3882
+ TTAGCA
3883
+ TTAGCC
3884
+ TTAGCG
3885
+ TTAGCT
3886
+ TTAGGA
3887
+ TTAGGC
3888
+ TTAGGG
3889
+ TTAGGT
3890
+ TTAGTA
3891
+ TTAGTC
3892
+ TTAGTG
3893
+ TTAGTT
3894
+ TTATAA
3895
+ TTATAC
3896
+ TTATAG
3897
+ TTATAT
3898
+ TTATCA
3899
+ TTATCC
3900
+ TTATCG
3901
+ TTATCT
3902
+ TTATGA
3903
+ TTATGC
3904
+ TTATGG
3905
+ TTATGT
3906
+ TTATTA
3907
+ TTATTC
3908
+ TTATTG
3909
+ TTATTT
3910
+ TTCAAA
3911
+ TTCAAC
3912
+ TTCAAG
3913
+ TTCAAT
3914
+ TTCACA
3915
+ TTCACC
3916
+ TTCACG
3917
+ TTCACT
3918
+ TTCAGA
3919
+ TTCAGC
3920
+ TTCAGG
3921
+ TTCAGT
3922
+ TTCATA
3923
+ TTCATC
3924
+ TTCATG
3925
+ TTCATT
3926
+ TTCCAA
3927
+ TTCCAC
3928
+ TTCCAG
3929
+ TTCCAT
3930
+ TTCCCA
3931
+ TTCCCC
3932
+ TTCCCG
3933
+ TTCCCT
3934
+ TTCCGA
3935
+ TTCCGC
3936
+ TTCCGG
3937
+ TTCCGT
3938
+ TTCCTA
3939
+ TTCCTC
3940
+ TTCCTG
3941
+ TTCCTT
3942
+ TTCGAA
3943
+ TTCGAC
3944
+ TTCGAG
3945
+ TTCGAT
3946
+ TTCGCA
3947
+ TTCGCC
3948
+ TTCGCG
3949
+ TTCGCT
3950
+ TTCGGA
3951
+ TTCGGC
3952
+ TTCGGG
3953
+ TTCGGT
3954
+ TTCGTA
3955
+ TTCGTC
3956
+ TTCGTG
3957
+ TTCGTT
3958
+ TTCTAA
3959
+ TTCTAC
3960
+ TTCTAG
3961
+ TTCTAT
3962
+ TTCTCA
3963
+ TTCTCC
3964
+ TTCTCG
3965
+ TTCTCT
3966
+ TTCTGA
3967
+ TTCTGC
3968
+ TTCTGG
3969
+ TTCTGT
3970
+ TTCTTA
3971
+ TTCTTC
3972
+ TTCTTG
3973
+ TTCTTT
3974
+ TTGAAA
3975
+ TTGAAC
3976
+ TTGAAG
3977
+ TTGAAT
3978
+ TTGACA
3979
+ TTGACC
3980
+ TTGACG
3981
+ TTGACT
3982
+ TTGAGA
3983
+ TTGAGC
3984
+ TTGAGG
3985
+ TTGAGT
3986
+ TTGATA
3987
+ TTGATC
3988
+ TTGATG
3989
+ TTGATT
3990
+ TTGCAA
3991
+ TTGCAC
3992
+ TTGCAG
3993
+ TTGCAT
3994
+ TTGCCA
3995
+ TTGCCC
3996
+ TTGCCG
3997
+ TTGCCT
3998
+ TTGCGA
3999
+ TTGCGC
4000
+ TTGCGG
4001
+ TTGCGT
4002
+ TTGCTA
4003
+ TTGCTC
4004
+ TTGCTG
4005
+ TTGCTT
4006
+ TTGGAA
4007
+ TTGGAC
4008
+ TTGGAG
4009
+ TTGGAT
4010
+ TTGGCA
4011
+ TTGGCC
4012
+ TTGGCG
4013
+ TTGGCT
4014
+ TTGGGA
4015
+ TTGGGC
4016
+ TTGGGG
4017
+ TTGGGT
4018
+ TTGGTA
4019
+ TTGGTC
4020
+ TTGGTG
4021
+ TTGGTT
4022
+ TTGTAA
4023
+ TTGTAC
4024
+ TTGTAG
4025
+ TTGTAT
4026
+ TTGTCA
4027
+ TTGTCC
4028
+ TTGTCG
4029
+ TTGTCT
4030
+ TTGTGA
4031
+ TTGTGC
4032
+ TTGTGG
4033
+ TTGTGT
4034
+ TTGTTA
4035
+ TTGTTC
4036
+ TTGTTG
4037
+ TTGTTT
4038
+ TTTAAA
4039
+ TTTAAC
4040
+ TTTAAG
4041
+ TTTAAT
4042
+ TTTACA
4043
+ TTTACC
4044
+ TTTACG
4045
+ TTTACT
4046
+ TTTAGA
4047
+ TTTAGC
4048
+ TTTAGG
4049
+ TTTAGT
4050
+ TTTATA
4051
+ TTTATC
4052
+ TTTATG
4053
+ TTTATT
4054
+ TTTCAA
4055
+ TTTCAC
4056
+ TTTCAG
4057
+ TTTCAT
4058
+ TTTCCA
4059
+ TTTCCC
4060
+ TTTCCG
4061
+ TTTCCT
4062
+ TTTCGA
4063
+ TTTCGC
4064
+ TTTCGG
4065
+ TTTCGT
4066
+ TTTCTA
4067
+ TTTCTC
4068
+ TTTCTG
4069
+ TTTCTT
4070
+ TTTGAA
4071
+ TTTGAC
4072
+ TTTGAG
4073
+ TTTGAT
4074
+ TTTGCA
4075
+ TTTGCC
4076
+ TTTGCG
4077
+ TTTGCT
4078
+ TTTGGA
4079
+ TTTGGC
4080
+ TTTGGG
4081
+ TTTGGT
4082
+ TTTGTA
4083
+ TTTGTC
4084
+ TTTGTG
4085
+ TTTGTT
4086
+ TTTTAA
4087
+ TTTTAC
4088
+ TTTTAG
4089
+ TTTTAT
4090
+ TTTTCA
4091
+ TTTTCC
4092
+ TTTTCG
4093
+ TTTTCT
4094
+ TTTTGA
4095
+ TTTTGC
4096
+ TTTTGG
4097
+ TTTTGT
4098
+ TTTTTA
4099
+ TTTTTC
4100
+ TTTTTG
4101
+ TTTTTT
data/prokbert_vocabs/prokbert-base-dna7/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/prokbert_vocabs/prokbert-base-dna8/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/prokbert_vocabs/prokbert-base-dna9/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
general_utils.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ import pandas as pd
4
+ import os
5
+ import numpy as np
6
+ import subprocess
7
+ import shutil
8
+ """ Library for general utils, such as dataframe properties checking,
9
+ creating directories, checking files, etc.
10
+ """
11
+
12
+
13
+ def check_expected_columns(df: pd.DataFrame, expected_columns: list) -> bool:
14
+ """
15
+ Checks if a DataFrame contains the expected columns.
16
+
17
+ Parameters
18
+ ----------
19
+ df : pd.DataFrame
20
+ The input DataFrame to be checked.
21
+ expected_columns : list
22
+ A list of columns that are expected to be present in the DataFrame.
23
+
24
+ Returns
25
+ -------
26
+ bool
27
+ True if all expected columns are present in the DataFrame, False otherwise.
28
+
29
+ Raises
30
+ ------
31
+ ValueError
32
+ If any of the expected columns are not present in the DataFrame.
33
+
34
+ Examples
35
+ --------
36
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
37
+ >>> check_expected_columns(df, ['A', 'B'])
38
+ True
39
+
40
+ >>> check_expected_columns(df, ['A', 'C'])
41
+ ValueError: The following columns are missing: ['C']
42
+ """
43
+
44
+ missing_columns = [col for col in expected_columns if col not in df.columns]
45
+
46
+ if missing_columns:
47
+ raise ValueError(f"The following columns are missing: {missing_columns}")
48
+
49
+ return True
50
+
51
+
52
+ def is_valid_primary_key(df: pd.DataFrame, column_name: str) -> bool:
53
+ """
54
+ Checks if a specified column in a DataFrame can serve as a valid primary key.
55
+
56
+ Parameters
57
+ ----------
58
+ df : pd.DataFrame
59
+ The input DataFrame to be checked.
60
+ column_name : str
61
+ The name of the column to check.
62
+
63
+ Returns
64
+ -------
65
+ bool
66
+ True if the column can serve as a valid primary key, False otherwise.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If the specified column does not exist in the DataFrame.
72
+
73
+ Examples
74
+ --------
75
+ >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
76
+ >>> is_valid_primary_key(df, 'A')
77
+ True
78
+
79
+ >>> df = pd.DataFrame({'A': [1, 2, 2], 'B': [4, 5, 6]})
80
+ >>> is_valid_primary_key(df, 'A')
81
+ False
82
+ """
83
+
84
+ if column_name not in df.columns:
85
+ raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
86
+
87
+ # Check for NaN values
88
+ if df[column_name].isnull().any():
89
+ return False
90
+
91
+ # Check for unique values
92
+ if not df[column_name].is_unique:
93
+ return False
94
+
95
+ return True
96
+
97
+ def get_non_empty_files(start_path: str, extensions: tuple = ('.fasta', '.fna')) -> str:
98
+ """
99
+ Generator that yields non-empty files from a specified directory and its subdirectories based on the given extensions.
100
+
101
+ :param start_path: The path to the directory from which to start the search.
102
+ :type start_path: str
103
+
104
+ :param extensions: A tuple of file extensions to look for (default is ('.fasta', '.fna')).
105
+ The function also automatically checks for compressed versions with '.gz'.
106
+ :type extensions: tuple
107
+
108
+ :return: Yields filenames that match the specified extensions and are non-empty.
109
+ :rtype: str
110
+ """
111
+
112
+ for dirpath, _, filenames in os.walk(start_path):
113
+ for filename in filenames:
114
+ filepath = os.path.join(dirpath, filename)
115
+ if any(filename.endswith(ext) or filename.endswith(ext + '.gz') for ext in extensions) and os.path.getsize(filepath) > 0:
116
+ yield filename
117
+
118
+
119
+
120
+ def truncate_zero_columns(arr: np.ndarray) -> np.ndarray:
121
+ """
122
+ Truncate all trailing columns composed entirely of zeros in a given 2D numpy array.
123
+
124
+ :param arr: Input 2D numpy array.
125
+ :type arr: np.ndarray
126
+
127
+ :return: A new array with trailing zero columns removed.
128
+ :rtype: np.ndarray
129
+ """
130
+
131
+ # Iterate over columns from the end
132
+ for idx in range(arr.shape[1]-1, -1, -1):
133
+ if np.any(arr[:, idx]):
134
+ return arr[:, :(idx+1)]
135
+ return np.empty((arr.shape[0], 0))
136
+
137
+
138
+ import os
139
+
140
+ def create_directory_for_filepath(filepath: str) -> None:
141
+ """
142
+ Given a file path, creates the underlying directory structure if it doesn't already exist.
143
+
144
+ Args:
145
+ filepath (str): The path to the file for which the directory structure should be created.
146
+
147
+ Raises:
148
+ ValueError: If the provided path is empty or None.
149
+ OSError: If there's an error creating the directory structure.
150
+ """
151
+
152
+ if not filepath:
153
+ raise ValueError("The provided filepath is empty or None.")
154
+
155
+ directory = os.path.dirname(filepath)
156
+
157
+ if directory and not os.path.exists(directory):
158
+ try:
159
+ os.makedirs(directory)
160
+ print(f"Directory structure {directory} created successfully.")
161
+ except OSError as e:
162
+ raise OSError(f"Error creating directory structure {directory}. Error: {e}")
163
+
164
+ # Example usage:
165
+ # create_directory_for_filepath("/path/to/directory/that/might/not/exist/filename.txt")
166
+
167
+ def check_file_exists(file_path: str) -> bool:
168
+ """
169
+ Checks if the provided file path exists.
170
+
171
+ Args:
172
+ file_path (str): Path to the file.
173
+
174
+ Returns:
175
+ bool: True if the file exists, raises ValueError otherwise.
176
+ """
177
+ if os.path.exists(file_path):
178
+ return True
179
+ else:
180
+ raise ValueError(f"The provided file path '{file_path}' does not exist.")
181
+
182
+ def count_gpus():
183
+ # Count NVIDIA GPUs
184
+ import torch
185
+ nvidia_gpu_count = torch.cuda.device_count()
186
+
187
+ # Count AMD GPUs
188
+ amd_gpu_count = 0
189
+ try:
190
+ clinfo_output = subprocess.check_output('clinfo').decode('utf-8')
191
+ amd_gpu_count = clinfo_output.count('Device Type: GPU')
192
+ except:
193
+ pass # clinfo command might not be available
194
+
195
+ total_gpus = nvidia_gpu_count + amd_gpu_count
196
+
197
+ return total_gpus
198
+
199
+
200
+ def create_hard_links(source_directory: str, target_directory: str, blacklist: list = []) -> None:
201
+ """
202
+ Creates hard links for all files from the source directory to the target directory.
203
+
204
+ Args:
205
+ source_directory (str): The directory containing the original files.
206
+ target_directory (str): The directory where hard links will be created.
207
+ blacklist (list): List of filenames to exclude from creating hard links.
208
+
209
+ Returns:
210
+ None
211
+ """
212
+
213
+ # Ensure the provided directories exist
214
+ if not os.path.exists(source_directory):
215
+ raise ValueError(f"The source directory '{source_directory}' does not exist.")
216
+ if not os.path.exists(target_directory):
217
+ os.makedirs(target_directory)
218
+
219
+ # Iterate through the files in the source directory
220
+ for filename in os.listdir(source_directory):
221
+ source_file_path = os.path.join(source_directory, filename)
222
+ target_file_path = os.path.join(target_directory, filename)
223
+
224
+ # Check for files to skip
225
+ if (filename.startswith('.') or
226
+ filename.startswith('_') or
227
+ os.path.isdir(source_file_path) or
228
+ filename in blacklist):
229
+ continue
230
+
231
+ # Create a hard link
232
+ os.link(source_file_path, target_file_path)
233
+
234
+ return f"Hard links created in {target_directory} from {source_directory}."
235
+
236
+ # Example usage
237
+ # create_hard_links("/path/to/source_directory", "/path/to/target_directory", blacklist=["file_to_skip.txt"])
238
+
239
+ def create_selected_hard_links(source_directory: str, target_directory: str, filenames: list) -> None:
240
+ """
241
+ Creates hard links for the specified files from the source directory to the target directory.
242
+
243
+ Args:
244
+ source_directory (str): The directory containing the original files.
245
+ target_directory (str): The directory where hard links will be created.
246
+ filenames (list): List of filenames for which hard links should be created.
247
+
248
+ Returns:
249
+ None
250
+ """
251
+
252
+ # Ensure the provided directories exist
253
+ if not os.path.exists(source_directory):
254
+ raise ValueError(f"The source directory '{source_directory}' does not exist.")
255
+ if not os.path.exists(target_directory):
256
+ os.makedirs(target_directory)
257
+
258
+ # Iterate through the specified filenames
259
+ for filename in filenames:
260
+ source_file_path = os.path.join(source_directory, filename)
261
+ target_file_path = os.path.join(target_directory, filename)
262
+
263
+ # Ensure the file exists in the source directory
264
+ if not os.path.isfile(source_file_path):
265
+ print(f"Warning: {filename} does not exist in the source directory. Skipping.")
266
+ continue
267
+
268
+ # Create a hard link
269
+ try:
270
+ os.link(source_file_path, target_file_path)
271
+ except FileExistsError:
272
+ print(f'The target hard link {target_file_path} exist. Skipping...')
273
+
274
+ return f"Hard links for specified files created in {target_directory} from {source_directory}."
275
+
276
+ def remove_hidden_files(directory: str) -> None:
277
+ """
278
+ Removes all files recursively in a folder that start with '.' or '_'.
279
+
280
+ Args:
281
+ directory (str): The directory from which hidden files should be removed.
282
+
283
+ Returns:
284
+ None
285
+ """
286
+
287
+ # Ensure the directory exists
288
+ if not os.path.exists(directory):
289
+ raise ValueError(f"The directory '{directory}' does not exist.")
290
+
291
+ # Use os.walk to iterate through all subdirectories and files
292
+ for dirpath, dirnames, filenames in os.walk(directory, topdown=False):
293
+
294
+ # Filter out directories starting with '.' or '_'
295
+ dirnames[:] = [d for d in dirnames if not d.startswith('.') and not d.startswith('_')]
296
+
297
+ # Remove files starting with '.' or '_'
298
+ for filename in filenames:
299
+ if filename.startswith('.') or filename.startswith('_'):
300
+ file_path = os.path.join(dirpath, filename)
301
+ os.remove(file_path)
302
+ print(f"Removed: {file_path}")
303
+
304
+ print(f"All hidden files removed from {directory}.")
prokbert_tokenizer.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/tokenization_bert.py
3
+
4
+
5
+ # ProkBERT tokenizer stuff
6
+
7
+ import collections
8
+ import os
9
+ import unicodedata
10
+ from typing import List, Optional, Tuple, Union
11
+ from copy import deepcopy
12
+ from transformers import PreTrainedTokenizer
13
+ from transformers.tokenization_utils import _is_control, _is_punctuation, _is_whitespace
14
+ from transformers.utils import logging
15
+
16
+ # These utils contains the tools needed by the ProkBERT tokenizer
17
+
18
+ from config_utils import *
19
+ from sequtils import *
20
+
21
+ import logging as logger
22
+
23
+ #logger = logging.get_logger(__name__)
24
+
25
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
26
+
27
+ # models prokbert-mini-k6s1, prokbert-large-k6s2, prokbert-large-k6s1
28
+
29
+
30
+ PRETRAINED_VOCAB_FILES_MAP = {
31
+ "vocab_file": {
32
+ "prokbert-mini-k6s1": "prokbert-base-dna6/vocab.txt",
33
+ "prokbert-large-k6s1": "prokbert-base-dna6/vocab.txt",
34
+ "prokbert-large-k6s2": "prokbert-base-dna6/vocab.txt"
35
+ }
36
+ }
37
+
38
+
39
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40
+ "prokbert-mini-k6s1": 1024,
41
+ "prokbert-large-k6s1": 1024,
42
+ "prokbert-large-k6s2": 1024
43
+ }
44
+
45
+ PRETRAINED_INIT_CONFIGURATION = {
46
+ "prokbert-mini-k6s1": {"do_upper_case": True},
47
+ "prokbert-large-k6s1": {"do_upper_case": True},
48
+ "prokbert-large-k6s2": {"do_upper_case": True}
49
+
50
+ }
51
+
52
+
53
+ def load_vocab(vocab_file):
54
+ """Loads a vocabulary file into a dictionary."""
55
+ vocab = collections.OrderedDict()
56
+ with open(vocab_file, "r", encoding="utf-8") as reader:
57
+ tokens = reader.readlines()
58
+ for index, token in enumerate(tokens):
59
+ token = token.rstrip("\n")
60
+ vocab[token] = index
61
+ return vocab
62
+
63
+
64
+ class ProkBERTTokenizer(PreTrainedTokenizer):
65
+ """Custom tokenizer for ProkBERT."""
66
+
67
+ vocab_files_names = VOCAB_FILES_NAMES
68
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
69
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
70
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
71
+ nucleotide_abc = {'A', 'T', 'C', 'G'}
72
+ extended_nucleotide_abc = {'A', 'T', 'C', 'G', '*'}
73
+ sequence_unk_token = 'N'
74
+ default_unk_token="[UNK]"
75
+ default_sep_token="[SEP]"
76
+ default_pad_token="[PAD]"
77
+ default_cls_token="[CLS]"
78
+ default_mask_token="[MASK]"
79
+
80
+
81
+ def __init__(self,
82
+ tokenization_params: Dict = {},
83
+ segmentation_params: Dict = {},
84
+ comp_params: Dict = {},
85
+ operation_space: str = 'sequence',
86
+ **kwargs):
87
+ """Initialize the ProkBERT tokenizer.
88
+
89
+ Args:
90
+ tokenization_params (Dict, optional): Tokenization parameters. Defaults to {}.
91
+ segmentation_params (Dict, optional): Segmentation parameters. Defaults to {}.
92
+ comp_params (Dict, optional): Computational parameters. Defaults to {}.
93
+ operation_space (str, optional): Specifies the operation mode. Can be 'kmer' or 'sequence'. Defaults to 'kmer'.
94
+ """
95
+ super().__init__(cls_token=ProkBERTTokenizer.default_cls_token,
96
+ **kwargs)
97
+
98
+ self.defconfig = SeqConfig()
99
+ self.tokenization_params = self.defconfig.get_and_set_tokenization_parameters(tokenization_params)
100
+ self.segmentation_params = self.defconfig.get_and_set_segmentation_parameters(segmentation_params)
101
+ self.comp_params = self.defconfig.get_and_set_computational_parameters(comp_params)
102
+ self.operation_space = operation_space
103
+
104
+ vocab_file = self.tokenization_params['vocabfile']
105
+ self.vocab = self.tokenization_params['vocabmap']
106
+ self.id2token = {v: k for k, v in self.vocab.items()}
107
+ self.max_len = self.tokenization_params['max_segment_length']
108
+
109
+ if self.operation_space == 'sequence':
110
+ token_extension = sorted(list(set(generate_kmers(ProkBERTTokenizer.extended_nucleotide_abc, self.tokenization_params['kmer'])) - \
111
+ set(generate_kmers(ProkBERTTokenizer.nucleotide_abc, self.tokenization_params['kmer'])) ))
112
+ self.extended_vocab = deepcopy(self.vocab)
113
+ for token in token_extension:
114
+ self.extended_vocab[token] = 4
115
+
116
+ self.unk_token = ProkBERTTokenizer.sequence_unk_token * self.tokenization_params['shift']
117
+ self.mask_token = '*'
118
+ self.extended_vocab[self.mask_token] = self.vocab['[MASK]']
119
+
120
+ full_unk = 'N' * self.tokenization_params['kmer']
121
+ self.vocab[full_unk] = 1
122
+ self.id2token[1] = full_unk
123
+ self.full_unk_token = full_unk
124
+
125
+ else:
126
+ self.extended_vocab = self.vocab
127
+ self.unk_token = '[UNK]'
128
+ self.sep_token = '[SEP]'
129
+ self.cls_token = '[CLS]'
130
+ self.pad_token = '[PAD]'
131
+ self.mask_token = '[MASK]'
132
+ self.special_tokens = list(self.special_tokens_map.values())
133
+
134
+ def __len__(self) -> int:
135
+ return len(self.vocab)
136
+
137
+
138
+ def tokenize(self, text: str, lca_shift: int = 0, all: bool = False) -> Union[List[str], Tuple[List[List[str]], List[List[str]]]]:
139
+ """
140
+ Tokenizes a given segment.
141
+
142
+ Args:
143
+ text (str): The DNA segment to tokenize.
144
+ lca_shift (int, optional): Which tokenized vector belonging to the specified LCA offset should be returned. Defaults to 0.
145
+ all (bool, optional): If True, returns all possible tokenizations. Defaults to False.
146
+
147
+ Returns:
148
+ Union[List[str], Tuple[List[List[str]], List[List[str]]]]: Tokenized segment or tuple of all possible tokenizations.
149
+
150
+ Usage Example:
151
+ >>> tokenizer = ProkBERTTokenizer(...)
152
+ >>> segment = 'AATCAAGGAATTATTATCGTT'
153
+ >>> tokens, kmers = tokenizer.tokenize(segment, all=True)
154
+ >>> print(tokens)
155
+ ...
156
+ """
157
+ tokenized_segments, kmerized_segments = lca_tokenize_segment(text, self.tokenization_params)
158
+ if all:
159
+ return tokenized_segments, kmerized_segments
160
+ else:
161
+ return kmerized_segments[lca_shift]
162
+
163
+ def _convert_token_to_id(self, token):
164
+ """Converts a token (str) in an id using the vocab."""
165
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
166
+
167
+ def _convert_id_to_token(self, index):
168
+ """Converts an index (integer) in a token (str) using the vocab."""
169
+ return self.ids_to_tokens.get(index, self.unk_token)
170
+
171
+
172
+ def depr_convert_ids_to_tokens(self, ids: Union[int, List[int]]) -> List[str]:
173
+ """
174
+ Converts tokens to their corresponding IDs.
175
+
176
+ Args:
177
+ tokens (List[str]): List of tokens to convert.
178
+
179
+ Returns:
180
+ List[int]: List of corresponding token IDs.
181
+
182
+ Usage Example:
183
+ >>> tokenizer = ProkBERTTokenizer(...)
184
+ >>> tokens = ['AATCAA', 'TCAAGG']
185
+ >>> ids = tokenizer.convert_tokens_to_ids(tokens)
186
+ >>> print(ids)
187
+ ...
188
+ """
189
+
190
+ if isinstance(ids, int):
191
+ token_ids = self.vocab.get(ids, self.vocab[self.unk_token])
192
+
193
+
194
+ if self.operation_space == 'sequence':
195
+ token_ids = [self.vocab.get(token, self.vocab[self.full_unk_token]) for token in tokens]
196
+
197
+ else:
198
+ token_ids = [self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens]
199
+
200
+ return token_ids
201
+
202
+ def convert_ids_to_tokens(self, ids: Union[int, List[int]]) -> Union[str, List[str]]:
203
+ """
204
+ Converts token IDs back to their original tokens.
205
+
206
+ Args:
207
+ ids (List[int]): List of token IDs to convert.
208
+
209
+ Returns:
210
+ List[str]: List of corresponding tokens.
211
+
212
+ Usage Example:
213
+ >>> tokenizer = ProkBERTTokenizer(...)
214
+ >>> ids = [213, 3343]
215
+ >>> tokens = tokenizer.convert_ids_to_tokens(ids)
216
+ >>> print(tokens)
217
+ ...
218
+ """
219
+ if isinstance(ids, int):
220
+ ids = [ids]
221
+ if len(ids) == 1:
222
+ #default_token_list = [self.id2token.get(ids[0], self.unk_token)]
223
+ return self.id2token.get(ids[0], self.unk_token)
224
+
225
+ if self.operation_space == 'kmer':
226
+ token_list = [self.id2token.get(id, self.unk_token) for id in ids]
227
+
228
+ elif self.operation_space == 'sequence':
229
+ token_list = []
230
+ # Handling the sentence start
231
+ if ids[0] == 2:
232
+ pass
233
+ else:
234
+ token_list.append(self.id2token.get(ids[0], self.unk_token))
235
+ if len(ids) > 1:
236
+ # if this is a kmer then we add accordingly.
237
+ true_start_token = self.id2token.get(ids[1], self.unk_token)
238
+
239
+
240
+ token_list.append(true_start_token)
241
+ print(token_list)
242
+ if len(ids) >2:
243
+ # Adding the other tokens until the end
244
+ for token_id in ids[2:]:
245
+ mapped_token_id = self.id2token.get(token_id, self.unk_token)
246
+ if (mapped_token_id in self.special_tokens):
247
+ act_token_value = ''
248
+ else:
249
+ act_token_value = mapped_token_id[-1*self.tokenization_params['shift']:]
250
+ token_list.append(act_token_value)
251
+
252
+ return token_list
253
+
254
+
255
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
256
+ """Saves the vocabulary to a file."""
257
+ if filename_prefix is None:
258
+ filename_prefix = ""
259
+ vocab_file_path = os.path.join(save_directory, filename_prefix + "vocab.txt")
260
+ with open(vocab_file_path, "w") as f:
261
+ for token in self.vocab:
262
+ f.write(token + "\\n")
263
+ return (vocab_file_path,)
264
+
265
+ @classmethod
266
+ def from_pretrained(cls, vocab_file: str) -> 'ProkBERTTokenizer':
267
+ """Loads a pre-trained tokenizer.
268
+
269
+ Args:
270
+ vocab_file (str): Path to the pre-trained tokenizer vocabulary file.
271
+
272
+ Returns:
273
+ ProkBERTTokenizer: Loaded tokenizer instance.
274
+ """
275
+ return cls(vocab_file)
276
+
277
+ def encode_plus(self, text: str, lca_shift: int = 0, **kwargs) -> Dict[str, np.ndarray]:
278
+ """
279
+ Tokenizes a sequence and returns it in a format suitable for model input.
280
+
281
+ Args:
282
+ text (str): The sequence to tokenize.
283
+ lca_shift (int, optional): LCA offset for tokenization. Defaults to 0.
284
+
285
+ Returns:
286
+ Dict[str, np.ndarray]: Dictionary containing token IDs and attention masks.
287
+
288
+ Usage Example:
289
+ >>> tokenizer = ProkBERTTokenizer(...)
290
+ >>> segment = 'AATCAAGGAATTATTATCGTT'
291
+ >>> encoded = tokenizer.encode_plus(segment)
292
+ >>> print(encoded)
293
+ ...
294
+ """
295
+ tokenized_segments, kmerized_segments = lca_tokenize_segment(text, self.tokenization_params)
296
+ input_ids = tokenized_segments[lca_shift]
297
+ attention_mask = [1] * len(input_ids)
298
+
299
+ # Padding
300
+ while len(input_ids) < self.max_len:
301
+ input_ids.append(0)
302
+ attention_mask.append(0)
303
+
304
+ return {
305
+ "input_ids": np.array(input_ids, dtype=self.comp_params['np_tokentype']),
306
+ "attention_mask": np.array(attention_mask, dtype=self.comp_params['np_tokentype'])
307
+ }
308
+
309
+ def batch_encode_plus(self, sequences: List[str], lca_shift: int = 0, all: bool = False, **kwargs) -> Dict[str, List[List[int]]]:
310
+ """
311
+ Tokenizes multiple sequences and returns them in a format suitable for model input. It is assumed that sequences
312
+ have already been preprocessed (i.e., segmented) and quality controlled.
313
+
314
+ Args:
315
+ - sequences (List[str]): A list of DNA sequences to be tokenized.
316
+ - lca_shift (int, default=0): The LCA offset or windows to get the tokenized vector. If the required offset is >= shift,
317
+ an error is raised.
318
+ - all (bool, default=False): Whether all possible tokenization vectors should be returned. If False, only the specified
319
+ offset is used.
320
+ - **kwargs: Additional arguments (like max_length, padding, etc.)
321
+
322
+ Returns:
323
+ - Dict[str, List[List[int]]]: A dictionary containing token IDs, attention masks, and token type IDs.
324
+ """
325
+ shift = self.tokenization_params['shift']
326
+ if lca_shift >= shift:
327
+ raise ValueError(f'The required offset {lca_shift} is invalid. The maximum offset should be < {shift}')
328
+
329
+ # Parallel tokenization. First, create unique IDs for all sequences.
330
+ sequence_ids = list(range(len(sequences)))
331
+ to_tokenize_data = (sequences, sequence_ids)
332
+
333
+ # Tokenize each sequence
334
+ tokenization_results = batch_tokenize_segments_with_ids(
335
+ to_tokenize_data,
336
+ self.tokenization_params,
337
+ self.comp_params['cpu_cores_for_tokenization'],
338
+ self.comp_params['batch_size_tokenization'],
339
+ self.comp_params['np_tokentype']
340
+ )
341
+
342
+ # Generate input ids, token type ids, and attention masks
343
+ input_ids = []
344
+ token_type_ids = []
345
+ attention_masks = []
346
+
347
+ if all:
348
+ for tokenized_vectors in tokenization_results.values():
349
+ for tokenized_vector in tokenized_vectors:
350
+ input_ids.append(tokenized_vector)
351
+ token_type_ids.append([0] * len(tokenized_vector))
352
+ attention_masks.append([1] * len(tokenized_vector))
353
+ else:
354
+ for tokenized_vectors in tokenization_results.values():
355
+ selected_vector = tokenized_vectors[lca_shift]
356
+ input_ids.append(selected_vector)
357
+ token_type_ids.append([0] * len(selected_vector))
358
+ attention_masks.append([1] * len(selected_vector))
359
+
360
+ return {
361
+ "input_ids": input_ids,
362
+ "token_type_ids": token_type_ids,
363
+ "attention_mask": attention_masks
364
+ }
365
+
366
+ def encode(self, segment: str, lca_shift: int = 0, all: bool = False, add_special_tokens: bool = True, **kwargs) -> List[int]:
367
+ """
368
+ Encode a DNA sequence into its corresponding token IDs.
369
+
370
+ Args:
371
+ text (str): The DNA segment to encode.
372
+ add_special_tokens (bool, optional): Whether to add special tokens like [CLS] and [SEP]. Defaults to True.
373
+
374
+ Returns:
375
+ List[int]: Encoded token IDs.
376
+
377
+ Usage Example:
378
+ >>> tokenizer = ProkBERTTokenizer(...)
379
+ >>> segment = 'AATCAAGGAATTATTATCGTT'
380
+ >>> ids = tokenizer.encode(segment)
381
+ >>> print(ids)
382
+ ...
383
+ """
384
+ shift = self.tokenization_params['shift']
385
+ if lca_shift >= shift:
386
+ raise ValueError(f'The required offset {lca_shift} is invalid. The maximum offset should be < {shift}')
387
+
388
+ tokenized_segments, _ = lca_tokenize_segment(segment, self.tokenization_params)
389
+
390
+ # if all is set to True, then we return all the possible ids as a list
391
+ if all:
392
+ token_ids = tokenized_segments
393
+ if not add_special_tokens:
394
+ new_token_ids = []
395
+ for token_id_set in tokenized_segments:
396
+ new_token_ids.append(token_id_set[1:len(token_id_set)-1])
397
+ token_ids = new_token_ids
398
+
399
+ else:
400
+ token_ids = tokenized_segments[lca_shift]
401
+ # Convert tokens to their corresponding IDs
402
+ # Add special tokens if needed
403
+ if not add_special_tokens:
404
+ token_ids = token_ids[1:len(token_ids)-1]
405
+
406
+ return token_ids
407
+
408
+ def decode(self, ids):
409
+ tokens = self.convert_ids_to_tokens(ids)
410
+ return ''.join(tokens)
411
+
412
+ def batch_decode(self, token_ids_list: List[List[int]], **kwargs) -> List[str]:
413
+ """
414
+ Decodes multiple token ID sequences back into their original sequences.
415
+
416
+ Args:
417
+ token_ids_list (List[List[int]]): List of token ID sequences.
418
+
419
+ Returns:
420
+ List[str]: List of decoded sequences.
421
+
422
+ Usage Example:
423
+ >>> tokenizer = ProkBERTTokenizer(...)
424
+ >>> ids = [[2, 213, 3343, 165, 2580, 248, 3905, 978, 3296, 3]]
425
+ >>> sequences = tokenizer.batch_decode(ids)
426
+ >>> print(sequences)
427
+ ...
428
+ """
429
+ return [self.decode(token_ids) for token_ids in token_ids_list]
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "N"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "model_max_length": 1000000000000000019884624838656,
5
+ "tokenizer_class": "ProkBERTTokenizer"
6
+ }
vocab.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ [PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nAAAAAA\nAAAAAC\nAAAAAG\nAAAAAT\nAAAACA\nAAAACC\nAAAACG\nAAAACT\nAAAAGA\nAAAAGC\nAAAAGG\nAAAAGT\nAAAATA\nAAAATC\nAAAATG\nAAAATT\nAAACAA\nAAACAC\nAAACAG\nAAACAT\nAAACCA\nAAACCC\nAAACCG\nAAACCT\nAAACGA\nAAACGC\nAAACGG\nAAACGT\nAAACTA\nAAACTC\nAAACTG\nAAACTT\nAAAGAA\nAAAGAC\nAAAGAG\nAAAGAT\nAAAGCA\nAAAGCC\nAAAGCG\nAAAGCT\nAAAGGA\nAAAGGC\nAAAGGG\nAAAGGT\nAAAGTA\nAAAGTC\nAAAGTG\nAAAGTT\nAAATAA\nAAATAC\nAAATAG\nAAATAT\nAAATCA\nAAATCC\nAAATCG\nAAATCT\nAAATGA\nAAATGC\nAAATGG\nAAATGT\nAAATTA\nAAATTC\nAAATTG\nAAATTT\nAACAAA\nAACAAC\nAACAAG\nAACAAT\nAACACA\nAACACC\nAACACG\nAACACT\nAACAGA\nAACAGC\nAACAGG\nAACAGT\nAACATA\nAACATC\nAACATG\nAACATT\nAACCAA\nAACCAC\nAACCAG\nAACCAT\nAACCCA\nAACCCC\nAACCCG\nAACCCT\nAACCGA\nAACCGC\nAACCGG\nAACCGT\nAACCTA\nAACCTC\nAACCTG\nAACCTT\nAACGAA\nAACGAC\nAACGAG\nAACGAT\nAACGCA\nAACGCC\nAACGCG\nAACGCT\nAACGGA\nAACGGC\nAACGGG\nAACGGT\nAACGTA\nAACGTC\nAACGTG\nAACGTT\nAACTAA\nAACTAC\nAACTAG\nAACTAT\nAACTCA\nAACTCC\nAACTCG\nAACTCT\nAACTGA\nAACTGC\nAACTGG\nAACTGT\nAACTTA\nAACTTC\nAACTTG\nAACTTT\nAAGAAA\nAAGAAC\nAAGAAG\nAAGAAT\nAAGACA\nAAGACC\nAAGACG\nAAGACT\nAAGAGA\nAAGAGC\nAAGAGG\nAAGAGT\nAAGATA\nAAGATC\nAAGATG\nAAGATT\nAAGCAA\nAAGCAC\nAAGCAG\nAAGCAT\nAAGCCA\nAAGCCC\nAAGCCG\nAAGCCT\nAAGCGA\nAAGCGC\nAAGCGG\nAAGCGT\nAAGCTA\nAAGCTC\nAAGCTG\nAAGCTT\nAAGGAA\nAAGGAC\nAAGGAG\nAAGGAT\nAAGGCA\nAAGGCC\nAAGGCG\nAAGGCT\nAAGGGA\nAAGGGC\nAAGGGG\nAAGGGT\nAAGGTA\nAAGGTC\nAAGGTG\nAAGGTT\nAAGTAA\nAAGTAC\nAAGTAG\nAAGTAT\nAAGTCA\nAAGTCC\nAAGTCG\nAAGTCT\nAAGTGA\nAAGTGC\nAAGTGG\nAAGTGT\nAAGTTA\nAAGTTC\nAAGTTG\nAAGTTT\nAATAAA\nAATAAC\nAATAAG\nAATAAT\nAATACA\nAATACC\nAATACG\nAATACT\nAATAGA\nAATAGC\nAATAGG\nAATAGT\nAATATA\nAATATC\nAATATG\nAATATT\nAATCAA\nAATCAC\nAATCAG\nAATCAT\nAATCCA\nAATCCC\nAATCCG\nAATCCT\nAATCGA\nAATCGC\nAATCGG\nAATCGT\nAATCTA\nAATCTC\nAATCTG\nAATCTT\nAATGAA\nAATGAC\nAATGAG\nAATGAT\nAATGCA\nAATGCC\nAATGCG\nAATGCT\nAATGGA\nAATGGC\nAATGGG\nAATGGT\nAATGTA\nAATGTC\nAATGTG\nAATGTT\nAATTAA\nAATTAC\nAATTAG\nAATTAT\nAATTCA\nAATTCC\nAATTCG\nAATTCT\nAATTGA\nAATTGC\nAATTGG\nAATTGT\nAATTTA\nAATTTC\nAATTTG\nAATTTT\nACAAAA\nACAAAC\nACAAAG\nACAAAT\nACAACA\nACAACC\nACAACG\nACAACT\nACAAGA\nACAAGC\nACAAGG\nACAAGT\nACAATA\nACAATC\nACAATG\nACAATT\nACACAA\nACACAC\nACACAG\nACACAT\nACACCA\nACACCC\nACACCG\nACACCT\nACACGA\nACACGC\nACACGG\nACACGT\nACACTA\nACACTC\nACACTG\nACACTT\nACAGAA\nACAGAC\nACAGAG\nACAGAT\nACAGCA\nACAGCC\nACAGCG\nACAGCT\nACAGGA\nACAGGC\nACAGGG\nACAGGT\nACAGTA\nACAGTC\nACAGTG\nACAGTT\nACATAA\nACATAC\nACATAG\nACATAT\nACATCA\nACATCC\nACATCG\nACATCT\nACATGA\nACATGC\nACATGG\nACATGT\nACATTA\nACATTC\nACATTG\nACATTT\nACCAAA\nACCAAC\nACCAAG\nACCAAT\nACCACA\nACCACC\nACCACG\nACCACT\nACCAGA\nACCAGC\nACCAGG\nACCAGT\nACCATA\nACCATC\nACCATG\nACCATT\nACCCAA\nACCCAC\nACCCAG\nACCCAT\nACCCCA\nACCCCC\nACCCCG\nACCCCT\nACCCGA\nACCCGC\nACCCGG\nACCCGT\nACCCTA\nACCCTC\nACCCTG\nACCCTT\nACCGAA\nACCGAC\nACCGAG\nACCGAT\nACCGCA\nACCGCC\nACCGCG\nACCGCT\nACCGGA\nACCGGC\nACCGGG\nACCGGT\nACCGTA\nACCGTC\nACCGTG\nACCGTT\nACCTAA\nACCTAC\nACCTAG\nACCTAT\nACCTCA\nACCTCC\nACCTCG\nACCTCT\nACCTGA\nACCTGC\nACCTGG\nACCTGT\nACCTTA\nACCTTC\nACCTTG\nACCTTT\nACGAAA\nACGAAC\nACGAAG\nACGAAT\nACGACA\nACGACC\nACGACG\nACGACT\nACGAGA\nACGAGC\nACGAGG\nACGAGT\nACGATA\nACGATC\nACGATG\nACGATT\nACGCAA\nACGCAC\nACGCAG\nACGCAT\nACGCCA\nACGCCC\nACGCCG\nACGCCT\nACGCGA\nACGCGC\nACGCGG\nACGCGT\nACGCTA\nACGCTC\nACGCTG\nACGCTT\nACGGAA\nACGGAC\nACGGAG\nACGGAT\nACGGCA\nACGGCC\nACGGCG\nACGGCT\nACGGGA\nACGGGC\nACGGGG\nACGGGT\nACGGTA\nACGGTC\nACGGTG\nACGGTT\nACGTAA\nACGTAC\nACGTAG\nACGTAT\nACGTCA\nACGTCC\nACGTCG\nACGTCT\nACGTGA\nACGTGC\nACGTGG\nACGTGT\nACGTTA\nACGTTC\nACGTTG\nACGTTT\nACTAAA\nACTAAC\nACTAAG\nACTAAT\nACTACA\nACTACC\nACTACG\nACTACT\nACTAGA\nACTAGC\nACTAGG\nACTAGT\nACTATA\nACTATC\nACTATG\nACTATT\nACTCAA\nACTCAC\nACTCAG\nACTCAT\nACTCCA\nACTCCC\nACTCCG\nACTCCT\nACTCGA\nACTCGC\nACTCGG\nACTCGT\nACTCTA\nACTCTC\nACTCTG\nACTCTT\nACTGAA\nACTGAC\nACTGAG\nACTGAT\nACTGCA\nACTGCC\nACTGCG\nACTGCT\nACTGGA\nACTGGC\nACTGGG\nACTGGT\nACTGTA\nACTGTC\nACTGTG\nACTGTT\nACTTAA\nACTTAC\nACTTAG\nACTTAT\nACTTCA\nACTTCC\nACTTCG\nACTTCT\nACTTGA\nACTTGC\nACTTGG\nACTTGT\nACTTTA\nACTTTC\nACTTTG\nACTTTT\nAGAAAA\nAGAAAC\nAGAAAG\nAGAAAT\nAGAACA\nAGAACC\nAGAACG\nAGAACT\nAGAAGA\nAGAAGC\nAGAAGG\nAGAAGT\nAGAATA\nAGAATC\nAGAATG\nAGAATT\nAGACAA\nAGACAC\nAGACAG\nAGACAT\nAGACCA\nAGACCC\nAGACCG\nAGACCT\nAGACGA\nAGACGC\nAGACGG\nAGACGT\nAGACTA\nAGACTC\nAGACTG\nAGACTT\nAGAGAA\nAGAGAC\nAGAGAG\nAGAGAT\nAGAGCA\nAGAGCC\nAGAGCG\nAGAGCT\nAGAGGA\nAGAGGC\nAGAGGG\nAGAGGT\nAGAGTA\nAGAGTC\nAGAGTG\nAGAGTT\nAGATAA\nAGATAC\nAGATAG\nAGATAT\nAGATCA\nAGATCC\nAGATCG\nAGATCT\nAGATGA\nAGATGC\nAGATGG\nAGATGT\nAGATTA\nAGATTC\nAGATTG\nAGATTT\nAGCAAA\nAGCAAC\nAGCAAG\nAGCAAT\nAGCACA\nAGCACC\nAGCACG\nAGCACT\nAGCAGA\nAGCAGC\nAGCAGG\nAGCAGT\nAGCATA\nAGCATC\nAGCATG\nAGCATT\nAGCCAA\nAGCCAC\nAGCCAG\nAGCCAT\nAGCCCA\nAGCCCC\nAGCCCG\nAGCCCT\nAGCCGA\nAGCCGC\nAGCCGG\nAGCCGT\nAGCCTA\nAGCCTC\nAGCCTG\nAGCCTT\nAGCGAA\nAGCGAC\nAGCGAG\nAGCGAT\nAGCGCA\nAGCGCC\nAGCGCG\nAGCGCT\nAGCGGA\nAGCGGC\nAGCGGG\nAGCGGT\nAGCGTA\nAGCGTC\nAGCGTG\nAGCGTT\nAGCTAA\nAGCTAC\nAGCTAG\nAGCTAT\nAGCTCA\nAGCTCC\nAGCTCG\nAGCTCT\nAGCTGA\nAGCTGC\nAGCTGG\nAGCTGT\nAGCTTA\nAGCTTC\nAGCTTG\nAGCTTT\nAGGAAA\nAGGAAC\nAGGAAG\nAGGAAT\nAGGACA\nAGGACC\nAGGACG\nAGGACT\nAGGAGA\nAGGAGC\nAGGAGG\nAGGAGT\nAGGATA\nAGGATC\nAGGATG\nAGGATT\nAGGCAA\nAGGCAC\nAGGCAG\nAGGCAT\nAGGCCA\nAGGCCC\nAGGCCG\nAGGCCT\nAGGCGA\nAGGCGC\nAGGCGG\nAGGCGT\nAGGCTA\nAGGCTC\nAGGCTG\nAGGCTT\nAGGGAA\nAGGGAC\nAGGGAG\nAGGGAT\nAGGGCA\nAGGGCC\nAGGGCG\nAGGGCT\nAGGGGA\nAGGGGC\nAGGGGG\nAGGGGT\nAGGGTA\nAGGGTC\nAGGGTG\nAGGGTT\nAGGTAA\nAGGTAC\nAGGTAG\nAGGTAT\nAGGTCA\nAGGTCC\nAGGTCG\nAGGTCT\nAGGTGA\nAGGTGC\nAGGTGG\nAGGTGT\nAGGTTA\nAGGTTC\nAGGTTG\nAGGTTT\nAGTAAA\nAGTAAC\nAGTAAG\nAGTAAT\nAGTACA\nAGTACC\nAGTACG\nAGTACT\nAGTAGA\nAGTAGC\nAGTAGG\nAGTAGT\nAGTATA\nAGTATC\nAGTATG\nAGTATT\nAGTCAA\nAGTCAC\nAGTCAG\nAGTCAT\nAGTCCA\nAGTCCC\nAGTCCG\nAGTCCT\nAGTCGA\nAGTCGC\nAGTCGG\nAGTCGT\nAGTCTA\nAGTCTC\nAGTCTG\nAGTCTT\nAGTGAA\nAGTGAC\nAGTGAG\nAGTGAT\nAGTGCA\nAGTGCC\nAGTGCG\nAGTGCT\nAGTGGA\nAGTGGC\nAGTGGG\nAGTGGT\nAGTGTA\nAGTGTC\nAGTGTG\nAGTGTT\nAGTTAA\nAGTTAC\nAGTTAG\nAGTTAT\nAGTTCA\nAGTTCC\nAGTTCG\nAGTTCT\nAGTTGA\nAGTTGC\nAGTTGG\nAGTTGT\nAGTTTA\nAGTTTC\nAGTTTG\nAGTTTT\nATAAAA\nATAAAC\nATAAAG\nATAAAT\nATAACA\nATAACC\nATAACG\nATAACT\nATAAGA\nATAAGC\nATAAGG\nATAAGT\nATAATA\nATAATC\nATAATG\nATAATT\nATACAA\nATACAC\nATACAG\nATACAT\nATACCA\nATACCC\nATACCG\nATACCT\nATACGA\nATACGC\nATACGG\nATACGT\nATACTA\nATACTC\nATACTG\nATACTT\nATAGAA\nATAGAC\nATAGAG\nATAGAT\nATAGCA\nATAGCC\nATAGCG\nATAGCT\nATAGGA\nATAGGC\nATAGGG\nATAGGT\nATAGTA\nATAGTC\nATAGTG\nATAGTT\nATATAA\nATATAC\nATATAG\nATATAT\nATATCA\nATATCC\nATATCG\nATATCT\nATATGA\nATATGC\nATATGG\nATATGT\nATATTA\nATATTC\nATATTG\nATATTT\nATCAAA\nATCAAC\nATCAAG\nATCAAT\nATCACA\nATCACC\nATCACG\nATCACT\nATCAGA\nATCAGC\nATCAGG\nATCAGT\nATCATA\nATCATC\nATCATG\nATCATT\nATCCAA\nATCCAC\nATCCAG\nATCCAT\nATCCCA\nATCCCC\nATCCCG\nATCCCT\nATCCGA\nATCCGC\nATCCGG\nATCCGT\nATCCTA\nATCCTC\nATCCTG\nATCCTT\nATCGAA\nATCGAC\nATCGAG\nATCGAT\nATCGCA\nATCGCC\nATCGCG\nATCGCT\nATCGGA\nATCGGC\nATCGGG\nATCGGT\nATCGTA\nATCGTC\nATCGTG\nATCGTT\nATCTAA\nATCTAC\nATCTAG\nATCTAT\nATCTCA\nATCTCC\nATCTCG\nATCTCT\nATCTGA\nATCTGC\nATCTGG\nATCTGT\nATCTTA\nATCTTC\nATCTTG\nATCTTT\nATGAAA\nATGAAC\nATGAAG\nATGAAT\nATGACA\nATGACC\nATGACG\nATGACT\nATGAGA\nATGAGC\nATGAGG\nATGAGT\nATGATA\nATGATC\nATGATG\nATGATT\nATGCAA\nATGCAC\nATGCAG\nATGCAT\nATGCCA\nATGCCC\nATGCCG\nATGCCT\nATGCGA\nATGCGC\nATGCGG\nATGCGT\nATGCTA\nATGCTC\nATGCTG\nATGCTT\nATGGAA\nATGGAC\nATGGAG\nATGGAT\nATGGCA\nATGGCC\nATGGCG\nATGGCT\nATGGGA\nATGGGC\nATGGGG\nATGGGT\nATGGTA\nATGGTC\nATGGTG\nATGGTT\nATGTAA\nATGTAC\nATGTAG\nATGTAT\nATGTCA\nATGTCC\nATGTCG\nATGTCT\nATGTGA\nATGTGC\nATGTGG\nATGTGT\nATGTTA\nATGTTC\nATGTTG\nATGTTT\nATTAAA\nATTAAC\nATTAAG\nATTAAT\nATTACA\nATTACC\nATTACG\nATTACT\nATTAGA\nATTAGC\nATTAGG\nATTAGT\nATTATA\nATTATC\nATTATG\nATTATT\nATTCAA\nATTCAC\nATTCAG\nATTCAT\nATTCCA\nATTCCC\nATTCCG\nATTCCT\nATTCGA\nATTCGC\nATTCGG\nATTCGT\nATTCTA\nATTCTC\nATTCTG\nATTCTT\nATTGAA\nATTGAC\nATTGAG\nATTGAT\nATTGCA\nATTGCC\nATTGCG\nATTGCT\nATTGGA\nATTGGC\nATTGGG\nATTGGT\nATTGTA\nATTGTC\nATTGTG\nATTGTT\nATTTAA\nATTTAC\nATTTAG\nATTTAT\nATTTCA\nATTTCC\nATTTCG\nATTTCT\nATTTGA\nATTTGC\nATTTGG\nATTTGT\nATTTTA\nATTTTC\nATTTTG\nATTTTT\nCAAAAA\nCAAAAC\nCAAAAG\nCAAAAT\nCAAACA\nCAAACC\nCAAACG\nCAAACT\nCAAAGA\nCAAAGC\nCAAAGG\nCAAAGT\nCAAATA\nCAAATC\nCAAATG\nCAAATT\nCAACAA\nCAACAC\nCAACAG\nCAACAT\nCAACCA\nCAACCC\nCAACCG\nCAACCT\nCAACGA\nCAACGC\nCAACGG\nCAACGT\nCAACTA\nCAACTC\nCAACTG\nCAACTT\nCAAGAA\nCAAGAC\nCAAGAG\nCAAGAT\nCAAGCA\nCAAGCC\nCAAGCG\nCAAGCT\nCAAGGA\nCAAGGC\nCAAGGG\nCAAGGT\nCAAGTA\nCAAGTC\nCAAGTG\nCAAGTT\nCAATAA\nCAATAC\nCAATAG\nCAATAT\nCAATCA\nCAATCC\nCAATCG\nCAATCT\nCAATGA\nCAATGC\nCAATGG\nCAATGT\nCAATTA\nCAATTC\nCAATTG\nCAATTT\nCACAAA\nCACAAC\nCACAAG\nCACAAT\nCACACA\nCACACC\nCACACG\nCACACT\nCACAGA\nCACAGC\nCACAGG\nCACAGT\nCACATA\nCACATC\nCACATG\nCACATT\nCACCAA\nCACCAC\nCACCAG\nCACCAT\nCACCCA\nCACCCC\nCACCCG\nCACCCT\nCACCGA\nCACCGC\nCACCGG\nCACCGT\nCACCTA\nCACCTC\nCACCTG\nCACCTT\nCACGAA\nCACGAC\nCACGAG\nCACGAT\nCACGCA\nCACGCC\nCACGCG\nCACGCT\nCACGGA\nCACGGC\nCACGGG\nCACGGT\nCACGTA\nCACGTC\nCACGTG\nCACGTT\nCACTAA\nCACTAC\nCACTAG\nCACTAT\nCACTCA\nCACTCC\nCACTCG\nCACTCT\nCACTGA\nCACTGC\nCACTGG\nCACTGT\nCACTTA\nCACTTC\nCACTTG\nCACTTT\nCAGAAA\nCAGAAC\nCAGAAG\nCAGAAT\nCAGACA\nCAGACC\nCAGACG\nCAGACT\nCAGAGA\nCAGAGC\nCAGAGG\nCAGAGT\nCAGATA\nCAGATC\nCAGATG\nCAGATT\nCAGCAA\nCAGCAC\nCAGCAG\nCAGCAT\nCAGCCA\nCAGCCC\nCAGCCG\nCAGCCT\nCAGCGA\nCAGCGC\nCAGCGG\nCAGCGT\nCAGCTA\nCAGCTC\nCAGCTG\nCAGCTT\nCAGGAA\nCAGGAC\nCAGGAG\nCAGGAT\nCAGGCA\nCAGGCC\nCAGGCG\nCAGGCT\nCAGGGA\nCAGGGC\nCAGGGG\nCAGGGT\nCAGGTA\nCAGGTC\nCAGGTG\nCAGGTT\nCAGTAA\nCAGTAC\nCAGTAG\nCAGTAT\nCAGTCA\nCAGTCC\nCAGTCG\nCAGTCT\nCAGTGA\nCAGTGC\nCAGTGG\nCAGTGT\nCAGTTA\nCAGTTC\nCAGTTG\nCAGTTT\nCATAAA\nCATAAC\nCATAAG\nCATAAT\nCATACA\nCATACC\nCATACG\nCATACT\nCATAGA\nCATAGC\nCATAGG\nCATAGT\nCATATA\nCATATC\nCATATG\nCATATT\nCATCAA\nCATCAC\nCATCAG\nCATCAT\nCATCCA\nCATCCC\nCATCCG\nCATCCT\nCATCGA\nCATCGC\nCATCGG\nCATCGT\nCATCTA\nCATCTC\nCATCTG\nCATCTT\nCATGAA\nCATGAC\nCATGAG\nCATGAT\nCATGCA\nCATGCC\nCATGCG\nCATGCT\nCATGGA\nCATGGC\nCATGGG\nCATGGT\nCATGTA\nCATGTC\nCATGTG\nCATGTT\nCATTAA\nCATTAC\nCATTAG\nCATTAT\nCATTCA\nCATTCC\nCATTCG\nCATTCT\nCATTGA\nCATTGC\nCATTGG\nCATTGT\nCATTTA\nCATTTC\nCATTTG\nCATTTT\nCCAAAA\nCCAAAC\nCCAAAG\nCCAAAT\nCCAACA\nCCAACC\nCCAACG\nCCAACT\nCCAAGA\nCCAAGC\nCCAAGG\nCCAAGT\nCCAATA\nCCAATC\nCCAATG\nCCAATT\nCCACAA\nCCACAC\nCCACAG\nCCACAT\nCCACCA\nCCACCC\nCCACCG\nCCACCT\nCCACGA\nCCACGC\nCCACGG\nCCACGT\nCCACTA\nCCACTC\nCCACTG\nCCACTT\nCCAGAA\nCCAGAC\nCCAGAG\nCCAGAT\nCCAGCA\nCCAGCC\nCCAGCG\nCCAGCT\nCCAGGA\nCCAGGC\nCCAGGG\nCCAGGT\nCCAGTA\nCCAGTC\nCCAGTG\nCCAGTT\nCCATAA\nCCATAC\nCCATAG\nCCATAT\nCCATCA\nCCATCC\nCCATCG\nCCATCT\nCCATGA\nCCATGC\nCCATGG\nCCATGT\nCCATTA\nCCATTC\nCCATTG\nCCATTT\nCCCAAA\nCCCAAC\nCCCAAG\nCCCAAT\nCCCACA\nCCCACC\nCCCACG\nCCCACT\nCCCAGA\nCCCAGC\nCCCAGG\nCCCAGT\nCCCATA\nCCCATC\nCCCATG\nCCCATT\nCCCCAA\nCCCCAC\nCCCCAG\nCCCCAT\nCCCCCA\nCCCCCC\nCCCCCG\nCCCCCT\nCCCCGA\nCCCCGC\nCCCCGG\nCCCCGT\nCCCCTA\nCCCCTC\nCCCCTG\nCCCCTT\nCCCGAA\nCCCGAC\nCCCGAG\nCCCGAT\nCCCGCA\nCCCGCC\nCCCGCG\nCCCGCT\nCCCGGA\nCCCGGC\nCCCGGG\nCCCGGT\nCCCGTA\nCCCGTC\nCCCGTG\nCCCGTT\nCCCTAA\nCCCTAC\nCCCTAG\nCCCTAT\nCCCTCA\nCCCTCC\nCCCTCG\nCCCTCT\nCCCTGA\nCCCTGC\nCCCTGG\nCCCTGT\nCCCTTA\nCCCTTC\nCCCTTG\nCCCTTT\nCCGAAA\nCCGAAC\nCCGAAG\nCCGAAT\nCCGACA\nCCGACC\nCCGACG\nCCGACT\nCCGAGA\nCCGAGC\nCCGAGG\nCCGAGT\nCCGATA\nCCGATC\nCCGATG\nCCGATT\nCCGCAA\nCCGCAC\nCCGCAG\nCCGCAT\nCCGCCA\nCCGCCC\nCCGCCG\nCCGCCT\nCCGCGA\nCCGCGC\nCCGCGG\nCCGCGT\nCCGCTA\nCCGCTC\nCCGCTG\nCCGCTT\nCCGGAA\nCCGGAC\nCCGGAG\nCCGGAT\nCCGGCA\nCCGGCC\nCCGGCG\nCCGGCT\nCCGGGA\nCCGGGC\nCCGGGG\nCCGGGT\nCCGGTA\nCCGGTC\nCCGGTG\nCCGGTT\nCCGTAA\nCCGTAC\nCCGTAG\nCCGTAT\nCCGTCA\nCCGTCC\nCCGTCG\nCCGTCT\nCCGTGA\nCCGTGC\nCCGTGG\nCCGTGT\nCCGTTA\nCCGTTC\nCCGTTG\nCCGTTT\nCCTAAA\nCCTAAC\nCCTAAG\nCCTAAT\nCCTACA\nCCTACC\nCCTACG\nCCTACT\nCCTAGA\nCCTAGC\nCCTAGG\nCCTAGT\nCCTATA\nCCTATC\nCCTATG\nCCTATT\nCCTCAA\nCCTCAC\nCCTCAG\nCCTCAT\nCCTCCA\nCCTCCC\nCCTCCG\nCCTCCT\nCCTCGA\nCCTCGC\nCCTCGG\nCCTCGT\nCCTCTA\nCCTCTC\nCCTCTG\nCCTCTT\nCCTGAA\nCCTGAC\nCCTGAG\nCCTGAT\nCCTGCA\nCCTGCC\nCCTGCG\nCCTGCT\nCCTGGA\nCCTGGC\nCCTGGG\nCCTGGT\nCCTGTA\nCCTGTC\nCCTGTG\nCCTGTT\nCCTTAA\nCCTTAC\nCCTTAG\nCCTTAT\nCCTTCA\nCCTTCC\nCCTTCG\nCCTTCT\nCCTTGA\nCCTTGC\nCCTTGG\nCCTTGT\nCCTTTA\nCCTTTC\nCCTTTG\nCCTTTT\nCGAAAA\nCGAAAC\nCGAAAG\nCGAAAT\nCGAACA\nCGAACC\nCGAACG\nCGAACT\nCGAAGA\nCGAAGC\nCGAAGG\nCGAAGT\nCGAATA\nCGAATC\nCGAATG\nCGAATT\nCGACAA\nCGACAC\nCGACAG\nCGACAT\nCGACCA\nCGACCC\nCGACCG\nCGACCT\nCGACGA\nCGACGC\nCGACGG\nCGACGT\nCGACTA\nCGACTC\nCGACTG\nCGACTT\nCGAGAA\nCGAGAC\nCGAGAG\nCGAGAT\nCGAGCA\nCGAGCC\nCGAGCG\nCGAGCT\nCGAGGA\nCGAGGC\nCGAGGG\nCGAGGT\nCGAGTA\nCGAGTC\nCGAGTG\nCGAGTT\nCGATAA\nCGATAC\nCGATAG\nCGATAT\nCGATCA\nCGATCC\nCGATCG\nCGATCT\nCGATGA\nCGATGC\nCGATGG\nCGATGT\nCGATTA\nCGATTC\nCGATTG\nCGATTT\nCGCAAA\nCGCAAC\nCGCAAG\nCGCAAT\nCGCACA\nCGCACC\nCGCACG\nCGCACT\nCGCAGA\nCGCAGC\nCGCAGG\nCGCAGT\nCGCATA\nCGCATC\nCGCATG\nCGCATT\nCGCCAA\nCGCCAC\nCGCCAG\nCGCCAT\nCGCCCA\nCGCCCC\nCGCCCG\nCGCCCT\nCGCCGA\nCGCCGC\nCGCCGG\nCGCCGT\nCGCCTA\nCGCCTC\nCGCCTG\nCGCCTT\nCGCGAA\nCGCGAC\nCGCGAG\nCGCGAT\nCGCGCA\nCGCGCC\nCGCGCG\nCGCGCT\nCGCGGA\nCGCGGC\nCGCGGG\nCGCGGT\nCGCGTA\nCGCGTC\nCGCGTG\nCGCGTT\nCGCTAA\nCGCTAC\nCGCTAG\nCGCTAT\nCGCTCA\nCGCTCC\nCGCTCG\nCGCTCT\nCGCTGA\nCGCTGC\nCGCTGG\nCGCTGT\nCGCTTA\nCGCTTC\nCGCTTG\nCGCTTT\nCGGAAA\nCGGAAC\nCGGAAG\nCGGAAT\nCGGACA\nCGGACC\nCGGACG\nCGGACT\nCGGAGA\nCGGAGC\nCGGAGG\nCGGAGT\nCGGATA\nCGGATC\nCGGATG\nCGGATT\nCGGCAA\nCGGCAC\nCGGCAG\nCGGCAT\nCGGCCA\nCGGCCC\nCGGCCG\nCGGCCT\nCGGCGA\nCGGCGC\nCGGCGG\nCGGCGT\nCGGCTA\nCGGCTC\nCGGCTG\nCGGCTT\nCGGGAA\nCGGGAC\nCGGGAG\nCGGGAT\nCGGGCA\nCGGGCC\nCGGGCG\nCGGGCT\nCGGGGA\nCGGGGC\nCGGGGG\nCGGGGT\nCGGGTA\nCGGGTC\nCGGGTG\nCGGGTT\nCGGTAA\nCGGTAC\nCGGTAG\nCGGTAT\nCGGTCA\nCGGTCC\nCGGTCG\nCGGTCT\nCGGTGA\nCGGTGC\nCGGTGG\nCGGTGT\nCGGTTA\nCGGTTC\nCGGTTG\nCGGTTT\nCGTAAA\nCGTAAC\nCGTAAG\nCGTAAT\nCGTACA\nCGTACC\nCGTACG\nCGTACT\nCGTAGA\nCGTAGC\nCGTAGG\nCGTAGT\nCGTATA\nCGTATC\nCGTATG\nCGTATT\nCGTCAA\nCGTCAC\nCGTCAG\nCGTCAT\nCGTCCA\nCGTCCC\nCGTCCG\nCGTCCT\nCGTCGA\nCGTCGC\nCGTCGG\nCGTCGT\nCGTCTA\nCGTCTC\nCGTCTG\nCGTCTT\nCGTGAA\nCGTGAC\nCGTGAG\nCGTGAT\nCGTGCA\nCGTGCC\nCGTGCG\nCGTGCT\nCGTGGA\nCGTGGC\nCGTGGG\nCGTGGT\nCGTGTA\nCGTGTC\nCGTGTG\nCGTGTT\nCGTTAA\nCGTTAC\nCGTTAG\nCGTTAT\nCGTTCA\nCGTTCC\nCGTTCG\nCGTTCT\nCGTTGA\nCGTTGC\nCGTTGG\nCGTTGT\nCGTTTA\nCGTTTC\nCGTTTG\nCGTTTT\nCTAAAA\nCTAAAC\nCTAAAG\nCTAAAT\nCTAACA\nCTAACC\nCTAACG\nCTAACT\nCTAAGA\nCTAAGC\nCTAAGG\nCTAAGT\nCTAATA\nCTAATC\nCTAATG\nCTAATT\nCTACAA\nCTACAC\nCTACAG\nCTACAT\nCTACCA\nCTACCC\nCTACCG\nCTACCT\nCTACGA\nCTACGC\nCTACGG\nCTACGT\nCTACTA\nCTACTC\nCTACTG\nCTACTT\nCTAGAA\nCTAGAC\nCTAGAG\nCTAGAT\nCTAGCA\nCTAGCC\nCTAGCG\nCTAGCT\nCTAGGA\nCTAGGC\nCTAGGG\nCTAGGT\nCTAGTA\nCTAGTC\nCTAGTG\nCTAGTT\nCTATAA\nCTATAC\nCTATAG\nCTATAT\nCTATCA\nCTATCC\nCTATCG\nCTATCT\nCTATGA\nCTATGC\nCTATGG\nCTATGT\nCTATTA\nCTATTC\nCTATTG\nCTATTT\nCTCAAA\nCTCAAC\nCTCAAG\nCTCAAT\nCTCACA\nCTCACC\nCTCACG\nCTCACT\nCTCAGA\nCTCAGC\nCTCAGG\nCTCAGT\nCTCATA\nCTCATC\nCTCATG\nCTCATT\nCTCCAA\nCTCCAC\nCTCCAG\nCTCCAT\nCTCCCA\nCTCCCC\nCTCCCG\nCTCCCT\nCTCCGA\nCTCCGC\nCTCCGG\nCTCCGT\nCTCCTA\nCTCCTC\nCTCCTG\nCTCCTT\nCTCGAA\nCTCGAC\nCTCGAG\nCTCGAT\nCTCGCA\nCTCGCC\nCTCGCG\nCTCGCT\nCTCGGA\nCTCGGC\nCTCGGG\nCTCGGT\nCTCGTA\nCTCGTC\nCTCGTG\nCTCGTT\nCTCTAA\nCTCTAC\nCTCTAG\nCTCTAT\nCTCTCA\nCTCTCC\nCTCTCG\nCTCTCT\nCTCTGA\nCTCTGC\nCTCTGG\nCTCTGT\nCTCTTA\nCTCTTC\nCTCTTG\nCTCTTT\nCTGAAA\nCTGAAC\nCTGAAG\nCTGAAT\nCTGACA\nCTGACC\nCTGACG\nCTGACT\nCTGAGA\nCTGAGC\nCTGAGG\nCTGAGT\nCTGATA\nCTGATC\nCTGATG\nCTGATT\nCTGCAA\nCTGCAC\nCTGCAG\nCTGCAT\nCTGCCA\nCTGCCC\nCTGCCG\nCTGCCT\nCTGCGA\nCTGCGC\nCTGCGG\nCTGCGT\nCTGCTA\nCTGCTC\nCTGCTG\nCTGCTT\nCTGGAA\nCTGGAC\nCTGGAG\nCTGGAT\nCTGGCA\nCTGGCC\nCTGGCG\nCTGGCT\nCTGGGA\nCTGGGC\nCTGGGG\nCTGGGT\nCTGGTA\nCTGGTC\nCTGGTG\nCTGGTT\nCTGTAA\nCTGTAC\nCTGTAG\nCTGTAT\nCTGTCA\nCTGTCC\nCTGTCG\nCTGTCT\nCTGTGA\nCTGTGC\nCTGTGG\nCTGTGT\nCTGTTA\nCTGTTC\nCTGTTG\nCTGTTT\nCTTAAA\nCTTAAC\nCTTAAG\nCTTAAT\nCTTACA\nCTTACC\nCTTACG\nCTTACT\nCTTAGA\nCTTAGC\nCTTAGG\nCTTAGT\nCTTATA\nCTTATC\nCTTATG\nCTTATT\nCTTCAA\nCTTCAC\nCTTCAG\nCTTCAT\nCTTCCA\nCTTCCC\nCTTCCG\nCTTCCT\nCTTCGA\nCTTCGC\nCTTCGG\nCTTCGT\nCTTCTA\nCTTCTC\nCTTCTG\nCTTCTT\nCTTGAA\nCTTGAC\nCTTGAG\nCTTGAT\nCTTGCA\nCTTGCC\nCTTGCG\nCTTGCT\nCTTGGA\nCTTGGC\nCTTGGG\nCTTGGT\nCTTGTA\nCTTGTC\nCTTGTG\nCTTGTT\nCTTTAA\nCTTTAC\nCTTTAG\nCTTTAT\nCTTTCA\nCTTTCC\nCTTTCG\nCTTTCT\nCTTTGA\nCTTTGC\nCTTTGG\nCTTTGT\nCTTTTA\nCTTTTC\nCTTTTG\nCTTTTT\nGAAAAA\nGAAAAC\nGAAAAG\nGAAAAT\nGAAACA\nGAAACC\nGAAACG\nGAAACT\nGAAAGA\nGAAAGC\nGAAAGG\nGAAAGT\nGAAATA\nGAAATC\nGAAATG\nGAAATT\nGAACAA\nGAACAC\nGAACAG\nGAACAT\nGAACCA\nGAACCC\nGAACCG\nGAACCT\nGAACGA\nGAACGC\nGAACGG\nGAACGT\nGAACTA\nGAACTC\nGAACTG\nGAACTT\nGAAGAA\nGAAGAC\nGAAGAG\nGAAGAT\nGAAGCA\nGAAGCC\nGAAGCG\nGAAGCT\nGAAGGA\nGAAGGC\nGAAGGG\nGAAGGT\nGAAGTA\nGAAGTC\nGAAGTG\nGAAGTT\nGAATAA\nGAATAC\nGAATAG\nGAATAT\nGAATCA\nGAATCC\nGAATCG\nGAATCT\nGAATGA\nGAATGC\nGAATGG\nGAATGT\nGAATTA\nGAATTC\nGAATTG\nGAATTT\nGACAAA\nGACAAC\nGACAAG\nGACAAT\nGACACA\nGACACC\nGACACG\nGACACT\nGACAGA\nGACAGC\nGACAGG\nGACAGT\nGACATA\nGACATC\nGACATG\nGACATT\nGACCAA\nGACCAC\nGACCAG\nGACCAT\nGACCCA\nGACCCC\nGACCCG\nGACCCT\nGACCGA\nGACCGC\nGACCGG\nGACCGT\nGACCTA\nGACCTC\nGACCTG\nGACCTT\nGACGAA\nGACGAC\nGACGAG\nGACGAT\nGACGCA\nGACGCC\nGACGCG\nGACGCT\nGACGGA\nGACGGC\nGACGGG\nGACGGT\nGACGTA\nGACGTC\nGACGTG\nGACGTT\nGACTAA\nGACTAC\nGACTAG\nGACTAT\nGACTCA\nGACTCC\nGACTCG\nGACTCT\nGACTGA\nGACTGC\nGACTGG\nGACTGT\nGACTTA\nGACTTC\nGACTTG\nGACTTT\nGAGAAA\nGAGAAC\nGAGAAG\nGAGAAT\nGAGACA\nGAGACC\nGAGACG\nGAGACT\nGAGAGA\nGAGAGC\nGAGAGG\nGAGAGT\nGAGATA\nGAGATC\nGAGATG\nGAGATT\nGAGCAA\nGAGCAC\nGAGCAG\nGAGCAT\nGAGCCA\nGAGCCC\nGAGCCG\nGAGCCT\nGAGCGA\nGAGCGC\nGAGCGG\nGAGCGT\nGAGCTA\nGAGCTC\nGAGCTG\nGAGCTT\nGAGGAA\nGAGGAC\nGAGGAG\nGAGGAT\nGAGGCA\nGAGGCC\nGAGGCG\nGAGGCT\nGAGGGA\nGAGGGC\nGAGGGG\nGAGGGT\nGAGGTA\nGAGGTC\nGAGGTG\nGAGGTT\nGAGTAA\nGAGTAC\nGAGTAG\nGAGTAT\nGAGTCA\nGAGTCC\nGAGTCG\nGAGTCT\nGAGTGA\nGAGTGC\nGAGTGG\nGAGTGT\nGAGTTA\nGAGTTC\nGAGTTG\nGAGTTT\nGATAAA\nGATAAC\nGATAAG\nGATAAT\nGATACA\nGATACC\nGATACG\nGATACT\nGATAGA\nGATAGC\nGATAGG\nGATAGT\nGATATA\nGATATC\nGATATG\nGATATT\nGATCAA\nGATCAC\nGATCAG\nGATCAT\nGATCCA\nGATCCC\nGATCCG\nGATCCT\nGATCGA\nGATCGC\nGATCGG\nGATCGT\nGATCTA\nGATCTC\nGATCTG\nGATCTT\nGATGAA\nGATGAC\nGATGAG\nGATGAT\nGATGCA\nGATGCC\nGATGCG\nGATGCT\nGATGGA\nGATGGC\nGATGGG\nGATGGT\nGATGTA\nGATGTC\nGATGTG\nGATGTT\nGATTAA\nGATTAC\nGATTAG\nGATTAT\nGATTCA\nGATTCC\nGATTCG\nGATTCT\nGATTGA\nGATTGC\nGATTGG\nGATTGT\nGATTTA\nGATTTC\nGATTTG\nGATTTT\nGCAAAA\nGCAAAC\nGCAAAG\nGCAAAT\nGCAACA\nGCAACC\nGCAACG\nGCAACT\nGCAAGA\nGCAAGC\nGCAAGG\nGCAAGT\nGCAATA\nGCAATC\nGCAATG\nGCAATT\nGCACAA\nGCACAC\nGCACAG\nGCACAT\nGCACCA\nGCACCC\nGCACCG\nGCACCT\nGCACGA\nGCACGC\nGCACGG\nGCACGT\nGCACTA\nGCACTC\nGCACTG\nGCACTT\nGCAGAA\nGCAGAC\nGCAGAG\nGCAGAT\nGCAGCA\nGCAGCC\nGCAGCG\nGCAGCT\nGCAGGA\nGCAGGC\nGCAGGG\nGCAGGT\nGCAGTA\nGCAGTC\nGCAGTG\nGCAGTT\nGCATAA\nGCATAC\nGCATAG\nGCATAT\nGCATCA\nGCATCC\nGCATCG\nGCATCT\nGCATGA\nGCATGC\nGCATGG\nGCATGT\nGCATTA\nGCATTC\nGCATTG\nGCATTT\nGCCAAA\nGCCAAC\nGCCAAG\nGCCAAT\nGCCACA\nGCCACC\nGCCACG\nGCCACT\nGCCAGA\nGCCAGC\nGCCAGG\nGCCAGT\nGCCATA\nGCCATC\nGCCATG\nGCCATT\nGCCCAA\nGCCCAC\nGCCCAG\nGCCCAT\nGCCCCA\nGCCCCC\nGCCCCG\nGCCCCT\nGCCCGA\nGCCCGC\nGCCCGG\nGCCCGT\nGCCCTA\nGCCCTC\nGCCCTG\nGCCCTT\nGCCGAA\nGCCGAC\nGCCGAG\nGCCGAT\nGCCGCA\nGCCGCC\nGCCGCG\nGCCGCT\nGCCGGA\nGCCGGC\nGCCGGG\nGCCGGT\nGCCGTA\nGCCGTC\nGCCGTG\nGCCGTT\nGCCTAA\nGCCTAC\nGCCTAG\nGCCTAT\nGCCTCA\nGCCTCC\nGCCTCG\nGCCTCT\nGCCTGA\nGCCTGC\nGCCTGG\nGCCTGT\nGCCTTA\nGCCTTC\nGCCTTG\nGCCTTT\nGCGAAA\nGCGAAC\nGCGAAG\nGCGAAT\nGCGACA\nGCGACC\nGCGACG\nGCGACT\nGCGAGA\nGCGAGC\nGCGAGG\nGCGAGT\nGCGATA\nGCGATC\nGCGATG\nGCGATT\nGCGCAA\nGCGCAC\nGCGCAG\nGCGCAT\nGCGCCA\nGCGCCC\nGCGCCG\nGCGCCT\nGCGCGA\nGCGCGC\nGCGCGG\nGCGCGT\nGCGCTA\nGCGCTC\nGCGCTG\nGCGCTT\nGCGGAA\nGCGGAC\nGCGGAG\nGCGGAT\nGCGGCA\nGCGGCC\nGCGGCG\nGCGGCT\nGCGGGA\nGCGGGC\nGCGGGG\nGCGGGT\nGCGGTA\nGCGGTC\nGCGGTG\nGCGGTT\nGCGTAA\nGCGTAC\nGCGTAG\nGCGTAT\nGCGTCA\nGCGTCC\nGCGTCG\nGCGTCT\nGCGTGA\nGCGTGC\nGCGTGG\nGCGTGT\nGCGTTA\nGCGTTC\nGCGTTG\nGCGTTT\nGCTAAA\nGCTAAC\nGCTAAG\nGCTAAT\nGCTACA\nGCTACC\nGCTACG\nGCTACT\nGCTAGA\nGCTAGC\nGCTAGG\nGCTAGT\nGCTATA\nGCTATC\nGCTATG\nGCTATT\nGCTCAA\nGCTCAC\nGCTCAG\nGCTCAT\nGCTCCA\nGCTCCC\nGCTCCG\nGCTCCT\nGCTCGA\nGCTCGC\nGCTCGG\nGCTCGT\nGCTCTA\nGCTCTC\nGCTCTG\nGCTCTT\nGCTGAA\nGCTGAC\nGCTGAG\nGCTGAT\nGCTGCA\nGCTGCC\nGCTGCG\nGCTGCT\nGCTGGA\nGCTGGC\nGCTGGG\nGCTGGT\nGCTGTA\nGCTGTC\nGCTGTG\nGCTGTT\nGCTTAA\nGCTTAC\nGCTTAG\nGCTTAT\nGCTTCA\nGCTTCC\nGCTTCG\nGCTTCT\nGCTTGA\nGCTTGC\nGCTTGG\nGCTTGT\nGCTTTA\nGCTTTC\nGCTTTG\nGCTTTT\nGGAAAA\nGGAAAC\nGGAAAG\nGGAAAT\nGGAACA\nGGAACC\nGGAACG\nGGAACT\nGGAAGA\nGGAAGC\nGGAAGG\nGGAAGT\nGGAATA\nGGAATC\nGGAATG\nGGAATT\nGGACAA\nGGACAC\nGGACAG\nGGACAT\nGGACCA\nGGACCC\nGGACCG\nGGACCT\nGGACGA\nGGACGC\nGGACGG\nGGACGT\nGGACTA\nGGACTC\nGGACTG\nGGACTT\nGGAGAA\nGGAGAC\nGGAGAG\nGGAGAT\nGGAGCA\nGGAGCC\nGGAGCG\nGGAGCT\nGGAGGA\nGGAGGC\nGGAGGG\nGGAGGT\nGGAGTA\nGGAGTC\nGGAGTG\nGGAGTT\nGGATAA\nGGATAC\nGGATAG\nGGATAT\nGGATCA\nGGATCC\nGGATCG\nGGATCT\nGGATGA\nGGATGC\nGGATGG\nGGATGT\nGGATTA\nGGATTC\nGGATTG\nGGATTT\nGGCAAA\nGGCAAC\nGGCAAG\nGGCAAT\nGGCACA\nGGCACC\nGGCACG\nGGCACT\nGGCAGA\nGGCAGC\nGGCAGG\nGGCAGT\nGGCATA\nGGCATC\nGGCATG\nGGCATT\nGGCCAA\nGGCCAC\nGGCCAG\nGGCCAT\nGGCCCA\nGGCCCC\nGGCCCG\nGGCCCT\nGGCCGA\nGGCCGC\nGGCCGG\nGGCCGT\nGGCCTA\nGGCCTC\nGGCCTG\nGGCCTT\nGGCGAA\nGGCGAC\nGGCGAG\nGGCGAT\nGGCGCA\nGGCGCC\nGGCGCG\nGGCGCT\nGGCGGA\nGGCGGC\nGGCGGG\nGGCGGT\nGGCGTA\nGGCGTC\nGGCGTG\nGGCGTT\nGGCTAA\nGGCTAC\nGGCTAG\nGGCTAT\nGGCTCA\nGGCTCC\nGGCTCG\nGGCTCT\nGGCTGA\nGGCTGC\nGGCTGG\nGGCTGT\nGGCTTA\nGGCTTC\nGGCTTG\nGGCTTT\nGGGAAA\nGGGAAC\nGGGAAG\nGGGAAT\nGGGACA\nGGGACC\nGGGACG\nGGGACT\nGGGAGA\nGGGAGC\nGGGAGG\nGGGAGT\nGGGATA\nGGGATC\nGGGATG\nGGGATT\nGGGCAA\nGGGCAC\nGGGCAG\nGGGCAT\nGGGCCA\nGGGCCC\nGGGCCG\nGGGCCT\nGGGCGA\nGGGCGC\nGGGCGG\nGGGCGT\nGGGCTA\nGGGCTC\nGGGCTG\nGGGCTT\nGGGGAA\nGGGGAC\nGGGGAG\nGGGGAT\nGGGGCA\nGGGGCC\nGGGGCG\nGGGGCT\nGGGGGA\nGGGGGC\nGGGGGG\nGGGGGT\nGGGGTA\nGGGGTC\nGGGGTG\nGGGGTT\nGGGTAA\nGGGTAC\nGGGTAG\nGGGTAT\nGGGTCA\nGGGTCC\nGGGTCG\nGGGTCT\nGGGTGA\nGGGTGC\nGGGTGG\nGGGTGT\nGGGTTA\nGGGTTC\nGGGTTG\nGGGTTT\nGGTAAA\nGGTAAC\nGGTAAG\nGGTAAT\nGGTACA\nGGTACC\nGGTACG\nGGTACT\nGGTAGA\nGGTAGC\nGGTAGG\nGGTAGT\nGGTATA\nGGTATC\nGGTATG\nGGTATT\nGGTCAA\nGGTCAC\nGGTCAG\nGGTCAT\nGGTCCA\nGGTCCC\nGGTCCG\nGGTCCT\nGGTCGA\nGGTCGC\nGGTCGG\nGGTCGT\nGGTCTA\nGGTCTC\nGGTCTG\nGGTCTT\nGGTGAA\nGGTGAC\nGGTGAG\nGGTGAT\nGGTGCA\nGGTGCC\nGGTGCG\nGGTGCT\nGGTGGA\nGGTGGC\nGGTGGG\nGGTGGT\nGGTGTA\nGGTGTC\nGGTGTG\nGGTGTT\nGGTTAA\nGGTTAC\nGGTTAG\nGGTTAT\nGGTTCA\nGGTTCC\nGGTTCG\nGGTTCT\nGGTTGA\nGGTTGC\nGGTTGG\nGGTTGT\nGGTTTA\nGGTTTC\nGGTTTG\nGGTTTT\nGTAAAA\nGTAAAC\nGTAAAG\nGTAAAT\nGTAACA\nGTAACC\nGTAACG\nGTAACT\nGTAAGA\nGTAAGC\nGTAAGG\nGTAAGT\nGTAATA\nGTAATC\nGTAATG\nGTAATT\nGTACAA\nGTACAC\nGTACAG\nGTACAT\nGTACCA\nGTACCC\nGTACCG\nGTACCT\nGTACGA\nGTACGC\nGTACGG\nGTACGT\nGTACTA\nGTACTC\nGTACTG\nGTACTT\nGTAGAA\nGTAGAC\nGTAGAG\nGTAGAT\nGTAGCA\nGTAGCC\nGTAGCG\nGTAGCT\nGTAGGA\nGTAGGC\nGTAGGG\nGTAGGT\nGTAGTA\nGTAGTC\nGTAGTG\nGTAGTT\nGTATAA\nGTATAC\nGTATAG\nGTATAT\nGTATCA\nGTATCC\nGTATCG\nGTATCT\nGTATGA\nGTATGC\nGTATGG\nGTATGT\nGTATTA\nGTATTC\nGTATTG\nGTATTT\nGTCAAA\nGTCAAC\nGTCAAG\nGTCAAT\nGTCACA\nGTCACC\nGTCACG\nGTCACT\nGTCAGA\nGTCAGC\nGTCAGG\nGTCAGT\nGTCATA\nGTCATC\nGTCATG\nGTCATT\nGTCCAA\nGTCCAC\nGTCCAG\nGTCCAT\nGTCCCA\nGTCCCC\nGTCCCG\nGTCCCT\nGTCCGA\nGTCCGC\nGTCCGG\nGTCCGT\nGTCCTA\nGTCCTC\nGTCCTG\nGTCCTT\nGTCGAA\nGTCGAC\nGTCGAG\nGTCGAT\nGTCGCA\nGTCGCC\nGTCGCG\nGTCGCT\nGTCGGA\nGTCGGC\nGTCGGG\nGTCGGT\nGTCGTA\nGTCGTC\nGTCGTG\nGTCGTT\nGTCTAA\nGTCTAC\nGTCTAG\nGTCTAT\nGTCTCA\nGTCTCC\nGTCTCG\nGTCTCT\nGTCTGA\nGTCTGC\nGTCTGG\nGTCTGT\nGTCTTA\nGTCTTC\nGTCTTG\nGTCTTT\nGTGAAA\nGTGAAC\nGTGAAG\nGTGAAT\nGTGACA\nGTGACC\nGTGACG\nGTGACT\nGTGAGA\nGTGAGC\nGTGAGG\nGTGAGT\nGTGATA\nGTGATC\nGTGATG\nGTGATT\nGTGCAA\nGTGCAC\nGTGCAG\nGTGCAT\nGTGCCA\nGTGCCC\nGTGCCG\nGTGCCT\nGTGCGA\nGTGCGC\nGTGCGG\nGTGCGT\nGTGCTA\nGTGCTC\nGTGCTG\nGTGCTT\nGTGGAA\nGTGGAC\nGTGGAG\nGTGGAT\nGTGGCA\nGTGGCC\nGTGGCG\nGTGGCT\nGTGGGA\nGTGGGC\nGTGGGG\nGTGGGT\nGTGGTA\nGTGGTC\nGTGGTG\nGTGGTT\nGTGTAA\nGTGTAC\nGTGTAG\nGTGTAT\nGTGTCA\nGTGTCC\nGTGTCG\nGTGTCT\nGTGTGA\nGTGTGC\nGTGTGG\nGTGTGT\nGTGTTA\nGTGTTC\nGTGTTG\nGTGTTT\nGTTAAA\nGTTAAC\nGTTAAG\nGTTAAT\nGTTACA\nGTTACC\nGTTACG\nGTTACT\nGTTAGA\nGTTAGC\nGTTAGG\nGTTAGT\nGTTATA\nGTTATC\nGTTATG\nGTTATT\nGTTCAA\nGTTCAC\nGTTCAG\nGTTCAT\nGTTCCA\nGTTCCC\nGTTCCG\nGTTCCT\nGTTCGA\nGTTCGC\nGTTCGG\nGTTCGT\nGTTCTA\nGTTCTC\nGTTCTG\nGTTCTT\nGTTGAA\nGTTGAC\nGTTGAG\nGTTGAT\nGTTGCA\nGTTGCC\nGTTGCG\nGTTGCT\nGTTGGA\nGTTGGC\nGTTGGG\nGTTGGT\nGTTGTA\nGTTGTC\nGTTGTG\nGTTGTT\nGTTTAA\nGTTTAC\nGTTTAG\nGTTTAT\nGTTTCA\nGTTTCC\nGTTTCG\nGTTTCT\nGTTTGA\nGTTTGC\nGTTTGG\nGTTTGT\nGTTTTA\nGTTTTC\nGTTTTG\nGTTTTT\nTAAAAA\nTAAAAC\nTAAAAG\nTAAAAT\nTAAACA\nTAAACC\nTAAACG\nTAAACT\nTAAAGA\nTAAAGC\nTAAAGG\nTAAAGT\nTAAATA\nTAAATC\nTAAATG\nTAAATT\nTAACAA\nTAACAC\nTAACAG\nTAACAT\nTAACCA\nTAACCC\nTAACCG\nTAACCT\nTAACGA\nTAACGC\nTAACGG\nTAACGT\nTAACTA\nTAACTC\nTAACTG\nTAACTT\nTAAGAA\nTAAGAC\nTAAGAG\nTAAGAT\nTAAGCA\nTAAGCC\nTAAGCG\nTAAGCT\nTAAGGA\nTAAGGC\nTAAGGG\nTAAGGT\nTAAGTA\nTAAGTC\nTAAGTG\nTAAGTT\nTAATAA\nTAATAC\nTAATAG\nTAATAT\nTAATCA\nTAATCC\nTAATCG\nTAATCT\nTAATGA\nTAATGC\nTAATGG\nTAATGT\nTAATTA\nTAATTC\nTAATTG\nTAATTT\nTACAAA\nTACAAC\nTACAAG\nTACAAT\nTACACA\nTACACC\nTACACG\nTACACT\nTACAGA\nTACAGC\nTACAGG\nTACAGT\nTACATA\nTACATC\nTACATG\nTACATT\nTACCAA\nTACCAC\nTACCAG\nTACCAT\nTACCCA\nTACCCC\nTACCCG\nTACCCT\nTACCGA\nTACCGC\nTACCGG\nTACCGT\nTACCTA\nTACCTC\nTACCTG\nTACCTT\nTACGAA\nTACGAC\nTACGAG\nTACGAT\nTACGCA\nTACGCC\nTACGCG\nTACGCT\nTACGGA\nTACGGC\nTACGGG\nTACGGT\nTACGTA\nTACGTC\nTACGTG\nTACGTT\nTACTAA\nTACTAC\nTACTAG\nTACTAT\nTACTCA\nTACTCC\nTACTCG\nTACTCT\nTACTGA\nTACTGC\nTACTGG\nTACTGT\nTACTTA\nTACTTC\nTACTTG\nTACTTT\nTAGAAA\nTAGAAC\nTAGAAG\nTAGAAT\nTAGACA\nTAGACC\nTAGACG\nTAGACT\nTAGAGA\nTAGAGC\nTAGAGG\nTAGAGT\nTAGATA\nTAGATC\nTAGATG\nTAGATT\nTAGCAA\nTAGCAC\nTAGCAG\nTAGCAT\nTAGCCA\nTAGCCC\nTAGCCG\nTAGCCT\nTAGCGA\nTAGCGC\nTAGCGG\nTAGCGT\nTAGCTA\nTAGCTC\nTAGCTG\nTAGCTT\nTAGGAA\nTAGGAC\nTAGGAG\nTAGGAT\nTAGGCA\nTAGGCC\nTAGGCG\nTAGGCT\nTAGGGA\nTAGGGC\nTAGGGG\nTAGGGT\nTAGGTA\nTAGGTC\nTAGGTG\nTAGGTT\nTAGTAA\nTAGTAC\nTAGTAG\nTAGTAT\nTAGTCA\nTAGTCC\nTAGTCG\nTAGTCT\nTAGTGA\nTAGTGC\nTAGTGG\nTAGTGT\nTAGTTA\nTAGTTC\nTAGTTG\nTAGTTT\nTATAAA\nTATAAC\nTATAAG\nTATAAT\nTATACA\nTATACC\nTATACG\nTATACT\nTATAGA\nTATAGC\nTATAGG\nTATAGT\nTATATA\nTATATC\nTATATG\nTATATT\nTATCAA\nTATCAC\nTATCAG\nTATCAT\nTATCCA\nTATCCC\nTATCCG\nTATCCT\nTATCGA\nTATCGC\nTATCGG\nTATCGT\nTATCTA\nTATCTC\nTATCTG\nTATCTT\nTATGAA\nTATGAC\nTATGAG\nTATGAT\nTATGCA\nTATGCC\nTATGCG\nTATGCT\nTATGGA\nTATGGC\nTATGGG\nTATGGT\nTATGTA\nTATGTC\nTATGTG\nTATGTT\nTATTAA\nTATTAC\nTATTAG\nTATTAT\nTATTCA\nTATTCC\nTATTCG\nTATTCT\nTATTGA\nTATTGC\nTATTGG\nTATTGT\nTATTTA\nTATTTC\nTATTTG\nTATTTT\nTCAAAA\nTCAAAC\nTCAAAG\nTCAAAT\nTCAACA\nTCAACC\nTCAACG\nTCAACT\nTCAAGA\nTCAAGC\nTCAAGG\nTCAAGT\nTCAATA\nTCAATC\nTCAATG\nTCAATT\nTCACAA\nTCACAC\nTCACAG\nTCACAT\nTCACCA\nTCACCC\nTCACCG\nTCACCT\nTCACGA\nTCACGC\nTCACGG\nTCACGT\nTCACTA\nTCACTC\nTCACTG\nTCACTT\nTCAGAA\nTCAGAC\nTCAGAG\nTCAGAT\nTCAGCA\nTCAGCC\nTCAGCG\nTCAGCT\nTCAGGA\nTCAGGC\nTCAGGG\nTCAGGT\nTCAGTA\nTCAGTC\nTCAGTG\nTCAGTT\nTCATAA\nTCATAC\nTCATAG\nTCATAT\nTCATCA\nTCATCC\nTCATCG\nTCATCT\nTCATGA\nTCATGC\nTCATGG\nTCATGT\nTCATTA\nTCATTC\nTCATTG\nTCATTT\nTCCAAA\nTCCAAC\nTCCAAG\nTCCAAT\nTCCACA\nTCCACC\nTCCACG\nTCCACT\nTCCAGA\nTCCAGC\nTCCAGG\nTCCAGT\nTCCATA\nTCCATC\nTCCATG\nTCCATT\nTCCCAA\nTCCCAC\nTCCCAG\nTCCCAT\nTCCCCA\nTCCCCC\nTCCCCG\nTCCCCT\nTCCCGA\nTCCCGC\nTCCCGG\nTCCCGT\nTCCCTA\nTCCCTC\nTCCCTG\nTCCCTT\nTCCGAA\nTCCGAC\nTCCGAG\nTCCGAT\nTCCGCA\nTCCGCC\nTCCGCG\nTCCGCT\nTCCGGA\nTCCGGC\nTCCGGG\nTCCGGT\nTCCGTA\nTCCGTC\nTCCGTG\nTCCGTT\nTCCTAA\nTCCTAC\nTCCTAG\nTCCTAT\nTCCTCA\nTCCTCC\nTCCTCG\nTCCTCT\nTCCTGA\nTCCTGC\nTCCTGG\nTCCTGT\nTCCTTA\nTCCTTC\nTCCTTG\nTCCTTT\nTCGAAA\nTCGAAC\nTCGAAG\nTCGAAT\nTCGACA\nTCGACC\nTCGACG\nTCGACT\nTCGAGA\nTCGAGC\nTCGAGG\nTCGAGT\nTCGATA\nTCGATC\nTCGATG\nTCGATT\nTCGCAA\nTCGCAC\nTCGCAG\nTCGCAT\nTCGCCA\nTCGCCC\nTCGCCG\nTCGCCT\nTCGCGA\nTCGCGC\nTCGCGG\nTCGCGT\nTCGCTA\nTCGCTC\nTCGCTG\nTCGCTT\nTCGGAA\nTCGGAC\nTCGGAG\nTCGGAT\nTCGGCA\nTCGGCC\nTCGGCG\nTCGGCT\nTCGGGA\nTCGGGC\nTCGGGG\nTCGGGT\nTCGGTA\nTCGGTC\nTCGGTG\nTCGGTT\nTCGTAA\nTCGTAC\nTCGTAG\nTCGTAT\nTCGTCA\nTCGTCC\nTCGTCG\nTCGTCT\nTCGTGA\nTCGTGC\nTCGTGG\nTCGTGT\nTCGTTA\nTCGTTC\nTCGTTG\nTCGTTT\nTCTAAA\nTCTAAC\nTCTAAG\nTCTAAT\nTCTACA\nTCTACC\nTCTACG\nTCTACT\nTCTAGA\nTCTAGC\nTCTAGG\nTCTAGT\nTCTATA\nTCTATC\nTCTATG\nTCTATT\nTCTCAA\nTCTCAC\nTCTCAG\nTCTCAT\nTCTCCA\nTCTCCC\nTCTCCG\nTCTCCT\nTCTCGA\nTCTCGC\nTCTCGG\nTCTCGT\nTCTCTA\nTCTCTC\nTCTCTG\nTCTCTT\nTCTGAA\nTCTGAC\nTCTGAG\nTCTGAT\nTCTGCA\nTCTGCC\nTCTGCG\nTCTGCT\nTCTGGA\nTCTGGC\nTCTGGG\nTCTGGT\nTCTGTA\nTCTGTC\nTCTGTG\nTCTGTT\nTCTTAA\nTCTTAC\nTCTTAG\nTCTTAT\nTCTTCA\nTCTTCC\nTCTTCG\nTCTTCT\nTCTTGA\nTCTTGC\nTCTTGG\nTCTTGT\nTCTTTA\nTCTTTC\nTCTTTG\nTCTTTT\nTGAAAA\nTGAAAC\nTGAAAG\nTGAAAT\nTGAACA\nTGAACC\nTGAACG\nTGAACT\nTGAAGA\nTGAAGC\nTGAAGG\nTGAAGT\nTGAATA\nTGAATC\nTGAATG\nTGAATT\nTGACAA\nTGACAC\nTGACAG\nTGACAT\nTGACCA\nTGACCC\nTGACCG\nTGACCT\nTGACGA\nTGACGC\nTGACGG\nTGACGT\nTGACTA\nTGACTC\nTGACTG\nTGACTT\nTGAGAA\nTGAGAC\nTGAGAG\nTGAGAT\nTGAGCA\nTGAGCC\nTGAGCG\nTGAGCT\nTGAGGA\nTGAGGC\nTGAGGG\nTGAGGT\nTGAGTA\nTGAGTC\nTGAGTG\nTGAGTT\nTGATAA\nTGATAC\nTGATAG\nTGATAT\nTGATCA\nTGATCC\nTGATCG\nTGATCT\nTGATGA\nTGATGC\nTGATGG\nTGATGT\nTGATTA\nTGATTC\nTGATTG\nTGATTT\nTGCAAA\nTGCAAC\nTGCAAG\nTGCAAT\nTGCACA\nTGCACC\nTGCACG\nTGCACT\nTGCAGA\nTGCAGC\nTGCAGG\nTGCAGT\nTGCATA\nTGCATC\nTGCATG\nTGCATT\nTGCCAA\nTGCCAC\nTGCCAG\nTGCCAT\nTGCCCA\nTGCCCC\nTGCCCG\nTGCCCT\nTGCCGA\nTGCCGC\nTGCCGG\nTGCCGT\nTGCCTA\nTGCCTC\nTGCCTG\nTGCCTT\nTGCGAA\nTGCGAC\nTGCGAG\nTGCGAT\nTGCGCA\nTGCGCC\nTGCGCG\nTGCGCT\nTGCGGA\nTGCGGC\nTGCGGG\nTGCGGT\nTGCGTA\nTGCGTC\nTGCGTG\nTGCGTT\nTGCTAA\nTGCTAC\nTGCTAG\nTGCTAT\nTGCTCA\nTGCTCC\nTGCTCG\nTGCTCT\nTGCTGA\nTGCTGC\nTGCTGG\nTGCTGT\nTGCTTA\nTGCTTC\nTGCTTG\nTGCTTT\nTGGAAA\nTGGAAC\nTGGAAG\nTGGAAT\nTGGACA\nTGGACC\nTGGACG\nTGGACT\nTGGAGA\nTGGAGC\nTGGAGG\nTGGAGT\nTGGATA\nTGGATC\nTGGATG\nTGGATT\nTGGCAA\nTGGCAC\nTGGCAG\nTGGCAT\nTGGCCA\nTGGCCC\nTGGCCG\nTGGCCT\nTGGCGA\nTGGCGC\nTGGCGG\nTGGCGT\nTGGCTA\nTGGCTC\nTGGCTG\nTGGCTT\nTGGGAA\nTGGGAC\nTGGGAG\nTGGGAT\nTGGGCA\nTGGGCC\nTGGGCG\nTGGGCT\nTGGGGA\nTGGGGC\nTGGGGG\nTGGGGT\nTGGGTA\nTGGGTC\nTGGGTG\nTGGGTT\nTGGTAA\nTGGTAC\nTGGTAG\nTGGTAT\nTGGTCA\nTGGTCC\nTGGTCG\nTGGTCT\nTGGTGA\nTGGTGC\nTGGTGG\nTGGTGT\nTGGTTA\nTGGTTC\nTGGTTG\nTGGTTT\nTGTAAA\nTGTAAC\nTGTAAG\nTGTAAT\nTGTACA\nTGTACC\nTGTACG\nTGTACT\nTGTAGA\nTGTAGC\nTGTAGG\nTGTAGT\nTGTATA\nTGTATC\nTGTATG\nTGTATT\nTGTCAA\nTGTCAC\nTGTCAG\nTGTCAT\nTGTCCA\nTGTCCC\nTGTCCG\nTGTCCT\nTGTCGA\nTGTCGC\nTGTCGG\nTGTCGT\nTGTCTA\nTGTCTC\nTGTCTG\nTGTCTT\nTGTGAA\nTGTGAC\nTGTGAG\nTGTGAT\nTGTGCA\nTGTGCC\nTGTGCG\nTGTGCT\nTGTGGA\nTGTGGC\nTGTGGG\nTGTGGT\nTGTGTA\nTGTGTC\nTGTGTG\nTGTGTT\nTGTTAA\nTGTTAC\nTGTTAG\nTGTTAT\nTGTTCA\nTGTTCC\nTGTTCG\nTGTTCT\nTGTTGA\nTGTTGC\nTGTTGG\nTGTTGT\nTGTTTA\nTGTTTC\nTGTTTG\nTGTTTT\nTTAAAA\nTTAAAC\nTTAAAG\nTTAAAT\nTTAACA\nTTAACC\nTTAACG\nTTAACT\nTTAAGA\nTTAAGC\nTTAAGG\nTTAAGT\nTTAATA\nTTAATC\nTTAATG\nTTAATT\nTTACAA\nTTACAC\nTTACAG\nTTACAT\nTTACCA\nTTACCC\nTTACCG\nTTACCT\nTTACGA\nTTACGC\nTTACGG\nTTACGT\nTTACTA\nTTACTC\nTTACTG\nTTACTT\nTTAGAA\nTTAGAC\nTTAGAG\nTTAGAT\nTTAGCA\nTTAGCC\nTTAGCG\nTTAGCT\nTTAGGA\nTTAGGC\nTTAGGG\nTTAGGT\nTTAGTA\nTTAGTC\nTTAGTG\nTTAGTT\nTTATAA\nTTATAC\nTTATAG\nTTATAT\nTTATCA\nTTATCC\nTTATCG\nTTATCT\nTTATGA\nTTATGC\nTTATGG\nTTATGT\nTTATTA\nTTATTC\nTTATTG\nTTATTT\nTTCAAA\nTTCAAC\nTTCAAG\nTTCAAT\nTTCACA\nTTCACC\nTTCACG\nTTCACT\nTTCAGA\nTTCAGC\nTTCAGG\nTTCAGT\nTTCATA\nTTCATC\nTTCATG\nTTCATT\nTTCCAA\nTTCCAC\nTTCCAG\nTTCCAT\nTTCCCA\nTTCCCC\nTTCCCG\nTTCCCT\nTTCCGA\nTTCCGC\nTTCCGG\nTTCCGT\nTTCCTA\nTTCCTC\nTTCCTG\nTTCCTT\nTTCGAA\nTTCGAC\nTTCGAG\nTTCGAT\nTTCGCA\nTTCGCC\nTTCGCG\nTTCGCT\nTTCGGA\nTTCGGC\nTTCGGG\nTTCGGT\nTTCGTA\nTTCGTC\nTTCGTG\nTTCGTT\nTTCTAA\nTTCTAC\nTTCTAG\nTTCTAT\nTTCTCA\nTTCTCC\nTTCTCG\nTTCTCT\nTTCTGA\nTTCTGC\nTTCTGG\nTTCTGT\nTTCTTA\nTTCTTC\nTTCTTG\nTTCTTT\nTTGAAA\nTTGAAC\nTTGAAG\nTTGAAT\nTTGACA\nTTGACC\nTTGACG\nTTGACT\nTTGAGA\nTTGAGC\nTTGAGG\nTTGAGT\nTTGATA\nTTGATC\nTTGATG\nTTGATT\nTTGCAA\nTTGCAC\nTTGCAG\nTTGCAT\nTTGCCA\nTTGCCC\nTTGCCG\nTTGCCT\nTTGCGA\nTTGCGC\nTTGCGG\nTTGCGT\nTTGCTA\nTTGCTC\nTTGCTG\nTTGCTT\nTTGGAA\nTTGGAC\nTTGGAG\nTTGGAT\nTTGGCA\nTTGGCC\nTTGGCG\nTTGGCT\nTTGGGA\nTTGGGC\nTTGGGG\nTTGGGT\nTTGGTA\nTTGGTC\nTTGGTG\nTTGGTT\nTTGTAA\nTTGTAC\nTTGTAG\nTTGTAT\nTTGTCA\nTTGTCC\nTTGTCG\nTTGTCT\nTTGTGA\nTTGTGC\nTTGTGG\nTTGTGT\nTTGTTA\nTTGTTC\nTTGTTG\nTTGTTT\nTTTAAA\nTTTAAC\nTTTAAG\nTTTAAT\nTTTACA\nTTTACC\nTTTACG\nTTTACT\nTTTAGA\nTTTAGC\nTTTAGG\nTTTAGT\nTTTATA\nTTTATC\nTTTATG\nTTTATT\nTTTCAA\nTTTCAC\nTTTCAG\nTTTCAT\nTTTCCA\nTTTCCC\nTTTCCG\nTTTCCT\nTTTCGA\nTTTCGC\nTTTCGG\nTTTCGT\nTTTCTA\nTTTCTC\nTTTCTG\nTTTCTT\nTTTGAA\nTTTGAC\nTTTGAG\nTTTGAT\nTTTGCA\nTTTGCC\nTTTGCG\nTTTGCT\nTTTGGA\nTTTGGC\nTTTGGG\nTTTGGT\nTTTGTA\nTTTGTC\nTTTGTG\nTTTGTT\nTTTTAA\nTTTTAC\nTTTTAG\nTTTTAT\nTTTTCA\nTTTTCC\nTTTTCG\nTTTTCT\nTTTTGA\nTTTTGC\nTTTTGG\nTTTTGT\nTTTTTA\nTTTTTC\nTTTTTG\nTTTTTT\nNNNNNN\n