austindavis
/

ChessGPT_d12

+from typing import List
+import chess
+import tiktoken
+import tokenizers
+from tokenizers import models, pre_tokenizers, processors
+from torch import Tensor as TT
+from transformers import PreTrainedTokenizerFast
+from transformers.tokenization_utils_fast import BatchEncoding
+def getTiktokenizer() -> tiktoken.Encoding:
+    """
+    Defines a tiktoken-based BPE encoder for UCI chess moves. This
+    tokenizer effectively tokenizes UCI moves by the square names.
+    One notable variation is that promotions must be in upper-case.
+    Vocabulary:
+    Special Tokens (4): "\<|pad|\>", "\<|startoftext|\>", "\<|endoftext|\>", "\<|unknown|\>"
+    Square Tokens (64): a1 through h8
+    Promote Tokens (4): Q, B, R, N
+    UNUSED (8120): Need 8192-4-64-4=8120 unused tokens of the form <|unused####|>
+    """
+    special_tokens = ["<|pad|>", "<|startoftext|>", "<|endoftext|>", "<|unknown|>"]
+    unused_tokens = [f"<|unused{i:04d}" for i in range(8120)]
+    chess_vocab = special_tokens + chess.SQUARE_NAMES + list("QBRN") + unused_tokens
+    mergeable_ranks = {k.encode():v for (v,k) in enumerate(chess_vocab)}
+    chess_pat_str = r'[a-h][1-8]|[QBRN]'
+    enc = tiktoken.Encoding(
+        name="chess_enc",
+        pat_str=chess_pat_str, # or \d|\s
+        mergeable_ranks=mergeable_ranks,
+        special_tokens={k:v for (v,k) in enumerate(special_tokens)},
+    )
+    return enc
+class UciTokenizer(PreTrainedTokenizerFast):
+    _PAD_TOKEN: str
+    _UNK_TOKEN: str
+    _EOS_TOKEN: str
+    _BOS_TOKEN: str
+    stoi: dict[str, int]
+    """Integer to String mapping"""
+    itos: dict[int, str]
+    """String to Integer Mapping. This is the vocab"""
+    def __init__(
+        self,
+        stoi,
+        itos,
+        pad_token,
+        unk_token,
+        bos_token,
+        eos_token,
+        name_or_path,
+        **kwargs
+    ):
+        self.stoi = stoi
+        self.itos = itos
+        self._PAD_TOKEN = pad_token
+        self._UNK_TOKEN = unk_token
+        self._EOS_TOKEN = eos_token
+        self._BOS_TOKEN = bos_token
+        # Define the model
+        tok_model = models.WordLevel(vocab=self.stoi, unk_token=self._UNK_TOKEN)
+        slow_tokenizer = tokenizers.Tokenizer(tok_model)
+        slow_tokenizer.pre_tokenizer = self._init_pretokenizer()
+        # post processing adds special tokens unless explicitly ignored
+        post_proc = processors.TemplateProcessing(
+            single=f"{bos_token} $0",
+            pair=None,
+            special_tokens=[(bos_token, 1)],
+        )
+        slow_tokenizer.post_processor=post_proc
+        super().__init__(
+            tokenizer_object=slow_tokenizer,
+            unk_token=self._UNK_TOKEN,
+            bos_token=self._BOS_TOKEN,
+            eos_token=self._EOS_TOKEN,
+            pad_token=self._PAD_TOKEN,
+            name_or_path=name_or_path,
+            **kwargs
+        )
+        # Override the decode behavior to ensure spaces are correctly handled
+        def _decode(
+            token_ids: int | List[int] | dict | TT,
+            skip_special_tokens=False,
+            clean_up_tokenization_spaces=False,
+        ) -> int | List[int]:
+            if isinstance(token_ids, int):
+                return self.itos.get(token_ids, self._UNK_TOKEN)
+            if isinstance(token_ids, dict):
+                token_ids = token_ids["input_ids"]
+            if isinstance(token_ids, TT):
+                token_ids = token_ids.tolist()
+            if isinstance(token_ids, list):
+                tokens_str = [self.itos.get(xi, self._UNK_TOKEN) for xi in token_ids]
+                processed_tokens = self._process_str_tokens(tokens_str)
+                return " ".join(processed_tokens)
+            raise ValueError(f"Unknown input type to decode() for argument 'token_ids'. Received: {type(token_ids)} ")
+        self._decode = _decode
+    def _init_pretokenizer(self) -> pre_tokenizers.PreTokenizer:
+        raise NotImplementedError
+    def _process_str_tokens(self, tokens_str: list[str], return_player_ids: bool) -> list[str]:
+        raise NotImplementedError
+    def get_id2square_list() -> list[int]:
+        raise NotImplementedError
+class UciTileTokenizer(UciTokenizer):
+    """ Uci tokenizer converting start/end tiles and promotion types each into individual tokens"""
+    SPECIAL_TOKENS = ["<|pad|>", "<|startoftext|>", "<|endoftext|>", "<|unknown|>"]
+    stoi = {
+        tok: idx
+        for tok, idx in list(
+            zip(SPECIAL_TOKENS + chess.SQUARE_NAMES + list("QRBN"), range(72))
+        )
+    }
+    itos = {
+        idx: tok
+        for tok, idx in list(
+            zip(SPECIAL_TOKENS + chess.SQUARE_NAMES + list("QRBN"), range(72))
+        )
+    }
+    id2square:List[int] = list(range(4,68))
+    """
+    List mapping token IDs to squares on the chess board. Order is file then rank, i.e.:
+    `A1, B1, C1, ..., F8, G8, H8`
+    """
+    def get_id2square_list(self) -> List[int]:
+        return self.id2square
+    def __init__(self, **kwargs):
+        super().__init__(
+            self.stoi,
+            self.itos,
+            pad_token="<|pad|>",
+            unk_token="<|unknown|>",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            name_or_path="austindavis/uci_tile_tokenizer",
+            clean_up_tokenization_spaces=False,
+            **kwargs
+        )
+    def _init_pretokenizer(self):
+        # Pre-tokenizer to split input into UCI moves
+        pattern = tokenizers.Regex(r"\d|[QBRN]")
+        pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Whitespace(),
+                pre_tokenizers.Split(pattern=pattern, behavior="merged_with_previous"),
+            ]
+        )
+        return pre_tokenizer
+    def _process_str_tokens(self, token_str: list[str]):
+        moves = []
+        next_move = ""
+        for token in token_str:
+            # skip special tokens
+            if token in self.all_special_tokens:
+                continue
+            # handle promotions
+            if len(token) == 1:
+                next_move += token
+                continue
+            # handle regular tokens if there's room
+            if len(next_move) < 4:
+                next_move += token
+                continue
+            moves.append(next_move)
+            next_move = token
+        moves.append(next_move)
+        return moves
+    @staticmethod
+    def compute_players(encoding: BatchEncoding, according_to='output'):
+        """
+        Determines which player (white=True, black=False) is associated with each token in the sequence.
+        This method works based on chess move sequences tokenized using the UciTileTokenizer.
+        # Parameters:
+        ----------
+        **`encoding`** : BatchEncoding
+            Tokenized input of a chess game, where each token represents a move or special token.
+        **`according_to`** : str (optional, default='output')
+            Specifies the perspective for associating players:
+            - 'output': Returns the player whose next move is predicted by the sequence (the output move).
+            - Otherwise: Returns the player associated with the input tokens (i.e., which player made each move).
+        # Returns:
+        -------
+        List[bool]
+            A list of boolean values indicating the player for each token:
+            - True for white (player 1),
+            - False for black (player 2).
+            The list length corresponds to the number of tokens in the sequence, including special tokens if any.
+        # Example Usage:
+        ```
+        >>> tok = UciTileTokenizer()
+        >>> encoding = tok('e2e4 d7d5 e4d5 e7e6 d5e6 d8g5 e6e7 g5f6 e7f8Q')
+        >>> print(encoding['input_ids'])
+        [1, 16, 32, 55, 39, 32, 39, 56, 48, 39, 48, 63, 42, 48, 56, 42, 49, 56, 65, 68]
+        >>> tok.compute_players(encoding)
+        [True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False, True, True, True, False]
+        >>> tok.compute_players(encoding, according_to='input')
+        [True, True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False, True, True, True]
+        ```
+        # Notes:
+        -------
+        This method does not rely on board position calculations. Therefore, when
+        using `according_to='output'`, it cannot reliably predict which player is
+        responsible for selecting the final token of the sequence. For instance,
+        if a pawn is moved to the back rank (e.g., 'e7e8'), then white must select
+        the promotion class on the next token; however, this algorithm will predict
+        that black is responsible for selecting the next token instead of white.
+        """
+        return [UciTileTokenizer._compute_players_single(encoding[i].ids) for i in range(len(encoding['input_ids']))]
+    @staticmethod
+    def _compute_players_single(input_ids: list[int], according_to: str='output'):
+        players = [] if according_to == "output" else [True]
+        current_player = False
+        num_tokens_in_ply = 0
+        has_specials = False
+        for i, token_id in enumerate(input_ids):
+            if token_id == 1:
+                has_specials = True
+                continue
+            if num_tokens_in_ply == 0:
+                # check if promotion OR unknown token ID
+                if token_id > 67 or token_id == 3:
+                    players.append(current_player)
+                    num_tokens_in_ply = 0
+                else:
+                    num_tokens_in_ply += 1
+                    current_player = not current_player
+                    players.append(current_player)
+            elif num_tokens_in_ply == 1:
+                num_tokens_in_ply = 0
+                players.append(current_player)
+            else:
+                raise ValueError("Illegal move sequence")
+        if according_to == "output":
+            # anticipate what output should be based on the final input token
+            # see notes for more detail
+            if num_tokens_in_ply == 0:
+                if token_id > 67:
+                    players.append(not current_player)
+                else:
+                    players.append(current_player)
+            else:
+                players.append(current_player)
+        return players if has_specials else players[1:]
+if __name__ == "__main__":
+    tok = UciTileTokenizer()
+    encoding = tok('e2e4Q b7b8N e2e7 a1',add_special_tokens=True)
+    print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding,  according_to='output')=}")
+    print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='input')=}")
+    encoding = tok('e2e4Q b7b8N e2e7 a1',add_special_tokens=False)
+    print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding,  according_to='output')=}")
+    print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='input')=}")
+    encoding = tok('e2e4 d7d5 e4d5 e7e6 d5e6 d8g5 e6e7 g5f6 e7f8Q')
+    print(encoding['input_ids'])
+    print(tok.compute_players(encoding))
+    print(tok.compute_players(encoding, according_to='input'))