from torch import Tensor from transformers import AutoTokenizer, AutoModel from ctranslate2 import Translator from typing import Union from fastapi import FastAPI from pydantic import BaseModel def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: last_hidden = last_hidden_states.masked_fill( ~attention_mask[..., None].bool(), 0.0) return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] # text-ada replacement embeddingTokenizer = AutoTokenizer.from_pretrained( './multilingual-e5-base') embeddingModel = AutoModel.from_pretrained('./multilingual-e5-base') # chatGpt replacement inferenceTokenizer = AutoTokenizer.from_pretrained( "./ct2fast-flan-alpaca-xl") inferenceTranslator = Translator( "./ct2fast-flan-alpaca-xl", compute_type="int8", device="cpu") class EmbeddingRequest(BaseModel): input: Union[str, None] = None class TokensCountRequest(BaseModel): input: Union[str, None] = None class InferenceRequest(BaseModel): input: Union[str, None] = None max_length: Union[int, None] = 0 app = FastAPI() @app.get("/") async def root(): return {"message": "Hello World"} @app.post("/text-embedding") async def text_embedding(request: EmbeddingRequest): input = request.input # Process the input data batch_dict = embeddingTokenizer([input], max_length=512, padding=True, truncation=True, return_tensors='pt') outputs = embeddingModel(**batch_dict) embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # create response return { 'embedding': embeddings[0].tolist() } @app.post('/inference') async def inference(request: InferenceRequest): input_text = request.input max_length = 256 try: max_length = int(request.max_length) max_length = min(1024, max_length) except: pass # process request input_tokens = inferenceTokenizer.convert_ids_to_tokens( inferenceTokenizer.encode(input_text)) results = inferenceTranslator.translate_batch( [input_tokens], beam_size=1, max_input_length=0, max_decoding_length=max_length, num_hypotheses=1, repetition_penalty=1.3, sampling_topk=40, sampling_temperature=0.7, use_vmap=False) output_tokens = results[0].hypotheses[0] output_text = inferenceTokenizer.decode( inferenceTokenizer.convert_tokens_to_ids(output_tokens)) # create response return { 'generated_text': output_text } @app.post('/tokens-count') async def tokens_count(request: TokensCountRequest): input_text = request.input tokens = inferenceTokenizer.convert_ids_to_tokens( inferenceTokenizer.encode(input_text)) # create response return { 'tokens': tokens, 'total': len(tokens) }