Spaces:
Sleeping
Sleeping
import logging | |
from abc import abstractmethod | |
from typing import Any, List, Optional | |
import nltk | |
from nltk import word_tokenize | |
from pydantic import BaseModel | |
logger = logging.getLogger(__name__) | |
class BaseTextTokenizer(BaseModel): | |
def tokenize_text(self, text: str) -> List[str]: | |
pass | |
class NLTKTextTokenizer(BaseTextTokenizer): | |
tokenizer_name: Optional[str] = "punkt" | |
def __init__(self, **data: Any): | |
super().__init__(**data) | |
try: | |
nltk.data.find(f"tokenizers/{self.tokenizer_name}") | |
except LookupError: | |
nltk.download(f"{self.tokenizer_name}") | |
def tokenize_text(self, text: str) -> Any: | |
return word_tokenize(text) | |