File size: 4,822 Bytes
1ce5e18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from typing import Dict
from .base import GenericTensor, Pipeline
# Can't use @add_end_docstrings(PIPELINE_INIT_ARGS) here because this one does not accept `binary_output`
class FeatureExtractionPipeline(Pipeline):
"""
Feature extraction pipeline using no model head. This pipeline extracts the hidden states from the base
transformer, which can be used as features in downstream tasks.
Example:
```python
>>> from transformers import pipeline
>>> extractor = pipeline(model="bert-base-uncased", task="feature-extraction")
>>> result = extractor("This is a simple test.", return_tensors=True)
>>> result.shape # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input string.
torch.Size([1, 8, 768])
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
`"feature-extraction"`.
All models may be used for this pipeline. See a list of all models, including community-contributed models on
[huggingface.co/models](https://huggingface.co/models).
Arguments:
model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
[`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow.
tokenizer ([`PreTrainedTokenizer`]):
The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
[`PreTrainedTokenizer`].
modelcard (`str` or [`ModelCard`], *optional*):
Model card attributed to the model for this pipeline.
framework (`str`, *optional*):
The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified and
both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
provided.
return_tensors (`bool`, *optional*):
If `True`, returns a tensor according to the specified framework, otherwise returns a list.
task (`str`, defaults to `""`):
A task-identifier for the pipeline.
args_parser ([`~pipelines.ArgumentHandler`], *optional*):
Reference to the object in charge of parsing supplied pipeline parameters.
device (`int`, *optional*, defaults to -1):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
the associated CUDA device id.
tokenize_kwargs (`dict`, *optional*):
Additional dictionary of keyword arguments passed along to the tokenizer.
"""
def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs):
if tokenize_kwargs is None:
tokenize_kwargs = {}
if truncation is not None:
if "truncation" in tokenize_kwargs:
raise ValueError(
"truncation parameter defined twice (given as keyword argument as well as in tokenize_kwargs)"
)
tokenize_kwargs["truncation"] = truncation
preprocess_params = tokenize_kwargs
postprocess_params = {}
if return_tensors is not None:
postprocess_params["return_tensors"] = return_tensors
return preprocess_params, {}, postprocess_params
def preprocess(self, inputs, **tokenize_kwargs) -> Dict[str, GenericTensor]:
return_tensors = self.framework
model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenize_kwargs)
return model_inputs
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs
def postprocess(self, model_outputs, return_tensors=False):
# [0] is the first available tensor, logits or last_hidden_state.
if return_tensors:
return model_outputs[0]
if self.framework == "pt":
return model_outputs[0].tolist()
elif self.framework == "tf":
return model_outputs[0].numpy().tolist()
def __call__(self, *args, **kwargs):
"""
Extract the features of the input(s).
Args:
args (`str` or `List[str]`): One or several texts (or one list of texts) to get the features of.
Return:
A nested list of `float`: The features computed by the model.
"""
return super().__call__(*args, **kwargs)
|