transformers-chat / ingest_examples.py
enoreyes's picture
Update code
fa8c8ef
raw
history blame
3.98 kB
"""Ingest examples into FAISS."""
import os
from pathlib import Path
import pickle
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts.example_selector import \
SemanticSimilarityExampleSelector
rephrase_documents = [
{
"question": "how do i load those?",
"chat_history": "Human: What types of tasks can I do with Pipelines?\nAssistant: \n\nThere are a few different types of tasks pipelines can do. Some examples: Text classification, Text generation, name entity recognition, question answering, summarization, translation, image classification, image segmentation, object detection, audio classification, and visual question answering.",
"answer": "How do I load a pipeline for a specific task",
},
{
"question": "how do i install this package?",
"chat_history": "",
"answer": "How do I install transformers?",
},
{
"question": "where do i find the models?",
"chat_history": "Human: can you write me a code snippet for that?\nAssistant: \n\nYes, you can load a pretained model with the from_pretrained() method. Here is a [link](https://huggingface.co/docs/transformers/autoclass_tutorial) to the documentation that provides a code snippet for loading a pretrained model with AutoClass.",
"answer": "Where do I find the models that can be loaded into an autoclass?",
},
{
"question": "how do I finetune a pre-trained model?",
"chat_history": "Human: List all methods of a pipeline please\nAssistant: \n\nTo answer your question, you can find a list of all the methods of the Pipeline class in the [API reference documentation](https://huggingface.co/docs/transformers/main_classes/pipelines).",
"answer": "What are some methods for finetuning a pre-trained model?",
},
{
"question": "can you write me a code snippet for that?",
"chat_history": "Human: how do I do train on multiple gpus?\nAssistant: \n\nTo perform distributed training, you can use the [Accelerate](https://huggingface.co/docs/transformers/accelerate) library. This example shows how to perform distributed training on multiple GPUs with accelerate. For more information on distributed training, check out the [Full Accelerate Documentation](https://huggingface.co/docs/accelerate/).",
"answer": "Can you provide a code snippet for training on multiple GPUs with accelerate?",
},
{
"question": "show me how to do it with trainer",
"chat_history": "Human: How do I finetune a pre-trained model?\nAssistant: \n\nYou can fine-tune a pretrained model with 🤗 Transformers Trainer, in TensorFlow with Keras, and in native PyTorch. For more information on how to do this, visit our [training tutorial](https://huggingface.co/docs/transformers/training)",
"answer": "How do I finetune a pre-trained model with Transformers trainer?",
}
]
model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the text from the Hugging Face code documentation"
query_instruction = "Query the most relevant text from the Hugging Face code documentation"
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)
example_selector = SemanticSimilarityExampleSelector.from_examples(
# This is the list of examples available to select from.
rephrase_documents,
# This is the embedding class used to produce embeddings which are used to measure semantic similarity.
embedding,
# This is the VectorStore class that is used to store the embeddings and do a similarity search over.
FAISS,
# This is the number of examples to produce.
k=4
)
print("beginning pickle")
with open("rephrase_eg.pkl", 'wb') as f:
pickle.dump(example_selector, f)
print("Rephrase pickle complete")