File size: 3,979 Bytes
a3c3fcf
9d38059
 
a3c3fcf
 
fa8c8ef
a3c3fcf
 
 
 
 
9d38059
 
a3c3fcf
 
9d38059
 
 
 
a3c3fcf
9d38059
 
a3c3fcf
 
 
9d38059
 
a3c3fcf
 
 
9d38059
 
 
a3c3fcf
 
9d38059
a3c3fcf
 
 
 
 
9d38059
 
fa8c8ef
 
 
 
 
a3c3fcf
 
 
 
fa8c8ef
a3c3fcf
 
 
 
 
9d38059
a3c3fcf
 
 
9d38059
a3c3fcf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""Ingest examples into FAISS."""
import os
from pathlib import Path
import pickle
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts.example_selector import \
    SemanticSimilarityExampleSelector

rephrase_documents = [
    {
        "question": "how do i load those?",
        "chat_history": "Human: What types of tasks can I do with Pipelines?\nAssistant: \n\nThere are a few different types of tasks pipelines can do. Some examples: Text classification, Text generation, name entity recognition, question answering, summarization, translation, image classification, image segmentation, object detection, audio classification, and visual question answering.",
        "answer": "How do I load a pipeline for a specific task",
    },
    {
        "question": "how do i install this package?",
        "chat_history": "",
        "answer": "How do I install transformers?",
    },
    {
        "question": "where do i find the models?",
        "chat_history": "Human: can you write me a code snippet for that?\nAssistant: \n\nYes, you can load a pretained model with the from_pretrained() method. Here is a [link](https://huggingface.co/docs/transformers/autoclass_tutorial) to the documentation that provides a code snippet for loading a pretrained model with AutoClass.",
        "answer": "Where do I find the models that can be loaded into an autoclass?",
    },
    {
        "question": "how do I finetune a pre-trained model?",
        "chat_history": "Human: List all methods of a pipeline please\nAssistant: \n\nTo answer your question, you can find a list of all the methods of the Pipeline class in the [API reference documentation](https://huggingface.co/docs/transformers/main_classes/pipelines).",
        "answer": "What are some methods for finetuning a pre-trained model?",
    },
    {
        "question": "can you write me a code snippet for that?",
        "chat_history": "Human: how do I do train on multiple gpus?\nAssistant: \n\nTo perform distributed training, you can use the [Accelerate](https://huggingface.co/docs/transformers/accelerate) library. This example shows how to perform distributed training on multiple GPUs with accelerate. For more information on distributed training, check out the [Full Accelerate Documentation](https://huggingface.co/docs/accelerate/).",
        "answer": "Can you provide a code snippet for training on multiple GPUs with accelerate?",
    },
    {
        "question": "show me how to do it with trainer",
        "chat_history": "Human: How do I finetune a pre-trained model?\nAssistant: \n\nYou can fine-tune a pretrained model with 🤗 Transformers Trainer, in TensorFlow with Keras, and in native PyTorch. For more information on how to do this, visit our [training tutorial](https://huggingface.co/docs/transformers/training)",
        "answer": "How do I finetune a pre-trained model with Transformers trainer?",
    }
]

model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the text from the Hugging Face code documentation"
query_instruction = "Query the most relevant text from the Hugging Face code documentation"
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)

example_selector = SemanticSimilarityExampleSelector.from_examples(
    # This is the list of examples available to select from.
    rephrase_documents,
    # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
    embedding,
    # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
    FAISS,
    # This is the number of examples to produce.
    k=4
)

print("beginning pickle")
with open("rephrase_eg.pkl", 'wb') as f:
    pickle.dump(example_selector, f)

print("Rephrase pickle complete")