enoreyes commited on
Commit
a3c3fcf
1 Parent(s): 17305b3

Update ingest_examples.py

Browse files
Files changed (1) hide show
  1. ingest_examples.py +39 -199
ingest_examples.py CHANGED
@@ -1,219 +1,59 @@
1
- """Ingest examples into Weaviate."""
2
  import os
3
  from pathlib import Path
4
-
5
- import weaviate
6
-
7
- WEAVIATE_URL = os.environ["WEAVIATE_URL"]
8
- client = weaviate.Client(
9
- url=WEAVIATE_URL,
10
- additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
11
- )
12
-
13
- client.schema.delete_class("Rephrase")
14
- client.schema.delete_class("QA")
15
- client.schema.get()
16
- schema = {
17
- "classes": [
18
- {
19
- "class": "Rephrase",
20
- "description": "Rephrase Examples",
21
- "vectorizer": "text2vec-openai",
22
- "moduleConfig": {
23
- "text2vec-openai": {
24
- "model": "ada",
25
- "modelVersion": "002",
26
- "type": "text",
27
- }
28
- },
29
- "properties": [
30
- {
31
- "dataType": ["text"],
32
- "moduleConfig": {
33
- "text2vec-openai": {
34
- "skip": False,
35
- "vectorizePropertyName": False,
36
- }
37
- },
38
- "name": "content",
39
- },
40
- {
41
- "dataType": ["text"],
42
- "description": "The link",
43
- "moduleConfig": {
44
- "text2vec-openai": {
45
- "skip": True,
46
- "vectorizePropertyName": False,
47
- }
48
- },
49
- "name": "question",
50
- },
51
- {
52
- "dataType": ["text"],
53
- "description": "The link",
54
- "moduleConfig": {
55
- "text2vec-openai": {
56
- "skip": True,
57
- "vectorizePropertyName": False,
58
- }
59
- },
60
- "name": "answer",
61
- },
62
- {
63
- "dataType": ["text"],
64
- "description": "The link",
65
- "moduleConfig": {
66
- "text2vec-openai": {
67
- "skip": True,
68
- "vectorizePropertyName": False,
69
- }
70
- },
71
- "name": "chat_history",
72
- },
73
- ],
74
- },
75
- ]
76
- }
77
-
78
- client.schema.create(schema)
79
-
80
- documents = [
81
  {
82
  "question": "how do i load those?",
83
- "chat_history": "Human: What types of memory exist?\nAssistant: \n\nThere are a few different types of memory: Buffer, Summary, and Conversational Memory.",
84
- "answer": "How do I load Buffer, Summary, and Conversational Memory",
85
  },
86
  {
87
  "question": "how do i install this package?",
88
  "chat_history": "",
89
- "answer": "How do I install langchain?",
90
  },
91
  {
92
- "question": "how do I set serpapi_api_key?",
93
- "chat_history": "Human: can you write me a code snippet for that?\nAssistant: \n\nYes, you can create an Agent with a custom LLMChain in LangChain. Here is a [link](https://langchain.readthedocs.io/en/latest/modules/agents/examples/custom_agent.html) to the documentation that provides a code snippet for creating a custom Agent.",
94
- "answer": "How do I set the serpapi_api_key?",
95
  },
96
  {
97
- "question": "What are some methods for data augmented generation?",
98
- "chat_history": "Human: List all methods of an Agent class please\nAssistant: \n\nTo answer your question, you can find a list of all the methods of the Agent class in the [API reference documentation](https://langchain.readthedocs.io/en/latest/modules/agents/reference.html).",
99
- "answer": "What are some methods for data augmented generation?",
100
  },
101
  {
102
  "question": "can you write me a code snippet for that?",
103
- "chat_history": "Human: how do I create an agent with custom LLMChain?\nAssistant: \n\nTo create an Agent with a custom LLMChain in LangChain, you can use the [Custom Agent example](https://langchain.readthedocs.io/en/latest/modules/agents/examples/custom_agent.html). This example shows how to create a custom LLMChain and use an existing Agent class to parse the output. For more information on Agents and Tools, check out the [Key Concepts](https://langchain.readthedocs.io/en/latest/modules/agents/key_concepts.html) documentation.",
104
- "answer": "Can you provide a code snippet for creating an Agent with a custom LLMChain?",
105
  },
 
 
 
 
 
106
  ]
107
- from langchain.prompts.example_selector.semantic_similarity import \
108
- sorted_values
109
-
110
- for d in documents:
111
- d["content"] = " ".join(sorted_values(d))
112
- with client.batch as batch:
113
- for text in documents:
114
- batch.add_data_object(
115
- text,
116
- "Rephrase",
117
- )
118
 
119
- client.schema.get()
120
- schema = {
121
- "classes": [
122
- {
123
- "class": "QA",
124
- "description": "Rephrase Examples",
125
- "vectorizer": "text2vec-openai",
126
- "moduleConfig": {
127
- "text2vec-openai": {
128
- "model": "ada",
129
- "modelVersion": "002",
130
- "type": "text",
131
- }
132
- },
133
- "properties": [
134
- {
135
- "dataType": ["text"],
136
- "moduleConfig": {
137
- "text2vec-openai": {
138
- "skip": False,
139
- "vectorizePropertyName": False,
140
- }
141
- },
142
- "name": "content",
143
- },
144
- {
145
- "dataType": ["text"],
146
- "description": "The link",
147
- "moduleConfig": {
148
- "text2vec-openai": {
149
- "skip": True,
150
- "vectorizePropertyName": False,
151
- }
152
- },
153
- "name": "question",
154
- },
155
- {
156
- "dataType": ["text"],
157
- "description": "The link",
158
- "moduleConfig": {
159
- "text2vec-openai": {
160
- "skip": True,
161
- "vectorizePropertyName": False,
162
- }
163
- },
164
- "name": "answer",
165
- },
166
- {
167
- "dataType": ["text"],
168
- "description": "The link",
169
- "moduleConfig": {
170
- "text2vec-openai": {
171
- "skip": True,
172
- "vectorizePropertyName": False,
173
- }
174
- },
175
- "name": "summaries",
176
- },
177
- {
178
- "dataType": ["text"],
179
- "description": "The link",
180
- "moduleConfig": {
181
- "text2vec-openai": {
182
- "skip": True,
183
- "vectorizePropertyName": False,
184
- }
185
- },
186
- "name": "sources",
187
- },
188
- ],
189
- },
190
- ]
191
- }
192
-
193
- client.schema.create(schema)
194
 
195
- documents = [
196
- {
197
- "question": "how do i install langchain?",
198
- "answer": "```pip install langchain```",
199
- "summaries": ">Example:\nContent:\n---------\nYou can pip install langchain package by running 'pip install langchain'\n----------\nSource: foo.html",
200
- "sources": "foo.html",
201
- },
202
- {
203
- "question": "how do i import an openai LLM?",
204
- "answer": "```from langchain.llm import OpenAI```",
205
- "summaries": ">Example:\nContent:\n---------\nyou can import the open ai wrapper (OpenAI) from the langchain.llm module\n----------\nSource: bar.html",
206
- "sources": "bar.html",
207
- },
208
- ]
209
- from langchain.prompts.example_selector.semantic_similarity import \
210
- sorted_values
211
 
212
- for d in documents:
213
- d["content"] = " ".join(sorted_values(d))
214
- with client.batch as batch:
215
- for text in documents:
216
- batch.add_data_object(
217
- text,
218
- "QA",
219
- )
 
1
+ """Ingest examples into FAISS."""
2
  import os
3
  from pathlib import Path
4
+ import pickle
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.prompts.example_selector import \
9
+ SemanticSimilarityExampleSelector
10
+
11
+ rephrase_documents = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  {
13
  "question": "how do i load those?",
14
+ "chat_history": "Human: What types of tasks can I do with Pipelines?\nAssistant: \n\nThere are a few different types of tasks pipelines can do. Some examples: Text classification, Text generation, name entity recognition, question answering, summarization, translation, image classification, image segmentation, object detection, audio classification, and visual question answering.",
15
+ "answer": "How do I load a pipeline for a specific task",
16
  },
17
  {
18
  "question": "how do i install this package?",
19
  "chat_history": "",
20
+ "answer": "How do I install transformers?",
21
  },
22
  {
23
+ "question": "where do i find the models?",
24
+ "chat_history": "Human: can you write me a code snippet for that?\nAssistant: \n\nYes, you can load a pretained model with the from_pretrained() method. Here is a [link](https://huggingface.co/docs/transformers/autoclass_tutorial) to the documentation that provides a code snippet for loading a pretrained model with AutoClass.",
25
+ "answer": "Where do I find the models that can be loaded into an autoclass?",
26
  },
27
  {
28
+ "question": "how do I finetune a pre-trained model?",
29
+ "chat_history": "Human: List all methods of a pipeline please\nAssistant: \n\nTo answer your question, you can find a list of all the methods of the Pipeline class in the [API reference documentation](https://huggingface.co/docs/transformers/main_classes/pipelines).",
30
+ "answer": "What are some methods for finetuning a pre-trained model?",
31
  },
32
  {
33
  "question": "can you write me a code snippet for that?",
34
+ "chat_history": "Human: how do I do train on multiple gpus?\nAssistant: \n\nTo perform distributed training, you can use the [Accelerate](https://huggingface.co/docs/transformers/accelerate) library. This example shows how to perform distributed training on multiple GPUs with accelerate. For more information on distributed training, check out the [Full Accelerate Documentation](https://huggingface.co/docs/accelerate/).",
35
+ "answer": "Can you provide a code snippet for training on multiple GPUs with accelerate?",
36
  },
37
+ {
38
+ "question": "show me how to do it with trainer",
39
+ "chat_history": "Human: How do I finetune a pre-trained model?\nAssistant: \n\nYou can fine-tune a pretrained model with 🤗 Transformers Trainer, in TensorFlow with Keras, and in native PyTorch. For more information on how to do this, visit our [training tutorial](https://huggingface.co/docs/transformers/training)",
40
+ "answer": "How do I finetune a pre-trained model with Transformers trainer?",
41
+ }
42
  ]
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ example_selector = SemanticSimilarityExampleSelector.from_examples(
45
+ # This is the list of examples available to select from.
46
+ rephrase_documents,
47
+ # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
48
+ HuggingFaceEmbeddings(),
49
+ # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
50
+ FAISS,
51
+ # This is the number of examples to produce.
52
+ k=4
53
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ print("beginning pickle")
56
+ with open("rephrase_eg.pkl", 'wb') as f:
57
+ pickle.dump(example_selector, f)
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ print("Rephrase pickle complete")