Vadim212 commited on
Commit
0416ac9
1 Parent(s): b11ef22

Upload 8 files

Browse files
Files changed (8) hide show
  1. .gitattributes +35 -35
  2. app.py +40 -0
  3. app2.py +16 -0
  4. doc_faiss_search.py +49 -0
  5. doc_faiss_train.py +105 -0
  6. faiss_test.py +50 -0
  7. faiss_train.py +99 -0
  8. requirements.txt +5 -0
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import xml.etree.ElementTree as ET
3
+ import glob, os
4
+
5
+ rootFolder = "c:/317"
6
+
7
+ file = open(rootFolder + "/result.csv", "w", encoding="utf-8")
8
+ file.write("prompt,text,rejected_text\n")
9
+
10
+ def parseXML(xmlFile):
11
+
12
+ prompt = xmlFile.replace("Using_", "").replace(".xml", "").replace(".", " ").replace("_", " ")
13
+ text = ""
14
+
15
+ try:
16
+ tree = ET.parse(rootFolder + "/" + xmlFile)
17
+ root = tree.getroot()
18
+
19
+ for item in root.findall(".//text"):
20
+ text += item.text
21
+
22
+ if text.find("а") == -1:
23
+ #file.write("### prompt\n")
24
+ file.write(prompt + "," + text.replace(",", " ") + "\n")
25
+ #file.write("### text\n")
26
+ #file.write(text.replace(",", " "))
27
+
28
+ except:
29
+ print("=======")
30
+
31
+
32
+ os.chdir(rootFolder)
33
+ for xmlFile in glob.glob("*.xml"):
34
+ print(xmlFile)
35
+ parseXML(xmlFile)
36
+
37
+
38
+ ## parseXML('Using_WinRT_Viewer.Search_Panel.xml')
39
+
40
+ file.close()
app2.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ device = torch.device("cuda")
4
+
5
+ tenz = torch.tensor([1.,2.], device=device)
6
+ #tenz.toDevice(device)
7
+
8
+ print(torch.cuda.is_available())
9
+
10
+ from datasets import Dataset
11
+
12
+ dataset = Dataset.from_dict({"a": [0, 1, 2]})
13
+ dataset_with_duplicates = dataset.map(lambda batch: {"b": batch["a"] * 2})
14
+ print(dataset_with_duplicates.shape)
15
+ len(dataset_with_duplicates)
16
+ dataset_with_duplicates[:]
doc_faiss_search.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, load_from_disk, Dataset
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ import pandas as pd
5
+
6
+ model_ckpt = "nomic-ai/nomic-embed-text-v1.5"
7
+
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
10
+ model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)
11
+
12
+ device = torch.device("cpu")
13
+ model.to(device)
14
+
15
+ def cls_pooling(model_output):
16
+ return model_output.last_hidden_state[:, 0]
17
+
18
+ def get_embeddings(text_list):
19
+ encoded_input = tokenizer(
20
+ text_list, padding=True, truncation=True, return_tensors="pt"
21
+ )
22
+ encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
23
+ model_output = model(**encoded_input)
24
+ return cls_pooling(model_output)
25
+
26
+
27
+ embeddings_dataset = Dataset.load_from_disk("dataset/embeddings")
28
+
29
+ embeddings_dataset.load_faiss_index("embeddings", "index/embeddings")
30
+
31
+ question = "Download license key"
32
+
33
+ question_embedding = get_embeddings([question]).cpu().detach().numpy()
34
+
35
+ scores, samples = embeddings_dataset.get_nearest_examples(
36
+ "embeddings", question_embedding, k=10
37
+ )
38
+
39
+ samples_df = pd.DataFrame.from_dict(samples)
40
+ samples_df["scores"] = scores
41
+ samples_df.sort_values("scores", ascending=True, inplace=True)
42
+
43
+ for _, row in samples_df.iterrows():
44
+ print(f"COMMENT: {row.text}")
45
+ print(f"SCORE: {row.scores}")
46
+ print(f"PROMPT: {row.prompt}")
47
+ print("=" * 50)
48
+ print()
49
+
doc_faiss_train.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, load_from_disk, Dataset
2
+ import os
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ import pandas as pd
6
+ import xml.etree.ElementTree as ET
7
+ import glob, os
8
+
9
+ rootFolder = "c:/317"
10
+ file = open(rootFolder + "/result.csv", "w", encoding="utf-8")
11
+
12
+ def parseXML(xmlFile):
13
+
14
+ prompt = xmlFile.replace("Using_", "").replace(".xml", "").replace(".", " ").replace("_", " ")
15
+ text = ""
16
+
17
+ try:
18
+ tree = ET.parse(rootFolder + "/" + xmlFile)
19
+ root = tree.getroot()
20
+
21
+ for item in root.findall(".//text"):
22
+ text += (item.text + " ")
23
+
24
+ if len(text) > 500:
25
+ text = text[:500]
26
+
27
+ if text.find("а") == -1:
28
+ file.write(text + "\n")
29
+ return {"text": text, "prompt": prompt}
30
+ else:
31
+ return None
32
+
33
+ except Exception as error:
34
+ print(error)
35
+
36
+
37
+ def generator():
38
+
39
+ for xmlFile in glob.glob("*.xml", root_dir=rootFolder):
40
+ print(xmlFile)
41
+ data = parseXML(xmlFile)
42
+ if not (data == None) : yield data
43
+
44
+
45
+ ds = Dataset.from_generator(generator)
46
+
47
+ file.close()
48
+
49
+ ##########################################################
50
+
51
+
52
+ model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
53
+ #model_ckpt = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
54
+ # model_ckpt = "sentence-transformers/msmarco-bert-base-dot-v5"
55
+ model_ckpt = "nomic-ai/nomic-embed-text-v1.5"
56
+
57
+
58
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
59
+ model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)
60
+
61
+ device = torch.device("cuda")
62
+ model.to(device)
63
+
64
+ def cls_pooling(model_output):
65
+ return model_output.last_hidden_state[:, 0]
66
+
67
+ def get_embeddings(text_list):
68
+ encoded_input = tokenizer(
69
+ text_list, padding=True, truncation=True, return_tensors="pt"
70
+ )
71
+ encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
72
+ model_output = model(**encoded_input)
73
+ return cls_pooling(model_output)
74
+
75
+ embeddings_dataset = ds.map(
76
+ lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
77
+ )
78
+
79
+ embeddings_dataset.save_to_disk("dataset/embeddings")
80
+
81
+ embeddings_dataset = Dataset.load_from_disk("dataset/embeddings")
82
+
83
+ embeddings_dataset.add_faiss_index(column="embeddings")
84
+
85
+ embeddings_dataset.save_faiss_index("embeddings", "index/embeddings")
86
+
87
+ question = "Download license key"
88
+
89
+ question_embedding = get_embeddings([question]).cpu().detach().numpy()
90
+
91
+ scores, samples = embeddings_dataset.get_nearest_examples(
92
+ "embeddings", question_embedding, k=10
93
+ )
94
+
95
+ samples_df = pd.DataFrame.from_dict(samples)
96
+ samples_df["scores"] = scores
97
+ samples_df.sort_values("scores", ascending=True, inplace=True)
98
+
99
+ for _, row in samples_df.iterrows():
100
+ print(f"COMMENT: {row.text}")
101
+ print(f"SCORE: {row.scores}")
102
+ print(f"PROMPT: {row.prompt}")
103
+ print("=" * 50)
104
+ print()
105
+
faiss_test.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, load_from_disk, Dataset
2
+ import os
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ import pandas as pd
6
+ import faiss
7
+
8
+ ########################
9
+
10
+ model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
12
+ model = AutoModel.from_pretrained(model_ckpt)
13
+
14
+ device = torch.device("cuda")
15
+ model.to(device)
16
+
17
+ def cls_pooling(model_output):
18
+ return model_output.last_hidden_state[:, 0]
19
+
20
+ def get_embeddings(text_list):
21
+ encoded_input = tokenizer(
22
+ text_list, padding=True, truncation=True, return_tensors="pt"
23
+ )
24
+ encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
25
+ model_output = model(**encoded_input)
26
+ return cls_pooling(model_output)
27
+
28
+ embeddings_dataset = load_from_disk("dataset/embeddings")
29
+
30
+ embeddings_dataset.add_faiss_index(column="embeddings")
31
+
32
+ question = "How can I load a dataset offline?"
33
+ question_embedding = get_embeddings([question]).cpu().detach().numpy()
34
+
35
+ scores, samples = embeddings_dataset.get_nearest_examples(
36
+ "embeddings", question_embedding, k=5
37
+ )
38
+
39
+ samples_df = pd.DataFrame.from_dict(samples)
40
+ samples_df["scores"] = scores
41
+ samples_df.sort_values("scores", ascending=False, inplace=True)
42
+
43
+ for _, row in samples_df.iterrows():
44
+ print(f"COMMENT: {row.comments}")
45
+ print(f"SCORE: {row.scores}")
46
+ print(f"TITLE: {row.title}")
47
+ print(f"URL: {row.html_url}")
48
+ print("=" * 50)
49
+ print()
50
+
faiss_train.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, load_from_disk, Dataset
2
+ import os
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ import pandas as pd
6
+
7
+ datasetPath = "dataset/github.ds"
8
+
9
+ if os.path.exists(datasetPath):
10
+ issues_dataset = load_from_disk(datasetPath)
11
+ else:
12
+ issues_dataset = load_dataset("lewtun/github-issues", split="train")
13
+ issues_dataset.save_to_disk(datasetPath)
14
+
15
+ issues_dataset = issues_dataset.filter(
16
+ lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
17
+ )
18
+
19
+ columns = issues_dataset.column_names
20
+ columns_to_keep = ["title", "body", "html_url", "comments"]
21
+ columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
22
+ issues_dataset = issues_dataset.remove_columns(columns_to_remove)
23
+
24
+ issues_dataset.set_format("pandas")
25
+ df = issues_dataset[:]
26
+
27
+ comments_df = df.explode("comments", ignore_index=True)
28
+
29
+ comments_dataset = Dataset.from_pandas(comments_df)
30
+
31
+ comments_dataset = comments_dataset.map(
32
+ lambda x: {"comment_length": len(x["comments"].split())}
33
+ )
34
+
35
+ comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
36
+
37
+ def concatenate_text(examples):
38
+ return {
39
+ "text": examples["title"]
40
+ + " \n "
41
+ + examples["body"]
42
+ + " \n "
43
+ + examples["comments"]
44
+ }
45
+
46
+
47
+ comments_dataset = comments_dataset.map(concatenate_text)
48
+
49
+ ########################
50
+
51
+ model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
52
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
53
+ model = AutoModel.from_pretrained(model_ckpt)
54
+
55
+ device = torch.device("cuda")
56
+ model.to(device)
57
+
58
+ def cls_pooling(model_output):
59
+ return model_output.last_hidden_state[:, 0]
60
+
61
+ def get_embeddings(text_list):
62
+ encoded_input = tokenizer(
63
+ text_list, padding=True, truncation=True, return_tensors="pt"
64
+ )
65
+ encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
66
+ model_output = model(**encoded_input)
67
+ return cls_pooling(model_output)
68
+
69
+ embedding = get_embeddings(comments_dataset["text"][0])
70
+
71
+ embeddings_dataset = comments_dataset.map(
72
+ lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
73
+ )
74
+
75
+ embeddings_dataset.add_faiss_index(column="embeddings")
76
+
77
+ # embeddings_dataset.save_to_disk("dataset/embeddings")
78
+
79
+ question = "How can I load a dataset offline?"
80
+ question_embedding = get_embeddings([question]).cpu().detach().numpy()
81
+
82
+ scores, samples = embeddings_dataset.get_nearest_examples(
83
+ "embeddings", question_embedding, k=5
84
+ )
85
+
86
+ samples_df = pd.DataFrame.from_dict(samples)
87
+ samples_df["scores"] = scores
88
+ samples_df.sort_values("scores", ascending=False, inplace=True)
89
+
90
+ for _, row in samples_df.iterrows():
91
+ print(f"COMMENT: {row.comments}")
92
+ print(f"SCORE: {row.scores}")
93
+ print(f"TITLE: {row.title}")
94
+ print(f"URL: {row.html_url}")
95
+ print("=" * 50)
96
+ print()
97
+
98
+
99
+ print(issues_dataset)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ datasets
2
+ transformers
3
+ torch
4
+ pandas
5
+ #faiss-gpu