Red-tech-hub commited on
Commit
19353ca
β€’
1 Parent(s): 442880a

[update] new vectores

Browse files
.env CHANGED
@@ -1,2 +1,4 @@
1
  TRANSFORMERS_CACHE=/code/model/cache
2
- HF_HUB_DISABLE_SYMLINKS_WARNING=true
 
 
 
1
  TRANSFORMERS_CACHE=/code/model/cache
2
+ HF_HUB_DISABLE_SYMLINKS_WARNING=true
3
+ PINECONE_API_KEY="04e7b9a8-4d29-4c1a-a4bd-f61d84cbbc58"
4
+ HF_HOME=/code/model/cache
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β†’ 20a0199b-8b35-420d-a98b-6310dae9461f}/data_level0.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e61ddecf856f7bfc716cdcf4c732fd6e919f653f5f0bcfbaff15e8d0d10ad097
3
- size 3212000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c43c0fbe34b585fd92affd208ed4762d98c01ed73533d075ce449ef6c622c872
3
+ size 1676000
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β†’ 20a0199b-8b35-420d-a98b-6310dae9461f}/header.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdb00e89b6ee7733fd37556b1da3447d9895ad7431512096c0e073ed667a25d0
3
  size 100
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e84f099c65ade720d9a85b056a1619549b039e5ec79157f87d43bb6918187f
3
  size 100
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β†’ 20a0199b-8b35-420d-a98b-6310dae9461f}/index_metadata.pickle RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67d02ffeefff7683db7f9af38d1ea492ebfa8f648f20ad28562938189ebf4f8b
3
  size 55974
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd6a6304deeab5d34b507b65b3b8d295efe81f2a272f8ad148f7d78cc578f679
3
  size 55974
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β†’ 20a0199b-8b35-420d-a98b-6310dae9461f}/length.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7e634c1869715e5681fdf53cba91ef68fe057bdffe0330374fd92b4db85540e
3
  size 4000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dee53a6241ae7881c9ab2e4e091f69b64ef544e12c427f9278da1e9b5b9c93c5
3
  size 4000
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β†’ 20a0199b-8b35-420d-a98b-6310dae9461f}/link_lists.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:903583451dafbf32b09d933ad25705452073842325e3f3caddc8a4382f8fb655
3
  size 8624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21cb3111a44e08a70ca4290809114061e0a51f7f7a0a22afc52280db70942449
3
  size 8624
chroma_db/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a891484ada865ece4c707faf558848011741f497d5431653122592f0258d46f4
3
- size 23683072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1c050a90eba5d3d16210b23d6b8578fc53b8b77eab8b19ddd3f4c81910dae16
3
+ size 21774336
finetune.py CHANGED
@@ -8,15 +8,15 @@ from transformers import AutoModelForCausalLM
8
 
9
  load_dotenv()
10
 
11
- ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
12
- model_type='llama',
13
- max_new_tokens = 10960,
14
- threads = 3,
15
- )
16
 
17
  csv_files = []
18
  root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
- cve_csv_path = os.path.join(root_dir, 'data\\cve')
20
 
21
  csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
22
 
@@ -36,7 +36,7 @@ chroma_db_directory = str("chroma_db/")
36
 
37
  client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
38
 
39
- collection = client.get_or_create_collection(name="CVE", embedding_function=ollama_ef)
40
 
41
  documents_to_add = []
42
  ids_to_add = []
 
8
 
9
  load_dotenv()
10
 
11
+ # ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
12
+ # model_type='llama',
13
+ # max_new_tokens = 10960,
14
+ # threads = 3,
15
+ # )
16
 
17
  csv_files = []
18
  root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
+ cve_csv_path = os.path.join(root_dir, 'codevulnerabilityai\\data\\cve')
20
 
21
  csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
22
 
 
36
 
37
  client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
38
 
39
+ collection = client.get_or_create_collection(name="CVE")
40
 
41
  documents_to_add = []
42
  ids_to_add = []
finetunePinecone.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import chromadb
3
+ import pandas as pd
4
+ import os
5
+ from dotenv import load_dotenv
6
+ import json
7
+ from transformers import AutoModelForCausalLM
8
+
9
+ from pinecone.grpc import PineconeGRPC as Pinecone
10
+ from pinecone import ServerlessSpec
11
+
12
+
13
+ load_dotenv()
14
+
15
+ ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
16
+ model_type='llama',
17
+ max_new_tokens = 10960,
18
+ threads = 3,
19
+ )
20
+
21
+ csv_files = []
22
+ root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
23
+ cve_csv_path = os.path.join(root_dir, 'data\\cve')
24
+
25
+ csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
26
+
27
+ dtype_dict = {
28
+ 'Name': str,
29
+ 'Status': str,
30
+ 'Description': str,
31
+ 'References': str,
32
+ 'Phase': str,
33
+ 'Votes': str,
34
+ 'Comments': str
35
+ }
36
+
37
+
38
+ pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
39
+
40
+ chroma_data_path = str(os.getenv('CHROMA_DATA_PATH'))
41
+
42
+ chroma_db_directory = str("chroma_db/")
43
+
44
+ client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
45
+
46
+ collection = client.get_or_create_collection(name="CVE", embedding_function=ollama_ef)
47
+
48
+ index_name = "code-vulnerability-ai"
49
+
50
+ documents_to_add = []
51
+ ids_to_add = []
52
+ metadata_to_add = []
53
+ documents_to_add_string = []
54
+
55
+ batch_size = 10
56
+ current_batch = 0
57
+
58
+ if csv_files:
59
+ for csv_file in csv_files:
60
+ print(f"Processing {csv_file}...")
61
+ df = pd.read_csv(csv_file, on_bad_lines='skip', dtype=dtype_dict)
62
+
63
+ documents = df['Description'].fillna('').astype(str).tolist()
64
+
65
+ if not df.empty and 'Description' in df.columns:
66
+ for index, row in df.iterrows():
67
+ metadata_parts = row['Name'].split(';')
68
+ metadata = {
69
+ "Name": str(metadata_parts[0].strip()),
70
+ "Status": str(metadata_parts[1].strip()) if len(metadata_parts) > 1 else "",
71
+ "Description": str(metadata_parts[2].strip()) if len(metadata_parts) > 2 else "",
72
+ "References": str(metadata_parts[3].strip()) if len(metadata_parts) > 3 else "",
73
+ "Phase": str(metadata_parts[4].strip()) if len(metadata_parts) > 4 else "",
74
+ "Votes": str(metadata_parts[5].strip()) if len(metadata_parts) > 5 else "",
75
+ }
76
+ document_id = str(uuid.uuid4())
77
+
78
+ document_content = metadata["Description"]
79
+
80
+ document = {'id': document_id, 'content': document_content}
81
+
82
+ documents_to_add.append(document)
83
+ documents_to_add_string.append(json.dumps(documents_to_add))
84
+ ids_to_add.append(document_id)
85
+ metadata_to_add.append(metadata)
86
+
87
+ current_batch += 1
88
+ if current_batch % batch_size == 0:
89
+ print(f"Batch {current_batch // batch_size} added to the collection.")
90
+ collection.add(documents=documents_to_add_string, ids=ids_to_add, metadatas=metadata_to_add)
91
+ documents_to_add = []
92
+ ids_to_add = []
93
+ metadata_to_add = []
94
+ documents_to_add_string = []
95
+ print(f"Batch {current_batch // batch_size} completed.")
96
+
97
+ else:
98
+ print(f"Skipping file {csv_file} due to empty DataFrame or missing 'Description' column")
99
+ else:
100
+ print("No CSV files found in the directory. Skipping processing.")
101
+
102
+ # Add the remaining documents if there are less than 100 left
103
+ if documents_to_add:
104
+ print(f"Adding remaining {len(documents_to_add)} documents to the collection.")
105
+ collection.add(documents=documents_to_add_string, ids=ids_to_add, metadatas=metadata_to_add)
106
+
107
+ # results = collection.query(
108
+ # query_texts=["Dotnet"],
109
+ # n_results=3,
110
+ # )
111
+
112
+ # print(results)
requirements.txt CHANGED
@@ -10,7 +10,8 @@ langchain==0.1.11
10
  langchain_core==0.1.48
11
  langchain_community==0.0.36
12
  langserve==0.1.1
13
- chromadb==0.4.24
14
  starlette==0.37.2
15
  typer==0.10.0
16
- sentence-transformers
 
 
10
  langchain_core==0.1.48
11
  langchain_community==0.0.36
12
  langserve==0.1.1
13
+ chromadb==0.5
14
  starlette==0.37.2
15
  typer==0.10.0
16
+ sentence-transformers
17
+ pinecone-client
run.py CHANGED
@@ -11,14 +11,14 @@ os.environ['TRANSFORMERS_CACHE'] = '/code/model/cache/'
11
 
12
  model_kwargs = {'trust_remote_code': True}
13
 
14
- embedding = HuggingFaceEmbeddings(
15
- model_name="nomic-ai/nomic-embed-text-v1.5",
16
- model_kwargs=model_kwargs
17
- )
18
 
19
  db = Chroma(
20
  persist_directory="./chroma_db",
21
- embedding_function=embedding,
22
  collection_name='CVE'
23
  )
24
 
 
11
 
12
  model_kwargs = {'trust_remote_code': True}
13
 
14
+ # embedding = HuggingFaceEmbeddings(
15
+ # model_name="nomic-ai/nomic-embed-text-v1.5",
16
+ # model_kwargs=model_kwargs
17
+ # )
18
 
19
  db = Chroma(
20
  persist_directory="./chroma_db",
21
+ # embedding_function=embedding,
22
  collection_name='CVE'
23
  )
24