Spaces:
Paused
Paused
update metadata, url and category
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ from langchain.llms import OpenAI
|
|
5 |
from langchain.vectorstores import Qdrant
|
6 |
from openai.error import InvalidRequestError
|
7 |
from qdrant_client import QdrantClient
|
8 |
-
from config import
|
9 |
|
10 |
|
11 |
PERSIST_DIR_NAME = "nvdajp-book"
|
@@ -13,7 +13,7 @@ PERSIST_DIR_NAME = "nvdajp-book"
|
|
13 |
|
14 |
def get_retrieval_qa() -> RetrievalQA:
|
15 |
embeddings = OpenAIEmbeddings()
|
16 |
-
db_url, db_api_key, db_collection_name =
|
17 |
client = QdrantClient(url=db_url, api_key=db_api_key)
|
18 |
db = Qdrant(client=client, collection_name=db_collection_name, embeddings=embeddings)
|
19 |
retriever = db.as_retriever()
|
@@ -22,21 +22,17 @@ def get_retrieval_qa() -> RetrievalQA:
|
|
22 |
)
|
23 |
|
24 |
|
25 |
-
def _remove_prefix_path(p: str):
|
26 |
-
prefix = "data/rtdocs/nvdajp-book.readthedocs.io/"
|
27 |
-
return p.removeprefix(prefix)
|
28 |
-
|
29 |
-
|
30 |
def get_related_url(metadata):
|
31 |
-
|
32 |
-
url = "https://nvdajp-book.readthedocs.io/"
|
33 |
for m in metadata:
|
34 |
-
p = m['source']
|
35 |
-
|
36 |
-
if
|
37 |
continue
|
38 |
-
|
39 |
-
|
|
|
|
|
40 |
|
41 |
|
42 |
def main(query: str):
|
|
|
5 |
from langchain.vectorstores import Qdrant
|
6 |
from openai.error import InvalidRequestError
|
7 |
from qdrant_client import QdrantClient
|
8 |
+
from config import DB_CONFIG
|
9 |
|
10 |
|
11 |
PERSIST_DIR_NAME = "nvdajp-book"
|
|
|
13 |
|
14 |
def get_retrieval_qa() -> RetrievalQA:
|
15 |
embeddings = OpenAIEmbeddings()
|
16 |
+
db_url, db_api_key, db_collection_name = DB_CONFIG
|
17 |
client = QdrantClient(url=db_url, api_key=db_api_key)
|
18 |
db = Qdrant(client=client, collection_name=db_collection_name, embeddings=embeddings)
|
19 |
retriever = db.as_retriever()
|
|
|
22 |
)
|
23 |
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
def get_related_url(metadata):
|
26 |
+
urls = set()
|
|
|
27 |
for m in metadata:
|
28 |
+
# p = m['source']
|
29 |
+
url = m["url"]
|
30 |
+
if url in urls:
|
31 |
continue
|
32 |
+
urls.add(url)
|
33 |
+
category = m["category"]
|
34 |
+
# print(m)
|
35 |
+
yield f'<p>URL: <a href="{url}">{url}</a> (category: {category})</p>'
|
36 |
|
37 |
|
38 |
def main(query: str):
|
config.py
CHANGED
@@ -1,8 +1,21 @@
|
|
1 |
import os
|
2 |
|
3 |
|
|
|
|
|
|
|
4 |
def get_db_config():
|
5 |
url = os.environ["QDRANT_URL"]
|
6 |
api_key = os.environ["QDRANT_API_KEY"]
|
7 |
collection_name = "nvdajp-book"
|
8 |
return url, api_key, collection_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
|
3 |
|
4 |
+
SAAS = True
|
5 |
+
|
6 |
+
|
7 |
def get_db_config():
|
8 |
url = os.environ["QDRANT_URL"]
|
9 |
api_key = os.environ["QDRANT_API_KEY"]
|
10 |
collection_name = "nvdajp-book"
|
11 |
return url, api_key, collection_name
|
12 |
+
|
13 |
+
|
14 |
+
def get_local_db_congin():
|
15 |
+
url = "localhost"
|
16 |
+
# api_key = os.environ["QDRANT_API_KEY"]
|
17 |
+
collection_name = "nvdajp-book"
|
18 |
+
return url, None, collection_name
|
19 |
+
|
20 |
+
|
21 |
+
DB_CONFIG = get_db_config() if SAAS else get_local_db_congin()
|
store.py
CHANGED
@@ -3,16 +3,29 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
3 |
from langchain.embeddings import OpenAIEmbeddings
|
4 |
from langchain.vectorstores import Qdrant
|
5 |
# from qdrant_client import QdrantClient
|
6 |
-
from config import
|
7 |
|
8 |
|
9 |
CHUNK_SIZE = 500
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
12 |
def get_documents(path: str):
|
13 |
loader = ReadTheDocsLoader(path, encoding="utf-8")
|
14 |
docs = loader.load()
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
def get_text_chunk(docs):
|
@@ -23,7 +36,7 @@ def get_text_chunk(docs):
|
|
23 |
|
24 |
def store(texts):
|
25 |
embeddings = OpenAIEmbeddings()
|
26 |
-
db_url, db_api_key, db_collection_name =
|
27 |
# client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True)
|
28 |
_ = Qdrant.from_documents(
|
29 |
texts,
|
@@ -48,6 +61,9 @@ if __name__ == "__main__":
|
|
48 |
args = sys.argv
|
49 |
if len(args) != 2:
|
50 |
print("No args, you need two args for html_path")
|
|
|
|
|
|
|
51 |
else:
|
52 |
path = args[1]
|
53 |
# dir_name = args[2]
|
|
|
3 |
from langchain.embeddings import OpenAIEmbeddings
|
4 |
from langchain.vectorstores import Qdrant
|
5 |
# from qdrant_client import QdrantClient
|
6 |
+
from config import DB_CONFIG
|
7 |
|
8 |
|
9 |
CHUNK_SIZE = 500
|
10 |
|
11 |
|
12 |
+
def _remove_prefix_path(p: str):
|
13 |
+
prefix = "data/rtdocs/nvdajp-book.readthedocs.io/"
|
14 |
+
return p.removeprefix(prefix)
|
15 |
+
|
16 |
+
|
17 |
def get_documents(path: str):
|
18 |
loader = ReadTheDocsLoader(path, encoding="utf-8")
|
19 |
docs = loader.load()
|
20 |
+
base_url = "https://nvdajp-book.readthedocs.io/"
|
21 |
+
add_meta = {"category": "ja-book"}
|
22 |
+
for doc in docs:
|
23 |
+
org_metadata = doc.metadata
|
24 |
+
source = _remove_prefix_path(org_metadata["source"])
|
25 |
+
add_meta = {"category": "ja-book", "source": source, "url": f"{base_url}{source}"}
|
26 |
+
doc.metadata = org_metadata | add_meta
|
27 |
+
yield doc
|
28 |
+
# return docs
|
29 |
|
30 |
|
31 |
def get_text_chunk(docs):
|
|
|
36 |
|
37 |
def store(texts):
|
38 |
embeddings = OpenAIEmbeddings()
|
39 |
+
db_url, db_api_key, db_collection_name = DB_CONFIG
|
40 |
# client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True)
|
41 |
_ = Qdrant.from_documents(
|
42 |
texts,
|
|
|
61 |
args = sys.argv
|
62 |
if len(args) != 2:
|
63 |
print("No args, you need two args for html_path")
|
64 |
+
docs = get_documents("data/rtdocs/nvdajp-book.readthedocs.io/ja/latest")
|
65 |
+
print(type(docs))
|
66 |
+
breakpoint()
|
67 |
else:
|
68 |
path = args[1]
|
69 |
# dir_name = args[2]
|