efeno commited on
Commit
8ceb20f
·
1 Parent(s): ae88a6b

concurrent github requests

Browse files
api/concurrent_external_services.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ from fastapi import HTTPException
4
+ from dotenv import load_dotenv
5
+ from llama_index import download_loader
6
+ from llama_hub.github_repo import GithubRepositoryReader, GithubClient
7
+ from llama_index import VectorStoreIndex
8
+ from llama_index.vector_stores import DeepLakeVectorStore
9
+ from llama_index.storage.storage_context import StorageContext
10
+ import yaml
11
+
12
+ load_dotenv()
13
+
14
+ # Fetch and set API keys
15
+ openai_api_key = os.getenv("OPENAI_API_KEY")
16
+
17
+
18
+ # Check for OpenAI API key
19
+ if not openai_api_key:
20
+ raise EnvironmentError("OpenAI API key not found in environment variables")
21
+
22
+
23
+ def get_validate_token(token_name):
24
+ token = os.getenv(token_name)
25
+ if not token:
26
+ raise EnvironmentError(f"{token_name} not found in environment variables")
27
+ return token
28
+
29
+
30
+ class InitiazlizeGithubService:
31
+ def __init__(self):
32
+ self.owner = None
33
+ self.repo = None
34
+ self.github_token = get_validate_token("GITHUB_TOKEN") # Check for GitHub Token
35
+ self.github_client = self.initialize_github_client(self.github_token)
36
+ download_loader("GithubRepositoryReader")
37
+
38
+ def initialize_github_client(self, github_token):
39
+ return GithubClient(github_token)
40
+
41
+ def parse_github_url(self, url):
42
+ pattern = r"https://github\.com/([^/]+)/([^/]+)"
43
+ match = re.match(pattern, url)
44
+ return match.groups() if match else (None, None)
45
+
46
+ def validate_owner_repo(self, owner, repo):
47
+ if bool(owner) and bool(repo):
48
+ self.owner = owner
49
+ self.repo = repo
50
+ return True
51
+
52
+ return False
53
+
54
+ def load_repo_data(self, owner, repo, file_type):
55
+ if self.validate_owner_repo(owner, repo):
56
+ loader = GithubRepositoryReader(
57
+ self.github_client,
58
+ owner=self.owner,
59
+ repo=self.repo,
60
+ filter_file_extensions=(
61
+ [file_type],
62
+ GithubRepositoryReader.FilterType.INCLUDE,
63
+ ),
64
+ verbose=False,
65
+ concurrent_requests=25,
66
+ )
67
+
68
+ print(
69
+ f"Loading {self.repo} repository by {self.owner}, file type: {file_type}"
70
+ )
71
+
72
+ docs = loader.load_data(branch="main")
73
+ print("Documents uploaded:")
74
+ for doc in docs:
75
+ print(doc.metadata)
76
+
77
+ return docs
78
+
79
+ else:
80
+ raise HTTPException(
81
+ status_code=400,
82
+ detail="Invalid GitHub URL. Please enter a valid GitHub URL",
83
+ )
84
+
85
+
86
+ class InitiazlizeActiveloopService:
87
+ def __init__(self):
88
+ self.active_loop_token = get_validate_token(
89
+ "ACTIVELOOP_TOKEN"
90
+ ) # Check for Activeloop Token
91
+ self.dataset_path = self.get_user_info("dataset_path")
92
+ self.vector_store = DeepLakeVectorStore(
93
+ dataset_path=f"hub://{self.dataset_path}",
94
+ overwrite=True,
95
+ runtime={"tensor_db": True},
96
+ )
97
+
98
+ self.storage_context = StorageContext.from_defaults(
99
+ vector_store=self.vector_store
100
+ )
101
+
102
+ def upload_to_activeloop(self, docs):
103
+ self.index = VectorStoreIndex.from_documents(
104
+ docs, storage_context=self.storage_context
105
+ )
106
+ self.query_engine = self.index.as_query_engine()
107
+
108
+ def get_user_info(self, user_info):
109
+ with open("resources.yaml", "r") as file:
110
+ yaml_data = yaml.safe_load(file)
111
+
112
+ retrieved_info = yaml_data["info"][user_info]
113
+ return retrieved_info
api/concurrent_requests.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ from dotenv import load_dotenv
5
+
6
+ from api.concurrent_external_services import (
7
+ InitiazlizeGithubService,
8
+ InitiazlizeActiveloopService,
9
+ )
10
+ import asyncio
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ github_service = InitiazlizeGithubService()
16
+ activeloop_service = InitiazlizeActiveloopService()
17
+
18
+ app = FastAPI()
19
+
20
+
21
+ class GitHubRepoRequest(BaseModel):
22
+ githubRepoUrl: str
23
+
24
+
25
+ class UserCodeRequest(BaseModel):
26
+ userCode: str
27
+
28
+
29
+ async def process_file(owner, repo, file_type):
30
+ docs = github_service.load_repo_data(owner, repo, file_type)
31
+ activeloop_service.upload_to_activeloop(docs)
32
+ return {
33
+ "status": "success",
34
+ "message": f"File type {file_type} processed successfully",
35
+ }
36
+
37
+
38
+ @app.post("/upload")
39
+ async def scrape_and_upload_to_activeloop(repo_request: GitHubRepoRequest):
40
+ # Add logic to scrape and upload to ActiveLoop
41
+ # Example: Scrape GitHub repo and upload to ActiveLoop
42
+ # Implement your scraping and upload logic here
43
+
44
+ print(f"repo from user: {repo_request.githubRepoUrl}")
45
+
46
+ owner, repo = github_service.parse_github_url(repo_request.githubRepoUrl)
47
+ file_types = [".py", ".js", ".ts", ".md", "ipynb"]
48
+ tasks = []
49
+
50
+ for file_type in file_types:
51
+ task = process_file(owner, repo, file_type)
52
+ tasks.append(task)
53
+
54
+ results = await asyncio.gather(*tasks)
55
+ # docs = github_service.load_repo_data(owner, repo, file_type)
56
+ # activeloop_service.upload_to_activeloop(docs)
57
+ print(results)
58
+ return {"status": "success", "message": "Repo processed successfully"}
59
+
60
+
61
+ @app.post("/retrieve")
62
+ async def find_similar_code_and_explain(code_request: UserCodeRequest):
63
+ # Add logic to find similar code and provide explanations or improvements
64
+ # Example: Search in ActiveLoop DB
65
+ # Implement your search and analysis logic here
66
+
67
+ print(f"code from user: {code_request.userCode}")
68
+
69
+ # intro_question = "What is the repository about?"
70
+ intro_question = code_request.userCode
71
+ print(f"Test question: {intro_question}")
72
+ print("=" * 50)
73
+
74
+ answer = activeloop_service.query_engine.query(intro_question)
75
+ print(f"Answer: {answer.__dict__}\n")
76
+ return {
77
+ "answer": answer,
78
+ }
api/external_services.py CHANGED
@@ -51,23 +51,21 @@ class InitiazlizeGithubService:
51
 
52
  return False
53
 
54
- def load_repo_data(self, owner, repo, file_type):
55
  if self.validate_owner_repo(owner, repo):
56
  loader = GithubRepositoryReader(
57
  self.github_client,
58
  owner=self.owner,
59
  repo=self.repo,
60
  filter_file_extensions=(
61
- [file_type],
62
  GithubRepositoryReader.FilterType.INCLUDE,
63
  ),
64
  verbose=False,
65
  concurrent_requests=25,
66
  )
67
 
68
- print(
69
- f"Loading {self.repo} repository by {self.owner}, file type: {file_type}"
70
- )
71
 
72
  docs = loader.load_data(branch="main")
73
  print("Documents uploaded:")
 
51
 
52
  return False
53
 
54
+ def load_repo_data(self, owner, repo):
55
  if self.validate_owner_repo(owner, repo):
56
  loader = GithubRepositoryReader(
57
  self.github_client,
58
  owner=self.owner,
59
  repo=self.repo,
60
  filter_file_extensions=(
61
+ [".py", ".js", ".ts", ".md", ".ipynb"],
62
  GithubRepositoryReader.FilterType.INCLUDE,
63
  ),
64
  verbose=False,
65
  concurrent_requests=25,
66
  )
67
 
68
+ print(f"Loading {self.repo} repository by {self.owner}")
 
 
69
 
70
  docs = loader.load_data(branch="main")
71
  print("Documents uploaded:")
api/main.py CHANGED
@@ -4,7 +4,6 @@ from pydantic import BaseModel
4
  from dotenv import load_dotenv
5
 
6
  from api.external_services import InitiazlizeGithubService, InitiazlizeActiveloopService
7
- import asyncio
8
 
9
  # Load environment variables
10
  load_dotenv()
@@ -23,15 +22,6 @@ class UserCodeRequest(BaseModel):
23
  userCode: str
24
 
25
 
26
- async def process_file(owner, repo, file_type):
27
- docs = github_service.load_repo_data(owner, repo, file_type)
28
- activeloop_service.upload_to_activeloop(docs)
29
- return {
30
- "status": "success",
31
- "message": f"File type {file_type} processed successfully",
32
- }
33
-
34
-
35
  @app.post("/upload")
36
  async def scrape_and_upload_to_activeloop(repo_request: GitHubRepoRequest):
37
  # Add logic to scrape and upload to ActiveLoop
@@ -41,17 +31,9 @@ async def scrape_and_upload_to_activeloop(repo_request: GitHubRepoRequest):
41
  print(f"repo from user: {repo_request.githubRepoUrl}")
42
 
43
  owner, repo = github_service.parse_github_url(repo_request.githubRepoUrl)
44
- file_types = [".py", ".js", ".ts", ".md", "ipynb"]
45
- tasks = []
46
-
47
- for file_type in file_types:
48
- task = process_file(owner, repo, file_type)
49
- tasks.append(task)
50
 
51
- results = await asyncio.gather(*tasks)
52
- # docs = github_service.load_repo_data(owner, repo, file_type)
53
- # activeloop_service.upload_to_activeloop(docs)
54
- print(results)
55
  return {"status": "success", "message": "Repo processed successfully"}
56
 
57
 
 
4
  from dotenv import load_dotenv
5
 
6
  from api.external_services import InitiazlizeGithubService, InitiazlizeActiveloopService
 
7
 
8
  # Load environment variables
9
  load_dotenv()
 
22
  userCode: str
23
 
24
 
 
 
 
 
 
 
 
 
 
25
  @app.post("/upload")
26
  async def scrape_and_upload_to_activeloop(repo_request: GitHubRepoRequest):
27
  # Add logic to scrape and upload to ActiveLoop
 
31
  print(f"repo from user: {repo_request.githubRepoUrl}")
32
 
33
  owner, repo = github_service.parse_github_url(repo_request.githubRepoUrl)
34
+ docs = github_service.load_repo_data(owner, repo)
35
+ activeloop_service.upload_to_activeloop(docs)
 
 
 
 
36
 
 
 
 
 
37
  return {"status": "success", "message": "Repo processed successfully"}
38
 
39