File size: 3,497 Bytes
8f0c52a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re
import os
from fastapi import HTTPException
from dotenv import load_dotenv
from llama_index import download_loader
from llama_hub.github_repo import GithubRepositoryReader, GithubClient
from llama_index import VectorStoreIndex
from llama_index.vector_stores import DeepLakeVectorStore
from llama_index.storage.storage_context import StorageContext
import yaml

load_dotenv()

# Fetch and set API keys
openai_api_key = os.getenv("OPENAI_API_KEY")


# Check for OpenAI API key
if not openai_api_key:
    raise EnvironmentError("OpenAI API key not found in environment variables")


def get_validate_token(token_name):
    token = os.getenv(token_name)
    if not token:
        raise EnvironmentError(f"{token_name} not found in environment variables")
    return token


class InitiazlizeGithubService:
    def __init__(self):
        self.owner = None
        self.repo = None
        self.github_token = get_validate_token("GITHUB_TOKEN")  # Check for GitHub Token
        self.github_client = self.initialize_github_client(self.github_token)
        download_loader("GithubRepositoryReader")

    def initialize_github_client(self, github_token):
        return GithubClient(github_token)

    def parse_github_url(self, url):
        pattern = r"https://github\.com/([^/]+)/([^/]+)"
        match = re.match(pattern, url)
        return match.groups() if match else (None, None)

    def validate_owner_repo(self, owner, repo):
        if bool(owner) and bool(repo):
            self.owner = owner
            self.repo = repo
            return True

        return False

    def load_repo_data(self, owner, repo):
        if self.validate_owner_repo(owner, repo):
            loader = GithubRepositoryReader(
                self.github_client,
                owner=self.owner,
                repo=self.repo,
                filter_file_extensions=(
                    [".py", ".js", ".ts", ".md"],
                    GithubRepositoryReader.FilterType.INCLUDE,
                ),
                verbose=False,
                concurrent_requests=5,
            )

            print(f"Loading {self.repo} repository by {self.owner}")

            docs = loader.load_data(branch="main")
            print("Documents uploaded:")
            for doc in docs:
                print(doc.metadata)

            return docs

        else:
            raise HTTPException(
                status_code=400,
                detail="Invalid GitHub URL. Please enter a valid GitHub URL",
            )


class InitiazlizeActiveloopService:
    def __init__(self):
        self.active_loop_token = get_validate_token(
            "ACTIVELOOP_TOKEN"
        )  # Check for Activeloop Token
        self.dataset_path = self.get_user_info("dataset_path")
        self.vector_store = DeepLakeVectorStore(
            dataset_path=f"hub://{self.dataset_path}",
            overwrite=True,
            runtime={"tensor_db": True},
        )

        self.storage_context = StorageContext.from_defaults(
            vector_store=self.vector_store
        )

    def upload_to_activeloop(self, docs):
        self.index = VectorStoreIndex.from_documents(
            docs, storage_context=self.storage_context
        )
        self.query_engine = self.index.as_query_engine()

    def get_user_info(self, user_info):
        with open("resources.yaml", "r") as file:
            yaml_data = yaml.safe_load(file)

        retrieved_info = yaml_data["info"][user_info]
        return retrieved_info