File size: 5,209 Bytes
4c2fab7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import json
import asyncio
import requests

from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# 获取当前目录根路径
current_file_path = os.path.dirname(os.path.abspath(__file__))
root_path = os.path.abspath(current_file_path)
data_path = os.path.join(root_path, "data_simple") 
db_path = os.path.join(root_path, "database", "init")

# 1. 根据 star 数量区间获取 GitHub 仓库,同时根据 star 数量从多到少排序(闭区间)并保存 GitHub 仓库
def get_top_repo_by_star(per_page=1000, page=1, min_star_num=0, max_star_num=500000):
    query = f'stars:{min_star_num}..{max_star_num} pushed:>2021-01-01'
    sort = 'stars'
    order = 'desc'
    search_url = f'{os.getenv('GITHUB_API_URL')}/search/repositories?q={query}&sort={sort}&order={order}&per_page={per_page}&page={page}'
    headers = {"Authorization": f"token {os.getenv('GITHUB_TOKEN')}"}

    response = requests.get(search_url, headers=headers)
    if response.status_code == 200:
        total_count = response.json()['total_count']
        total_page = total_count // per_page + 1
        print(f"Total page: {total_page}, current page: {page}")
        if response.json()['incomplete_results']: print("Incomplete results")
        return response.json()['items'], response.json()['items'][-1]['stargazers_count'], total_count
    else:
        print(f"Failed to retrieve repositories: {response.status_code}")
        print("")
        # 直接退出
        exit(1)

def save_repo_by_star(max_star=500000):
    # github 限制每次请求最多得到 100 个仓库,因此 page 固定为 1
    top_repositories, max_star, count = get_top_repo_by_star(per_page=1000, page=1, min_star_num=1000, max_star_num=max_star)

    for i, repo in enumerate(top_repositories):
        owner = repo['owner']['login']
        name = repo['name']
        unique_id = f"{name} -- {owner}"
        stars = repo['stargazers_count']
        print(f"Repository {i}: {name}, Stars: {stars}")

        # 存储为 json 格式
        with open(os.path.join(data_path, f'{unique_id}.json'), 'w') as f:
            json.dump(repo, f, indent=4)

    if count < 100: exit(1)

    return max_star

def main_repo():
    max_star = 500000 # 最多 star 的仓库有 500k
    num = 1
    while True:
        print("=" * 50)
        print(f"Round {num}, Max star: {max_star}")
        max_star = save_repo_by_star(max_star)
        num += 1

# 2. 将数据转换为向量
async def create_vector_db(docs, embeddings, batch_size=800):
    # 初始化第一批数据
    vector_db = await FAISS.afrom_documents(docs[0:batch_size], embeddings)
    if len(docs) < batch_size: return vector_db
    
    # 创建任务x``
    tasks = []
    for start_idx in range(batch_size, len(docs), batch_size):
        end_idx = min(start_idx + batch_size, len(docs))
        tasks.append(FAISS.afrom_documents(docs[start_idx:end_idx], embeddings))

    # 执行任务
    results = await asyncio.gather(*tasks)

    # 合并结果
    for temp_db in results:
        vector_db.merge_from(temp_db)
    return vector_db

async def main_convert_to_vector():
    # 读取文件
    files = os.listdir(data_path)

    # 构建 document
    docs = []
    for file in tqdm(files):
        if not file.endswith(".json"): continue
        with open(os.path.join(data_path, file), "r", encoding="utf-8") as f:
            data = json.load(f)
        
        content_map = {
            "name": data["name"],
            "description": data["description"],
        }
        content = json.dumps(content_map)
        doc = Document(page_content=content, metadata={"html_url": data["html_url"], 
                                                    "topics": data["topics"],
                                                    "created_at": data["created_at"],
                                                    "updated_at": data["updated_at"],
                                                    "star_count": data["stargazers_count"]})
        docs.append(doc)
    print(f"Total {len(docs)} documents.")

    # 初始化 Embedding 实例
    embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"),
                                  base_url=os.getenv("OPENAI_BASE_URL"),
                                  model="text-embedding-3-small")
    print("Embedding model success: text-embedding-3-small")

    # 文档嵌入
    if os.path.exists(os.path.join(db_path, "init.faiss")):
        vector_db = FAISS.load_local(db_path, embeddings=embeddings,
                                        index_name="init",
                                        allow_dangerous_deserialization=True)
    else:
        vector_db = await create_vector_db(docs, embeddings=embeddings)
        vector_db.save_local(db_path, index_name="init")
    return vector_db

if __name__ == "__main__":
    # 1. 获取仓库信息
    # main_repo()

    # 2. 构建向量数据库
    asyncio.run(main_convert_to_vector())