Spaces:
Runtime error
Runtime error
gamingflexer
commited on
Commit
•
59a1246
1
Parent(s):
424c175
Refactor
Browse files- src/scrapper/extractor.py +2 -1
- src/scrapper/main.py +4 -1
src/scrapper/extractor.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from langchain_community.chat_models import ChatOpenAI
|
2 |
-
from langchain import PromptTemplate
|
|
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
import tiktoken
|
5 |
from typing import Union
|
|
|
1 |
from langchain_community.chat_models import ChatOpenAI
|
2 |
+
from langchain.prompts import PromptTemplate
|
3 |
+
from langchain.chains import LLMChain
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
import tiktoken
|
6 |
from typing import Union
|
src/scrapper/main.py
CHANGED
@@ -3,6 +3,7 @@ from scrapper.arxiv import get_paper_id,Arxiv
|
|
3 |
from scrapper.extractor import get_google_scrape,init_extractor
|
4 |
from tqdm import tqdm
|
5 |
import os
|
|
|
6 |
|
7 |
class ArxivPaper:
|
8 |
|
@@ -33,6 +34,7 @@ class ArxivPaper:
|
|
33 |
|
34 |
def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
|
35 |
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
|
|
36 |
for i in tqdm(paper_ids):
|
37 |
paper = Arxiv(i)
|
38 |
paper.load()
|
@@ -41,4 +43,5 @@ class ArxivPaper:
|
|
41 |
extractor=self.extractor,
|
42 |
text_splitter=self.text_splitter,)
|
43 |
paper.chunker()
|
44 |
-
paper.save_chunks(include_metadata=True, path=path_author)
|
|
|
|
3 |
from scrapper.extractor import get_google_scrape,init_extractor
|
4 |
from tqdm import tqdm
|
5 |
import os
|
6 |
+
from config import OPENAI_API_KEY
|
7 |
|
8 |
class ArxivPaper:
|
9 |
|
|
|
34 |
|
35 |
def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
|
36 |
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
37 |
+
data = {}
|
38 |
for i in tqdm(paper_ids):
|
39 |
paper = Arxiv(i)
|
40 |
paper.load()
|
|
|
43 |
extractor=self.extractor,
|
44 |
text_splitter=self.text_splitter,)
|
45 |
paper.chunker()
|
46 |
+
paper.save_chunks(include_metadata=True, path=path_author)
|
47 |
+
|