Spaces:
Runtime error
Runtime error
html template
Browse files- Dockerfile +1 -1
- __pycache__/main.cpython-310.pyc +0 -0
- __pycache__/models.cpython-310.pyc +0 -0
- chunk_config.json → configs/chunk_config.json +0 -0
- embed_config.json → configs/embed_config.json +0 -0
- src/main.py → main.py +17 -12
- src/models.py → models.py +2 -2
- requirements.txt +2 -1
- src/__init__.py +0 -0
- style.css → static/style.css +0 -0
- home.html → templates/index.html +4 -4
Dockerfile
CHANGED
@@ -13,4 +13,4 @@ RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
|
13 |
|
14 |
COPY --chown=user . .
|
15 |
|
16 |
-
CMD ["uvicorn", "
|
|
|
13 |
|
14 |
COPY --chown=user . .
|
15 |
|
16 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
__pycache__/main.cpython-310.pyc
ADDED
Binary file (6.21 kB). View file
|
|
__pycache__/models.cpython-310.pyc
ADDED
Binary file (1.79 kB). View file
|
|
chunk_config.json → configs/chunk_config.json
RENAMED
File without changes
|
embed_config.json → configs/embed_config.json
RENAMED
File without changes
|
src/main.py → main.py
RENAMED
@@ -7,8 +7,10 @@ import os
|
|
7 |
import tempfile
|
8 |
import requests
|
9 |
|
10 |
-
from fastapi import FastAPI, BackgroundTasks
|
11 |
-
from fastapi.responses import
|
|
|
|
|
12 |
|
13 |
from aiohttp import ClientSession
|
14 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -16,7 +18,7 @@ from datasets import Dataset, load_dataset
|
|
16 |
from tqdm import tqdm
|
17 |
from tqdm.asyncio import tqdm_asyncio
|
18 |
|
19 |
-
from
|
20 |
|
21 |
logging.basicConfig(level=logging.INFO)
|
22 |
logger = logging.getLogger(__name__)
|
@@ -35,15 +37,18 @@ INPUT_SPLITS = os.getenv("INPUT_SPLITS")
|
|
35 |
# name of column to load from input dataset
|
36 |
INPUT_TEXT_COL = os.getenv("INPUT_TEXT_COL")
|
37 |
|
38 |
-
INPUT_SPLITS = [spl.strip() for spl in INPUT_SPLITS.split(",") if spl]
|
39 |
|
40 |
app = FastAPI()
|
41 |
app.state.seen_Sha = set()
|
42 |
|
|
|
|
|
43 |
|
44 |
-
@app.get("/")
|
45 |
-
async def
|
46 |
-
return
|
|
|
47 |
|
48 |
|
49 |
@app.post("/webhook")
|
@@ -56,13 +61,13 @@ async def post_webhook(
|
|
56 |
and payload.event.scope.startswith("repo.content")
|
57 |
and payload.repo.type == "dataset"
|
58 |
# webhook posts multiple requests with the same update, this addresses that
|
59 |
-
and payload.repo.headSha not in app.state.
|
60 |
):
|
61 |
# no-op
|
62 |
logger.info("Update detected, no action taken")
|
63 |
return {"processed": False}
|
64 |
|
65 |
-
app.state.
|
66 |
task_queue.add_task(chunk_dataset, ds_name=payload.repo.name)
|
67 |
task_queue.add_task(embed_dataset, ds_name=CHUNKED_DS_NAME)
|
68 |
|
@@ -206,6 +211,6 @@ def embed_dataset(ds_name):
|
|
206 |
|
207 |
# For debugging
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
7 |
import tempfile
|
8 |
import requests
|
9 |
|
10 |
+
from fastapi import FastAPI, Request, BackgroundTasks
|
11 |
+
from fastapi.responses import HTMLResponse
|
12 |
+
from fastapi.staticfiles import StaticFiles
|
13 |
+
from fastapi.templating import Jinja2Templates
|
14 |
|
15 |
from aiohttp import ClientSession
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
18 |
from tqdm import tqdm
|
19 |
from tqdm.asyncio import tqdm_asyncio
|
20 |
|
21 |
+
from models import chunk_config, embed_config, WebhookPayload
|
22 |
|
23 |
logging.basicConfig(level=logging.INFO)
|
24 |
logger = logging.getLogger(__name__)
|
|
|
37 |
# name of column to load from input dataset
|
38 |
INPUT_TEXT_COL = os.getenv("INPUT_TEXT_COL")
|
39 |
|
40 |
+
# INPUT_SPLITS = [spl.strip() for spl in INPUT_SPLITS.split(",") if spl]
|
41 |
|
42 |
app = FastAPI()
|
43 |
app.state.seen_Sha = set()
|
44 |
|
45 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
46 |
+
templates = Jinja2Templates(directory="templates")
|
47 |
|
48 |
+
@app.get("/", response_class=HTMLResponse)
|
49 |
+
async def root(request: Request):
|
50 |
+
return templates.TemplateResponse(request=request, name="index.html")
|
51 |
+
# return FileResponse("/Users/spetrov/Documents/PROJECTS/hub_etl_pipeline/auto-chunk-embed/templates/index.html")
|
52 |
|
53 |
|
54 |
@app.post("/webhook")
|
|
|
61 |
and payload.event.scope.startswith("repo.content")
|
62 |
and payload.repo.type == "dataset"
|
63 |
# webhook posts multiple requests with the same update, this addresses that
|
64 |
+
and payload.repo.headSha not in app.state.seen_Sha
|
65 |
):
|
66 |
# no-op
|
67 |
logger.info("Update detected, no action taken")
|
68 |
return {"processed": False}
|
69 |
|
70 |
+
app.state.seen_Sha.add(payload.repo.headSha)
|
71 |
task_queue.add_task(chunk_dataset, ds_name=payload.repo.name)
|
72 |
task_queue.add_task(embed_dataset, ds_name=CHUNKED_DS_NAME)
|
73 |
|
|
|
211 |
|
212 |
# For debugging
|
213 |
|
214 |
+
import uvicorn
|
215 |
+
if __name__ == "__main__":
|
216 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
src/models.py → models.py
RENAMED
@@ -34,10 +34,10 @@ class WebhookPayload(BaseModel):
|
|
34 |
repo: WebhookPayloadRepo
|
35 |
|
36 |
|
37 |
-
with open(os.path.join(os.getcwd(), "chunk_config.json")) as c:
|
38 |
data = json.load(c)
|
39 |
chunk_config = ChunkConfig.model_validate_json(json.dumps(data))
|
40 |
|
41 |
-
with open(os.path.join(os.getcwd(), "embed_config.json")) as c:
|
42 |
data = json.load(c)
|
43 |
embed_config = EmbedConfig.model_validate_json(json.dumps(data))
|
|
|
34 |
repo: WebhookPayloadRepo
|
35 |
|
36 |
|
37 |
+
with open(os.path.join(os.getcwd(), "configs/chunk_config.json")) as c:
|
38 |
data = json.load(c)
|
39 |
chunk_config = ChunkConfig.model_validate_json(json.dumps(data))
|
40 |
|
41 |
+
with open(os.path.join(os.getcwd(), "configs/embed_config.json")) as c:
|
42 |
data = json.load(c)
|
43 |
embed_config = EmbedConfig.model_validate_json(json.dumps(data))
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
fastapi==0.
|
2 |
requests==2.31.*
|
3 |
huggingface-hub==0.20.*
|
4 |
uvicorn[standard]==0.17.*
|
@@ -7,3 +7,4 @@ datasets==2.16.*
|
|
7 |
langchain==0.0.*
|
8 |
aiohttp==3.8.*
|
9 |
spacy==3.7.*
|
|
|
|
1 |
+
fastapi==0.110.*
|
2 |
requests==2.31.*
|
3 |
huggingface-hub==0.20.*
|
4 |
uvicorn[standard]==0.17.*
|
|
|
7 |
langchain==0.0.*
|
8 |
aiohttp==3.8.*
|
9 |
spacy==3.7.*
|
10 |
+
jinja2==3.1.*
|
src/__init__.py
DELETED
File without changes
|
style.css → static/style.css
RENAMED
File without changes
|
home.html → templates/index.html
RENAMED
@@ -1,10 +1,10 @@
|
|
1 |
<!DOCTYPE html>
|
2 |
<html>
|
3 |
<head>
|
4 |
-
<meta charset="utf-8"
|
5 |
-
<meta name="viewport" content="width=device-width"
|
6 |
<title>Auto chunking and embedding</title>
|
7 |
-
<link
|
8 |
</head>
|
9 |
<body>
|
10 |
<div class="card">
|
@@ -12,7 +12,7 @@
|
|
12 |
|
13 |
<p>This is a webhook space to chunk and embed a dataset when it changes.</p>
|
14 |
|
15 |
-
<p>Use
|
16 |
</div>
|
17 |
</body>
|
18 |
</html>
|
|
|
1 |
<!DOCTYPE html>
|
2 |
<html>
|
3 |
<head>
|
4 |
+
<meta charset="utf-8">
|
5 |
+
<meta name="viewport" content="width=device-width">
|
6 |
<title>Auto chunking and embedding</title>
|
7 |
+
<link href="{{ url_for('static', path='/style.css') }}" rel="stylesheet">
|
8 |
</head>
|
9 |
<body>
|
10 |
<div class="card">
|
|
|
12 |
|
13 |
<p>This is a webhook space to chunk and embed a dataset when it changes.</p>
|
14 |
|
15 |
+
<p>Use <a href="https://huggingface.co/docs/hub/webhooks-guide-auto-retrain" target="_blank">this</a> as a reference</p>
|
16 |
</div>
|
17 |
</body>
|
18 |
</html>
|