plaggy commited on
Commit
187981b
·
1 Parent(s): ce236eb

html template

Browse files
Dockerfile CHANGED
@@ -13,4 +13,4 @@ RUN pip install --no-cache-dir --upgrade -r requirements.txt
13
 
14
  COPY --chown=user . .
15
 
16
- CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
13
 
14
  COPY --chown=user . .
15
 
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/main.cpython-310.pyc ADDED
Binary file (6.21 kB). View file
 
__pycache__/models.cpython-310.pyc ADDED
Binary file (1.79 kB). View file
 
chunk_config.json → configs/chunk_config.json RENAMED
File without changes
embed_config.json → configs/embed_config.json RENAMED
File without changes
src/main.py → main.py RENAMED
@@ -7,8 +7,10 @@ import os
7
  import tempfile
8
  import requests
9
 
10
- from fastapi import FastAPI, BackgroundTasks
11
- from fastapi.responses import FileResponse
 
 
12
 
13
  from aiohttp import ClientSession
14
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -16,7 +18,7 @@ from datasets import Dataset, load_dataset
16
  from tqdm import tqdm
17
  from tqdm.asyncio import tqdm_asyncio
18
 
19
- from src.models import chunk_config, embed_config, WebhookPayload
20
 
21
  logging.basicConfig(level=logging.INFO)
22
  logger = logging.getLogger(__name__)
@@ -35,15 +37,18 @@ INPUT_SPLITS = os.getenv("INPUT_SPLITS")
35
  # name of column to load from input dataset
36
  INPUT_TEXT_COL = os.getenv("INPUT_TEXT_COL")
37
 
38
- INPUT_SPLITS = [spl.strip() for spl in INPUT_SPLITS.split(",") if spl]
39
 
40
  app = FastAPI()
41
  app.state.seen_Sha = set()
42
 
 
 
43
 
44
- @app.get("/")
45
- async def home():
46
- return FileResponse("home.html")
 
47
 
48
 
49
  @app.post("/webhook")
@@ -56,13 +61,13 @@ async def post_webhook(
56
  and payload.event.scope.startswith("repo.content")
57
  and payload.repo.type == "dataset"
58
  # webhook posts multiple requests with the same update, this addresses that
59
- and payload.repo.headSha not in app.state.last_Sha
60
  ):
61
  # no-op
62
  logger.info("Update detected, no action taken")
63
  return {"processed": False}
64
 
65
- app.state.last_Sha.add(payload.repo.headSha)
66
  task_queue.add_task(chunk_dataset, ds_name=payload.repo.name)
67
  task_queue.add_task(embed_dataset, ds_name=CHUNKED_DS_NAME)
68
 
@@ -206,6 +211,6 @@ def embed_dataset(ds_name):
206
 
207
  # For debugging
208
 
209
- # import uvicorn
210
- # if __name__ == "__main__":
211
- # uvicorn.run(app, host="0.0.0.0", port=7860)
 
7
  import tempfile
8
  import requests
9
 
10
+ from fastapi import FastAPI, Request, BackgroundTasks
11
+ from fastapi.responses import HTMLResponse
12
+ from fastapi.staticfiles import StaticFiles
13
+ from fastapi.templating import Jinja2Templates
14
 
15
  from aiohttp import ClientSession
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
18
  from tqdm import tqdm
19
  from tqdm.asyncio import tqdm_asyncio
20
 
21
+ from models import chunk_config, embed_config, WebhookPayload
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
 
37
  # name of column to load from input dataset
38
  INPUT_TEXT_COL = os.getenv("INPUT_TEXT_COL")
39
 
40
+ # INPUT_SPLITS = [spl.strip() for spl in INPUT_SPLITS.split(",") if spl]
41
 
42
  app = FastAPI()
43
  app.state.seen_Sha = set()
44
 
45
+ app.mount("/static", StaticFiles(directory="static"), name="static")
46
+ templates = Jinja2Templates(directory="templates")
47
 
48
+ @app.get("/", response_class=HTMLResponse)
49
+ async def root(request: Request):
50
+ return templates.TemplateResponse(request=request, name="index.html")
51
+ # return FileResponse("/Users/spetrov/Documents/PROJECTS/hub_etl_pipeline/auto-chunk-embed/templates/index.html")
52
 
53
 
54
  @app.post("/webhook")
 
61
  and payload.event.scope.startswith("repo.content")
62
  and payload.repo.type == "dataset"
63
  # webhook posts multiple requests with the same update, this addresses that
64
+ and payload.repo.headSha not in app.state.seen_Sha
65
  ):
66
  # no-op
67
  logger.info("Update detected, no action taken")
68
  return {"processed": False}
69
 
70
+ app.state.seen_Sha.add(payload.repo.headSha)
71
  task_queue.add_task(chunk_dataset, ds_name=payload.repo.name)
72
  task_queue.add_task(embed_dataset, ds_name=CHUNKED_DS_NAME)
73
 
 
211
 
212
  # For debugging
213
 
214
+ import uvicorn
215
+ if __name__ == "__main__":
216
+ uvicorn.run(app, host="0.0.0.0", port=7860)
src/models.py → models.py RENAMED
@@ -34,10 +34,10 @@ class WebhookPayload(BaseModel):
34
  repo: WebhookPayloadRepo
35
 
36
 
37
- with open(os.path.join(os.getcwd(), "chunk_config.json")) as c:
38
  data = json.load(c)
39
  chunk_config = ChunkConfig.model_validate_json(json.dumps(data))
40
 
41
- with open(os.path.join(os.getcwd(), "embed_config.json")) as c:
42
  data = json.load(c)
43
  embed_config = EmbedConfig.model_validate_json(json.dumps(data))
 
34
  repo: WebhookPayloadRepo
35
 
36
 
37
+ with open(os.path.join(os.getcwd(), "configs/chunk_config.json")) as c:
38
  data = json.load(c)
39
  chunk_config = ChunkConfig.model_validate_json(json.dumps(data))
40
 
41
+ with open(os.path.join(os.getcwd(), "configs/embed_config.json")) as c:
42
  data = json.load(c)
43
  embed_config = EmbedConfig.model_validate_json(json.dumps(data))
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- fastapi==0.104.*
2
  requests==2.31.*
3
  huggingface-hub==0.20.*
4
  uvicorn[standard]==0.17.*
@@ -7,3 +7,4 @@ datasets==2.16.*
7
  langchain==0.0.*
8
  aiohttp==3.8.*
9
  spacy==3.7.*
 
 
1
+ fastapi==0.110.*
2
  requests==2.31.*
3
  huggingface-hub==0.20.*
4
  uvicorn[standard]==0.17.*
 
7
  langchain==0.0.*
8
  aiohttp==3.8.*
9
  spacy==3.7.*
10
+ jinja2==3.1.*
src/__init__.py DELETED
File without changes
style.css → static/style.css RENAMED
File without changes
home.html → templates/index.html RENAMED
@@ -1,10 +1,10 @@
1
  <!DOCTYPE html>
2
  <html>
3
  <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
  <title>Auto chunking and embedding</title>
7
- <link rel="stylesheet" href="style.css" />
8
  </head>
9
  <body>
10
  <div class="card">
@@ -12,7 +12,7 @@
12
 
13
  <p>This is a webhook space to chunk and embed a dataset when it changes.</p>
14
 
15
- <p>Use this as a reference <a href="https://huggingface.co/docs/hub/webhooks-guide-auto-retrain" target="_blank">here</a>!</p>
16
  </div>
17
  </body>
18
  </html>
 
1
  <!DOCTYPE html>
2
  <html>
3
  <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width">
6
  <title>Auto chunking and embedding</title>
7
+ <link href="{{ url_for('static', path='/style.css') }}" rel="stylesheet">
8
  </head>
9
  <body>
10
  <div class="card">
 
12
 
13
  <p>This is a webhook space to chunk and embed a dataset when it changes.</p>
14
 
15
+ <p>Use <a href="https://huggingface.co/docs/hub/webhooks-guide-auto-retrain" target="_blank">this</a> as a reference</p>
16
  </div>
17
  </body>
18
  </html>