Spaces:
Sleeping
Sleeping
File size: 1,182 Bytes
ca2fff7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
from typing import Annotated
from fastapi import APIRouter, UploadFile, File, Body
from langchain.schema import Document
import io
import os
from pypdf import PdfReader
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from db.vector_store import Store
async def generate_documents(file: UploadFile, file_name: str):
num=0
async for txts in convert_documents(file):
num += 1
for txt in txts:
document = Document(page_content=txt,metadata={"file": file_name, "page": num})
yield document
async def convert_documents(file: UploadFile):
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
#parse pdf document
if file.content_type == 'application/pdf':
content = await file.read()
pdf_reader = PdfReader(io.BytesIO(content))
try:
for page in pdf_reader.pages:
yield splitter.split_text(page.extract_text())
except Exception as e:
print(f"Exception {e}")
elif "text" in file.content_type:
content = await file.read()
yield splitter.split_text(content.decode("utf-8"))
else:
return |