Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ from datasets import load_dataset
|
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
import pdfplumber
|
6 |
import pytesseract
|
|
|
7 |
|
8 |
# Loading
|
9 |
import os
|
@@ -272,6 +273,8 @@ async def download_database():
|
|
272 |
async def convert_upload_file(file: UploadFile = File(...)):
|
273 |
import pytesseract
|
274 |
from pdf2image import convert_from_path
|
|
|
|
|
275 |
|
276 |
file_savePath = join(temp_path,file.filename)
|
277 |
|
@@ -287,78 +290,26 @@ async def convert_upload_file(file: UploadFile = File(...)):
|
|
287 |
ocr_text = pytesseract.image_to_string(image,lang='vie')
|
288 |
text=text+ocr_text+'\n'
|
289 |
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
def filter_by_labels(elements, labels, format):
|
309 |
-
for element in elements:
|
310 |
-
for label in labels:
|
311 |
-
if label.lower() in element.text.lower():
|
312 |
-
return extract_value_from_text(element.text, format)
|
313 |
-
return None
|
314 |
-
|
315 |
-
def filter_by_values(elements, values):
|
316 |
-
for element in elements:
|
317 |
-
for value in values:
|
318 |
-
if value.lower() in element.text.lower():
|
319 |
-
return value
|
320 |
-
return None
|
321 |
-
|
322 |
-
def get_elements_by_schemas(elements, schemas):
|
323 |
-
result_elements=[]
|
324 |
-
for schema in schemas:
|
325 |
-
result_element={}
|
326 |
-
filterred_by_type_elements = filter_by_type(elements, schema['layout_type'])
|
327 |
-
if 'labels' in schema:
|
328 |
-
filterred_by_label_elements = filter_by_labels(filterred_by_type_elements, schema['labels'], schema['format'])
|
329 |
-
if filterred_by_label_elements is not None:
|
330 |
-
result_element[schema['name']] = filterred_by_label_elements
|
331 |
-
result_elements.append(result_element)
|
332 |
-
elif 'values' in schema:
|
333 |
-
fitered_by_value_elements = filter_by_values(filterred_by_type_elements, schema['values'])
|
334 |
-
if fitered_by_value_elements is not None:
|
335 |
-
result_element[schema['name']] = fitered_by_value_elements
|
336 |
-
result_elements.append(result_element)
|
337 |
-
else:
|
338 |
-
if filterred_by_type_elements is not None:
|
339 |
-
result_element[schema['name']] = filterred_by_type_elements[0].text
|
340 |
-
result_elements.append(result_element)
|
341 |
-
|
342 |
-
return result_elements
|
343 |
-
|
344 |
-
|
345 |
-
@app.post("/pdf2metadata/")
|
346 |
-
async def extract_upload_file(file: UploadFile = File(...)):
|
347 |
-
from unstructured.partition.pdf import partition_pdf
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
file_savePath = join(temp_path,file.filename)
|
352 |
-
|
353 |
-
with open(file_savePath,'wb') as f:
|
354 |
-
shutil.copyfileobj(file.file, f)
|
355 |
|
356 |
-
|
357 |
-
elements = partition_pdf(file_savePath, languages=["vie"])
|
358 |
|
359 |
-
schemas = [{'name':'publisher','layout_type':'Title','position':0,'from_last':False},{'name':'number','layout_type':'Text','position':0,'from_last':False, 'label':['Số','Luật số']}]
|
360 |
-
|
361 |
-
return get_elements_by_schemas(elements, schemas)
|
362 |
|
363 |
@app.get("/")
|
364 |
def api_home():
|
|
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
import pdfplumber
|
6 |
import pytesseract
|
7 |
+
from models import Article, Chapter, Law
|
8 |
|
9 |
# Loading
|
10 |
import os
|
|
|
273 |
async def convert_upload_file(file: UploadFile = File(...)):
|
274 |
import pytesseract
|
275 |
from pdf2image import convert_from_path
|
276 |
+
from octoai.client import OctoAI
|
277 |
+
from octoai.text_gen import ChatCompletionResponseFormat, ChatMessage
|
278 |
|
279 |
file_savePath = join(temp_path,file.filename)
|
280 |
|
|
|
290 |
ocr_text = pytesseract.image_to_string(image,lang='vie')
|
291 |
text=text+ocr_text+'\n'
|
292 |
|
293 |
+
client = OctoAI()
|
294 |
+
|
295 |
+
completion = client.text_gen.create_chat_completion(
|
296 |
+
model="meta-llama-3-8b-instruct",
|
297 |
+
messages=[
|
298 |
+
ChatMessage(role="system", content="You are a helpful assistant."),
|
299 |
+
ChatMessage(role="user", content=text),
|
300 |
+
],
|
301 |
+
max_tokens=512,
|
302 |
+
presence_penalty=0,
|
303 |
+
temperature=0.1,
|
304 |
+
top_p=0.9,
|
305 |
+
response_format=ChatCompletionResponseFormat(
|
306 |
+
type="json_object",
|
307 |
+
schema=Law.model_json_schema(),
|
308 |
+
),
|
309 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
|
311 |
+
return {content:text,metadate:completion.choices[0].message.content}
|
|
|
312 |
|
|
|
|
|
|
|
313 |
|
314 |
@app.get("/")
|
315 |
def api_home():
|